sys/vfs/hammer2/hammer2_chain.c

   1 /*
   2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@dragonflybsd.org>
   6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in
  16  *    the documentation and/or other materials provided with the
  17  *    distribution.
  18  * 3. Neither the name of The DragonFly Project nor the names of its
  19  *    contributors may be used to endorse or promote products derived
  20  *    from this software without specific, prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35 /*
  36  * This subsystem implements most of the core support functions for
  37  * the hammer2_chain structure.
  38  *
  39  * Chains are the in-memory version on media objects (volume header, inodes,
  40  * indirect blocks, data blocks, etc).  Chains represent a portion of the
  41  * HAMMER2 topology.
  42  *
  43  * A chain is topologically stable once it has been inserted into the
  44  * in-memory topology.  Modifications which copy, move, or resize the chain
  45  * are handled via the DELETE-DUPLICATE mechanic where the original chain
  46  * stays intact but is marked deleted and a new chain is allocated which
  47  * shares the old chain's children.
  48  *
  49  * This sharing is handled via the hammer2_chain_core structure.
  50  *
  51  * The DELETE-DUPLICATE mechanism allows the same topological level to contain
  52  * many overloadings.  However, our RBTREE mechanics require that there be
  53  * no overlaps so we accomplish the overloading by moving conflicting chains
  54  * with smaller or equal radii into a sub-RBTREE under the chain being
  55  * overloaded.
  56  *
  57  * DELETE-DUPLICATE is also used when a modification to a chain crosses a
  58  * flush synchronization boundary, allowing the flush code to continue flushing
  59  * the older version of the topology and not be disrupted by new frontend
  60  * operations.
  61  *
  62  *                              LIVE VS FLUSH VIEW
  63  *
  64  * All lookup and iterate operations and most modifications are done on the
  65  * live view.  During flushes lookups are not normally done and modifications
  66  * may be run on the flush view.  However, flushes often needs to allocate
  67  * blocks and the freemap_alloc/free code issues lookups.  This code is
  68  * special cased to use the live view when called from a flush.
  69  *
  70  * General chain lookup/iteration functions are NOT aware of the flush view,
  71  * they only know about live views.
  72  */
  73 #include <sys/cdefs.h>
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/types.h>
  77 #include <sys/lock.h>
  78 #include <sys/kern_syscall.h>
  79 #include <sys/uuid.h>
  80
  81 #include "hammer2.h"
  82
  83 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
  84
  85 static hammer2_chain_t *hammer2_chain_create_indirect(
  86                 hammer2_trans_t *trans, hammer2_chain_t *parent,
  87                 hammer2_key_t key, int keybits, int for_type, int *errorp);
  88 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop);
  89 static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes);
  90 static hammer2_chain_t *hammer2_combined_find(
  91                 hammer2_chain_t *parent,
  92                 hammer2_blockref_t *base, int count,
  93                 int *cache_indexp, hammer2_key_t *key_nextp,
  94                 hammer2_key_t key_beg, hammer2_key_t key_end,
  95                 hammer2_blockref_t **bresp);
  96 static void hammer2_chain_assert_not_present(hammer2_chain_core_t *above,
  97                                  hammer2_chain_t *chain);
  98
  99 hammer2_chain_t *XXChain;
 100 int XXChainWhy;
 101 /*
 102  * Basic RBTree for chains.  Chains cannot overlap within any given
 103  * core->rbtree without recursing through chain->rbtree.  We effectively
 104  * guarantee this by checking the full range rather than just the first
 105  * key element.  By matching on the full range callers can detect when
 106  * recursrion through chain->rbtree is needed.
 107  *
 108  * NOTE: This also means the a delete-duplicate on the same key will
 109  *       overload by placing the deleted element in the new element's
 110  *       chain->rbtree (when doing a direct replacement).
 111  */
 112 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 113
 114 int
 115 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
 116 {
 117         hammer2_key_t c1_beg;
 118         hammer2_key_t c1_end;
 119         hammer2_key_t c2_beg;
 120         hammer2_key_t c2_end;
 121
 122         c1_beg = chain1->bref.key;
 123         c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
 124         c2_beg = chain2->bref.key;
 125         c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1;
 126
 127         if (c1_end < c2_beg)    /* fully to the left */
 128                 return(-1);
 129         if (c1_beg > c2_end)    /* fully to the right */
 130                 return(1);
 131         return(0);              /* overlap (must not cross edge boundary) */
 132 }
 133
 134 static __inline
 135 int
 136 hammer2_isclusterable(hammer2_chain_t *chain)
 137 {
 138         if (hammer2_cluster_enable) {
 139                 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
 140                     chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
 141                     chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
 142                         return(1);
 143                 }
 144         }
 145         return(0);
 146 }
 147
 148 /*
 149  * Recursively set the update_hi flag up to the root starting at chain's
 150  * parent->core.  update_hi is not set in chain's core.
 151  *
 152  * This controls top-down visibility for flushes.  The child has just one
 153  * 'above' core, but the core itself can be multi-homed with parents iterated
 154  * via core->ownerq.
 155  *
 156  * This function is not used during a flush (except when the flush is
 157  * allocating which requires the live tree).  The flush keeps track of its
 158  * recursion itself.
 159  *
 160  * XXX needs to be optimized to use roll-up TIDs.  update_hi is only really
 161  * compared against bref.mirror_tid which itself is only updated by a flush.
 162  */
 163 void
 164 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
 165 {
 166         hammer2_chain_core_t *above;
 167
 168         while ((above = chain->above) != NULL) {
 169                 spin_lock(&above->cst.spin);
 170                 /* XXX optimize */
 171                 if (above->update_hi < trans->sync_tid)
 172                         above->update_hi = trans->sync_tid;
 173                 chain = TAILQ_LAST(&above->ownerq, h2_core_list);
 174 #if 0
 175                 TAILQ_FOREACH_REVERSE(chain, &above->ownerq,
 176                                       h2_core_list, core_entry) {
 177                         if (trans->sync_tid >= chain->modify_tid &&
 178                             trans->sync_tid <= chain->delete_tid) {
 179                                 break;
 180                         }
 181                 }
 182 #endif
 183                 spin_unlock(&above->cst.spin);
 184         }
 185 }
 186
 187 /*
 188  * Allocate a new disconnected chain element representing the specified
 189  * bref.  chain->refs is set to 1 and the passed bref is copied to
 190  * chain->bref.  chain->bytes is derived from the bref.
 191  *
 192  * chain->core is NOT allocated and the media data and bp pointers are left
 193  * NULL.  The caller must call chain_core_alloc() to allocate or associate
 194  * a core with the chain.
 195  *
 196  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
 197  */
 198 hammer2_chain_t *
 199 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
 200                     hammer2_trans_t *trans, hammer2_blockref_t *bref)
 201 {
 202         hammer2_chain_t *chain;
 203         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
 204
 205         /*
 206          * Construct the appropriate system structure.
 207          */
 208         switch(bref->type) {
 209         case HAMMER2_BREF_TYPE_INODE:
 210         case HAMMER2_BREF_TYPE_INDIRECT:
 211         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 212         case HAMMER2_BREF_TYPE_DATA:
 213         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
 214                 /*
 215                  * Chain's are really only associated with the hmp but we
 216                  * maintain a pmp association for per-mount memory tracking
 217                  * purposes.  The pmp can be NULL.
 218                  */
 219                 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
 220                 if (pmp)
 221                         chain->pmp = pmp;
 222                 break;
 223         case HAMMER2_BREF_TYPE_VOLUME:
 224         case HAMMER2_BREF_TYPE_FREEMAP:
 225                 chain = NULL;
 226                 panic("hammer2_chain_alloc volume type illegal for op");
 227         default:
 228                 chain = NULL;
 229                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
 230                       bref->type);
 231         }
 232
 233         chain->hmp = hmp;
 234         chain->bref = *bref;
 235         chain->bytes = bytes;
 236         chain->refs = 1;
 237         chain->flags = HAMMER2_CHAIN_ALLOCATED;
 238         chain->delete_tid = HAMMER2_MAX_TID;
 239
 240         /*
 241          * Set modify_tid if a transaction is creating the chain.  When
 242          * loading a chain from backing store trans is passed as NULL and
 243          * modify_tid is left set to 0.
 244          */
 245         if (trans)
 246                 chain->modify_tid = trans->sync_tid;
 247
 248         return (chain);
 249 }
 250
 251 /*
 252  * Associate an existing core with the chain or allocate a new core.
 253  *
 254  * The core is not locked.  No additional refs on the chain are made.
 255  * (trans) must not be NULL if (core) is not NULL.
 256  *
 257  * When chains are delete-duplicated during flushes we insert nchain on
 258  * the ownerq after ochain instead of at the end in order to give the
 259  * drop code visibility in the correct order, otherwise drops can be missed.
 260  */
 261 void
 262 hammer2_chain_core_alloc(hammer2_trans_t *trans,
 263                          hammer2_chain_t *nchain, hammer2_chain_t *ochain)
 264 {
 265         hammer2_chain_core_t *core;
 266
 267         KKASSERT(nchain->core == NULL);
 268
 269         if (ochain == NULL) {
 270                 /*
 271                  * Fresh core under nchain (no multi-homing of ochain's
 272                  * sub-tree).
 273                  */
 274                 core = kmalloc(sizeof(*core), nchain->hmp->mchain,
 275                                M_WAITOK | M_ZERO);
 276                 TAILQ_INIT(&core->layerq);
 277                 TAILQ_INIT(&core->ownerq);
 278                 core->sharecnt = 1;
 279                 core->good = 0x1234;
 280                 if (trans)
 281                         core->update_hi = trans->sync_tid;
 282                 else
 283                         core->update_hi = nchain->bref.mirror_tid;
 284                 nchain->core = core;
 285                 ccms_cst_init(&core->cst, nchain);
 286                 TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry);
 287         } else {
 288                 /*
 289                  * Propagate the PFSROOT flag which we set on all subdirs
 290                  * under the super-root.
 291                  */
 292                 atomic_set_int(&nchain->flags,
 293                                ochain->flags & HAMMER2_CHAIN_PFSROOT);
 294
 295                 /*
 296                  * Duplicating ochain -> nchain.  Set the DUPLICATED flag on
 297                  * ochain if nchain is not a snapshot.
 298                  *
 299                  * It is possible for the DUPLICATED flag to already be
 300                  * set when called via a flush operation because flush
 301                  * operations may have to work on elements with delete_tid's
 302                  * beyond the flush sync_tid.  In this situation we must
 303                  * ensure that nchain is placed just after ochain in the
 304                  * ownerq and that the DUPLICATED flag is set on nchain so
 305                  * 'live' operations skip past it to the correct chain.
 306                  *
 307                  * The flusher understands the blockref synchronization state
 308                  * for any stale chains by observing bref.mirror_tid, which
 309                  * delete-duplicate replicates.
 310                  *
 311                  * WARNING! However, the case is disallowed when the flusher
 312                  *          is allocating freemap space because this entails
 313                  *          more than just adjusting a block table.
 314                  */
 315                 if (ochain->flags & HAMMER2_CHAIN_DUPLICATED) {
 316                         KKASSERT((trans->flags &
 317                                   (HAMMER2_TRANS_ISFLUSH |
 318                                    HAMMER2_TRANS_ISALLOCATING)) ==
 319                                  HAMMER2_TRANS_ISFLUSH);
 320                         atomic_set_int(&nchain->flags,
 321                                        HAMMER2_CHAIN_DUPLICATED);
 322                 }
 323                 if ((nchain->flags & HAMMER2_CHAIN_SNAPSHOT) == 0) {
 324                         atomic_set_int(&ochain->flags,
 325                                        HAMMER2_CHAIN_DUPLICATED);
 326                 }
 327                 core = ochain->core;
 328                 atomic_add_int(&core->sharecnt, 1);
 329
 330                 spin_lock(&core->cst.spin);
 331                 nchain->core = core;
 332
 333 #if 0
 334                 if (core->update_hi < trans->sync_tid)
 335                         core->update_hi = trans->sync_tid;
 336 #endif
 337
 338                 /*
 339                  * Maintain ordering for refactor test so we don't skip over
 340                  * a snapshot.  Also, during flushes, delete-duplications
 341                  * for block-table updates can occur on blocks already
 342                  * deleted (delete-duplicated by a later transaction).  We
 343                  * must insert nchain after ochain but before the later
 344                  * transaction's copy.
 345                  */
 346                 TAILQ_INSERT_AFTER(&core->ownerq, ochain, nchain, core_entry);
 347
 348                 spin_unlock(&core->cst.spin);
 349         }
 350 }
 351
 352 /*
 353  * Add a reference to a chain element, preventing its destruction.
 354  */
 355 void
 356 hammer2_chain_ref(hammer2_chain_t *chain)
 357 {
 358         atomic_add_int(&chain->refs, 1);
 359 }
 360
 361 /*
 362  * Insert the chain in the core rbtree at the first layer
 363  * which accepts it (for now we don't sort layers by the transaction tid)
 364  */
 365 #define HAMMER2_CHAIN_INSERT_SPIN       0x0001
 366 #define HAMMER2_CHAIN_INSERT_LIVE       0x0002
 367 #define HAMMER2_CHAIN_INSERT_RACE       0x0004
 368
 369 static
 370 int
 371 hammer2_chain_insert(hammer2_chain_core_t *above, hammer2_chain_layer_t *layer,
 372                      hammer2_chain_t *chain, int flags, int generation)
 373 {
 374         hammer2_chain_t *xchain;
 375         hammer2_chain_layer_t *nlayer;
 376         int error = 0;
 377
 378         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
 379                 spin_lock(&above->cst.spin);
 380         hammer2_chain_assert_not_present(above, chain);
 381
 382         /*
 383          * Special case, place the chain in the next most-recent layer as the
 384          * specified layer, inserting a layer inbetween if necessary.
 385          */
 386         if (layer) {
 387                 KKASSERT((flags & HAMMER2_CHAIN_INSERT_RACE) == 0);
 388                 nlayer = TAILQ_PREV(layer, h2_layer_list, entry);
 389                 if (nlayer && RB_INSERT(hammer2_chain_tree,
 390                                         &nlayer->rbtree, chain) == NULL) {
 391                         layer = nlayer;
 392                         goto done;
 393                 }
 394
 395                 spin_unlock(&above->cst.spin);
 396                 KKASSERT((flags & HAMMER2_CHAIN_INSERT_LIVE) == 0);
 397                 nlayer = kmalloc(sizeof(*nlayer), chain->hmp->mchain,
 398                                  M_WAITOK | M_ZERO);
 399                 RB_INIT(&nlayer->rbtree);
 400                 nlayer->good = 0xABCD;
 401                 spin_lock(&above->cst.spin);
 402
 403                 TAILQ_INSERT_BEFORE(layer, nlayer, entry);
 404                 RB_INSERT(hammer2_chain_tree, &nlayer->rbtree, chain);
 405                 layer = nlayer;
 406                 goto done;
 407         }
 408
 409         /*
 410          * Interlocked by spinlock, check for race
 411          */
 412         if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
 413             above->generation != generation) {
 414                 error = EAGAIN;
 415                 goto failed;
 416         }
 417
 418         /*
 419          * Try to insert, allocate a new layer if a nominal collision
 420          * occurs (a collision is different from a SMP race).
 421          */
 422         layer = TAILQ_FIRST(&above->layerq);
 423         xchain = NULL;
 424
 425         if (layer == NULL ||
 426             (xchain = RB_INSERT(hammer2_chain_tree,
 427                                 &layer->rbtree, chain)) != NULL) {
 428
 429                 /*
 430                  * Allocate a new layer to resolve the issue.
 431                  */
 432                 spin_unlock(&above->cst.spin);
 433                 layer = kmalloc(sizeof(*layer), chain->hmp->mchain,
 434                                 M_WAITOK | M_ZERO);
 435                 RB_INIT(&layer->rbtree);
 436                 layer->good = 0xABCD;
 437                 spin_lock(&above->cst.spin);
 438
 439                 if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
 440                     above->generation != generation) {
 441                         spin_unlock(&above->cst.spin);
 442                         kfree(layer, chain->hmp->mchain);
 443                         spin_lock(&above->cst.spin);
 444                         error = EAGAIN;
 445                         goto failed;
 446                 }
 447
 448                 TAILQ_INSERT_HEAD(&above->layerq, layer, entry);
 449                 RB_INSERT(hammer2_chain_tree, &layer->rbtree, chain);
 450         }
 451 done:
 452         chain->above = above;
 453         chain->inlayer = layer;
 454         ++above->chain_count;
 455         ++above->generation;
 456         atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
 457
 458         /*
 459          * We have to keep track of the effective live-view blockref count
 460          * so the create code knows when to push an indirect block.
 461          */
 462         if ((flags & HAMMER2_CHAIN_INSERT_LIVE) &&
 463             (chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
 464                 atomic_add_int(&above->live_count, 1);
 465         }
 466 failed:
 467         if (flags & HAMMER2_CHAIN_INSERT_SPIN)
 468                 spin_unlock(&above->cst.spin);
 469         return error;
 470 }
 471
 472 /*
 473  * Drop the caller's reference to the chain.  When the ref count drops to
 474  * zero this function will disassociate the chain from its parent and
 475  * deallocate it, then recursely drop the parent using the implied ref
 476  * from the chain's chain->parent.
 477  *
 478  * WARNING! Just because we are able to deallocate a chain doesn't mean
 479  *          that chain->core->rbtree is empty.  There can still be a sharecnt
 480  *          on chain->core and RBTREE entries that refer to different parents.
 481  */
 482 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain,
 483                                                struct h2_core_list *delayq);
 484
 485 void
 486 hammer2_chain_drop(hammer2_chain_t *chain)
 487 {
 488         struct h2_core_list delayq;
 489         hammer2_chain_t *scan;
 490         u_int refs;
 491         u_int need = 0;
 492
 493         if (hammer2_debug & 0x200000)
 494                 Debugger("drop");
 495
 496         if (chain->flags & HAMMER2_CHAIN_MOVED)
 497                 ++need;
 498         if (chain->flags & HAMMER2_CHAIN_MODIFIED)
 499                 ++need;
 500         KKASSERT(chain->refs > need);
 501
 502         TAILQ_INIT(&delayq);
 503
 504         while (chain) {
 505                 refs = chain->refs;
 506                 cpu_ccfence();
 507                 KKASSERT(refs > 0);
 508
 509                 if (refs == 1) {
 510                         chain = hammer2_chain_lastdrop(chain, &delayq);
 511                 } else {
 512                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
 513                                 break;
 514                         /* retry the same chain */
 515                 }
 516
 517                 /*
 518                  * When we've exhausted lastdrop chaining pull off of delayq.
 519                  * chains on delayq are dead but are used to placehold other
 520                  * chains which we added a ref to for the purpose of dropping.
 521                  */
 522                 if (chain == NULL) {
 523                         hammer2_mount_t *hmp;
 524
 525                         if ((scan = TAILQ_FIRST(&delayq)) != NULL) {
 526                                 chain = (void *)scan->data;
 527                                 TAILQ_REMOVE(&delayq, scan, core_entry);
 528                                 scan->flags &= ~HAMMER2_CHAIN_ALLOCATED;
 529                                 hmp = scan->hmp;
 530                                 scan->hmp = NULL;
 531                                 kfree(scan, hmp->mchain);
 532                         }
 533                 }
 534         }
 535 }
 536
 537 /*
 538  * Safe handling of the 1->0 transition on chain.  Returns a chain for
 539  * recursive drop or NULL, possibly returning the same chain if the atomic
 540  * op fails.
 541  *
 542  * Whem two chains need to be recursively dropped we use the chain
 543  * we would otherwise free to placehold the additional chain.  It's a bit
 544  * convoluted but we can't just recurse without potentially blowing out
 545  * the kernel stack.
 546  *
 547  * The cst spinlock is allowed nest child-to-parent (not parent-to-child).
 548  */
 549 static
 550 hammer2_chain_t *
 551 hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
 552 {
 553         hammer2_pfsmount_t *pmp;
 554         hammer2_mount_t *hmp;
 555         hammer2_chain_core_t *above;
 556         hammer2_chain_core_t *core;
 557         hammer2_chain_layer_t *layer;
 558         hammer2_chain_t *rdrop1;
 559         hammer2_chain_t *rdrop2;
 560
 561         /*
 562          * Spinlock the core and check to see if it is empty.  If it is
 563          * not empty we leave chain intact with refs == 0.  The elements
 564          * in core->rbtree are associated with other chains contemporary
 565          * with ours but not with our chain directly.
 566          */
 567         if ((core = chain->core) != NULL) {
 568                 spin_lock(&core->cst.spin);
 569
 570                 /*
 571                  * We can't free non-stale chains with children until we are
 572                  * able to free the children because there might be a flush
 573                  * dependency.  Flushes of stale children (which should also
 574                  * have their deleted flag set) short-cut recursive flush
 575                  * dependencies and can be freed here.  Any flushes which run
 576                  * through stale children due to the flush synchronization
 577                  * point should have the MOVED bit set in the chain and not
 578                  * reach lastdrop at this time.
 579                  *
 580                  * NOTE: We return (chain) on failure to retry.
 581                  */
 582                 if (core->chain_count &&
 583                     (chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
 584                         if (atomic_cmpset_int(&chain->refs, 1, 0))
 585                                 chain = NULL;   /* success */
 586                         spin_unlock(&core->cst.spin);
 587                         return(chain);
 588                 }
 589                 /* no chains left under us */
 590
 591                 /*
 592                  * Various parts of the code might be holding a ref on a
 593                  * stale chain as a placemarker which must be iterated to
 594                  * locate a later non-stale (live) chain.  We must be sure
 595                  * NOT to free the later non-stale chain (which might have
 596                  * no refs).  Otherwise mass confusion may result.
 597                  *
 598                  * The DUPLICATED flag tells us whether the chain is stale
 599                  * or not, so the rule is that any chain whos DUPLICATED flag
 600                  * is NOT set must also be at the head of the ownerq.
 601                  *
 602                  * Note that the DELETED flag is not involved.  That is, a
 603                  * live chain can represent a deletion that has not yet been
 604                  * flushed (or still has refs).
 605                  */
 606 #if 0
 607                 if (TAILQ_NEXT(chain, core_entry) == NULL &&
 608                     TAILQ_FIRST(&core->ownerq) != chain) {
 609 #endif
 610                 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 &&
 611                     TAILQ_FIRST(&core->ownerq) != chain) {
 612                         if (atomic_cmpset_int(&chain->refs, 1, 0))
 613                                 chain = NULL;   /* success */
 614                         spin_unlock(&core->cst.spin);
 615                         return(chain);
 616                 }
 617         }
 618
 619         /*
 620          * chain->core has no children left so no accessors can get to our
 621          * chain from there.  Now we have to lock the above core to interlock
 622          * remaining possible accessors that might bump chain's refs before
 623          * we can safely drop chain's refs with intent to free the chain.
 624          */
 625         hmp = chain->hmp;
 626         pmp = chain->pmp;       /* can be NULL */
 627         rdrop1 = NULL;
 628         rdrop2 = NULL;
 629         layer = NULL;
 630
 631         /*
 632          * Spinlock the parent and try to drop the last ref on chain.
 633          * On success remove chain from its parent, otherwise return NULL.
 634          *
 635          * (normal core locks are top-down recursive but we define core
 636          *  spinlocks as bottom-up recursive, so this is safe).
 637          */
 638         if ((above = chain->above) != NULL) {
 639                 spin_lock(&above->cst.spin);
 640                 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
 641                         /* 1->0 transition failed */
 642                         spin_unlock(&above->cst.spin);
 643                         if (core)
 644                                 spin_unlock(&core->cst.spin);
 645                         return(chain);  /* retry */
 646                 }
 647
 648                 /*
 649                  * 1->0 transition successful, remove chain from its
 650                  * above core.  Track layer for removal/freeing.
 651                  */
 652                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
 653                 layer = chain->inlayer;
 654                 RB_REMOVE(hammer2_chain_tree, &layer->rbtree, chain);
 655                 --above->chain_count;
 656                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
 657                 chain->above = NULL;
 658                 chain->inlayer = NULL;
 659
 660                 if (RB_EMPTY(&layer->rbtree) && layer->refs == 0) {
 661                         TAILQ_REMOVE(&above->layerq, layer, entry);
 662                 } else {
 663                         layer = NULL;
 664                 }
 665
 666                 /*
 667                  * If our chain was the last chain in the parent's core the
 668                  * core is now empty and its parents might now be droppable.
 669                  * Try to drop the first multi-homed parent by gaining a
 670                  * ref on it here and then dropping it below.
 671                  */
 672                 if (above->chain_count == 0) {
 673                         rdrop1 = TAILQ_FIRST(&above->ownerq);
 674                         if (rdrop1 &&
 675                             atomic_cmpset_int(&rdrop1->refs, 0, 1) == 0) {
 676                                 rdrop1 = NULL;
 677                         }
 678                 }
 679                 spin_unlock(&above->cst.spin);
 680                 above = NULL;   /* safety */
 681         }
 682
 683         /*
 684          * Successful 1->0 transition and the chain can be destroyed now.
 685          *
 686          * We still have the core spinlock (if core is non-NULL), and core's
 687          * chain_count is 0.  The above spinlock is gone.
 688          *
 689          * Remove chain from ownerq.  Once core has no more owners (and no
 690          * children which is already the case) we can destroy core.
 691          *
 692          * If core has more owners we may be able to continue a bottom-up
 693          * drop with our next sibling.
 694          */
 695         if (core) {
 696                 chain->core = NULL;
 697
 698                 TAILQ_REMOVE(&core->ownerq, chain, core_entry);
 699                 rdrop2 = TAILQ_FIRST(&core->ownerq);
 700                 if (rdrop2 && atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0)
 701                         rdrop2 = NULL;
 702                 spin_unlock(&core->cst.spin);
 703
 704                 /*
 705                  * We can do the final 1->0 transition with an atomic op
 706                  * after releasing core's spinlock.
 707                  */
 708                 if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) {
 709                         /*
 710                          * On the 1->0 transition of core we can destroy
 711                          * it.  Any remaining layers should no longer be
 712                          * referenced or visibile to other threads.
 713                          */
 714                         KKASSERT(TAILQ_EMPTY(&core->ownerq));
 715                         if (layer) {
 716                                 layer->good = 0xEF00;
 717                                 kfree(layer, hmp->mchain);
 718                         }
 719                         while ((layer = TAILQ_FIRST(&core->layerq)) != NULL) {
 720                                 KKASSERT(layer->refs == 0 &&
 721                                          RB_EMPTY(&layer->rbtree));
 722                                 TAILQ_REMOVE(&core->layerq, layer, entry);
 723                                 layer->good = 0xEF01;
 724                                 kfree(layer, hmp->mchain);
 725                         }
 726                         /* layer now NULL */
 727                         KKASSERT(core->cst.count == 0);
 728                         KKASSERT(core->cst.upgrade == 0);
 729                         core->good = 0x5678;
 730                         kfree(core, hmp->mchain);
 731                 }
 732                 core = NULL;    /* safety */
 733         }
 734
 735         /*
 736          * All spin locks are gone, finish freeing stuff.
 737          */
 738         KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED |
 739                                   HAMMER2_CHAIN_MODIFIED)) == 0);
 740         hammer2_chain_drop_data(chain, 1);
 741
 742         KKASSERT(chain->dio == NULL);
 743
 744         /*
 745          * Free saved empty layer and return chained drop.
 746          */
 747         if (layer) {
 748                 layer->good = 0xEF02;
 749                 kfree(layer, hmp->mchain);
 750         }
 751
 752         /*
 753          * Once chain resources are gone we can use the now dead chain
 754          * structure to placehold what might otherwise require a recursive
 755          * drop, because we have potentially two things to drop and can only
 756          * return one directly.
 757          */
 758         if (rdrop1 && rdrop2) {
 759                 KKASSERT(chain->flags & HAMMER2_CHAIN_ALLOCATED);
 760                 chain->data = (void *)rdrop1;
 761                 TAILQ_INSERT_TAIL(delayq, chain, core_entry);
 762                 rdrop1 = NULL;
 763         } else if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
 764                 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED;
 765                 chain->hmp = NULL;
 766                 kfree(chain, hmp->mchain);
 767         }
 768
 769         /*
 770          * Either or both can be NULL.  We already handled the case where
 771          * both might not have been NULL.
 772          */
 773         if (rdrop1)
 774                 return(rdrop1);
 775         else
 776                 return(rdrop2);
 777 }
 778
 779 /*
 780  * On either last lock release or last drop
 781  */
 782 static void
 783 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop)
 784 {
 785         hammer2_mount_t *hmp = chain->hmp;
 786
 787         switch(chain->bref.type) {
 788         case HAMMER2_BREF_TYPE_VOLUME:
 789         case HAMMER2_BREF_TYPE_FREEMAP:
 790                 if (lastdrop)
 791                         chain->data = NULL;
 792                 break;
 793         case HAMMER2_BREF_TYPE_INODE:
 794                 if (chain->data) {
 795                         kfree(chain->data, hmp->mchain);
 796                         chain->data = NULL;
 797                 }
 798                 break;
 799         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
 800                 if (chain->data) {
 801                         kfree(chain->data, hmp->mchain);
 802                         chain->data = NULL;
 803                 }
 804                 break;
 805         default:
 806                 KKASSERT(chain->data == NULL);
 807                 break;
 808         }
 809 }
 810
 811 /*
 812  * Ref and lock a chain element, acquiring its data with I/O if necessary,
 813  * and specify how you would like the data to be resolved.
 814  *
 815  * Returns 0 on success or an error code if the data could not be acquired.
 816  * The chain element is locked on return regardless of whether an error
 817  * occurred or not.
 818  *
 819  * The lock is allowed to recurse, multiple locking ops will aggregate
 820  * the requested resolve types.  Once data is assigned it will not be
 821  * removed until the last unlock.
 822  *
 823  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
 824  *                         (typically used to avoid device/logical buffer
 825  *                          aliasing for data)
 826  *
 827  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
 828  *                         the INITIAL-create state (indirect blocks only).
 829  *
 830  *                         Do not resolve data elements for DATA chains.
 831  *                         (typically used to avoid device/logical buffer
 832  *                          aliasing for data)
 833  *
 834  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
 835  *
 836  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
 837  *                         it will be locked exclusive.
 838  *
 839  * NOTE: Embedded elements (volume header, inodes) are always resolved
 840  *       regardless.
 841  *
 842  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
 843  *       element will instantiate and zero its buffer, and flush it on
 844  *       release.
 845  *
 846  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
 847  *       so as not to instantiate a device buffer, which could alias against
 848  *       a logical file buffer.  However, if ALWAYS is specified the
 849  *       device buffer will be instantiated anyway.
 850  *
 851  * WARNING! If data must be fetched a shared lock will temporarily be
 852  *          upgraded to exclusive.  However, a deadlock can occur if
 853  *          the caller owns more than one shared lock.
 854  */
 855 int
 856 hammer2_chain_lock(hammer2_chain_t *chain, int how)
 857 {
 858         hammer2_mount_t *hmp;
 859         hammer2_chain_core_t *core;
 860         hammer2_blockref_t *bref;
 861         ccms_state_t ostate;
 862         char *bdata;
 863         int error;
 864
 865         /*
 866          * Ref and lock the element.  Recursive locks are allowed.
 867          */
 868         if ((how & HAMMER2_RESOLVE_NOREF) == 0)
 869                 hammer2_chain_ref(chain);
 870         atomic_add_int(&chain->lockcnt, 1);
 871
 872         hmp = chain->hmp;
 873         KKASSERT(hmp != NULL);
 874
 875         /*
 876          * Get the appropriate lock.
 877          */
 878         core = chain->core;
 879         if (how & HAMMER2_RESOLVE_SHARED)
 880                 ccms_thread_lock(&core->cst, CCMS_STATE_SHARED);
 881         else
 882                 ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE);
 883
 884         /*
 885          * If we already have a valid data pointer no further action is
 886          * necessary.
 887          */
 888         if (chain->data)
 889                 return (0);
 890
 891         /*
 892          * Do we have to resolve the data?
 893          */
 894         switch(how & HAMMER2_RESOLVE_MASK) {
 895         case HAMMER2_RESOLVE_NEVER:
 896                 return(0);
 897         case HAMMER2_RESOLVE_MAYBE:
 898                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
 899                         return(0);
 900                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
 901                         return(0);
 902 #if 0
 903                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE)
 904                         return(0);
 905 #endif
 906                 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF)
 907                         return(0);
 908                 /* fall through */
 909         case HAMMER2_RESOLVE_ALWAYS:
 910                 break;
 911         }
 912
 913         /*
 914          * Upgrade to an exclusive lock so we can safely manipulate the
 915          * buffer cache.  If another thread got to it before us we
 916          * can just return.
 917          */
 918         ostate = ccms_thread_lock_upgrade(&core->cst);
 919         if (chain->data) {
 920                 ccms_thread_lock_downgrade(&core->cst, ostate);
 921                 return (0);
 922         }
 923
 924         /*
 925          * We must resolve to a device buffer, either by issuing I/O or
 926          * by creating a zero-fill element.  We do not mark the buffer
 927          * dirty when creating a zero-fill element (the hammer2_chain_modify()
 928          * API must still be used to do that).
 929          *
 930          * The device buffer is variable-sized in powers of 2 down
 931          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
 932          * chunk always contains buffers of the same size. (XXX)
 933          *
 934          * The minimum physical IO size may be larger than the variable
 935          * block size.
 936          */
 937         bref = &chain->bref;
 938
 939         /*
 940          * The getblk() optimization can only be used on newly created
 941          * elements if the physical block size matches the request.
 942          */
 943         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
 944                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes,
 945                                         &chain->dio);
 946         } else {
 947                 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes,
 948                                          &chain->dio);
 949                 adjreadcounter(&chain->bref, chain->bytes);
 950         }
 951
 952         if (error) {
 953                 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n",
 954                         (intmax_t)bref->data_off, error);
 955                 hammer2_io_bqrelse(&chain->dio);
 956                 ccms_thread_lock_downgrade(&core->cst, ostate);
 957                 return (error);
 958         }
 959
 960         /*
 961          * We can clear the INITIAL state now, we've resolved the buffer
 962          * to zeros and marked it dirty with hammer2_io_new().
 963          */
 964         bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
 965         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
 966                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
 967         }
 968
 969         /*
 970          * Setup the data pointer, either pointing it to an embedded data
 971          * structure and copying the data from the buffer, or pointing it
 972          * into the buffer.
 973          *
 974          * The buffer is not retained when copying to an embedded data
 975          * structure in order to avoid potential deadlocks or recursions
 976          * on the same physical buffer.
 977          */
 978         switch (bref->type) {
 979         case HAMMER2_BREF_TYPE_VOLUME:
 980         case HAMMER2_BREF_TYPE_FREEMAP:
 981                 /*
 982                  * Copy data from bp to embedded buffer
 983                  */
 984                 panic("hammer2_chain_lock: called on unresolved volume header");
 985                 break;
 986         case HAMMER2_BREF_TYPE_INODE:
 987                 /*
 988                  * Copy data from dio to embedded buffer, do not retain the
 989                  * device buffer.
 990                  */
 991                 KKASSERT(chain->bytes == sizeof(chain->data->ipdata));
 992                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
 993                 chain->data = kmalloc(sizeof(chain->data->ipdata),
 994                                       hmp->mchain, M_WAITOK | M_ZERO);
 995                 bcopy(bdata, &chain->data->ipdata, chain->bytes);
 996                 hammer2_io_bqrelse(&chain->dio);
 997                 break;
 998         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
 999                 KKASSERT(chain->bytes == sizeof(chain->data->bmdata));
1000                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
1001                 chain->data = kmalloc(sizeof(chain->data->bmdata),
1002                                       hmp->mchain, M_WAITOK | M_ZERO);
1003                 bcopy(bdata, &chain->data->bmdata, chain->bytes);
1004                 hammer2_io_bqrelse(&chain->dio);
1005                 break;
1006         case HAMMER2_BREF_TYPE_INDIRECT:
1007         case HAMMER2_BREF_TYPE_DATA:
1008         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1009         default:
1010                 /*
1011                  * Point data at the device buffer and leave bp intact.
1012                  */
1013                 chain->data = (void *)bdata;
1014                 break;
1015         }
1016         ccms_thread_lock_downgrade(&core->cst, ostate);
1017         return (0);
1018 }
1019
1020 /*
1021  * This basically calls hammer2_io_breadcb() but does some pre-processing
1022  * of the chain first to handle certain cases.
1023  */
1024 void
1025 hammer2_chain_load_async(hammer2_chain_t *chain,
1026                          void (*callback)(hammer2_io_t *dio,
1027                                           hammer2_chain_t *chain,
1028                                           void *arg_p, off_t arg_o),
1029                          void *arg_p, off_t arg_o)
1030 {
1031         hammer2_mount_t *hmp;
1032         struct hammer2_io *dio;
1033         hammer2_blockref_t *bref;
1034         int error;
1035
1036         if (chain->data) {
1037                 callback(NULL, chain, arg_p, arg_o);
1038                 return;
1039         }
1040
1041         /*
1042          * We must resolve to a device buffer, either by issuing I/O or
1043          * by creating a zero-fill element.  We do not mark the buffer
1044          * dirty when creating a zero-fill element (the hammer2_chain_modify()
1045          * API must still be used to do that).
1046          *
1047          * The device buffer is variable-sized in powers of 2 down
1048          * to HAMMER2_MIN_ALLOC (typically 1K).  A 64K physical storage
1049          * chunk always contains buffers of the same size. (XXX)
1050          *
1051          * The minimum physical IO size may be larger than the variable
1052          * block size.
1053          */
1054         bref = &chain->bref;
1055         hmp = chain->hmp;
1056
1057         /*
1058          * The getblk() optimization can only be used on newly created
1059          * elements if the physical block size matches the request.
1060          */
1061         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
1062             chain->bytes == hammer2_devblksize(chain->bytes)) {
1063                 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio);
1064                 KKASSERT(error == 0);
1065                 callback(dio, chain, arg_p, arg_o);
1066                 return;
1067         }
1068
1069         /*
1070          * Otherwise issue a read
1071          */
1072         adjreadcounter(&chain->bref, chain->bytes);
1073         hammer2_io_breadcb(hmp, bref->data_off, chain->bytes,
1074                            callback, chain, arg_p, arg_o);
1075 }
1076
1077 /*
1078  * Unlock and deref a chain element.
1079  *
1080  * On the last lock release any non-embedded data (chain->dio) will be
1081  * retired.
1082  */
1083 void
1084 hammer2_chain_unlock(hammer2_chain_t *chain)
1085 {
1086         hammer2_chain_core_t *core = chain->core;
1087         ccms_state_t ostate;
1088         long *counterp;
1089         u_int lockcnt;
1090
1091         /*
1092          * The core->cst lock can be shared across several chains so we
1093          * need to track the per-chain lockcnt separately.
1094          *
1095          * If multiple locks are present (or being attempted) on this
1096          * particular chain we can just unlock, drop refs, and return.
1097          *
1098          * Otherwise fall-through on the 1->0 transition.
1099          */
1100         for (;;) {
1101                 lockcnt = chain->lockcnt;
1102                 KKASSERT(lockcnt > 0);
1103                 cpu_ccfence();
1104                 if (lockcnt > 1) {
1105                         if (atomic_cmpset_int(&chain->lockcnt,
1106                                               lockcnt, lockcnt - 1)) {
1107                                 ccms_thread_unlock(&core->cst);
1108                                 hammer2_chain_drop(chain);
1109                                 return;
1110                         }
1111                 } else {
1112                         if (atomic_cmpset_int(&chain->lockcnt, 1, 0))
1113                                 break;
1114                 }
1115                 /* retry */
1116         }
1117
1118         /*
1119          * On the 1->0 transition we upgrade the core lock (if necessary)
1120          * to exclusive for terminal processing.  If after upgrading we find
1121          * that lockcnt is non-zero, another thread is racing us and will
1122          * handle the unload for us later on, so just cleanup and return
1123          * leaving the data/io intact
1124          *
1125          * Otherwise if lockcnt is still 0 it is possible for it to become
1126          * non-zero and race, but since we hold the core->cst lock
1127          * exclusively all that will happen is that the chain will be
1128          * reloaded after we unload it.
1129          */
1130         ostate = ccms_thread_lock_upgrade(&core->cst);
1131         if (chain->lockcnt) {
1132                 ccms_thread_unlock_upgraded(&core->cst, ostate);
1133                 hammer2_chain_drop(chain);
1134                 return;
1135         }
1136
1137         /*
1138          * Shortcut the case if the data is embedded or not resolved.
1139          *
1140          * Do NOT NULL out chain->data (e.g. inode data), it might be
1141          * dirty.
1142          */
1143         if (chain->dio == NULL) {
1144                 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
1145                         hammer2_chain_drop_data(chain, 0);
1146                 ccms_thread_unlock_upgraded(&core->cst, ostate);
1147                 hammer2_chain_drop(chain);
1148                 return;
1149         }
1150
1151         /*
1152          * Statistics
1153          */
1154         if (hammer2_io_isdirty(chain->dio) == 0) {
1155                 ;
1156         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
1157                 switch(chain->bref.type) {
1158                 case HAMMER2_BREF_TYPE_DATA:
1159                         counterp = &hammer2_ioa_file_write;
1160                         break;
1161                 case HAMMER2_BREF_TYPE_INODE:
1162                         counterp = &hammer2_ioa_meta_write;
1163                         break;
1164                 case HAMMER2_BREF_TYPE_INDIRECT:
1165                         counterp = &hammer2_ioa_indr_write;
1166                         break;
1167                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1168                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1169                         counterp = &hammer2_ioa_fmap_write;
1170                         break;
1171                 default:
1172                         counterp = &hammer2_ioa_volu_write;
1173                         break;
1174                 }
1175                 *counterp += chain->bytes;
1176         } else {
1177                 switch(chain->bref.type) {
1178                 case HAMMER2_BREF_TYPE_DATA:
1179                         counterp = &hammer2_iod_file_write;
1180                         break;
1181                 case HAMMER2_BREF_TYPE_INODE:
1182                         counterp = &hammer2_iod_meta_write;
1183                         break;
1184                 case HAMMER2_BREF_TYPE_INDIRECT:
1185                         counterp = &hammer2_iod_indr_write;
1186                         break;
1187                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1188                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1189                         counterp = &hammer2_iod_fmap_write;
1190                         break;
1191                 default:
1192                         counterp = &hammer2_iod_volu_write;
1193                         break;
1194                 }
1195                 *counterp += chain->bytes;
1196         }
1197
1198         /*
1199          * Clean out the dio.
1200          *
1201          * If a device buffer was used for data be sure to destroy the
1202          * buffer when we are done to avoid aliases (XXX what about the
1203          * underlying VM pages?).
1204          *
1205          * NOTE: Freemap leaf's use reserved blocks and thus no aliasing
1206          *       is possible.
1207          *
1208          * NOTE: The isdirty check tracks whether we have to bdwrite() the
1209          *       buffer or not.  The buffer might already be dirty.  The
1210          *       flag is re-set when chain_modify() is called, even if
1211          *       MODIFIED is already set, allowing the OS to retire the
1212          *       buffer independent of a hammer2 flush.
1213          */
1214         chain->data = NULL;
1215         if ((chain->flags & HAMMER2_CHAIN_IOFLUSH) &&
1216             hammer2_io_isdirty(chain->dio)) {
1217                 hammer2_io_bawrite(&chain->dio);
1218         } else {
1219                 hammer2_io_bqrelse(&chain->dio);
1220         }
1221         ccms_thread_unlock_upgraded(&core->cst, ostate);
1222         hammer2_chain_drop(chain);
1223 }
1224
1225 /*
1226  * This counts the number of live blockrefs in a block array and
1227  * also calculates the point at which all remaining blockrefs are empty.
1228  * This routine can only be called on a live chain (DUPLICATED flag not set).
1229  *
1230  * NOTE: Flag is not set until after the count is complete, allowing
1231  *       callers to test the flag without holding the spinlock.
1232  *
1233  * NOTE: If base is NULL the related chain is still in the INITIAL
1234  *       state and there are no blockrefs to count.
1235  *
1236  * NOTE: live_count may already have some counts accumulated due to
1237  *       creation and deletion and could even be initially negative.
1238  */
1239 void
1240 hammer2_chain_countbrefs(hammer2_chain_t *chain,
1241                          hammer2_blockref_t *base, int count)
1242 {
1243         hammer2_chain_core_t *core = chain->core;
1244
1245         KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0);
1246
1247         spin_lock(&core->cst.spin);
1248         if ((core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) {
1249                 if (base) {
1250                         while (--count >= 0) {
1251                                 if (base[count].type)
1252                                         break;
1253                         }
1254                         core->live_zero = count + 1;
1255                         while (count >= 0) {
1256                                 if (base[count].type)
1257                                         atomic_add_int(&core->live_count, 1);
1258                                 --count;
1259                         }
1260                 } else {
1261                         core->live_zero = 0;
1262                 }
1263                 /* else do not modify live_count */
1264                 atomic_set_int(&core->flags, HAMMER2_CORE_COUNTEDBREFS);
1265         }
1266         spin_unlock(&core->cst.spin);
1267 }
1268
1269 /*
1270  * Resize the chain's physical storage allocation in-place.  This may
1271  * replace the passed-in chain with a new chain.
1272  *
1273  * Chains can be resized smaller without reallocating the storage.
1274  * Resizing larger will reallocate the storage.
1275  *
1276  * Must be passed an exclusively locked parent and chain, returns a new
1277  * exclusively locked chain at the same index and unlocks the old chain.
1278  * Flushes the buffer if necessary.
1279  *
1280  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
1281  * to avoid instantiating a device buffer that conflicts with the vnode
1282  * data buffer.  That is, the passed-in bp is a logical buffer, whereas
1283  * any chain-oriented bp would be a device buffer.
1284  *
1285  * XXX return error if cannot resize.
1286  */
1287 void
1288 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
1289                      hammer2_chain_t *parent, hammer2_chain_t **chainp,
1290                      int nradix, int flags)
1291 {
1292         hammer2_mount_t *hmp;
1293         hammer2_chain_t *chain;
1294         size_t obytes;
1295         size_t nbytes;
1296
1297         chain = *chainp;
1298         hmp = chain->hmp;
1299
1300         /*
1301          * Only data and indirect blocks can be resized for now.
1302          * (The volu root, inodes, and freemap elements use a fixed size).
1303          */
1304         KKASSERT(chain != &hmp->vchain);
1305         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
1306                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
1307
1308         /*
1309          * Nothing to do if the element is already the proper size
1310          */
1311         obytes = chain->bytes;
1312         nbytes = 1U << nradix;
1313         if (obytes == nbytes)
1314                 return;
1315
1316         /*
1317          * Delete the old chain and duplicate it at the same (parent, index),
1318          * returning a new chain.  This allows the old chain to still be
1319          * used by the flush code.  The new chain will be returned in a
1320          * modified state.
1321          *
1322          * The parent does not have to be locked for the delete/duplicate call,
1323          * but is in this particular code path.
1324          *
1325          * NOTE: If we are not crossing a synchronization point the
1326          *       duplication code will simply reuse the existing chain
1327          *       structure.
1328          */
1329         hammer2_chain_delete_duplicate(trans, &chain, 0);
1330
1331         /*
1332          * Relocate the block, even if making it smaller (because different
1333          * block sizes may be in different regions).
1334          */
1335         hammer2_freemap_alloc(trans, chain, nbytes);
1336         chain->bytes = nbytes;
1337         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW);
1338         /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
1339
1340         /*
1341          * For now just support it on DATA chains (and not on indirect
1342          * blocks).
1343          */
1344         KKASSERT(chain->dio == NULL);
1345
1346 #if 0
1347         /*
1348          * Make sure the chain is marked MOVED and propagate the update
1349          * to the root for flush.
1350          */
1351         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1352                 hammer2_chain_ref(chain);
1353                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1354         }
1355         hammer2_chain_setsubmod(trans, chain);
1356 #endif
1357         *chainp = chain;
1358 }
1359
1360 /*
1361  * Set a chain modified, making it read-write and duplicating it if necessary.
1362  * This function will assign a new physical block to the chain if necessary
1363  *
1364  * Duplication of already-modified chains is possible when the modification
1365  * crosses a flush synchronization boundary.
1366  *
1367  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
1368  *                   level or the COW operation will not work.
1369  *
1370  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
1371  *                   run the data through the device buffers.
1372  *
1373  * This function may return a different chain than was passed, in which case
1374  * the old chain will be unlocked and the new chain will be locked.
1375  *
1376  * ip->chain may be adjusted by hammer2_chain_modify_ip().
1377  */
1378 hammer2_inode_data_t *
1379 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
1380                         hammer2_chain_t **chainp, int flags)
1381 {
1382         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1383         hammer2_chain_modify(trans, chainp, flags);
1384         if (ip->chain != *chainp)
1385                 hammer2_inode_repoint(ip, NULL, *chainp);
1386         if (ip->vp)
1387                 vsetisdirty(ip->vp);
1388         return(&ip->chain->data->ipdata);
1389 }
1390
1391 void
1392 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
1393                      int flags)
1394 {
1395         hammer2_mount_t *hmp;
1396         hammer2_chain_t *chain;
1397         hammer2_io_t *dio;
1398         int error;
1399         int wasinitial;
1400         char *bdata;
1401
1402         chain = *chainp;
1403         hmp = chain->hmp;
1404
1405         KKASSERT(chain->bref.mirror_tid != trans->sync_tid ||
1406                  (chain->flags & HAMMER2_CHAIN_MODIFIED));
1407 #if 0
1408         if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP ||
1409             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1410             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1411                 kprintf("trans %04jx/%08x MODIFY1 %p.%d [%08x] %016jx/%d %016jx C/D %016jx/%016jx\n",
1412                         trans->sync_tid, trans->flags,
1413                         chain, chain->bref.type, chain->flags,
1414                         chain->bref.key, chain->bref.keybits,
1415                         chain->bref.data_off,
1416                         chain->modify_tid, chain->delete_tid);
1417         }
1418 #endif
1419 #if 0
1420         kprintf("MODIFY %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid);
1421 #endif
1422         /*
1423          * Data must be resolved if already assigned unless explicitly
1424          * flagged otherwise.
1425          */
1426         if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
1427             (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
1428                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1429                 hammer2_chain_unlock(chain);
1430         }
1431
1432         /*
1433          * data is not optional for freemap chains (we must always be sure
1434          * to copy the data on COW storage allocations).
1435          */
1436         if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1437             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1438                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) ||
1439                          (flags & HAMMER2_MODIFY_OPTDATA) == 0);
1440         }
1441
1442         /*
1443          * Determine if a delete-duplicate is needed.
1444          *
1445          * (a) Modify_tid is part of a prior flush
1446          * (b) Transaction is concurrent with a flush (has higher tid)
1447          * (c) and chain is not in the initial state (freshly created)
1448          * (d) and caller didn't request an in-place modification.
1449          *
1450          * The freemap and volume header special chains are never D-Dd.
1451          */
1452         if (chain->modify_tid != trans->sync_tid &&        /* cross boundary */
1453             (flags & HAMMER2_MODIFY_INPLACE) == 0) {       /* from d-d */
1454                 if (chain != &hmp->fchain && chain != &hmp->vchain) {
1455                         KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0);
1456                         hammer2_chain_delete_duplicate(trans, chainp, 0);
1457                         chain = *chainp;
1458 #if 0
1459         if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP ||
1460             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
1461             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
1462                 kprintf("trans %04jx/%08x MODIFY2 %p.%d [%08x] %016jx/%d %016jx\n",
1463                         trans->sync_tid, trans->flags,
1464                         chain, chain->bref.type, chain->flags,
1465                         chain->bref.key, chain->bref.keybits,
1466                         chain->bref.data_off);
1467                         return;
1468                 }
1469 #endif
1470                 }
1471
1472                 /*
1473                  * Fall through if fchain or vchain, clearing the CHAIN_FLUSHED
1474                  * flag.  Basically other chains are delete-duplicated and so
1475                  * the duplicated chains of course will not have the FLUSHED
1476                  * flag set, but fchain and vchain are special-cased and the
1477                  * flag must be cleared when changing modify_tid.
1478                  */
1479                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FLUSHED);
1480         }
1481
1482         /*
1483          * Otherwise do initial-chain handling
1484          */
1485         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1486                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1487                 hammer2_chain_ref(chain);
1488                 hammer2_chain_memory_inc(chain->pmp);
1489         }
1490 #if 0
1491         /* shouldn't be needed */
1492         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1493                 hammer2_chain_ref(chain);
1494                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1495         }
1496 #endif
1497
1498         /*
1499          * The modification or re-modification requires an allocation and
1500          * possible COW.
1501          *
1502          * We normally always allocate new storage here.  If storage exists
1503          * and MODIFY_NOREALLOC is passed in, we do not allocate new storage.
1504          */
1505         if (chain != &hmp->vchain && chain != &hmp->fchain) {
1506                 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
1507                      ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 &&
1508                       chain->modify_tid != trans->sync_tid)
1509                 ) {
1510                         hammer2_freemap_alloc(trans, chain, chain->bytes);
1511                         /* XXX failed allocation */
1512                 } else if (chain->flags & HAMMER2_CHAIN_FORCECOW) {
1513                         hammer2_freemap_alloc(trans, chain, chain->bytes);
1514                         /* XXX failed allocation */
1515                 }
1516                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW);
1517         }
1518
1519         /*
1520          * Update modify_tid.  XXX special-case vchain/fchain because they
1521          * are always modified in-place.  Otherwise the chain being modified
1522          * must not be part of a future transaction.
1523          */
1524         if (chain == &hmp->vchain || chain == &hmp->fchain) {
1525                 if (chain->modify_tid <= trans->sync_tid)
1526                         chain->modify_tid = trans->sync_tid;
1527         } else {
1528                 KKASSERT(chain->modify_tid <= trans->sync_tid);
1529                 chain->modify_tid = trans->sync_tid;
1530         }
1531
1532         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
1533                 chain->bref.modify_tid = trans->sync_tid;
1534
1535         /*
1536          * Do not COW if OPTDATA is set.  INITIAL flag remains unchanged.
1537          * (OPTDATA does not prevent [re]allocation of storage, only the
1538          * related copy-on-write op).
1539          */
1540         if (flags & HAMMER2_MODIFY_OPTDATA)
1541                 goto skip2;
1542
1543         /*
1544          * Clearing the INITIAL flag (for indirect blocks) indicates that
1545          * we've processed the uninitialized storage allocation.
1546          *
1547          * If this flag is already clear we are likely in a copy-on-write
1548          * situation but we have to be sure NOT to bzero the storage if
1549          * no data is present.
1550          */
1551         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
1552                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1553                 wasinitial = 1;
1554         } else {
1555                 wasinitial = 0;
1556         }
1557
1558         /*
1559          * Instantiate data buffer and possibly execute COW operation
1560          */
1561         switch(chain->bref.type) {
1562         case HAMMER2_BREF_TYPE_VOLUME:
1563         case HAMMER2_BREF_TYPE_FREEMAP:
1564         case HAMMER2_BREF_TYPE_INODE:
1565         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1566                 /*
1567                  * The data is embedded, no copy-on-write operation is
1568                  * needed.
1569                  */
1570                 KKASSERT(chain->dio == NULL);
1571                 break;
1572         case HAMMER2_BREF_TYPE_DATA:
1573         case HAMMER2_BREF_TYPE_INDIRECT:
1574         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1575                 /*
1576                  * Perform the copy-on-write operation
1577                  */
1578                 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
1579
1580                 if (wasinitial) {
1581                         error = hammer2_io_new(hmp, chain->bref.data_off,
1582                                                chain->bytes, &dio);
1583                 } else {
1584                         error = hammer2_io_bread(hmp, chain->bref.data_off,
1585                                                  chain->bytes, &dio);
1586                 }
1587                 adjreadcounter(&chain->bref, chain->bytes);
1588                 KKASSERT(error == 0);
1589
1590                 bdata = hammer2_io_data(dio, chain->bref.data_off);
1591
1592                 /*
1593                  * Copy or zero-fill on write depending on whether
1594                  * chain->data exists or not and set the dirty state for
1595                  * the new buffer.  Retire the existing buffer.
1596                  */
1597                 if (chain->data) {
1598                         KKASSERT(chain->dio != NULL);
1599                         if (chain->data != (void *)bdata) {
1600                                 bcopy(chain->data, bdata, chain->bytes);
1601                         }
1602                 } else if (wasinitial == 0) {
1603                         /*
1604                          * We have a problem.  We were asked to COW but
1605                          * we don't have any data to COW with!
1606                          */
1607                         panic("hammer2_chain_modify: having a COW %p\n",
1608                               chain);
1609                 }
1610                 hammer2_io_brelse(&chain->dio);
1611                 chain->data = (void *)bdata;
1612                 chain->dio = dio;
1613                 hammer2_io_setdirty(dio);       /* modified by bcopy above */
1614                 break;
1615         default:
1616                 panic("hammer2_chain_modify: illegal non-embedded type %d",
1617                       chain->bref.type);
1618                 break;
1619
1620         }
1621 skip2:
1622 #if 0
1623         kprintf("RET2 %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid);
1624 #endif
1625         hammer2_chain_setsubmod(trans, chain);
1626 }
1627
1628 /*
1629  * Mark the volume as having been modified.  This short-cut version
1630  * does not have to lock the volume's chain, which allows the ioctl
1631  * code to make adjustments to connections without deadlocking.  XXX
1632  *
1633  * No ref is made on vchain when flagging it MODIFIED.
1634  */
1635 void
1636 hammer2_modify_volume(hammer2_mount_t *hmp)
1637 {
1638         hammer2_voldata_lock(hmp);
1639         hammer2_voldata_unlock(hmp, 1);
1640 }
1641
1642 /*
1643  * This function returns the chain at the nearest key within the specified
1644  * range with the highest delete_tid.  The core spinlock must be held on
1645  * call and the returned chain will be referenced but not locked.
1646  *
1647  * The returned chain may or may not be in a deleted state.  Note that
1648  * live chains have a delete_tid = MAX_TID.
1649  *
1650  * This function will recurse through chain->rbtree as necessary and will
1651  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
1652  * the iteration value is less than the current value of *key_nextp.
1653  *
1654  * The caller should use (*key_nextp) to calculate the actual range of
1655  * the returned element, which will be (key_beg to *key_nextp - 1), because
1656  * there might be another element which is superior to the returned element
1657  * and overlaps it.
1658  *
1659  * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL
1660  * chains continue to be returned.  On EOF (*key_nextp) may overflow since
1661  * it will wind up being (key_end + 1).
1662  */
1663 struct hammer2_chain_find_info {
1664         hammer2_chain_t         *best;
1665         hammer2_key_t           key_beg;
1666         hammer2_key_t           key_end;
1667         hammer2_key_t           key_next;
1668 };
1669
1670 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
1671 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
1672
1673 /*
1674  * DEBUGGING - Assert that the chain will not collide.
1675  */
1676 static
1677 void
1678 hammer2_chain_assert_not_present(hammer2_chain_core_t *core,
1679                                  hammer2_chain_t *chain)
1680 {
1681         struct hammer2_chain_find_info info;
1682         hammer2_chain_layer_t *layer;
1683
1684         if (chain->flags & HAMMER2_CHAIN_DELETED)
1685                 return;
1686
1687         info.best = NULL;
1688         info.key_beg = chain->bref.key;
1689         info.key_end = chain->bref.key +
1690                        ((hammer2_key_t)1 << chain->bref.keybits) - 1;
1691         info.key_next = HAMMER2_MAX_KEY;
1692
1693         TAILQ_FOREACH(layer, &core->layerq, entry) {
1694                 KKASSERT(layer->good == 0xABCD);
1695                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
1696                         hammer2_chain_find_cmp, hammer2_chain_find_callback,
1697                         &info);
1698         }
1699         if (info.best && (info.best->flags & HAMMER2_CHAIN_DELETED) == 0)
1700                 panic("hammer2_chain_assert_not_present: %p/%p\n",
1701                         chain, info.best);
1702 }
1703
1704 static
1705 hammer2_chain_t *
1706 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
1707                           hammer2_key_t key_beg, hammer2_key_t key_end)
1708 {
1709         struct hammer2_chain_find_info info;
1710         hammer2_chain_layer_t *layer;
1711
1712         info.best = NULL;
1713         info.key_beg = key_beg;
1714         info.key_end = key_end;
1715         info.key_next = *key_nextp;
1716
1717         KKASSERT(parent->core->good == 0x1234);
1718         TAILQ_FOREACH(layer, &parent->core->layerq, entry) {
1719                 KKASSERT(layer->good == 0xABCD);
1720                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
1721                         hammer2_chain_find_cmp, hammer2_chain_find_callback,
1722                         &info);
1723         }
1724         *key_nextp = info.key_next;
1725 #if 0
1726         kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
1727                 parent, key_beg, key_end, *key_nextp);
1728 #endif
1729
1730         return (info.best);
1731 }
1732
1733 static
1734 int
1735 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
1736 {
1737         struct hammer2_chain_find_info *info = data;
1738         hammer2_key_t child_beg;
1739         hammer2_key_t child_end;
1740
1741         child_beg = child->bref.key;
1742         child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1;
1743
1744         if (child_end < info->key_beg)
1745                 return(-1);
1746         if (child_beg > info->key_end)
1747                 return(1);
1748         return(0);
1749 }
1750
1751 static
1752 int
1753 hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
1754 {
1755         struct hammer2_chain_find_info *info = data;
1756         hammer2_chain_t *best;
1757         hammer2_key_t child_end;
1758
1759         /*
1760          * WARNING! Do not discard DUPLICATED chains, it is possible that
1761          *          we are catching an insertion half-way done.  If a
1762          *          duplicated chain turns out to be the best choice the
1763          *          caller will re-check its flags after locking it.
1764          *
1765          * WARNING! Layerq is scanned forwards, exact matches should keep
1766          *          the existing info->best.
1767          */
1768         if ((best = info->best) == NULL) {
1769                 /*
1770                  * No previous best.  Assign best
1771                  */
1772                 info->best = child;
1773         } else if (best->bref.key <= info->key_beg &&
1774                    child->bref.key <= info->key_beg) {
1775                 /*
1776                  * If our current best is flush with key_beg and child is
1777                  * also flush with key_beg choose based on delete_tid.
1778                  *
1779                  * key_next will automatically be limited to the smaller of
1780                  * the two end-points.
1781                  */
1782                 if (child->delete_tid > best->delete_tid)
1783                         info->best = child;
1784         } else if (child->bref.key < best->bref.key) {
1785                 /*
1786                  * Child has a nearer key and best is not flush with key_beg.
1787                  * Truncate key_next to the old best key iff it had a better
1788                  * delete_tid.
1789                  */
1790                 info->best = child;
1791                 if (best->delete_tid >= child->delete_tid &&
1792                     (info->key_next > best->bref.key || info->key_next == 0))
1793                         info->key_next = best->bref.key;
1794         } else if (child->bref.key == best->bref.key) {
1795                 /*
1796                  * If our current best is flush with the child then choose
1797                  * based on delete_tid.
1798                  *
1799                  * key_next will automatically be limited to the smaller of
1800                  * the two end-points.
1801                  */
1802                 if (child->delete_tid > best->delete_tid)
1803                         info->best = child;
1804         } else {
1805                 /*
1806                  * Keep the current best but truncate key_next to the child's
1807                  * base iff the child has a higher delete_tid.
1808                  *
1809                  * key_next will also automatically be limited to the smaller
1810                  * of the two end-points (probably not necessary for this case
1811                  * but we do it anyway).
1812                  */
1813                 if (child->delete_tid >= best->delete_tid &&
1814                     (info->key_next > child->bref.key || info->key_next == 0))
1815                         info->key_next = child->bref.key;
1816         }
1817
1818         /*
1819          * Always truncate key_next based on child's end-of-range.
1820          */
1821         child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits);
1822         if (child_end && (info->key_next > child_end || info->key_next == 0))
1823                 info->key_next = child_end;
1824
1825         return(0);
1826 }
1827
1828 /*
1829  * Retrieve the specified chain from a media blockref, creating the
1830  * in-memory chain structure which reflects it.  modify_tid will be
1831  * left 0 which forces any modifications to issue a delete-duplicate.
1832  *
1833  * To handle insertion races pass the INSERT_RACE flag along with the
1834  * generation number of the core.  NULL will be returned if the generation
1835  * number changes before we have a chance to insert the chain.  Insert
1836  * races can occur because the parent might be held shared.
1837  *
1838  * Caller must hold the parent locked shared or exclusive since we may
1839  * need the parent's bref array to find our block.
1840  */
1841 hammer2_chain_t *
1842 hammer2_chain_get(hammer2_chain_t *parent, hammer2_blockref_t *bref,
1843                   int generation)
1844 {
1845         hammer2_mount_t *hmp = parent->hmp;
1846         hammer2_chain_core_t *above = parent->core;
1847         hammer2_chain_t *chain;
1848         int error;
1849
1850         /*
1851          * Allocate a chain structure representing the existing media
1852          * entry.  Resulting chain has one ref and is not locked.
1853          */
1854         chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref);
1855         hammer2_chain_core_alloc(NULL, chain, NULL);
1856         /* ref'd chain returned */
1857         chain->modify_tid = chain->bref.mirror_tid;
1858
1859         /*
1860          * Link the chain into its parent.  A spinlock is required to safely
1861          * access the RBTREE, and it is possible to collide with another
1862          * hammer2_chain_get() operation because the caller might only hold
1863          * a shared lock on the parent.
1864          */
1865         KKASSERT(parent->refs > 0);
1866         error = hammer2_chain_insert(above, NULL, chain,
1867                                      HAMMER2_CHAIN_INSERT_SPIN |
1868                                      HAMMER2_CHAIN_INSERT_RACE,
1869                                      generation);
1870         if (error) {
1871                 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
1872                 kprintf("chain %p get race\n", chain);
1873                 hammer2_chain_drop(chain);
1874                 chain = NULL;
1875         } else {
1876                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
1877         }
1878
1879         /*
1880          * Return our new chain referenced but not locked, or NULL if
1881          * a race occurred.
1882          */
1883         return (chain);
1884 }
1885
1886 /*
1887  * Lookup initialization/completion API
1888  */
1889 hammer2_chain_t *
1890 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
1891 {
1892         if (flags & HAMMER2_LOOKUP_SHARED) {
1893                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1894                                            HAMMER2_RESOLVE_SHARED);
1895         } else {
1896                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1897         }
1898         return (parent);
1899 }
1900
1901 void
1902 hammer2_chain_lookup_done(hammer2_chain_t *parent)
1903 {
1904         if (parent)
1905                 hammer2_chain_unlock(parent);
1906 }
1907
1908 static
1909 hammer2_chain_t *
1910 hammer2_chain_getparent(hammer2_chain_t **parentp, int how)
1911 {
1912         hammer2_chain_t *oparent;
1913         hammer2_chain_t *bparent;
1914         hammer2_chain_t *nparent;
1915         hammer2_chain_core_t *above;
1916
1917         oparent = *parentp;
1918         above = oparent->above;
1919
1920         spin_lock(&above->cst.spin);
1921         bparent = TAILQ_FIRST(&above->ownerq);
1922         hammer2_chain_ref(bparent);
1923
1924         /*
1925          * Be careful of order, oparent must be unlocked before nparent
1926          * is locked below to avoid a deadlock.  We might as well delay its
1927          * unlocking until we conveniently no longer have the spinlock (instead
1928          * of cycling the spinlock).
1929          *
1930          * Theoretically our ref on bparent should prevent elements of the
1931          * following chain from going away and prevent above from going away,
1932          * but we still need the spinlock to safely scan the list.
1933          */
1934         for (;;) {
1935                 nparent = bparent;
1936                 while (nparent->flags & HAMMER2_CHAIN_DUPLICATED)
1937                         nparent = TAILQ_NEXT(nparent, core_entry);
1938                 hammer2_chain_ref(nparent);
1939                 spin_unlock(&above->cst.spin);
1940
1941                 if (oparent) {
1942                         hammer2_chain_unlock(oparent);
1943                         oparent = NULL;
1944                 }
1945                 hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF);
1946                 hammer2_chain_drop(bparent);
1947
1948                 /*
1949                  * We might have raced a delete-duplicate.
1950                  */
1951                 if ((nparent->flags & HAMMER2_CHAIN_DUPLICATED) == 0)
1952                         break;
1953                 bparent = nparent;
1954                 hammer2_chain_ref(bparent);
1955                 hammer2_chain_unlock(nparent);
1956                 spin_lock(&above->cst.spin);
1957                 /* retry */
1958         }
1959         *parentp = nparent;
1960
1961         return (nparent);
1962 }
1963
1964 /*
1965  * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive.
1966  * (*parentp) typically points to an inode but can also point to a related
1967  * indirect block and this function will recurse upwards and find the inode
1968  * again.
1969  *
1970  * (*parentp) must be exclusively locked and referenced and can be an inode
1971  * or an existing indirect block within the inode.
1972  *
1973  * On return (*parentp) will be modified to point at the deepest parent chain
1974  * element encountered during the search, as a helper for an insertion or
1975  * deletion.   The new (*parentp) will be locked and referenced and the old
1976  * will be unlocked and dereferenced (no change if they are both the same).
1977  *
1978  * The matching chain will be returned exclusively locked.  If NOLOCK is
1979  * requested the chain will be returned only referenced.
1980  *
1981  * NULL is returned if no match was found, but (*parentp) will still
1982  * potentially be adjusted.
1983  *
1984  * On return (*key_nextp) will point to an iterative value for key_beg.
1985  * (If NULL is returned (*key_nextp) is set to key_end).
1986  *
1987  * This function will also recurse up the chain if the key is not within the
1988  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1989  * can simply allow (*parentp) to float inside the loop.
1990  *
1991  * NOTE!  chain->data is not always resolved.  By default it will not be
1992  *        resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF.  Use
1993  *        HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/
1994  *        BREF_TYPE_DATA as the device buffer can alias the logical file
1995  *        buffer).
1996  */
1997 hammer2_chain_t *
1998 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
1999                      hammer2_key_t key_beg, hammer2_key_t key_end,
2000                      int *cache_indexp, int flags)
2001 {
2002         hammer2_mount_t *hmp;
2003         hammer2_chain_t *parent;
2004         hammer2_chain_t *chain;
2005         hammer2_blockref_t *base;
2006         hammer2_blockref_t *bref;
2007         hammer2_blockref_t bcopy;
2008         hammer2_key_t scan_beg;
2009         hammer2_key_t scan_end;
2010         hammer2_chain_core_t *above;
2011         int count = 0;
2012         int how_always = HAMMER2_RESOLVE_ALWAYS;
2013         int how_maybe = HAMMER2_RESOLVE_MAYBE;
2014         int how;
2015         int generation;
2016         int maxloops = 300000;
2017         int wasdup;
2018         hammer2_chain_t * volatile xxchain = NULL;
2019         volatile int xxchainwhy;
2020
2021         if (flags & HAMMER2_LOOKUP_ALWAYS) {
2022                 how_maybe = how_always;
2023                 how = HAMMER2_RESOLVE_ALWAYS;
2024         } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
2025                 how = HAMMER2_RESOLVE_NEVER;
2026         } else {
2027                 how = HAMMER2_RESOLVE_MAYBE;
2028         }
2029         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
2030                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2031                 how_always |= HAMMER2_RESOLVE_SHARED;
2032                 how |= HAMMER2_RESOLVE_SHARED;
2033         }
2034
2035         /*
2036          * Recurse (*parentp) upward if necessary until the parent completely
2037          * encloses the key range or we hit the inode.
2038          *
2039          * This function handles races against the flusher doing a delete-
2040          * duplicate above us and re-homes the parent to the duplicate in
2041          * that case, otherwise we'd wind up recursing down a stale chain.
2042          */
2043         parent = *parentp;
2044         hmp = parent->hmp;
2045
2046         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2047                parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2048                 scan_beg = parent->bref.key;
2049                 scan_end = scan_beg +
2050                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2051                 if (key_beg >= scan_beg && key_end <= scan_end)
2052                         break;
2053                 parent = hammer2_chain_getparent(parentp, how_maybe);
2054         }
2055
2056 again:
2057         if (--maxloops == 0)
2058                 panic("hammer2_chain_lookup: maxloops");
2059         /*
2060          * Locate the blockref array.  Currently we do a fully associative
2061          * search through the array.
2062          */
2063         switch(parent->bref.type) {
2064         case HAMMER2_BREF_TYPE_INODE:
2065                 /*
2066                  * Special shortcut for embedded data returns the inode
2067                  * itself.  Callers must detect this condition and access
2068                  * the embedded data (the strategy code does this for us).
2069                  *
2070                  * This is only applicable to regular files and softlinks.
2071                  */
2072                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2073                         if (flags & HAMMER2_LOOKUP_NOLOCK)
2074                                 hammer2_chain_ref(parent);
2075                         else
2076                                 hammer2_chain_lock(parent, how_always);
2077                         *key_nextp = key_end + 1;
2078                         return (parent);
2079                 }
2080                 base = &parent->data->ipdata.u.blockset.blockref[0];
2081                 count = HAMMER2_SET_COUNT;
2082                 break;
2083         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2084         case HAMMER2_BREF_TYPE_INDIRECT:
2085                 /*
2086                  * Handle MATCHIND on the parent
2087                  */
2088                 if (flags & HAMMER2_LOOKUP_MATCHIND) {
2089                         scan_beg = parent->bref.key;
2090                         scan_end = scan_beg +
2091                                ((hammer2_key_t)1 << parent->bref.keybits) - 1;
2092                         if (key_beg == scan_beg && key_end == scan_end) {
2093                                 chain = parent;
2094                                 hammer2_chain_lock(chain, how_maybe);
2095                                 *key_nextp = scan_end + 1;
2096                                 goto done;
2097                         }
2098                 }
2099                 /*
2100                  * Optimize indirect blocks in the INITIAL state to avoid
2101                  * I/O.
2102                  */
2103                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2104                         base = NULL;
2105                 } else {
2106                         if (parent->data == NULL)
2107                                 panic("parent->data is NULL");
2108                         base = &parent->data->npdata[0];
2109                 }
2110                 count = parent->bytes / sizeof(hammer2_blockref_t);
2111                 break;
2112         case HAMMER2_BREF_TYPE_VOLUME:
2113                 base = &hmp->voldata.sroot_blockset.blockref[0];
2114                 count = HAMMER2_SET_COUNT;
2115                 break;
2116         case HAMMER2_BREF_TYPE_FREEMAP:
2117                 base = &hmp->voldata.freemap_blockset.blockref[0];
2118                 count = HAMMER2_SET_COUNT;
2119                 break;
2120         default:
2121                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
2122                       parent->bref.type);
2123                 base = NULL;    /* safety */
2124                 count = 0;      /* safety */
2125         }
2126
2127         /*
2128          * Merged scan to find next candidate.
2129          *
2130          * hammer2_base_*() functions require the above->live_* fields
2131          * to be synchronized.
2132          *
2133          * We need to hold the spinlock to access the block array and RB tree
2134          * and to interlock chain creation.
2135          */
2136         above = parent->core;
2137         if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
2138                 hammer2_chain_countbrefs(parent, base, count);
2139
2140         /*
2141          * Combined search
2142          */
2143         spin_lock(&above->cst.spin);
2144         chain = hammer2_combined_find(parent, base, count,
2145                                       cache_indexp, key_nextp,
2146                                       key_beg, key_end, &bref);
2147         generation = above->generation;
2148
2149         /*
2150          * Exhausted parent chain, iterate.
2151          */
2152         if (bref == NULL) {
2153                 spin_unlock(&above->cst.spin);
2154                 if (key_beg == key_end) /* short cut single-key case */
2155                         return (NULL);
2156                 return (hammer2_chain_next(parentp, NULL, key_nextp,
2157                                            key_beg, key_end,
2158                                            cache_indexp, flags));
2159         }
2160
2161         /*
2162          * Selected from blockref or in-memory chain.
2163          */
2164         if (chain == NULL) {
2165                 bcopy = *bref;
2166                 spin_unlock(&above->cst.spin);
2167                 chain = hammer2_chain_get(parent, &bcopy, generation);
2168                 if (chain == NULL) {
2169                         kprintf("retry lookup parent %p keys %016jx:%016jx\n",
2170                                 parent, key_beg, key_end);
2171                         goto again;
2172                 }
2173                 if (bcmp(&bcopy, bref, sizeof(bcopy))) {
2174                         xxchain = chain;
2175                         xxchainwhy = 1;
2176                         hammer2_chain_drop(chain);
2177                         goto again;
2178                 }
2179                 wasdup = 0;
2180         } else {
2181                 hammer2_chain_ref(chain);
2182                 wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0);
2183                 spin_unlock(&above->cst.spin);
2184         }
2185
2186         /*
2187          * chain is referenced but not locked.  We must lock the chain
2188          * to obtain definitive DUPLICATED/DELETED state
2189          */
2190         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2191             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2192                 hammer2_chain_lock(chain, how_maybe | HAMMER2_RESOLVE_NOREF);
2193         } else {
2194                 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF);
2195         }
2196
2197         /*
2198          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2199          *
2200          * NOTE: Chain's key range is not relevant as there might be
2201          *       one-offs within the range that are not deleted.
2202          *
2203          * NOTE: Lookups can race delete-duplicate because
2204          *       delete-duplicate does not lock the parent's core
2205          *       (they just use the spinlock on the core).  We must
2206          *       check for races by comparing the DUPLICATED flag before
2207          *       releasing the spinlock with the flag after locking the
2208          *       chain.
2209          */
2210         if (chain->flags & HAMMER2_CHAIN_DELETED) {
2211                 hammer2_chain_unlock(chain);
2212                 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) {
2213                         key_beg = *key_nextp;
2214                         if (key_beg == 0 || key_beg > key_end)
2215                                 return(NULL);
2216                 }
2217                 xxchain = chain;
2218                         xxchainwhy = 2;
2219                 goto again;
2220         }
2221
2222         /*
2223          * If the chain element is an indirect block it becomes the new
2224          * parent and we loop on it.  We must maintain our top-down locks
2225          * to prevent the flusher from interfering (i.e. doing a
2226          * delete-duplicate and leaving us recursing down a deleted chain).
2227          *
2228          * The parent always has to be locked with at least RESOLVE_MAYBE
2229          * so we can access its data.  It might need a fixup if the caller
2230          * passed incompatible flags.  Be careful not to cause a deadlock
2231          * as a data-load requires an exclusive lock.
2232          *
2233          * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key
2234          * range is within the requested key range we return the indirect
2235          * block and do NOT loop.  This is usually only used to acquire
2236          * freemap nodes.
2237          */
2238         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
2239             chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2240                 hammer2_chain_unlock(parent);
2241                 *parentp = parent = chain;
2242                 xxchain = chain;
2243                         xxchainwhy = 3;
2244                 goto again;
2245         }
2246 done:
2247         /*
2248          * All done, return the chain
2249          */
2250         XXChain = chain;
2251         XXChainWhy = xxchainwhy;
2252         return (chain);
2253 }
2254
2255 /*
2256  * After having issued a lookup we can iterate all matching keys.
2257  *
2258  * If chain is non-NULL we continue the iteration from just after it's index.
2259  *
2260  * If chain is NULL we assume the parent was exhausted and continue the
2261  * iteration at the next parent.
2262  *
2263  * parent must be locked on entry and remains locked throughout.  chain's
2264  * lock status must match flags.  Chain is always at least referenced.
2265  *
2266  * WARNING!  The MATCHIND flag does not apply to this function.
2267  */
2268 hammer2_chain_t *
2269 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
2270                    hammer2_key_t *key_nextp,
2271                    hammer2_key_t key_beg, hammer2_key_t key_end,
2272                    int *cache_indexp, int flags)
2273 {
2274         hammer2_chain_t *parent;
2275         int how_maybe;
2276
2277         /*
2278          * Calculate locking flags for upward recursion.
2279          */
2280         how_maybe = HAMMER2_RESOLVE_MAYBE;
2281         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
2282                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2283
2284         parent = *parentp;
2285
2286         /*
2287          * Calculate the next index and recalculate the parent if necessary.
2288          */
2289         if (chain) {
2290                 key_beg = chain->bref.key +
2291                           ((hammer2_key_t)1 << chain->bref.keybits);
2292                 if (flags & HAMMER2_LOOKUP_NOLOCK)
2293                         hammer2_chain_drop(chain);
2294                 else
2295                         hammer2_chain_unlock(chain);
2296
2297                 /*
2298                  * Any scan where the lookup returned degenerate data embedded
2299                  * in the inode has an invalid index and must terminate.
2300                  */
2301                 if (chain == parent)
2302                         return(NULL);
2303                 if (key_beg == 0 || key_beg > key_end)
2304                         return(NULL);
2305                 chain = NULL;
2306         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT &&
2307                    parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) {
2308                 /*
2309                  * We reached the end of the iteration.
2310                  */
2311                 return (NULL);
2312         } else {
2313                 /*
2314                  * Continue iteration with next parent unless the current
2315                  * parent covers the range.
2316                  */
2317                 key_beg = parent->bref.key +
2318                           ((hammer2_key_t)1 << parent->bref.keybits);
2319                 if (key_beg == 0 || key_beg > key_end)
2320                         return (NULL);
2321                 parent = hammer2_chain_getparent(parentp, how_maybe);
2322         }
2323
2324         /*
2325          * And execute
2326          */
2327         return (hammer2_chain_lookup(parentp, key_nextp,
2328                                      key_beg, key_end,
2329                                      cache_indexp, flags));
2330 }
2331
2332 /*
2333  * Raw scan functions are similar to lookup/next but do not seek the parent
2334  * chain and do not skip stale chains.  These functions are primarily used
2335  * by the recovery code.
2336  *
2337  * Parent and chain are locked, parent's data must be resolved.  To acquire
2338  * the first sub-chain under parent pass chain == NULL.
2339  */
2340 hammer2_chain_t *
2341 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain,
2342                    int *cache_indexp, int flags)
2343 {
2344         hammer2_mount_t *hmp;
2345         hammer2_blockref_t *base;
2346         hammer2_blockref_t *bref;
2347         hammer2_blockref_t bcopy;
2348         hammer2_chain_core_t *above;
2349         hammer2_key_t key;
2350         hammer2_key_t next_key;
2351         int count = 0;
2352         int how_always = HAMMER2_RESOLVE_ALWAYS;
2353         int how_maybe = HAMMER2_RESOLVE_MAYBE;
2354         int how;
2355         int generation;
2356         int maxloops = 300000;
2357         int wasdup;
2358
2359         hmp = parent->hmp;
2360
2361         /*
2362          * Scan flags borrowed from lookup
2363          */
2364         if (flags & HAMMER2_LOOKUP_ALWAYS) {
2365                 how_maybe = how_always;
2366                 how = HAMMER2_RESOLVE_ALWAYS;
2367         } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) {
2368                 how = HAMMER2_RESOLVE_NEVER;
2369         } else {
2370                 how = HAMMER2_RESOLVE_MAYBE;
2371         }
2372         if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) {
2373                 how_maybe |= HAMMER2_RESOLVE_SHARED;
2374                 how_always |= HAMMER2_RESOLVE_SHARED;
2375                 how |= HAMMER2_RESOLVE_SHARED;
2376         }
2377
2378         /*
2379          * Calculate key to locate first/next element, unlocking the previous
2380          * element as we go.  Be careful, the key calculation can overflow.
2381          */
2382         if (chain) {
2383                 key = chain->bref.key +
2384                       ((hammer2_key_t)1 << chain->bref.keybits);
2385                 hammer2_chain_unlock(chain);
2386                 chain = NULL;
2387                 if (key == 0)
2388                         goto done;
2389         } else {
2390                 key = 0;
2391         }
2392
2393 again:
2394         if (--maxloops == 0)
2395                 panic("hammer2_chain_scan: maxloops");
2396         /*
2397          * Locate the blockref array.  Currently we do a fully associative
2398          * search through the array.
2399          */
2400         switch(parent->bref.type) {
2401         case HAMMER2_BREF_TYPE_INODE:
2402                 /*
2403                  * An inode with embedded data has no sub-chains.
2404                  */
2405                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)
2406                         goto done;
2407                 base = &parent->data->ipdata.u.blockset.blockref[0];
2408                 count = HAMMER2_SET_COUNT;
2409                 break;
2410         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2411         case HAMMER2_BREF_TYPE_INDIRECT:
2412                 /*
2413                  * Optimize indirect blocks in the INITIAL state to avoid
2414                  * I/O.
2415                  */
2416                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
2417                         base = NULL;
2418                 } else {
2419                         if (parent->data == NULL)
2420                                 panic("parent->data is NULL");
2421                         base = &parent->data->npdata[0];
2422                 }
2423                 count = parent->bytes / sizeof(hammer2_blockref_t);
2424                 break;
2425         case HAMMER2_BREF_TYPE_VOLUME:
2426                 base = &hmp->voldata.sroot_blockset.blockref[0];
2427                 count = HAMMER2_SET_COUNT;
2428                 break;
2429         case HAMMER2_BREF_TYPE_FREEMAP:
2430                 base = &hmp->voldata.freemap_blockset.blockref[0];
2431                 count = HAMMER2_SET_COUNT;
2432                 break;
2433         default:
2434                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
2435                       parent->bref.type);
2436                 base = NULL;    /* safety */
2437                 count = 0;      /* safety */
2438         }
2439
2440         /*
2441          * Merged scan to find next candidate.
2442          *
2443          * hammer2_base_*() functions require the above->live_* fields
2444          * to be synchronized.
2445          *
2446          * We need to hold the spinlock to access the block array and RB tree
2447          * and to interlock chain creation.
2448          */
2449         if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
2450                 hammer2_chain_countbrefs(parent, base, count);
2451
2452         above = parent->core;
2453         next_key = 0;
2454         spin_lock(&above->cst.spin);
2455         chain = hammer2_combined_find(parent, base, count,
2456                                       cache_indexp, &next_key,
2457                                       key, HAMMER2_MAX_KEY, &bref);
2458         generation = above->generation;
2459
2460         /*
2461          * Exhausted parent chain, we're done.
2462          */
2463         if (bref == NULL) {
2464                 spin_unlock(&above->cst.spin);
2465                 KKASSERT(chain == NULL);
2466                 goto done;
2467         }
2468
2469         /*
2470          * Selected from blockref or in-memory chain.
2471          */
2472         if (chain == NULL) {
2473                 bcopy = *bref;
2474                 spin_unlock(&above->cst.spin);
2475                 chain = hammer2_chain_get(parent, &bcopy, generation);
2476                 if (chain == NULL) {
2477                         kprintf("retry scan parent %p keys %016jx\n",
2478                                 parent, key);
2479                         goto again;
2480                 }
2481                 if (bcmp(&bcopy, bref, sizeof(bcopy))) {
2482                         hammer2_chain_drop(chain);
2483                         chain = NULL;
2484                         goto again;
2485                 }
2486                 wasdup = 0;
2487         } else {
2488                 hammer2_chain_ref(chain);
2489                 wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0);
2490                 spin_unlock(&above->cst.spin);
2491         }
2492
2493         /*
2494          * chain is referenced but not locked.  We must lock the chain
2495          * to obtain definitive DUPLICATED/DELETED state
2496          */
2497         hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF);
2498
2499         /*
2500          * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
2501          *
2502          * NOTE: chain's key range is not relevant as there might be
2503          *       one-offs within the range that are not deleted.
2504          *
2505          * NOTE: XXX this could create problems with scans used in
2506          *       situations other than mount-time recovery.
2507          *
2508          * NOTE: Lookups can race delete-duplicate because
2509          *       delete-duplicate does not lock the parent's core
2510          *       (they just use the spinlock on the core).  We must
2511          *       check for races by comparing the DUPLICATED flag before
2512          *       releasing the spinlock with the flag after locking the
2513          *       chain.
2514          */
2515         if (chain->flags & HAMMER2_CHAIN_DELETED) {
2516                 hammer2_chain_unlock(chain);
2517                 chain = NULL;
2518
2519                 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) {
2520                         key = next_key;
2521                         if (key == 0)
2522                                 goto done;
2523                 }
2524                 goto again;
2525         }
2526
2527 done:
2528         /*
2529          * All done, return the chain or NULL
2530          */
2531         return (chain);
2532 }
2533
2534 /*
2535  * Create and return a new hammer2 system memory structure of the specified
2536  * key, type and size and insert it under (*parentp).  This is a full
2537  * insertion, based on the supplied key/keybits, and may involve creating
2538  * indirect blocks and moving other chains around via delete/duplicate.
2539  *
2540  * (*parentp) must be exclusive locked and may be replaced on return
2541  * depending on how much work the function had to do.
2542  *
2543  * (*chainp) usually starts out NULL and returns the newly created chain,
2544  * but if the caller desires the caller may allocate a disconnected chain
2545  * and pass it in instead.  (It is also possible for the caller to use
2546  * chain_duplicate() to create a disconnected chain, manipulate it, then
2547  * pass it into this function to insert it).
2548  *
2549  * This function should NOT be used to insert INDIRECT blocks.  It is
2550  * typically used to create/insert inodes and data blocks.
2551  *
2552  * Caller must pass-in an exclusively locked parent the new chain is to
2553  * be inserted under, and optionally pass-in a disconnected, exclusively
2554  * locked chain to insert (else we create a new chain).  The function will
2555  * adjust (*parentp) as necessary, create or connect the chain, and
2556  * return an exclusively locked chain in *chainp.
2557  */
2558 int
2559 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
2560                      hammer2_chain_t **chainp,
2561                      hammer2_key_t key, int keybits, int type, size_t bytes)
2562 {
2563         hammer2_mount_t *hmp;
2564         hammer2_chain_t *chain;
2565         hammer2_chain_t *parent = *parentp;
2566         hammer2_chain_core_t *above;
2567         hammer2_blockref_t *base;
2568         hammer2_blockref_t dummy;
2569         int allocated = 0;
2570         int error = 0;
2571         int count;
2572         int maxloops = 300000;
2573
2574         above = parent->core;
2575         KKASSERT(ccms_thread_lock_owned(&above->cst));
2576         hmp = parent->hmp;
2577         chain = *chainp;
2578
2579         if (chain == NULL) {
2580                 /*
2581                  * First allocate media space and construct the dummy bref,
2582                  * then allocate the in-memory chain structure.  Set the
2583                  * INITIAL flag for fresh chains which do not have embedded
2584                  * data.
2585                  */
2586                 bzero(&dummy, sizeof(dummy));
2587                 dummy.type = type;
2588                 dummy.key = key;
2589                 dummy.keybits = keybits;
2590                 dummy.data_off = hammer2_getradix(bytes);
2591                 dummy.methods = parent->bref.methods;
2592                 chain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy);
2593                 hammer2_chain_core_alloc(trans, chain, NULL);
2594
2595                 /*
2596                  * Lock the chain manually, chain_lock will load the chain
2597                  * which we do NOT want to do.  (note: chain->refs is set
2598                  * to 1 by chain_alloc() for us, but lockcnt is not).
2599                  */
2600                 chain->lockcnt = 1;
2601                 ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE);
2602                 allocated = 1;
2603
2604                 /*
2605                  * We do NOT set INITIAL here (yet).  INITIAL is only
2606                  * used for indirect blocks.
2607                  *
2608                  * Recalculate bytes to reflect the actual media block
2609                  * allocation.
2610                  */
2611                 bytes = (hammer2_off_t)1 <<
2612                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
2613                 chain->bytes = bytes;
2614
2615                 switch(type) {
2616                 case HAMMER2_BREF_TYPE_VOLUME:
2617                 case HAMMER2_BREF_TYPE_FREEMAP:
2618                         panic("hammer2_chain_create: called with volume type");
2619                         break;
2620                 case HAMMER2_BREF_TYPE_INODE:
2621                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
2622                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
2623                         chain->data = kmalloc(sizeof(chain->data->ipdata),
2624                                               hmp->mchain, M_WAITOK | M_ZERO);
2625                         break;
2626                 case HAMMER2_BREF_TYPE_INDIRECT:
2627                         panic("hammer2_chain_create: cannot be used to"
2628                               "create indirect block");
2629                         break;
2630                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2631                         panic("hammer2_chain_create: cannot be used to"
2632                               "create freemap root or node");
2633                         break;
2634                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2635                         KKASSERT(bytes == sizeof(chain->data->bmdata));
2636                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED);
2637                         chain->data = kmalloc(sizeof(chain->data->bmdata),
2638                                               hmp->mchain, M_WAITOK | M_ZERO);
2639                         break;
2640                 case HAMMER2_BREF_TYPE_DATA:
2641                 default:
2642                         /*
2643                          * leave chain->data NULL, set INITIAL
2644                          */
2645                         KKASSERT(chain->data == NULL);
2646                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
2647                         break;
2648                 }
2649         } else {
2650                 /*
2651                  * We are reattaching a chain that has been duplicated and
2652                  * left disconnected under a DIFFERENT parent with potentially
2653                  * different key/keybits.
2654                  *
2655                  * The chain must be modified in the current transaction
2656                  * (the duplication code should have done that for us),
2657                  * and it's modify_tid should be greater than the parent's
2658                  * bref.mirror_tid.  This should cause it to be created under
2659                  * the new parent.
2660                  *
2661                  * If deleted in the same transaction, the create/delete TIDs
2662                  * will be the same and effective the chain will not have
2663                  * existed at all from the point of view of the parent.
2664                  *
2665                  * Do NOT mess with the current state of the INITIAL flag.
2666                  */
2667                 KKASSERT(chain->modify_tid > parent->bref.mirror_tid);
2668                 KKASSERT(chain->modify_tid == trans->sync_tid);
2669                 chain->bref.key = key;
2670                 chain->bref.keybits = keybits;
2671                 /* chain->modify_tid = chain->bref.mirror_tid; */
2672                 KKASSERT(chain->above == NULL);
2673         }
2674
2675         /*
2676          * Calculate how many entries we have in the blockref array and
2677          * determine if an indirect block is required.
2678          */
2679 again:
2680         if (--maxloops == 0)
2681                 panic("hammer2_chain_create: maxloops");
2682         above = parent->core;
2683
2684         switch(parent->bref.type) {
2685         case HAMMER2_BREF_TYPE_INODE:
2686                 KKASSERT((parent->data->ipdata.op_flags &
2687                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2688                 KKASSERT(parent->data != NULL);
2689                 base = &parent->data->ipdata.u.blockset.blockref[0];
2690                 count = HAMMER2_SET_COUNT;
2691                 break;
2692         case HAMMER2_BREF_TYPE_INDIRECT:
2693         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2694                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2695                         base = NULL;
2696                 else
2697                         base = &parent->data->npdata[0];
2698                 count = parent->bytes / sizeof(hammer2_blockref_t);
2699                 break;
2700         case HAMMER2_BREF_TYPE_VOLUME:
2701                 KKASSERT(parent->data != NULL);
2702                 base = &hmp->voldata.sroot_blockset.blockref[0];
2703                 count = HAMMER2_SET_COUNT;
2704                 break;
2705         case HAMMER2_BREF_TYPE_FREEMAP:
2706                 KKASSERT(parent->data != NULL);
2707                 base = &hmp->voldata.freemap_blockset.blockref[0];
2708                 count = HAMMER2_SET_COUNT;
2709                 break;
2710         default:
2711                 panic("hammer2_chain_create: unrecognized blockref type: %d",
2712                       parent->bref.type);
2713                 base = NULL;
2714                 count = 0;
2715                 break;
2716         }
2717
2718         /*
2719          * Make sure we've counted the brefs
2720          */
2721         if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
2722                 hammer2_chain_countbrefs(parent, base, count);
2723
2724         KKASSERT(above->live_count >= 0 && above->live_count <= count);
2725
2726         /*
2727          * If no free blockref could be found we must create an indirect
2728          * block and move a number of blockrefs into it.  With the parent
2729          * locked we can safely lock each child in order to delete+duplicate
2730          * it without causing a deadlock.
2731          *
2732          * This may return the new indirect block or the old parent depending
2733          * on where the key falls.  NULL is returned on error.
2734          */
2735         if (above->live_count == count) {
2736                 hammer2_chain_t *nparent;
2737
2738                 nparent = hammer2_chain_create_indirect(trans, parent,
2739                                                         key, keybits,
2740                                                         type, &error);
2741                 if (nparent == NULL) {
2742                         if (allocated)
2743                                 hammer2_chain_drop(chain);
2744                         chain = NULL;
2745                         goto done;
2746                 }
2747                 if (parent != nparent) {
2748                         hammer2_chain_unlock(parent);
2749                         parent = *parentp = nparent;
2750                 }
2751                 goto again;
2752         }
2753
2754         /*
2755          * Link the chain into its parent.  Later on we will have to set
2756          * the MOVED bit in situations where we don't mark the new chain
2757          * as being modified.
2758          */
2759         if (chain->above != NULL)
2760                 panic("hammer2: hammer2_chain_create: chain already connected");
2761         KKASSERT(chain->above == NULL);
2762         KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
2763         hammer2_chain_insert(above, NULL, chain,
2764                              HAMMER2_CHAIN_INSERT_SPIN |
2765                              HAMMER2_CHAIN_INSERT_LIVE,
2766                              0);
2767
2768         if (allocated) {
2769                 /*
2770                  * Mark the newly created chain modified.
2771                  *
2772                  * Device buffers are not instantiated for DATA elements
2773                  * as these are handled by logical buffers.
2774                  *
2775                  * Indirect and freemap node indirect blocks are handled
2776                  * by hammer2_chain_create_indirect() and not by this
2777                  * function.
2778                  *
2779                  * Data for all other bref types is expected to be
2780                  * instantiated (INODE, LEAF).
2781                  */
2782                 switch(chain->bref.type) {
2783                 case HAMMER2_BREF_TYPE_DATA:
2784                         hammer2_chain_modify(trans, &chain,
2785                                              HAMMER2_MODIFY_OPTDATA |
2786                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2787                         break;
2788                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2789                 case HAMMER2_BREF_TYPE_INODE:
2790                         hammer2_chain_modify(trans, &chain,
2791                                              HAMMER2_MODIFY_ASSERTNOCOPY);
2792                         break;
2793                 default:
2794                         /*
2795                          * Remaining types are not supported by this function.
2796                          * In particular, INDIRECT and LEAF_NODE types are
2797                          * handled by create_indirect().
2798                          */
2799                         panic("hammer2_chain_create: bad type: %d",
2800                               chain->bref.type);
2801                         /* NOT REACHED */
2802                         break;
2803                 }
2804         } else {
2805                 /*
2806                  * When reconnecting a chain we must set MOVED and setsubmod
2807                  * so the flush recognizes that it must update the bref in
2808                  * the parent.
2809                  */
2810                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2811                         hammer2_chain_ref(chain);
2812                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2813                 }
2814         }
2815         hammer2_chain_setsubmod(trans, chain);
2816
2817 done:
2818         *chainp = chain;
2819
2820         return (error);
2821 }
2822
2823 /*
2824  * Replace (*chainp) with a duplicate in-memory chain structure which shares
2825  * the same core and media state as the orignal.  The original *chainp is
2826  * unlocked and the replacement will be returned locked.
2827  *
2828  * The old chain must be in a DELETED state unless snapshot is non-zero.
2829  *
2830  * The new chain will be live (i.e. not deleted), and modified.
2831  *
2832  * If (parent) is non-NULL then the new duplicated chain is inserted under
2833  * the parent.
2834  *
2835  * If (parent) is NULL then the newly duplicated chain is not inserted
2836  * anywhere, similar to if it had just been chain_alloc()'d (suitable for
2837  * passing into hammer2_chain_create() after this function returns).
2838  *
2839  * WARNING! This function cannot take snapshots all by itself.  The caller
2840  *          needs to do other massaging for snapshots.
2841  *
2842  * WARNING! This function calls create which means it can insert indirect
2843  *          blocks.  Callers may have to refactor locked chains held across
2844  *          the call (other than the ones passed into the call).
2845  */
2846 static void hammer2_chain_dup_fixup(hammer2_chain_t *ochain,
2847                                     hammer2_chain_t *nchain);
2848
2849 void
2850 hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
2851                         hammer2_chain_t **chainp, hammer2_blockref_t *bref,
2852                         int snapshot, int duplicate_reason)
2853 {
2854         hammer2_mount_t *hmp;
2855         hammer2_chain_t *parent;
2856         hammer2_chain_t *ochain;
2857         hammer2_chain_t *nchain;
2858         hammer2_chain_core_t *above;
2859         size_t bytes;
2860
2861         /*
2862          * We want nchain to be our go-to live chain, but ochain may be in
2863          * a MODIFIED state within the current flush synchronization segment.
2864          * Force any further modifications of ochain to do another COW
2865          * operation even if modify_tid indicates that one is not needed.
2866          *
2867          * We don't want to set FORCECOW on nchain simply as an optimization,
2868          * as many duplication calls simply move chains into ichains and
2869          * then delete the original.
2870          *
2871          * WARNING!  We should never resolve DATA to device buffers
2872          *           (XXX allow it if the caller did?), and since
2873          *           we currently do not have the logical buffer cache
2874          *           buffer in-hand to fix its cached physical offset
2875          *           we also force the modify code to not COW it. XXX
2876          */
2877         ochain = *chainp;
2878         hmp = ochain->hmp;
2879         KKASSERT(snapshot == 1 || (ochain->flags & HAMMER2_CHAIN_DELETED));
2880
2881         atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
2882
2883         /*
2884          * Now create a duplicate of the chain structure, associating
2885          * it with the same core, making it the same size, pointing it
2886          * to the same bref (the same media block).
2887          *
2888          * Give the duplicate the same modify_tid that we previously
2889          * ensured was sufficiently advanced to trigger a block table
2890          * insertion on flush.
2891          *
2892          * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
2893          *       hammer2_chain_alloc()
2894          */
2895         if (bref == NULL)
2896                 bref = &ochain->bref;
2897         if (snapshot) {
2898                 nchain = hammer2_chain_alloc(hmp, NULL, trans, bref);
2899                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SNAPSHOT);
2900         } else {
2901                 nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, bref);
2902         }
2903         hammer2_chain_core_alloc(trans, nchain, ochain);
2904         bytes = (hammer2_off_t)1 <<
2905                 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
2906         nchain->bytes = bytes;
2907         nchain->modify_tid = ochain->modify_tid;
2908         nchain->inode_reason = ochain->inode_reason + 0x100000;
2909         if (ochain->flags & HAMMER2_CHAIN_INITIAL)
2910                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
2911
2912         /*
2913          * Fixup (copy) any embedded data.  Non-embedded data relies on the
2914          * media block.  We must unlock ochain before we can access nchain's
2915          * media block because they might share the same bp and deadlock if
2916          * we don't.
2917          */
2918         hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER |
2919                                    HAMMER2_RESOLVE_NOREF);
2920         hammer2_chain_dup_fixup(ochain, nchain);
2921         /* nchain has 1 ref */
2922         hammer2_chain_unlock(ochain);
2923         KKASSERT((ochain->flags & HAMMER2_CHAIN_EMBEDDED) ||
2924                  ochain->data == NULL);
2925
2926         /*
2927          * Place nchain in the modified state, instantiate media data
2928          * if necessary.  Because modify_tid is already completely
2929          * synchronized this should not result in a delete-duplicate.
2930          *
2931          * We want nchain at the target to look like a new insertion.
2932          * Forcing the modification to be INPLACE accomplishes this
2933          * because we get the same nchain with an updated modify_tid.
2934          */
2935         if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2936                 hammer2_chain_modify(trans, &nchain,
2937                                      HAMMER2_MODIFY_OPTDATA |
2938                                      HAMMER2_MODIFY_NOREALLOC |
2939                                      HAMMER2_MODIFY_INPLACE);
2940         } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
2941                 hammer2_chain_modify(trans, &nchain,
2942                                      HAMMER2_MODIFY_OPTDATA |
2943                                      HAMMER2_MODIFY_INPLACE);
2944         } else {
2945                 hammer2_chain_modify(trans, &nchain,
2946                                      HAMMER2_MODIFY_INPLACE);
2947         }
2948
2949         /*
2950          * If parent is not NULL the duplicated chain will be entered under
2951          * the parent and the MOVED bit set.
2952          *
2953          * Having both chains locked is extremely important for atomicy.
2954          */
2955         if (parentp && (parent = *parentp) != NULL) {
2956                 above = parent->core;
2957                 KKASSERT(ccms_thread_lock_owned(&above->cst));
2958                 KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
2959                 KKASSERT(parent->refs > 0);
2960
2961                 hammer2_chain_create(trans, parentp, &nchain,
2962                                      nchain->bref.key, nchain->bref.keybits,
2963                                      nchain->bref.type, nchain->bytes);
2964                 parent = NULL;
2965
2966                 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
2967                         hammer2_chain_ref(nchain);
2968                         atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
2969                 }
2970                 hammer2_chain_setsubmod(trans, nchain);
2971         }
2972
2973 #if 0
2974         /*
2975          * Unconditionally set MOVED to force the parent blockrefs to
2976          * update, and adjust update_hi below nchain so nchain's
2977          * blockrefs are updated with the new attachment.
2978          */
2979         if (nchain->core->update_hi < trans->sync_tid) {
2980                 spin_lock(&nchain->core->cst.spin);
2981                 if (nchain->core->update_hi < trans->sync_tid)
2982                         nchain->core->update_hi = trans->sync_tid;
2983                 spin_unlock(&nchain->core->cst.spin);
2984         }
2985 #endif
2986
2987         *chainp = nchain;
2988 }
2989
2990 /*
2991  * Special in-place delete-duplicate sequence which does not require a
2992  * locked parent.  (*chainp) is marked DELETED and atomically replaced
2993  * with a duplicate.  Atomicy is at the very-fine spin-lock level in
2994  * order to ensure that lookups do not race us.
2995  *
2996  * If the old chain is already marked deleted the new chain will also be
2997  * marked deleted.  This case can occur when an inode is removed from the
2998  * filesystem but programs still have an open descriptor to it, and during
2999  * flushes when the flush needs to operate on a chain that is deleted in
3000  * the live view but still alive in the flush view.
3001  *
3002  * The new chain will be marked modified for the current transaction.
3003  */
3004 void
3005 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
3006                                int flags)
3007 {
3008         hammer2_mount_t *hmp;
3009         hammer2_chain_t *ochain;
3010         hammer2_chain_t *nchain;
3011         hammer2_chain_core_t *above;
3012         size_t bytes;
3013
3014         if (hammer2_debug & 0x20000)
3015                 Debugger("dd");
3016
3017         /*
3018          * Note that we do not have to call setsubmod on ochain, calling it
3019          * on nchain is sufficient.
3020          */
3021         ochain = *chainp;
3022         hmp = ochain->hmp;
3023
3024         if (ochain->bref.type == HAMMER2_BREF_TYPE_INODE) {
3025                 KKASSERT(ochain->data);
3026         }
3027
3028         /*
3029          * First create a duplicate of the chain structure.
3030          * (nchain is allocated with one ref).
3031          *
3032          * In the case where nchain inherits ochains core, nchain is
3033          * effectively locked due to ochain being locked (and sharing the
3034          * core), until we can give nchain its own official ock.
3035          */
3036         nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, &ochain->bref);
3037         if (flags & HAMMER2_DELDUP_RECORE)
3038                 hammer2_chain_core_alloc(trans, nchain, NULL);
3039         else
3040                 hammer2_chain_core_alloc(trans, nchain, ochain);
3041         above = ochain->above;
3042
3043         bytes = (hammer2_off_t)1 <<
3044                 (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
3045         nchain->bytes = bytes;
3046
3047         /*
3048          * Duplicate inherits ochain's live state including its modification
3049          * state.  This function disposes of the original.  Because we are
3050          * doing this in-place under the same parent the block array
3051          * inserted/deleted state does not change.
3052          *
3053          * The caller isn't expected to make further modifications of ochain
3054          * but set the FORCECOW bit anyway, just in case it does.  If ochain
3055          * was previously marked FORCECOW we also flag nchain FORCECOW
3056          * (used during hardlink splits).  FORCECOW forces a reallocation
3057          * of the block when we modify the chain a little later, it does
3058          * not force another delete-duplicate.
3059          *
3060          * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
3061          *       hammer2_chain_alloc()
3062          */
3063         nchain->data_count += ochain->data_count;
3064         nchain->inode_count += ochain->inode_count;
3065         atomic_set_int(&nchain->flags,
3066                        ochain->flags & (HAMMER2_CHAIN_INITIAL |
3067                                         HAMMER2_CHAIN_FORCECOW));
3068         atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
3069         nchain->inode_reason = ochain->inode_reason + 0x1000;
3070
3071         /*
3072          * Lock nchain so both chains are now locked (extremely important
3073          * for atomicy).  Mark ochain deleted and reinsert into the topology
3074          * and insert nchain all in one go.
3075          *
3076          * If the ochain is already deleted it is left alone and nchain
3077          * is inserted into the topology as a deleted chain.  This is
3078          * important because it allows ongoing operations to be executed
3079          * on a deleted inode which still has open descriptors.
3080          *
3081          * The deleted case can also occur when a flush delete-duplicates
3082          * a node which is being concurrently modified by ongoing operations
3083          * in a later transaction.  This creates a problem because the flush
3084          * is intended to update blockrefs which then propagate, allowing
3085          * the original covering in-memory chains to be freed up.  In this
3086          * situation the flush code does NOT free the original covering
3087          * chains and will re-apply them to successive copies.
3088          */
3089         hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
3090         hammer2_chain_dup_fixup(ochain, nchain);
3091         /* extra ref still present from original allocation */
3092
3093         KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE);
3094         spin_lock(&above->cst.spin);
3095         KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE);
3096
3097         /*
3098          * Ultimately nchain->modify_tid will be set to trans->sync_tid,
3099          * but we can't do that here because we want to call
3100          * hammer2_chain_modify() to reallocate the block (if necessary).
3101          */
3102         nchain->modify_tid = ochain->modify_tid;
3103
3104         if (ochain->flags & HAMMER2_CHAIN_DELETED) {
3105                 /*
3106                  * ochain was deleted
3107                  */
3108                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED);
3109                 if (ochain->delete_tid > trans->sync_tid) {
3110                         /*
3111                          * delete-duplicate a chain deleted in a later
3112                          * transaction.  Only allowed on chains created
3113                          * before or during the current transaction (flush
3114                          * code should filter out chains created after the
3115                          * current transaction).
3116                          *
3117                          * To make this work is a bit of a hack.  We convert
3118                          * ochain's delete_tid to the current sync_tid and
3119                          * create a nchain which sets up ochains original
3120                          * delete_tid.
3121                          *
3122                          * This effectively forces ochain to flush as a
3123                          * deletion and nchain as a creation.  Thus MOVED
3124                          * must be set in ochain (it should already be
3125                          * set since it's original delete_tid could not
3126                          * have been flushed yet).  Since ochain's delete_tid
3127                          * has been moved down to sync_tid, a re-flush at
3128                          * sync_tid won't try to delete-duplicate ochain
3129                          * again.
3130                          */
3131                         KKASSERT(ochain->modify_tid <= trans->sync_tid);
3132                         nchain->delete_tid = ochain->delete_tid;
3133                         ochain->delete_tid = trans->sync_tid;
3134                         KKASSERT(ochain->flags & HAMMER2_CHAIN_MOVED);
3135                 } else if (ochain->delete_tid == trans->sync_tid) {
3136                         /*
3137                          * ochain was deleted in the current transaction
3138                          */
3139                         nchain->delete_tid = trans->sync_tid;
3140                 } else {
3141                         /*
3142                          * ochain was deleted in a prior transaction.
3143                          * create and delete nchain in the current
3144                          * transaction.
3145                          *
3146                          * (delete_tid might represent a deleted inode
3147                          *  which still has an open descriptor).
3148                          */
3149                         nchain->delete_tid = trans->sync_tid;
3150                 }
3151                 hammer2_chain_insert(above, ochain->inlayer, nchain, 0, 0);
3152         } else {
3153                 /*
3154                  * ochain was not deleted, delete it in the current
3155                  * transaction.
3156                  */
3157                 KKASSERT(trans->sync_tid >= ochain->modify_tid);
3158                 ochain->delete_tid = trans->sync_tid;
3159                 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED);
3160                 atomic_add_int(&above->live_count, -1);
3161                 hammer2_chain_insert(above, NULL, nchain,
3162                                      HAMMER2_CHAIN_INSERT_LIVE, 0);
3163         }
3164
3165         if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) {
3166                 hammer2_chain_ref(ochain);
3167                 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED);
3168         }
3169         spin_unlock(&above->cst.spin);
3170
3171         /*
3172          * ochain must be unlocked because ochain and nchain might share
3173          * a buffer cache buffer, so we need to release it so nchain can
3174          * potentially obtain it.
3175          */
3176         hammer2_chain_unlock(ochain);
3177
3178         /*
3179          * Finishing fixing up nchain.  A new block will be allocated if
3180          * crossing a synchronization point (meta-data only).
3181          *
3182          * Calling hammer2_chain_modify() will update modify_tid to
3183          * (typically) trans->sync_tid.
3184          */
3185         if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
3186                 hammer2_chain_modify(trans, &nchain,
3187                                      HAMMER2_MODIFY_OPTDATA |
3188                                      HAMMER2_MODIFY_NOREALLOC |
3189                                      HAMMER2_MODIFY_INPLACE);
3190         } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
3191                 hammer2_chain_modify(trans, &nchain,
3192                                      HAMMER2_MODIFY_OPTDATA |
3193                                      HAMMER2_MODIFY_INPLACE);
3194         } else {
3195                 hammer2_chain_modify(trans, &nchain,
3196                                      HAMMER2_MODIFY_INPLACE);
3197         }
3198         hammer2_chain_drop(nchain);
3199
3200         /*
3201          * Unconditionally set MOVED to force the parent blockrefs to
3202          * update as the chain_modify() above won't necessarily do it.
3203          *
3204          * Adjust update_hi below nchain so nchain's blockrefs are updated
3205          * with the new attachment.
3206          */
3207         if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
3208                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
3209                 hammer2_chain_ref(nchain);
3210         }
3211 #if 0
3212         if (nchain->core->update_hi < trans->sync_tid) {
3213                 spin_lock(&nchain->core->cst.spin);
3214                 if (nchain->core->update_hi < trans->sync_tid)
3215                         nchain->core->update_hi = trans->sync_tid;
3216                 spin_unlock(&nchain->core->cst.spin);
3217         }
3218 #endif
3219         hammer2_chain_setsubmod(trans, nchain);
3220         *chainp = nchain;
3221 }
3222
3223 /*
3224  * Helper function to fixup inodes.  The caller procedure stack may hold
3225  * multiple locks on ochain if it represents an inode, preventing our
3226  * unlock from retiring its state to the buffer cache.
3227  *
3228  * In this situation any attempt to access the buffer cache could result
3229  * either in stale data or a deadlock.  Work around the problem by copying
3230  * the embedded data directly.
3231  */
3232 static
3233 void
3234 hammer2_chain_dup_fixup(hammer2_chain_t *ochain, hammer2_chain_t *nchain)
3235 {
3236         if (ochain->data == NULL)
3237                 return;
3238         switch(ochain->bref.type) {
3239         case HAMMER2_BREF_TYPE_INODE:
3240                 KKASSERT(nchain->data == NULL);
3241                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED);
3242                 nchain->data = kmalloc(sizeof(nchain->data->ipdata),
3243                                        ochain->hmp->mchain, M_WAITOK | M_ZERO);
3244                 nchain->data->ipdata = ochain->data->ipdata;
3245                 break;
3246         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
3247                 KKASSERT(nchain->data == NULL);
3248                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED);
3249                 nchain->data = kmalloc(sizeof(nchain->data->bmdata),
3250                                        ochain->hmp->mchain, M_WAITOK | M_ZERO);
3251                 bcopy(ochain->data->bmdata,
3252                       nchain->data->bmdata,
3253                       sizeof(nchain->data->bmdata));
3254                 break;
3255         default:
3256                 break;
3257         }
3258 }
3259
3260 /*
3261  * Create a snapshot of the specified {parent, ochain} with the specified
3262  * label.  The originating hammer2_inode must be exclusively locked for
3263  * safety.
3264  *
3265  * The ioctl code has already synced the filesystem.
3266  */
3267 int
3268 hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **ochainp,
3269                        hammer2_ioc_pfs_t *pfs)
3270 {
3271         hammer2_mount_t *hmp;
3272         hammer2_chain_t *ochain = *ochainp;
3273         hammer2_chain_t *nchain;
3274         hammer2_inode_data_t *ipdata;
3275         hammer2_inode_t *nip;
3276         size_t name_len;
3277         hammer2_key_t lhc;
3278         struct vattr vat;
3279         uuid_t opfs_clid;
3280         int error;
3281
3282         kprintf("snapshot %s ochain->refs %d ochain->flags %08x\n",
3283                 pfs->name, ochain->refs, ochain->flags);
3284
3285         name_len = strlen(pfs->name);
3286         lhc = hammer2_dirhash(pfs->name, name_len);
3287
3288         hmp = ochain->hmp;
3289         opfs_clid = ochain->data->ipdata.pfs_clid;
3290
3291         *ochainp = ochain;
3292
3293         /*
3294          * Create the snapshot directory under the super-root
3295          *
3296          * Set PFS type, generate a unique filesystem id, and generate
3297          * a cluster id.  Use the same clid when snapshotting a PFS root,
3298          * which theoretically allows the snapshot to be used as part of
3299          * the same cluster (perhaps as a cache).
3300          *
3301          * Copy the (flushed) ochain's blockref array.  Theoretically we
3302          * could use chain_duplicate() but it becomes difficult to disentangle
3303          * the shared core so for now just brute-force it.
3304          */
3305         VATTR_NULL(&vat);
3306         vat.va_type = VDIR;
3307         vat.va_mode = 0755;
3308         nchain = NULL;
3309         nip = hammer2_inode_create(trans, hmp->sroot, &vat, proc0.p_ucred,
3310                                    pfs->name, name_len, &nchain, &error);
3311
3312         if (nip) {
3313                 ipdata = hammer2_chain_modify_ip(trans, nip, &nchain, 0);
3314                 ipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT;
3315                 kern_uuidgen(&ipdata->pfs_fsid, 1);
3316                 if (ochain->flags & HAMMER2_CHAIN_PFSROOT)
3317                         ipdata->pfs_clid = opfs_clid;
3318                 else
3319                         kern_uuidgen(&ipdata->pfs_clid, 1);
3320                 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_PFSROOT);
3321                 ipdata->u.blockset = ochain->data->ipdata.u.blockset;
3322
3323                 hammer2_inode_unlock_ex(nip, nchain);
3324         }
3325         return (error);
3326 }
3327
3328 /*
3329  * Create an indirect block that covers one or more of the elements in the
3330  * current parent.  Either returns the existing parent with no locking or
3331  * ref changes or returns the new indirect block locked and referenced
3332  * and leaving the original parent lock/ref intact as well.
3333  *
3334  * If an error occurs, NULL is returned and *errorp is set to the error.
3335  *
3336  * The returned chain depends on where the specified key falls.
3337  *
3338  * The key/keybits for the indirect mode only needs to follow three rules:
3339  *
3340  * (1) That all elements underneath it fit within its key space and
3341  *
3342  * (2) That all elements outside it are outside its key space.
3343  *
3344  * (3) When creating the new indirect block any elements in the current
3345  *     parent that fit within the new indirect block's keyspace must be
3346  *     moved into the new indirect block.
3347  *
3348  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
3349  *     keyspace the the current parent, but lookup/iteration rules will
3350  *     ensure (and must ensure) that rule (2) for all parents leading up
3351  *     to the nearest inode or the root volume header is adhered to.  This
3352  *     is accomplished by always recursing through matching keyspaces in
3353  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
3354  *
3355  * The current implementation calculates the current worst-case keyspace by
3356  * iterating the current parent and then divides it into two halves, choosing
3357  * whichever half has the most elements (not necessarily the half containing
3358  * the requested key).
3359  *
3360  * We can also opt to use the half with the least number of elements.  This
3361  * causes lower-numbered keys (aka logical file offsets) to recurse through
3362  * fewer indirect blocks and higher-numbered keys to recurse through more.
3363  * This also has the risk of not moving enough elements to the new indirect
3364  * block and being forced to create several indirect blocks before the element
3365  * can be inserted.
3366  *
3367  * Must be called with an exclusively locked parent.
3368  */
3369 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent,
3370                                 hammer2_key_t *keyp, int keybits,
3371                                 hammer2_blockref_t *base, int count);
3372 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent,
3373                                 hammer2_key_t *keyp, int keybits,
3374                                 hammer2_blockref_t *base, int count);
3375 static
3376 hammer2_chain_t *
3377 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
3378                               hammer2_key_t create_key, int create_bits,
3379                               int for_type, int *errorp)
3380 {
3381         hammer2_mount_t *hmp;
3382         hammer2_chain_core_t *above;
3383         hammer2_chain_core_t *icore;
3384         hammer2_blockref_t *base;
3385         hammer2_blockref_t *bref;
3386         hammer2_blockref_t bcopy;
3387         hammer2_chain_t *chain;
3388         hammer2_chain_t *ichain;
3389         hammer2_chain_t dummy;
3390         hammer2_key_t key = create_key;
3391         hammer2_key_t key_beg;
3392         hammer2_key_t key_end;
3393         hammer2_key_t key_next;
3394         int keybits = create_bits;
3395         int count;
3396         int nbytes;
3397         int cache_index;
3398         int loops;
3399         int reason;
3400         int generation;
3401         int maxloops = 300000;
3402         int retry_same;
3403         int wasdup;
3404         hammer2_chain_t * volatile xxchain = NULL;
3405
3406         /*
3407          * Calculate the base blockref pointer or NULL if the chain
3408          * is known to be empty.  We need to calculate the array count
3409          * for RB lookups either way.
3410          */
3411         hmp = parent->hmp;
3412         *errorp = 0;
3413         KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
3414         above = parent->core;
3415
3416         /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/
3417         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
3418                 base = NULL;
3419
3420                 switch(parent->bref.type) {
3421                 case HAMMER2_BREF_TYPE_INODE:
3422                         count = HAMMER2_SET_COUNT;
3423                         break;
3424                 case HAMMER2_BREF_TYPE_INDIRECT:
3425                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3426                         count = parent->bytes / sizeof(hammer2_blockref_t);
3427                         break;
3428                 case HAMMER2_BREF_TYPE_VOLUME:
3429                         count = HAMMER2_SET_COUNT;
3430                         break;
3431                 case HAMMER2_BREF_TYPE_FREEMAP:
3432                         count = HAMMER2_SET_COUNT;
3433                         break;
3434                 default:
3435                         panic("hammer2_chain_create_indirect: "
3436                               "unrecognized blockref type: %d",
3437                               parent->bref.type);
3438                         count = 0;
3439                         break;
3440                 }
3441         } else {
3442                 switch(parent->bref.type) {
3443                 case HAMMER2_BREF_TYPE_INODE:
3444                         base = &parent->data->ipdata.u.blockset.blockref[0];
3445                         count = HAMMER2_SET_COUNT;
3446                         break;
3447                 case HAMMER2_BREF_TYPE_INDIRECT:
3448                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
3449                         base = &parent->data->npdata[0];
3450                         count = parent->bytes / sizeof(hammer2_blockref_t);
3451                         break;
3452                 case HAMMER2_BREF_TYPE_VOLUME:
3453                         base = &hmp->voldata.sroot_blockset.blockref[0];
3454                         count = HAMMER2_SET_COUNT;
3455                         break;
3456                 case HAMMER2_BREF_TYPE_FREEMAP:
3457                         base = &hmp->voldata.freemap_blockset.blockref[0];
3458                         count = HAMMER2_SET_COUNT;
3459                         break;
3460                 default:
3461                         panic("hammer2_chain_create_indirect: "
3462                               "unrecognized blockref type: %d",
3463                               parent->bref.type);
3464                         count = 0;
3465                         break;
3466                 }
3467         }
3468
3469         /*
3470          * dummy used in later chain allocation (no longer used for lookups).
3471          */
3472         bzero(&dummy, sizeof(dummy));
3473         dummy.delete_tid = HAMMER2_MAX_TID;
3474
3475         /*
3476          * When creating an indirect block for a freemap node or leaf
3477          * the key/keybits must be fitted to static radix levels because
3478          * particular radix levels use particular reserved blocks in the
3479          * related zone.
3480          *
3481          * This routine calculates the key/radix of the indirect block
3482          * we need to create, and whether it is on the high-side or the
3483          * low-side.
3484          */
3485         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3486             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3487                 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits,
3488                                                        base, count);
3489         } else {
3490                 keybits = hammer2_chain_indkey_normal(parent, &key, keybits,
3491                                                       base, count);
3492         }
3493
3494         /*
3495          * Normalize the key for the radix being represented, keeping the
3496          * high bits and throwing away the low bits.
3497          */
3498         key &= ~(((hammer2_key_t)1 << keybits) - 1);
3499
3500         /*
3501          * How big should our new indirect block be?  It has to be at least
3502          * as large as its parent.
3503          */
3504         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
3505                 nbytes = HAMMER2_IND_BYTES_MIN;
3506         else
3507                 nbytes = HAMMER2_IND_BYTES_MAX;
3508         if (nbytes < count * sizeof(hammer2_blockref_t))
3509                 nbytes = count * sizeof(hammer2_blockref_t);
3510
3511         /*
3512          * Ok, create our new indirect block
3513          */
3514         if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
3515             for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
3516                 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE;
3517         } else {
3518                 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
3519         }
3520         dummy.bref.key = key;
3521         dummy.bref.keybits = keybits;
3522         dummy.bref.data_off = hammer2_getradix(nbytes);
3523         dummy.bref.methods = parent->bref.methods;
3524
3525         ichain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy.bref);
3526         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
3527         hammer2_chain_core_alloc(trans, ichain, NULL);
3528         icore = ichain->core;
3529         hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
3530         hammer2_chain_drop(ichain);     /* excess ref from alloc */
3531
3532         /*
3533          * We have to mark it modified to allocate its block, but use
3534          * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
3535          * it won't be acted upon by the flush code.
3536          *
3537          * XXX leave the node unmodified, depend on the update_hi
3538          * flush to assign and modify parent blocks.
3539          */
3540         hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);
3541
3542         /*
3543          * Iterate the original parent and move the matching brefs into
3544          * the new indirect block.
3545          *
3546          * XXX handle flushes.
3547          */
3548         key_beg = 0;
3549         key_end = HAMMER2_MAX_KEY;
3550         cache_index = 0;
3551         spin_lock(&above->cst.spin);
3552         loops = 0;
3553         reason = 0;
3554         retry_same = 0;
3555
3556         for (;;) {
3557                 if (++loops > 100000) {
3558                     spin_unlock(&above->cst.spin);
3559                     panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
3560                           reason, parent, base, count, key_next);
3561                 }
3562
3563                 /*
3564                  * NOTE: spinlock stays intact, returned chain (if not NULL)
3565                  *       is not referenced or locked which means that we
3566                  *       cannot safely check its flagged / deletion status
3567                  *       until we lock it.
3568                  */
3569                 chain = hammer2_combined_find(parent, base, count,
3570                                               &cache_index, &key_next,
3571                                               key_beg, key_end,
3572                                               &bref);
3573                 generation = above->generation;
3574                 if (bref == NULL)
3575                         break;
3576                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3577
3578                 /*
3579                  * Skip keys that are not within the key/radix of the new
3580                  * indirect block.  They stay in the parent.
3581                  */
3582                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
3583                     (key ^ bref->key)) != 0) {
3584                         goto next_key_spinlocked;
3585                 }
3586
3587                 /*
3588                  * Load the new indirect block by acquiring the related
3589                  * chains (potentially from media as it might not be
3590                  * in-memory).  Then move it to the new parent (ichain)
3591                  * via DELETE-DUPLICATE.
3592                  *
3593                  * chain is referenced but not locked.  We must lock the
3594                  * chain to obtain definitive DUPLICATED/DELETED state
3595                  */
3596                 if (chain) {
3597                         /*
3598                          * Use chain already present in the RBTREE
3599                          */
3600                         hammer2_chain_ref(chain);
3601                         wasdup = ((chain->flags &
3602                                    HAMMER2_CHAIN_DUPLICATED) != 0);
3603                         spin_unlock(&above->cst.spin);
3604                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER |
3605                                                   HAMMER2_RESOLVE_NOREF);
3606                 } else {
3607                         /*
3608                          * Get chain for blockref element.  _get returns NULL
3609                          * on insertion race.
3610                          */
3611                         bcopy = *bref;
3612                         spin_unlock(&above->cst.spin);
3613                         chain = hammer2_chain_get(parent, &bcopy, generation);
3614                         if (chain == NULL) {
3615                                 reason = 1;
3616                                 spin_lock(&above->cst.spin);
3617                                 continue;
3618                         }
3619                         if (bcmp(&bcopy, bref, sizeof(bcopy))) {
3620                                 reason = 2;
3621                                 hammer2_chain_drop(chain);
3622                                 spin_lock(&above->cst.spin);
3623                                 continue;
3624                         }
3625                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER |
3626                                                   HAMMER2_RESOLVE_NOREF);
3627                         wasdup = 0;
3628                 }
3629
3630                 /*
3631                  * This is always live so if the chain has been delete-
3632                  * duplicated we raced someone and we have to retry.
3633                  *
3634                  * NOTE: Lookups can race delete-duplicate because
3635                  *       delete-duplicate does not lock the parent's core
3636                  *       (they just use the spinlock on the core).  We must
3637                  *       check for races by comparing the DUPLICATED flag before
3638                  *       releasing the spinlock with the flag after locking the
3639                  *       chain.
3640                  *
3641                  *       (note reversed logic for this one)
3642                  */
3643                 if (chain->flags & HAMMER2_CHAIN_DELETED) {
3644                         hammer2_chain_unlock(chain);
3645                         if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) &&
3646                             wasdup == 0) {
3647                                 retry_same = 1;
3648                                 xxchain = chain;
3649                         }
3650                         goto next_key;
3651                 }
3652
3653                 /*
3654                  * Shift the chain to the indirect block.
3655                  *
3656                  * WARNING! Can cause held-over chains to require a refactor.
3657                  *          Fortunately we have none (our locked chains are
3658                  *          passed into and modified by the call).
3659                  */
3660                 hammer2_chain_delete(trans, chain, 0);
3661                 hammer2_chain_duplicate(trans, &ichain, &chain, NULL, 0, 1);
3662                 hammer2_chain_unlock(chain);
3663                 KKASSERT(parent->refs > 0);
3664                 chain = NULL;
3665 next_key:
3666                 spin_lock(&above->cst.spin);
3667 next_key_spinlocked:
3668                 if (--maxloops == 0)
3669                         panic("hammer2_chain_create_indirect: maxloops");
3670                 reason = 4;
3671                 if (retry_same == 0) {
3672                         if (key_next == 0 || key_next > key_end)
3673                                 break;
3674                         key_beg = key_next;
3675                 }
3676                 /* loop */
3677         }
3678         spin_unlock(&above->cst.spin);
3679
3680         /*
3681          * Insert the new indirect block into the parent now that we've
3682          * cleared out some entries in the parent.  We calculated a good
3683          * insertion index in the loop above (ichain->index).
3684          *
3685          * We don't have to set MOVED here because we mark ichain modified
3686          * down below (so the normal modified -> flush -> set-moved sequence
3687          * applies).
3688          *
3689          * The insertion shouldn't race as this is a completely new block
3690          * and the parent is locked.
3691          */
3692         KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
3693         hammer2_chain_insert(above, NULL, ichain,
3694                              HAMMER2_CHAIN_INSERT_SPIN |
3695                              HAMMER2_CHAIN_INSERT_LIVE,
3696                              0);
3697
3698         /*
3699          * Mark the new indirect block modified after insertion, which
3700          * will propagate up through parent all the way to the root and
3701          * also allocate the physical block in ichain for our caller,
3702          * and assign ichain->data to a pre-zero'd space (because there
3703          * is not prior data to copy into it).
3704          *
3705          * We have to set update_hi in ichain's flags manually so the
3706          * flusher knows it has to recurse through it to get to all of
3707          * our moved blocks, then call setsubmod() to set the bit
3708          * recursively.
3709          */
3710         /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/
3711         if (ichain->core->update_hi < trans->sync_tid) {
3712                 spin_lock(&ichain->core->cst.spin);
3713                 if (ichain->core->update_hi < trans->sync_tid)
3714                         ichain->core->update_hi = trans->sync_tid;
3715                 spin_unlock(&ichain->core->cst.spin);
3716         }
3717         hammer2_chain_setsubmod(trans, ichain);
3718
3719         /*
3720          * Figure out what to return.
3721          */
3722         if (~(((hammer2_key_t)1 << keybits) - 1) &
3723                    (create_key ^ key)) {
3724                 /*
3725                  * Key being created is outside the key range,
3726                  * return the original parent.
3727                  */
3728                 hammer2_chain_unlock(ichain);
3729         } else {
3730                 /*
3731                  * Otherwise its in the range, return the new parent.
3732                  * (leave both the new and old parent locked).
3733                  */
3734                 parent = ichain;
3735         }
3736         XXChain = xxchain;
3737
3738         return(parent);
3739 }
3740
3741 /*
3742  * Calculate the keybits and highside/lowside of the freemap node the
3743  * caller is creating.
3744  *
3745  * This routine will specify the next higher-level freemap key/radix
3746  * representing the lowest-ordered set.  By doing so, eventually all
3747  * low-ordered sets will be moved one level down.
3748  *
3749  * We have to be careful here because the freemap reserves a limited
3750  * number of blocks for a limited number of levels.  So we can't just
3751  * push indiscriminately.
3752  */
3753 int
3754 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
3755                              int keybits, hammer2_blockref_t *base, int count)
3756 {
3757         hammer2_chain_core_t *above;
3758         hammer2_chain_t *chain;
3759         hammer2_blockref_t *bref;
3760         hammer2_key_t key;
3761         hammer2_key_t key_beg;
3762         hammer2_key_t key_end;
3763         hammer2_key_t key_next;
3764         int cache_index;
3765         int locount;
3766         int hicount;
3767         int maxloops = 300000;
3768
3769         key = *keyp;
3770         above = parent->core;
3771         locount = 0;
3772         hicount = 0;
3773         keybits = 64;
3774
3775         /*
3776          * Calculate the range of keys in the array being careful to skip
3777          * slots which are overridden with a deletion.
3778          */
3779         key_beg = 0;
3780         key_end = HAMMER2_MAX_KEY;
3781         cache_index = 0;
3782         spin_lock(&above->cst.spin);
3783
3784         for (;;) {
3785                 if (--maxloops == 0) {
3786                         panic("indkey_freemap shit %p %p:%d\n",
3787                               parent, base, count);
3788                 }
3789                 chain = hammer2_combined_find(parent, base, count,
3790                                               &cache_index, &key_next,
3791                                               key_beg, key_end, &bref);
3792
3793                 /*
3794                  * Exhausted search
3795                  */
3796                 if (bref == NULL)
3797                         break;
3798
3799                 /*
3800                  * NOTE: No need to check DUPLICATED here because we do
3801                  *       not release the spinlock.
3802                  */
3803                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3804                         if (key_next == 0 || key_next > key_end)
3805                                 break;
3806                         key_beg = key_next;
3807                         continue;
3808                 }
3809
3810                 /*
3811                  * Use the full live (not deleted) element for the scan
3812                  * iteration.  HAMMER2 does not allow partial replacements.
3813                  *
3814                  * XXX should be built into hammer2_combined_find().
3815                  */
3816                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3817
3818                 if (keybits > bref->keybits) {
3819                         key = bref->key;
3820                         keybits = bref->keybits;
3821                 } else if (keybits == bref->keybits && bref->key < key) {
3822                         key = bref->key;
3823                 }
3824                 if (key_next == 0)
3825                         break;
3826                 key_beg = key_next;
3827         }
3828         spin_unlock(&above->cst.spin);
3829
3830         /*
3831          * Return the keybits for a higher-level FREEMAP_NODE covering
3832          * this node.
3833          */
3834         switch(keybits) {
3835         case HAMMER2_FREEMAP_LEVEL0_RADIX:
3836                 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX;
3837                 break;
3838         case HAMMER2_FREEMAP_LEVEL1_RADIX:
3839                 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX;
3840                 break;
3841         case HAMMER2_FREEMAP_LEVEL2_RADIX:
3842                 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX;
3843                 break;
3844         case HAMMER2_FREEMAP_LEVEL3_RADIX:
3845                 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX;
3846                 break;
3847         case HAMMER2_FREEMAP_LEVEL4_RADIX:
3848                 panic("hammer2_chain_indkey_freemap: level too high");
3849                 break;
3850         default:
3851                 panic("hammer2_chain_indkey_freemap: bad radix");
3852                 break;
3853         }
3854         *keyp = key;
3855
3856         return (keybits);
3857 }
3858
3859 /*
3860  * Calculate the keybits and highside/lowside of the indirect block the
3861  * caller is creating.
3862  */
3863 static int
3864 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
3865                             int keybits, hammer2_blockref_t *base, int count)
3866 {
3867         hammer2_chain_core_t *above;
3868         hammer2_blockref_t *bref;
3869         hammer2_chain_t *chain;
3870         hammer2_key_t key_beg;
3871         hammer2_key_t key_end;
3872         hammer2_key_t key_next;
3873         hammer2_key_t key;
3874         int nkeybits;
3875         int locount;
3876         int hicount;
3877         int cache_index;
3878         int maxloops = 300000;
3879
3880         key = *keyp;
3881         above = parent->core;
3882         locount = 0;
3883         hicount = 0;
3884
3885         /*
3886          * Calculate the range of keys in the array being careful to skip
3887          * slots which are overridden with a deletion.  Once the scan
3888          * completes we will cut the key range in half and shift half the
3889          * range into the new indirect block.
3890          */
3891         key_beg = 0;
3892         key_end = HAMMER2_MAX_KEY;
3893         cache_index = 0;
3894         spin_lock(&above->cst.spin);
3895
3896         for (;;) {
3897                 if (--maxloops == 0) {
3898                         panic("indkey_freemap shit %p %p:%d\n",
3899                               parent, base, count);
3900                 }
3901                 chain = hammer2_combined_find(parent, base, count,
3902                                               &cache_index, &key_next,
3903                                               key_beg, key_end, &bref);
3904
3905                 /*
3906                  * Exhausted search
3907                  */
3908                 if (bref == NULL)
3909                         break;
3910
3911                 /*
3912                  * NOTE: No need to check DUPLICATED here because we do
3913                  *       not release the spinlock.
3914                  */
3915                 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
3916                         if (key_next == 0 || key_next > key_end)
3917                                 break;
3918                         key_beg = key_next;
3919                         continue;
3920                 }
3921
3922                 /*
3923                  * Use the full live (not deleted) element for the scan
3924                  * iteration.  HAMMER2 does not allow partial replacements.
3925                  *
3926                  * XXX should be built into hammer2_combined_find().
3927                  */
3928                 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
3929
3930                 /*
3931                  * Expand our calculated key range (key, keybits) to fit
3932                  * the scanned key.  nkeybits represents the full range
3933                  * that we will later cut in half (two halves @ nkeybits - 1).
3934                  */
3935                 nkeybits = keybits;
3936                 if (nkeybits < bref->keybits) {
3937                         if (bref->keybits > 64) {
3938                                 kprintf("bad bref chain %p bref %p\n",
3939                                         chain, bref);
3940                                 Debugger("fubar");
3941                         }
3942                         nkeybits = bref->keybits;
3943                 }
3944                 while (nkeybits < 64 &&
3945                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
3946                         (key ^ bref->key)) != 0) {
3947                         ++nkeybits;
3948                 }
3949
3950                 /*
3951                  * If the new key range is larger we have to determine
3952                  * which side of the new key range the existing keys fall
3953                  * under by checking the high bit, then collapsing the
3954                  * locount into the hicount or vise-versa.
3955                  */
3956                 if (keybits != nkeybits) {
3957                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
3958                                 hicount += locount;
3959                                 locount = 0;
3960                         } else {
3961                                 locount += hicount;
3962                                 hicount = 0;
3963                         }
3964                         keybits = nkeybits;
3965                 }
3966
3967                 /*
3968                  * The newly scanned key will be in the lower half or the
3969                  * upper half of the (new) key range.
3970                  */
3971                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
3972                         ++hicount;
3973                 else
3974                         ++locount;
3975
3976                 if (key_next == 0)
3977                         break;
3978                 key_beg = key_next;
3979         }
3980         spin_unlock(&above->cst.spin);
3981         bref = NULL;    /* now invalid (safety) */
3982
3983         /*
3984          * Adjust keybits to represent half of the full range calculated
3985          * above (radix 63 max)
3986          */
3987         --keybits;
3988
3989         /*
3990          * Select whichever half contains the most elements.  Theoretically
3991          * we can select either side as long as it contains at least one
3992          * element (in order to ensure that a free slot is present to hold
3993          * the indirect block).
3994          */
3995         if (hammer2_indirect_optimize) {
3996                 /*
3997                  * Insert node for least number of keys, this will arrange
3998                  * the first few blocks of a large file or the first few
3999                  * inodes in a directory with fewer indirect blocks when
4000                  * created linearly.
4001                  */
4002                 if (hicount < locount && hicount != 0)
4003                         key |= (hammer2_key_t)1 << keybits;
4004                 else
4005                         key &= ~(hammer2_key_t)1 << keybits;
4006         } else {
4007                 /*
4008                  * Insert node for most number of keys, best for heavily
4009                  * fragmented files.
4010                  */
4011                 if (hicount > locount)
4012                         key |= (hammer2_key_t)1 << keybits;
4013                 else
4014                         key &= ~(hammer2_key_t)1 << keybits;
4015         }
4016         *keyp = key;
4017
4018         return (keybits);
4019 }
4020
4021 /*
4022  * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and
4023  * set chain->delete_tid.  The chain is not actually marked possibly-free
4024  * in the freemap until the deletion is completely flushed out (because
4025  * a flush which doesn't cover the entire deletion is flushing the deleted
4026  * chain as if it were live).
4027  *
4028  * This function does NOT generate a modification to the parent.  It
4029  * would be nearly impossible to figure out which parent to modify anyway.
4030  * Such modifications are handled top-down by the flush code and are
4031  * properly merged using the flush synchronization point.
4032  *
4033  * The find/get code will properly overload the RBTREE check on top of
4034  * the bref check to detect deleted entries.
4035  *
4036  * This function is NOT recursive.  Any entity already pushed into the
4037  * chain (such as an inode) may still need visibility into its contents,
4038  * as well as the ability to read and modify the contents.  For example,
4039  * for an unlinked file which is still open.
4040  *
4041  * NOTE: This function does NOT set chain->modify_tid, allowing future
4042  *       code to distinguish between live and deleted chains by testing
4043  *       trans->sync_tid vs chain->modify_tid and chain->delete_tid.
4044  *
4045  * NOTE: Deletions normally do not occur in the middle of a duplication
4046  *       chain but we use a trick for hardlink migration that refactors
4047  *       the originating inode without deleting it, so we make no assumptions
4048  *       here.
4049  */
4050 void
4051 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
4052 {
4053         KKASSERT(ccms_thread_lock_owned(&chain->core->cst));
4054
4055         /*
4056          * Nothing to do if already marked.
4057          */
4058         if (chain->flags & HAMMER2_CHAIN_DELETED)
4059                 return;
4060
4061         /*
4062          * The setting of DELETED causes finds, lookups, and _next iterations
4063          * to no longer recognize the chain.  RB_SCAN()s will still have
4064          * visibility (needed for flush serialization points).
4065          *
4066          * We need the spinlock on the core whos RBTREE contains chain
4067          * to protect against races.
4068          */
4069         KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
4070         spin_lock(&chain->above->cst.spin);
4071
4072         KKASSERT(trans->sync_tid >= chain->modify_tid);
4073         chain->delete_tid = trans->sync_tid;
4074         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
4075         atomic_add_int(&chain->above->live_count, -1);
4076         ++chain->above->generation;
4077
4078         /*
4079          * We must set MOVED along with DELETED for the flush code to
4080          * recognize the operation and properly disconnect the chain
4081          * in-memory.
4082          */
4083         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
4084                 hammer2_chain_ref(chain);
4085                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
4086         }
4087         spin_unlock(&chain->above->cst.spin);
4088
4089         hammer2_chain_setsubmod(trans, chain);
4090 }
4091
4092 /*
4093  * Called with the core spinlock held to check for freeable layers.
4094  * Used by the flush code.  Layers can wind up not being freed due
4095  * to the temporary layer->refs count.  This function frees up any
4096  * layers that were missed.
4097  */
4098 void
4099 hammer2_chain_layer_check_locked(hammer2_mount_t *hmp,
4100                                  hammer2_chain_core_t *core)
4101 {
4102         hammer2_chain_layer_t *layer;
4103         hammer2_chain_layer_t *tmp;
4104
4105         tmp = TAILQ_FIRST(&core->layerq);
4106         while ((layer = tmp) != NULL) {
4107                 tmp = TAILQ_NEXT(tmp, entry);
4108                 if (layer->refs == 0 && RB_EMPTY(&layer->rbtree)) {
4109                         TAILQ_REMOVE(&core->layerq, layer, entry);
4110                         if (tmp)
4111                                 ++tmp->refs;
4112                         spin_unlock(&core->cst.spin);
4113                         kfree(layer, hmp->mchain);
4114                         spin_lock(&core->cst.spin);
4115                         if (tmp)
4116                                 --tmp->refs;
4117                 }
4118         }
4119 }
4120
4121 /*
4122  * Returns the index of the nearest element in the blockref array >= elm.
4123  * Returns (count) if no element could be found.
4124  *
4125  * Sets *key_nextp to the next key for loop purposes but does not modify
4126  * it if the next key would be higher than the current value of *key_nextp.
4127  * Note that *key_nexp can overflow to 0, which should be tested by the
4128  * caller.
4129  *
4130  * (*cache_indexp) is a heuristic and can be any value without effecting
4131  * the result.
4132  *
4133  * The spin lock on the related chain must be held.
4134  */
4135 int
4136 hammer2_base_find(hammer2_chain_t *chain,
4137                   hammer2_blockref_t *base, int count,
4138                   int *cache_indexp, hammer2_key_t *key_nextp,
4139                   hammer2_key_t key_beg, hammer2_key_t key_end)
4140 {
4141         hammer2_chain_core_t *core = chain->core;
4142         hammer2_blockref_t *scan;
4143         hammer2_key_t scan_end;
4144         int i;
4145         int limit;
4146
4147         /*
4148          * Require the live chain's already have their core's counted
4149          * so we can optimize operations.
4150          */
4151         KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) ||
4152                  core->flags & HAMMER2_CORE_COUNTEDBREFS);
4153
4154         /*
4155          * Degenerate case
4156          */
4157         if (count == 0 || base == NULL)
4158                 return(count);
4159
4160         /*
4161          * Sequential optimization using *cache_indexp.  This is the most
4162          * likely scenario.
4163          *
4164          * We can avoid trailing empty entries on live chains, otherwise
4165          * we might have to check the whole block array.
4166          */
4167         i = *cache_indexp;
4168         cpu_ccfence();
4169         if (chain->flags & HAMMER2_CHAIN_DUPLICATED)
4170                 limit = count;
4171         else
4172                 limit = core->live_zero;
4173         if (i >= limit)
4174                 i = limit - 1;
4175         if (i < 0)
4176                 i = 0;
4177         KKASSERT(i < count);
4178
4179         /*
4180          * Search backwards
4181          */
4182         scan = &base[i];
4183         while (i > 0 && (scan->type == 0 || scan->key > key_beg)) {
4184                 --scan;
4185                 --i;
4186         }
4187         *cache_indexp = i;
4188
4189         /*
4190          * Search forwards, stop when we find a scan element which
4191          * encloses the key or until we know that there are no further
4192          * elements.
4193          */
4194         while (i < count) {
4195                 if (scan->type != 0) {
4196                         if (scan->key > key_beg)
4197                                 break;
4198                         scan_end = scan->key +
4199                                    ((hammer2_key_t)1 << scan->keybits) - 1;
4200                         if (scan_end >= key_beg)
4201                                 break;
4202                 }
4203                 if (i >= limit)
4204                         return (count);
4205                 ++scan;
4206                 ++i;
4207         }
4208         if (i != count) {
4209                 *cache_indexp = i;
4210                 if (i >= limit) {
4211                         i = count;
4212                 } else {
4213                         scan_end = scan->key +
4214                                    ((hammer2_key_t)1 << scan->keybits);
4215                         if (scan_end && (*key_nextp > scan_end ||
4216                                          *key_nextp == 0)) {
4217                                 *key_nextp = scan_end;
4218                         }
4219                 }
4220         }
4221         return (i);
4222 }
4223
4224 /*
4225  * Do a combined search and return the next match either from the blockref
4226  * array or from the in-memory chain.  Sets *bresp to the returned bref in
4227  * both cases, or sets it to NULL if the search exhausted.  Only returns
4228  * a non-NULL chain if the search matched from the in-memory chain.
4229  *
4230  * Must be called with above's spinlock held.  Spinlock remains held
4231  * through the operation.
4232  *
4233  * The returned chain is not locked or referenced.  Use the returned bref
4234  * to determine if the search exhausted or not.
4235  */
4236 static hammer2_chain_t *
4237 hammer2_combined_find(hammer2_chain_t *parent,
4238                       hammer2_blockref_t *base, int count,
4239                       int *cache_indexp, hammer2_key_t *key_nextp,
4240                       hammer2_key_t key_beg, hammer2_key_t key_end,
4241                       hammer2_blockref_t **bresp)
4242 {
4243         hammer2_blockref_t *bref;
4244         hammer2_chain_t *chain;
4245         int i;
4246
4247         *key_nextp = key_end + 1;
4248         i = hammer2_base_find(parent, base, count, cache_indexp,
4249                               key_nextp, key_beg, key_end);
4250         chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
4251
4252         /*
4253          * Neither matched
4254          */
4255         if (i == count && chain == NULL) {
4256                 *bresp = NULL;
4257                 return(chain);  /* NULL */
4258         }
4259
4260         /*
4261          * Only chain matched
4262          */
4263         if (i == count) {
4264                 bref = &chain->bref;
4265                 goto found;
4266         }
4267
4268         /*
4269          * Only blockref matched.
4270          */
4271         if (chain == NULL) {
4272                 bref = &base[i];
4273                 goto found;
4274         }
4275
4276         /*
4277          * Both in-memory and blockref match.
4278          *
4279          * If they are both flush with the left hand side select the chain.
4280          * If their starts match select the chain.
4281          * Otherwise the nearer element wins.
4282          */
4283         if (chain->bref.key <= key_beg && base[i].key <= key_beg) {
4284                 bref = &chain->bref;
4285                 goto found;
4286         }
4287         if (chain->bref.key <= base[i].key) {
4288                 bref = &chain->bref;
4289                 goto found;
4290                 return(chain);
4291         }
4292         bref = &base[i];
4293         chain = NULL;
4294
4295         /*
4296          * If the bref is out of bounds we've exhausted our search.
4297          */
4298 found:
4299         if (bref->key > key_end) {
4300                 *bresp = NULL;
4301                 chain = NULL;
4302         } else {
4303                 *bresp = bref;
4304         }
4305         return(chain);
4306 }
4307
4308 /*
4309  * Locate the specified block array element and delete it.  The element
4310  * must exist.
4311  *
4312  * The spin lock on the related chain must be held.
4313  *
4314  * NOTE: live_count was adjusted when the chain was deleted, so it does not
4315  *       need to be adjusted when we commit the media change.
4316  */
4317 void
4318 hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
4319                     hammer2_blockref_t *base, int count,
4320                     int *cache_indexp, hammer2_chain_t *child)
4321 {
4322         hammer2_blockref_t *elm = &child->bref;
4323         hammer2_chain_core_t *core = parent->core;
4324         hammer2_key_t key_next;
4325         int i;
4326
4327         /*
4328          * Delete element.  Expect the element to exist.
4329          *
4330          * XXX see caller, flush code not yet sophisticated enough to prevent
4331          *     re-flushed in some cases.
4332          */
4333         key_next = 0; /* max range */
4334         i = hammer2_base_find(parent, base, count, cache_indexp,
4335                               &key_next, elm->key, elm->key);
4336         if (i == count || base[i].type == 0 ||
4337             base[i].key != elm->key || base[i].keybits != elm->keybits) {
4338                 panic("delete base %p element not found at %d/%d elm %p\n",
4339                       base, i, count, elm);
4340                 return;
4341         }
4342         bzero(&base[i], sizeof(*base));
4343         base[i].mirror_tid = (intptr_t)parent;          /* debug */
4344         base[i].modify_tid = (intptr_t)child;           /* debug */
4345         base[i].check.debug.sync_tid = trans->sync_tid; /* debug */
4346
4347         /*
4348          * We can only optimize core->live_zero for live chains.
4349          */
4350         if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
4351                 if (core->live_zero == i + 1) {
4352                         while (--i >= 0 && base[i].type == 0)
4353                                 ;
4354                         core->live_zero = i + 1;
4355                 }
4356         }
4357 }
4358
4359 /*
4360  * Insert the specified element.  The block array must not already have the
4361  * element and must have space available for the insertion.
4362  *
4363  * The spin lock on the related chain must be held.
4364  *
4365  * NOTE: live_count was adjusted when the chain was deleted, so it does not
4366  *       need to be adjusted when we commit the media change.
4367  */
4368 void
4369 hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
4370                     hammer2_blockref_t *base, int count,
4371                     int *cache_indexp, hammer2_chain_t *child)
4372 {
4373         hammer2_blockref_t *elm = &child->bref;
4374         hammer2_chain_core_t *core = parent->core;
4375         hammer2_key_t key_next;
4376         hammer2_key_t xkey;
4377         int i;
4378         int j;
4379         int k;
4380         int l;
4381         int u = 1;
4382
4383         /*
4384          * Insert new element.  Expect the element to not already exist
4385          * unless we are replacing it.
4386          *
4387          * XXX see caller, flush code not yet sophisticated enough to prevent
4388          *     re-flushed in some cases.
4389          */
4390         key_next = 0; /* max range */
4391         i = hammer2_base_find(parent, base, count, cache_indexp,
4392                               &key_next, elm->key, elm->key);
4393
4394         /*
4395          * Shortcut fill optimization, typical ordered insertion(s) may not
4396          * require a search.
4397          */
4398         KKASSERT(i >= 0 && i <= count);
4399
4400         /*
4401          * We can only optimize core->live_zero for live chains.
4402          */
4403         if (i == count && core->live_zero < count) {
4404                 if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
4405                         i = core->live_zero++;
4406                         base[i] = *elm;
4407                         return;
4408                 }
4409         }
4410
4411         xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
4412         if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
4413                 panic("insert base %p overlapping elements at %d elm %p\n",
4414                       base, i, elm);
4415         }
4416
4417         /*
4418          * Try to find an empty slot before or after.
4419          */
4420         j = i;
4421         k = i;
4422         while (j > 0 || k < count) {
4423                 --j;
4424                 if (j >= 0 && base[j].type == 0) {
4425                         if (j == i - 1) {
4426                                 base[j] = *elm;
4427                         } else {
4428                                 bcopy(&base[j+1], &base[j],
4429                                       (i - j - 1) * sizeof(*base));
4430                                 base[i - 1] = *elm;
4431                         }
4432                         goto validate;
4433                 }
4434                 ++k;
4435                 if (k < count && base[k].type == 0) {
4436                         bcopy(&base[i], &base[i+1],
4437                               (k - i) * sizeof(hammer2_blockref_t));
4438                         base[i] = *elm;
4439
4440                         /*
4441                          * We can only update core->live_zero for live
4442                          * chains.
4443                          */
4444                         if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
4445                                 if (core->live_zero <= k)
4446                                         core->live_zero = k + 1;
4447                         }
4448                         u = 2;
4449                         goto validate;
4450                 }
4451         }
4452         panic("hammer2_base_insert: no room!");
4453
4454         /*
4455          * Debugging
4456          */
4457 validate:
4458         key_next = 0;
4459         for (l = 0; l < count; ++l) {
4460                 if (base[l].type) {
4461                         key_next = base[l].key +
4462                                    ((hammer2_key_t)1 << base[l].keybits) - 1;
4463                         break;
4464                 }
4465         }
4466         while (++l < count) {
4467                 if (base[l].type) {
4468                         if (base[l].key <= key_next)
4469                                 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l);
4470                         key_next = base[l].key +
4471                                    ((hammer2_key_t)1 << base[l].keybits) - 1;
4472
4473                 }
4474         }
4475
4476 }
4477
4478 #if 0
4479
4480 /*
4481  * Sort the blockref array for the chain.  Used by the flush code to
4482  * sort the blockref[] array.
4483  *
4484  * The chain must be exclusively locked AND spin-locked.
4485  */
4486 typedef hammer2_blockref_t *hammer2_blockref_p;
4487
4488 static
4489 int
4490 hammer2_base_sort_callback(const void *v1, const void *v2)
4491 {
4492         hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1;
4493         hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2;
4494
4495         /*
4496          * Make sure empty elements are placed at the end of the array
4497          */
4498         if (bref1->type == 0) {
4499                 if (bref2->type == 0)
4500                         return(0);
4501                 return(1);
4502         } else if (bref2->type == 0) {
4503                 return(-1);
4504         }
4505
4506         /*
4507          * Sort by key
4508          */
4509         if (bref1->key < bref2->key)
4510                 return(-1);
4511         if (bref1->key > bref2->key)
4512                 return(1);
4513         return(0);
4514 }
4515
4516 void
4517 hammer2_base_sort(hammer2_chain_t *chain)
4518 {
4519         hammer2_blockref_t *base;
4520         int count;
4521
4522         switch(chain->bref.type) {
4523         case HAMMER2_BREF_TYPE_INODE:
4524                 /*
4525                  * Special shortcut for embedded data returns the inode
4526                  * itself.  Callers must detect this condition and access
4527                  * the embedded data (the strategy code does this for us).
4528                  *
4529                  * This is only applicable to regular files and softlinks.
4530                  */
4531                 if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)
4532                         return;
4533                 base = &chain->data->ipdata.u.blockset.blockref[0];
4534                 count = HAMMER2_SET_COUNT;
4535                 break;
4536         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4537         case HAMMER2_BREF_TYPE_INDIRECT:
4538                 /*
4539                  * Optimize indirect blocks in the INITIAL state to avoid
4540                  * I/O.
4541                  */
4542                 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0);
4543                 base = &chain->data->npdata[0];
4544                 count = chain->bytes / sizeof(hammer2_blockref_t);
4545                 break;
4546         case HAMMER2_BREF_TYPE_VOLUME:
4547                 base = &chain->hmp->voldata.sroot_blockset.blockref[0];
4548                 count = HAMMER2_SET_COUNT;
4549                 break;
4550         case HAMMER2_BREF_TYPE_FREEMAP:
4551                 base = &chain->hmp->voldata.freemap_blockset.blockref[0];
4552                 count = HAMMER2_SET_COUNT;
4553                 break;
4554         default:
4555                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
4556                       chain->bref.type);
4557                 base = NULL;    /* safety */
4558                 count = 0;      /* safety */
4559         }
4560         kqsort(base, count, sizeof(*base), hammer2_base_sort_callback);
4561 }
4562
4563 #endif
4564
4565 /*
4566  * Chain memory management
4567  */
4568 void
4569 hammer2_chain_wait(hammer2_chain_t *chain)
4570 {
4571         tsleep(chain, 0, "chnflw", 1);
4572 }
4573
4574 /*
4575  * Manage excessive memory resource use for chain and related
4576  * structures.
4577  */
4578 void
4579 hammer2_chain_memory_wait(hammer2_pfsmount_t *pmp)
4580 {
4581         long waiting;
4582         long count;
4583         long limit;
4584         static int zzticks;
4585
4586         /*
4587          * Atomic check condition and wait.  Also do an early speedup of
4588          * the syncer to try to avoid hitting the wait.
4589          */
4590         for (;;) {
4591                 waiting = pmp->inmem_dirty_chains;
4592                 cpu_ccfence();
4593                 count = waiting & HAMMER2_DIRTYCHAIN_MASK;
4594
4595                 limit = pmp->mp->mnt_nvnodelistsize / 10;
4596                 if (limit < hammer2_limit_dirty_chains)
4597                         limit = hammer2_limit_dirty_chains;
4598                 if (limit < 1000)
4599                         limit = 1000;
4600
4601                 if ((int)(ticks - zzticks) > hz) {
4602                         zzticks = ticks;
4603                         kprintf("count %ld %ld\n", count, limit);
4604                 }
4605
4606                 /*
4607                  * Block if there are too many dirty chains present, wait
4608                  * for the flush to clean some out.
4609                  */
4610                 if (count > limit) {
4611                         tsleep_interlock(&pmp->inmem_dirty_chains, 0);
4612                         if (atomic_cmpset_long(&pmp->inmem_dirty_chains,
4613                                                waiting,
4614                                        waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
4615                                 speedup_syncer(pmp->mp);
4616                                 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
4617                                        "chnmem", hz);
4618                         }
4619                         continue;       /* loop on success or fail */
4620                 }
4621
4622                 /*
4623                  * Try to start an early flush before we are forced to block.
4624                  */
4625                 if (count > limit * 7 / 10)
4626                         speedup_syncer(pmp->mp);
4627                 break;
4628         }
4629 }
4630
4631 void
4632 hammer2_chain_memory_inc(hammer2_pfsmount_t *pmp)
4633 {
4634         if (pmp)
4635                 atomic_add_long(&pmp->inmem_dirty_chains, 1);
4636 }
4637
4638 void
4639 hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp)
4640 {
4641         long waiting;
4642
4643         if (pmp == NULL)
4644                 return;
4645
4646         for (;;) {
4647                 waiting = pmp->inmem_dirty_chains;
4648                 cpu_ccfence();
4649                 if (atomic_cmpset_long(&pmp->inmem_dirty_chains,
4650                                        waiting,
4651                                        (waiting - 1) &
4652                                         ~HAMMER2_DIRTYCHAIN_WAITING)) {
4653                         break;
4654                 }
4655         }
4656         if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
4657                 wakeup(&pmp->inmem_dirty_chains);
4658 }
4659
4660 static
4661 void
4662 adjreadcounter(hammer2_blockref_t *bref, size_t bytes)
4663 {
4664         long *counterp;
4665
4666         switch(bref->type) {
4667         case HAMMER2_BREF_TYPE_DATA:
4668                 counterp = &hammer2_iod_file_read;
4669                 break;
4670         case HAMMER2_BREF_TYPE_INODE:
4671                 counterp = &hammer2_iod_meta_read;
4672                 break;
4673         case HAMMER2_BREF_TYPE_INDIRECT:
4674                 counterp = &hammer2_iod_indr_read;
4675                 break;
4676         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
4677         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
4678                 counterp = &hammer2_iod_fmap_read;
4679                 break;
4680         default:
4681                 counterp = &hammer2_iod_volu_read;
4682                 break;
4683         }
4684         *counterp += bytes;
4685 }