sys/vfs/hammer/hammer_cursor.c

   1 /*
   2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * HAMMER B-Tree index - cursor support routines
  37  */
  38 #include "hammer.h"
  39
  40 static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive);
  41
  42 /*
  43  * Initialize a fresh cursor using the B-Tree node cache.  If the cache
  44  * is not available initialize a fresh cursor at the root of the filesystem.
  45  */
  46 int
  47 hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
  48                    hammer_node_cache_t cache, hammer_inode_t ip)
  49 {
  50         hammer_volume_t volume;
  51         hammer_node_t node;
  52         hammer_mount_t hmp;
  53         u_int tticks;
  54         int error;
  55
  56         bzero(cursor, sizeof(*cursor));
  57
  58         cursor->trans = trans;
  59         hmp = trans->hmp;
  60
  61         /*
  62          * As the number of inodes queued to the flusher increases we use
  63          * time-domain multiplexing to control read vs flush performance.
  64          * We have to do it here, before acquiring any ip or node locks,
  65          * to avoid deadlocking or excessively delaying the flusher.
  66          *
  67          * The full time period is hammer_tdmux_ticks, typically 1/5 of
  68          * a second.
  69          *
  70          * inode allocation begins to get restrained at 2/4 the limit
  71          * via the "hmrrcm" mechanism in hammer_inode.  We want to begin
  72          * limiting read activity before that to try to avoid processes
  73          * stalling out in "hmrrcm".
  74          */
  75         tticks = hammer_tdmux_ticks;
  76         if (trans->type != HAMMER_TRANS_FLS && tticks &&
  77             hmp->count_reclaims > hammer_limit_reclaims / tticks &&
  78             hmp->count_reclaims > hammer_autoflush * 2 &&
  79             hammer_flusher_running(hmp)) {
  80                 u_int rticks;
  81                 u_int xticks;
  82                 u_int dummy;
  83
  84                 /*
  85                  * 0 ... xticks ... tticks
  86                  *
  87                  * rticks is the calculated position, xticks is the demarc
  88                  * where values below xticks are reserved for the flusher
  89                  * and values >= to xticks may be used by the frontend.
  90                  *
  91                  * At least one tick is always made available for the
  92                  * frontend.
  93                  */
  94                 rticks = (u_int)ticks % tticks;
  95                 xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims;
  96
  97                 /*
  98                  * Ensure rticks and xticks are stable
  99                  */
 100                 cpu_ccfence();
 101                 if (rticks < xticks) {
 102                         if (hammer_debug_general & 0x0004)
 103                                 hdkprintf("rt %3u, xt %3u, tt %3u\n",
 104                                         rticks, xticks, tticks);
 105                         tsleep(&dummy, 0, "htdmux", xticks - rticks);
 106                 }
 107         }
 108
 109         /*
 110          * If the cursor operation is on behalf of an inode, lock
 111          * the inode.
 112          *
 113          * When acquiring a shared lock on an inode on which the backend
 114          * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second)
 115          * for the deadlock to clear.
 116          */
 117         if ((cursor->ip = ip) != NULL) {
 118                 ++ip->cursor_ip_refs;
 119                 if (trans->type == HAMMER_TRANS_FLS) {
 120                         hammer_lock_ex(&ip->lock);
 121                 } else {
 122 #if 0
 123                         if (ip->cursor_exclreq_count) {
 124                                 tsleep(&ip->cursor_exclreq_count, 0,
 125                                        "hstag1", hammer_tdmux_ticks);
 126                         }
 127 #endif
 128                         hammer_lock_sh(&ip->lock);
 129                 }
 130         }
 131
 132         /*
 133          * Step 1 - acquire a locked node from the cache if possible
 134          */
 135         if (cache && cache->node) {
 136                 node = hammer_ref_node_safe(trans, cache, &error);
 137                 if (error == 0) {
 138                         hammer_lock_sh(&node->lock);
 139                         if (node->flags & HAMMER_NODE_DELETED) {
 140                                 hammer_unlock(&node->lock);
 141                                 hammer_rel_node(node);
 142                                 node = NULL;
 143                         }
 144                 }
 145                 if (node == NULL)
 146                         ++hammer_stats_btree_root_iterations;
 147         } else {
 148                 node = NULL;
 149                 ++hammer_stats_btree_root_iterations;
 150         }
 151
 152         /*
 153          * Step 2 - If we couldn't get a node from the cache, get
 154          * the one from the root of the filesystem.
 155          */
 156         while (node == NULL) {
 157                 volume = hammer_get_root_volume(hmp, &error);
 158                 if (error)
 159                         break;
 160                 node = hammer_get_node(trans, volume->ondisk->vol0_btree_root,
 161                                        0, &error);
 162                 hammer_rel_volume(volume, 0);
 163                 if (error)
 164                         break;
 165                 /*
 166                  * When the frontend acquires the root b-tree node while the
 167                  * backend is deadlocked on it, wait up to hammer_tdmux_ticks
 168                  * (1 second) for the deadlock to clear.
 169                  */
 170 #if 0
 171                 if (node->cursor_exclreq_count &&
 172                     cursor->trans->type != HAMMER_TRANS_FLS) {
 173                         tsleep(&node->cursor_exclreq_count, 0,
 174                                "hstag3", hammer_tdmux_ticks);
 175                 }
 176 #endif
 177                 hammer_lock_sh(&node->lock);
 178
 179                 /*
 180                  * If someone got in before we could lock the node, retry.
 181                  */
 182                 if (node->flags & HAMMER_NODE_DELETED) {
 183                         hammer_unlock(&node->lock);
 184                         hammer_rel_node(node);
 185                         node = NULL;
 186                         continue;
 187                 }
 188                 if (volume->ondisk->vol0_btree_root != node->node_offset) {
 189                         hammer_unlock(&node->lock);
 190                         hammer_rel_node(node);
 191                         node = NULL;
 192                         continue;
 193                 }
 194         }
 195
 196         /*
 197          * Step 3 - finish initializing the cursor by acquiring the parent
 198          */
 199         cursor->node = node;
 200         if (error == 0)
 201                 error = hammer_load_cursor_parent(cursor, 0);
 202         KKASSERT(error == 0);
 203         /* if (error) hammer_done_cursor(cursor); */
 204         return(error);
 205 }
 206
 207 /*
 208  * Normalize a cursor.  Sometimes cursors can be left in a state
 209  * where node is NULL.  If the cursor is in this state, cursor up.
 210  */
 211 void
 212 hammer_normalize_cursor(hammer_cursor_t cursor)
 213 {
 214         if (cursor->node == NULL) {
 215                 KKASSERT(cursor->parent != NULL);
 216                 hammer_cursor_up(cursor);
 217         }
 218 }
 219
 220
 221 /*
 222  * We are finished with a cursor.  We NULL out various fields as sanity
 223  * check, in case the structure is inappropriately used afterwords.
 224  */
 225 void
 226 hammer_done_cursor(hammer_cursor_t cursor)
 227 {
 228         hammer_inode_t ip;
 229
 230         KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0);
 231         if (cursor->parent) {
 232                 hammer_unlock(&cursor->parent->lock);
 233                 hammer_rel_node(cursor->parent);
 234                 cursor->parent = NULL;
 235         }
 236         if (cursor->node) {
 237                 hammer_unlock(&cursor->node->lock);
 238                 hammer_rel_node(cursor->node);
 239                 cursor->node = NULL;
 240         }
 241         if (cursor->data_buffer) {
 242                 hammer_rel_buffer(cursor->data_buffer, 0);
 243                 cursor->data_buffer = NULL;
 244         }
 245         if ((ip = cursor->ip) != NULL) {
 246                 KKASSERT(ip->cursor_ip_refs > 0);
 247                 --ip->cursor_ip_refs;
 248                 hammer_unlock(&ip->lock);
 249                 cursor->ip = NULL;
 250         }
 251         if (cursor->iprec) {
 252                 hammer_rel_mem_record(cursor->iprec);
 253                 cursor->iprec = NULL;
 254         }
 255
 256         /*
 257          * If we deadlocked this node will be referenced.  Do a quick
 258          * lock/unlock to wait for the deadlock condition to clear.
 259          *
 260          * Maintain exclreq_count / wakeup as necessary to notify new
 261          * entrants into ip.  We continue to hold the fs_token so our
 262          * EDEADLK retry loop should get its chance before another thread
 263          * steals the lock.
 264          */
 265         if (cursor->deadlk_node) {
 266 #if 0
 267                 if (ip && cursor->trans->type == HAMMER_TRANS_FLS)
 268                         ++ip->cursor_exclreq_count;
 269                 ++cursor->deadlk_node->cursor_exclreq_count;
 270 #endif
 271                 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
 272                 hammer_unlock(&cursor->deadlk_node->lock);
 273 #if 0
 274                 if (--cursor->deadlk_node->cursor_exclreq_count == 0)
 275                         wakeup(&cursor->deadlk_node->cursor_exclreq_count);
 276                 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) {
 277                         if (--ip->cursor_exclreq_count == 0)
 278                                 wakeup(&ip->cursor_exclreq_count);
 279                 }
 280 #endif
 281                 hammer_rel_node(cursor->deadlk_node);
 282                 cursor->deadlk_node = NULL;
 283         }
 284         if (cursor->deadlk_rec) {
 285                 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr");
 286                 hammer_rel_mem_record(cursor->deadlk_rec);
 287                 cursor->deadlk_rec = NULL;
 288         }
 289
 290         cursor->data = NULL;
 291         cursor->leaf = NULL;
 292         cursor->left_bound = NULL;
 293         cursor->right_bound = NULL;
 294         cursor->trans = NULL;
 295 }
 296
 297 /*
 298  * Upgrade cursor->node and cursor->parent to exclusive locks.  This
 299  * function can return EDEADLK.
 300  *
 301  * The lock must already be either held shared or already held exclusively
 302  * by us.
 303  *
 304  * We upgrade the parent first as it is the most likely to collide first
 305  * with the downward traversal that the frontend typically does.
 306  *
 307  * If we fail to upgrade the lock and cursor->deadlk_node is NULL,
 308  * we add another reference to the node that failed and set
 309  * cursor->deadlk_node so hammer_done_cursor() can block on it.
 310  */
 311 int
 312 hammer_cursor_upgrade(hammer_cursor_t cursor)
 313 {
 314         int error;
 315
 316         if (cursor->parent) {
 317                 error = hammer_lock_upgrade(&cursor->parent->lock, 1);
 318                 if (error && cursor->deadlk_node == NULL) {
 319                         cursor->deadlk_node = cursor->parent;
 320                         hammer_ref_node(cursor->deadlk_node);
 321                 }
 322         } else {
 323                 error = 0;
 324         }
 325         if (error == 0) {
 326                 error = hammer_lock_upgrade(&cursor->node->lock, 1);
 327                 if (error && cursor->deadlk_node == NULL) {
 328                         cursor->deadlk_node = cursor->node;
 329                         hammer_ref_node(cursor->deadlk_node);
 330                 }
 331         }
 332 #if 0
 333         error = hammer_lock_upgrade(&cursor->node->lock, 1);
 334         if (error && cursor->deadlk_node == NULL) {
 335                 cursor->deadlk_node = cursor->node;
 336                 hammer_ref_node(cursor->deadlk_node);
 337         } else if (error == 0 && cursor->parent) {
 338                 error = hammer_lock_upgrade(&cursor->parent->lock, 1);
 339                 if (error && cursor->deadlk_node == NULL) {
 340                         cursor->deadlk_node = cursor->parent;
 341                         hammer_ref_node(cursor->deadlk_node);
 342                 }
 343         }
 344 #endif
 345         return(error);
 346 }
 347
 348 int
 349 hammer_cursor_upgrade_node(hammer_cursor_t cursor)
 350 {
 351         int error;
 352
 353         error = hammer_lock_upgrade(&cursor->node->lock, 1);
 354         if (error && cursor->deadlk_node == NULL) {
 355                 cursor->deadlk_node = cursor->node;
 356                 hammer_ref_node(cursor->deadlk_node);
 357         }
 358         return(error);
 359 }
 360
 361 /*
 362  * Downgrade cursor->node and cursor->parent to shared locks.
 363  */
 364 void
 365 hammer_cursor_downgrade(hammer_cursor_t cursor)
 366 {
 367         if (hammer_lock_excl_owned(&cursor->node->lock, curthread))
 368                 hammer_lock_downgrade(&cursor->node->lock, 1);
 369         if (cursor->parent &&
 370             hammer_lock_excl_owned(&cursor->parent->lock, curthread)) {
 371                 hammer_lock_downgrade(&cursor->parent->lock, 1);
 372         }
 373 }
 374
 375 /*
 376  * Upgrade and downgrade pairs of cursors.  This is used by the dedup
 377  * code which must deal with two cursors.  A special function is needed
 378  * because some of the nodes may be shared between the two cursors,
 379  * resulting in share counts > 1 which will normally cause an upgrade
 380  * to fail.
 381  */
 382 static __noinline
 383 int
 384 collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node)
 385 {
 386         int i;
 387
 388         for (i = 0; i < n; ++i) {
 389                 if (array[i] == node)
 390                         break;
 391         }
 392         if (i == n) {
 393                 array[i] = node;
 394                 counts[i] = 1;
 395                 ++i;
 396         } else {
 397                 ++counts[i];
 398         }
 399         return(i);
 400 }
 401
 402 int
 403 hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2)
 404 {
 405         hammer_node_t nodes[4];
 406         int counts[4];
 407         int error;
 408         int i;
 409         int n;
 410
 411         n = collect_node(nodes, counts, 0, cursor1->node);
 412         if (cursor1->parent)
 413                 n = collect_node(nodes, counts, n, cursor1->parent);
 414         n = collect_node(nodes, counts, n, cursor2->node);
 415         if (cursor2->parent)
 416                 n = collect_node(nodes, counts, n, cursor2->parent);
 417
 418         error = 0;
 419         for (i = 0; i < n; ++i) {
 420                 error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]);
 421                 if (error)
 422                         break;
 423         }
 424         if (error) {
 425                 while (--i >= 0)
 426                         hammer_lock_downgrade(&nodes[i]->lock, counts[i]);
 427         }
 428         return (error);
 429 }
 430
 431 void
 432 hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2)
 433 {
 434         hammer_node_t nodes[4];
 435         int counts[4];
 436         int i;
 437         int n;
 438
 439         n = collect_node(nodes, counts, 0, cursor1->node);
 440         if (cursor1->parent)
 441                 n = collect_node(nodes, counts, n, cursor1->parent);
 442         n = collect_node(nodes, counts, n, cursor2->node);
 443         if (cursor2->parent)
 444                 n = collect_node(nodes, counts, n, cursor2->parent);
 445
 446         for (i = 0; i < n; ++i)
 447                 hammer_lock_downgrade(&nodes[i]->lock, counts[i]);
 448 }
 449
 450 /*
 451  * Seek the cursor to the specified node and index.
 452  *
 453  * The caller must ref the node prior to calling this routine and release
 454  * it after it returns.  If the seek succeeds the cursor will gain its own
 455  * ref on the node.
 456  */
 457 int
 458 hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index)
 459 {
 460         int error;
 461
 462         hammer_cursor_downgrade(cursor);
 463         error = 0;
 464
 465         if (cursor->node != node) {
 466                 hammer_unlock(&cursor->node->lock);
 467                 hammer_rel_node(cursor->node);
 468                 cursor->node = node;
 469                 hammer_ref_node(node);
 470                 hammer_lock_sh(&node->lock);
 471                 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0);
 472
 473                 if (cursor->parent) {
 474                         hammer_unlock(&cursor->parent->lock);
 475                         hammer_rel_node(cursor->parent);
 476                         cursor->parent = NULL;
 477                         cursor->parent_index = 0;
 478                 }
 479                 error = hammer_load_cursor_parent(cursor, 0);
 480         }
 481         cursor->index = index;
 482         return (error);
 483 }
 484
 485 /*
 486  * Load the parent of cursor->node into cursor->parent.
 487  */
 488 static
 489 int
 490 hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive)
 491 {
 492         hammer_mount_t hmp;
 493         hammer_node_t parent;
 494         hammer_node_t node;
 495         hammer_btree_elm_t elm;
 496         int error;
 497         int parent_index;
 498
 499         hmp = cursor->trans->hmp;
 500
 501         if (cursor->node->ondisk->parent) {
 502                 node = cursor->node;
 503                 parent = hammer_btree_get_parent(cursor->trans, node,
 504                                                  &parent_index,
 505                                                  &error, try_exclusive);
 506                 if (error == 0) {
 507                         elm = &parent->ondisk->elms[parent_index];
 508                         cursor->parent = parent;
 509                         cursor->parent_index = parent_index;
 510                         cursor->left_bound = &elm[0].internal.base;
 511                         cursor->right_bound = &elm[1].internal.base;
 512                 }
 513         } else {
 514                 cursor->parent = NULL;
 515                 cursor->parent_index = 0;
 516                 cursor->left_bound = &hmp->root_btree_beg;
 517                 cursor->right_bound = &hmp->root_btree_end;
 518                 error = 0;
 519         }
 520         return(error);
 521 }
 522
 523 /*
 524  * Cursor up to our parent node.  Return ENOENT if we are at the root of
 525  * the filesystem.
 526  */
 527 int
 528 hammer_cursor_up(hammer_cursor_t cursor)
 529 {
 530         int error;
 531
 532         hammer_cursor_downgrade(cursor);
 533
 534         /*
 535          * If the parent is NULL we are at the root of the B-Tree and
 536          * return ENOENT.
 537          */
 538         if (cursor->parent == NULL)
 539                 return (ENOENT);
 540
 541         /*
 542          * Set the node to its parent.
 543          */
 544         hammer_unlock(&cursor->node->lock);
 545         hammer_rel_node(cursor->node);
 546         cursor->node = cursor->parent;
 547         cursor->index = cursor->parent_index;
 548         cursor->parent = NULL;
 549         cursor->parent_index = 0;
 550
 551         error = hammer_load_cursor_parent(cursor, 0);
 552         return(error);
 553 }
 554
 555 /*
 556  * Special cursor up given a locked cursor.  The orignal node is not
 557  * unlocked or released and the cursor is not downgraded.
 558  *
 559  * This function can fail with EDEADLK.
 560  *
 561  * This function is only run when recursively deleting parent nodes
 562  * to get rid of an empty leaf.
 563  */
 564 int
 565 hammer_cursor_up_locked(hammer_cursor_t cursor)
 566 {
 567         hammer_node_t save;
 568         int error;
 569         int save_index;
 570
 571         /*
 572          * If the parent is NULL we are at the root of the B-Tree and
 573          * return ENOENT.
 574          */
 575         if (cursor->parent == NULL)
 576                 return (ENOENT);
 577
 578         save = cursor->node;
 579         save_index = cursor->index;
 580
 581         /*
 582          * Set the node to its parent.
 583          */
 584         cursor->node = cursor->parent;
 585         cursor->index = cursor->parent_index;
 586         cursor->parent = NULL;
 587         cursor->parent_index = 0;
 588
 589         /*
 590          * load the new parent, attempt to exclusively lock it.  Note that
 591          * we are still holding the old parent (now cursor->node) exclusively
 592          * locked.
 593          *
 594          * This can return EDEADLK.  Undo the operation on any error.  These
 595          * up sequences can occur during iterations so be sure to restore
 596          * the index.
 597          */
 598         error = hammer_load_cursor_parent(cursor, 1);
 599         if (error) {
 600                 cursor->parent = cursor->node;
 601                 cursor->parent_index = cursor->index;
 602                 cursor->node = save;
 603                 cursor->index = save_index;
 604         }
 605         return(error);
 606 }
 607
 608
 609 /*
 610  * Cursor down through the current node, which must be an internal node.
 611  *
 612  * This routine adjusts the cursor and sets index to 0.
 613  */
 614 int
 615 hammer_cursor_down(hammer_cursor_t cursor)
 616 {
 617         hammer_node_t node;
 618         hammer_btree_elm_t elm;
 619         int error;
 620
 621         /*
 622          * The current node becomes the current parent
 623          */
 624         hammer_cursor_downgrade(cursor);
 625         node = cursor->node;
 626         KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count);
 627         if (cursor->parent) {
 628                 hammer_unlock(&cursor->parent->lock);
 629                 hammer_rel_node(cursor->parent);
 630         }
 631         cursor->parent = node;
 632         cursor->parent_index = cursor->index;
 633         cursor->node = NULL;
 634         cursor->index = 0;
 635
 636         /*
 637          * Extract element to push into at (node,index), set bounds.
 638          */
 639         elm = &node->ondisk->elms[cursor->parent_index];
 640
 641         /*
 642          * Ok, push down into elm of an internal node.
 643          */
 644         KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
 645         KKASSERT(elm->internal.subtree_offset != 0);
 646         cursor->left_bound = &elm[0].internal.base;
 647         cursor->right_bound = &elm[1].internal.base;
 648         node = hammer_get_node(cursor->trans,
 649                                elm->internal.subtree_offset, 0, &error);
 650         if (error == 0) {
 651                 KASSERT(elm->base.btype == node->ondisk->type,
 652                         ("BTYPE MISMATCH %c %c NODE %p",
 653                          elm->base.btype, node->ondisk->type, node));
 654                 if (node->ondisk->parent != cursor->parent->node_offset)
 655                         hpanic("node %p %016jx vs %016jx",
 656                                 node,
 657                                 (intmax_t)node->ondisk->parent,
 658                                 (intmax_t)cursor->parent->node_offset);
 659                 KKASSERT(node->ondisk->parent == cursor->parent->node_offset);
 660         }
 661
 662         /*
 663          * If no error occured we can lock the new child node.  If the
 664          * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second)
 665          * for the deadlock to clear.  Otherwise a large number of concurrent
 666          * readers can continuously stall the flusher.
 667          *
 668          * We specifically do this in the cursor_down() code in order to
 669          * deal with frontend top-down searches smashing against bottom-up
 670          * flusher-based mirror updates.  These collisions typically occur
 671          * above the inode in the B-Tree and are not covered by the
 672          * ip->cursor_exclreq_count logic.
 673          */
 674         if (error == 0) {
 675 #if 0
 676                 if (node->cursor_exclreq_count &&
 677                     cursor->trans->type != HAMMER_TRANS_FLS) {
 678                         tsleep(&node->cursor_exclreq_count, 0,
 679                                "hstag2", hammer_tdmux_ticks);
 680                 }
 681 #endif
 682                 hammer_lock_sh(&node->lock);
 683                 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0);
 684                 cursor->node = node;
 685                 cursor->index = 0;
 686         }
 687         return(error);
 688 }
 689
 690 /************************************************************************
 691  *                              DEADLOCK RECOVERY                       *
 692  ************************************************************************
 693  *
 694  * These are the new deadlock recovery functions.  Currently they are only
 695  * used for the mirror propagation and physical node removal cases but
 696  * ultimately the intention is to use them for all deadlock recovery
 697  * operations.
 698  *
 699  * WARNING!  The contents of the cursor may be modified while unlocked.
 700  *           passive modifications including adjusting the node, parent,
 701  *           indexes, and leaf pointer.
 702  *
 703  *           An outright removal of the element the cursor was pointing at
 704  *           will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set,
 705  *           which chains to causing the HAMMER_CURSOR_RETEST to be set
 706  *           when the cursor is locked again.
 707  */
 708 void
 709 hammer_unlock_cursor(hammer_cursor_t cursor)
 710 {
 711         hammer_node_t node;
 712
 713         KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0);
 714         KKASSERT(cursor->node);
 715
 716         /*
 717          * Release the cursor's locks and track B-Tree operations on node.
 718          * While being tracked our cursor can be modified by other threads
 719          * and the node may be replaced.
 720          */
 721         if (cursor->parent) {
 722                 hammer_unlock(&cursor->parent->lock);
 723                 hammer_rel_node(cursor->parent);
 724                 cursor->parent = NULL;
 725         }
 726         node = cursor->node;
 727         cursor->flags |= HAMMER_CURSOR_TRACKED;
 728         TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry);
 729         hammer_unlock(&node->lock);
 730 }
 731
 732 /*
 733  * Get the cursor heated up again.  The cursor's node may have
 734  * changed and we might have to locate the new parent.
 735  *
 736  * If the exact element we were on got deleted RIPOUT will be
 737  * set and we must clear ATEDISK so an iteration does not skip
 738  * the element after it.
 739  */
 740 int
 741 hammer_lock_cursor(hammer_cursor_t cursor)
 742 {
 743         hammer_node_t node;
 744         int error;
 745
 746         KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED);
 747
 748         /*
 749          * Relock the node
 750          */
 751         for (;;) {
 752                 node = cursor->node;
 753                 hammer_ref_node(node);
 754                 hammer_lock_sh(&node->lock);
 755                 if (cursor->node == node) {
 756                         hammer_rel_node(node);
 757                         break;
 758                 }
 759                 hammer_unlock(&node->lock);
 760                 hammer_rel_node(node);
 761         }
 762
 763         /*
 764          * Untrack the cursor, clean up, and re-establish the parent node.
 765          */
 766         TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry);
 767         cursor->flags &= ~HAMMER_CURSOR_TRACKED;
 768
 769         /*
 770          * If a ripout has occured iterations must re-test the (new)
 771          * current element.  Clearing ATEDISK prevents the element from
 772          * being skipped and RETEST causes it to be re-tested.
 773          */
 774         if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) {
 775                 cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT;
 776                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
 777                 cursor->flags |= HAMMER_CURSOR_RETEST;
 778         }
 779         error = hammer_load_cursor_parent(cursor, 0);
 780         return(error);
 781 }
 782
 783 /*
 784  * Recover from a deadlocked cursor, tracking any node removals or
 785  * replacements.  If the cursor's current node is removed by another
 786  * thread (via btree_remove()) the cursor will be seeked upwards.
 787  *
 788  * The caller is working a modifying operation and must be holding the
 789  * sync lock (shared).  We do not release the sync lock because this
 790  * would break atomicy.
 791  */
 792 int
 793 hammer_recover_cursor(hammer_cursor_t cursor)
 794 {
 795         hammer_transaction_t trans __debugvar;
 796 #if 0
 797         hammer_inode_t ip;
 798 #endif
 799         int error;
 800
 801         hammer_unlock_cursor(cursor);
 802
 803 #if 0
 804         ip = cursor->ip;
 805 #endif
 806         trans = cursor->trans;
 807         KKASSERT(trans->sync_lock_refs > 0);
 808
 809         /*
 810          * Wait for the deadlock to clear.
 811          *
 812          * Maintain exclreq_count / wakeup as necessary to notify new
 813          * entrants into ip.  We continue to hold the fs_token so our
 814          * EDEADLK retry loop should get its chance before another thread
 815          * steals the lock.
 816          */
 817         if (cursor->deadlk_node) {
 818 #if 0
 819                 if (ip && trans->type == HAMMER_TRANS_FLS)
 820                         ++ip->cursor_exclreq_count;
 821                 ++cursor->deadlk_node->cursor_exclreq_count;
 822 #endif
 823                 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
 824                 hammer_unlock(&cursor->deadlk_node->lock);
 825 #if 0
 826                 if (--cursor->deadlk_node->cursor_exclreq_count == 0)
 827                         wakeup(&cursor->deadlk_node->cursor_exclreq_count);
 828                 if (ip && trans->type == HAMMER_TRANS_FLS) {
 829                         if (--ip->cursor_exclreq_count == 0)
 830                                 wakeup(&ip->cursor_exclreq_count);
 831                 }
 832 #endif
 833                 hammer_rel_node(cursor->deadlk_node);
 834                 cursor->deadlk_node = NULL;
 835         }
 836         if (cursor->deadlk_rec) {
 837                 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr");
 838                 hammer_rel_mem_record(cursor->deadlk_rec);
 839                 cursor->deadlk_rec = NULL;
 840         }
 841         error = hammer_lock_cursor(cursor);
 842         return(error);
 843 }
 844
 845 /*
 846  * Dup ocursor to ncursor.  ncursor inherits ocursor's locks and ocursor
 847  * is effectively unlocked and becomes tracked.  If ocursor was not locked
 848  * then ncursor also inherits the tracking.
 849  *
 850  * After the caller finishes working with ncursor it must be cleaned up
 851  * with hammer_done_cursor(), and the caller must re-lock ocursor.
 852  */
 853 hammer_cursor_t
 854 hammer_push_cursor(hammer_cursor_t ocursor)
 855 {
 856         hammer_cursor_t ncursor;
 857         hammer_inode_t ip;
 858         hammer_node_t node;
 859         hammer_mount_t hmp;
 860
 861         hmp = ocursor->trans->hmp;
 862         ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK | M_ZERO);
 863         bcopy(ocursor, ncursor, sizeof(*ocursor));
 864
 865         node = ocursor->node;
 866         hammer_ref_node(node);
 867         if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) {
 868                 ocursor->flags |= HAMMER_CURSOR_TRACKED;
 869                 TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry);
 870         }
 871         if (ncursor->parent)
 872                 ocursor->parent = NULL;
 873         ocursor->data_buffer = NULL;
 874         ocursor->leaf = NULL;
 875         ocursor->data = NULL;
 876         if (ncursor->flags & HAMMER_CURSOR_TRACKED)
 877                 TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry);
 878         if ((ip = ncursor->ip) != NULL) {
 879                 ++ip->cursor_ip_refs;
 880         }
 881         if (ncursor->iprec)
 882                 hammer_ref(&ncursor->iprec->lock);
 883         return(ncursor);
 884 }
 885
 886 /*
 887  * Destroy ncursor and restore ocursor
 888  *
 889  * This is a temporary hack for the release.  We can't afford to lose
 890  * the IP lock until the IP object scan code is able to deal with it,
 891  * so have ocursor inherit it back.
 892  */
 893 void
 894 hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor)
 895 {
 896         hammer_mount_t hmp;
 897         hammer_inode_t ip;
 898
 899         hmp = ncursor->trans->hmp;
 900         ip = ncursor->ip;
 901         ncursor->ip = NULL;
 902         if (ip)
 903                 --ip->cursor_ip_refs;
 904         hammer_done_cursor(ncursor);
 905         kfree(ncursor, hmp->m_misc);
 906         KKASSERT(ocursor->ip == ip);
 907         hammer_lock_cursor(ocursor);
 908 }
 909
 910 /*
 911  * onode is being replaced by nnode by the reblocking code.
 912  */
 913 void
 914 hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode)
 915 {
 916         hammer_cursor_t cursor;
 917         hammer_node_ondisk_t ondisk;
 918         hammer_node_ondisk_t nndisk;
 919
 920         ondisk = onode->ondisk;
 921         nndisk = nnode->ondisk;
 922
 923         while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) {
 924                 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
 925                 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
 926                 KKASSERT(cursor->node == onode);
 927                 if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
 928                         cursor->leaf = &nndisk->elms[cursor->index].leaf;
 929                 cursor->node = nnode;
 930                 hammer_ref_node(nnode);
 931                 hammer_rel_node(onode);
 932         }
 933 }
 934
 935 /*
 936  * We have removed <node> from the parent and collapsed the parent.
 937  *
 938  * Cursors in deadlock recovery are seeked upward to the parent so the
 939  * btree_remove() recursion works properly even though we have marked
 940  * the cursor as requiring a reseek.
 941  *
 942  * This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK,
 943  * meaning the cursor is no longer definitively pointing at an element
 944  * within its iteration (if the cursor is being used to iterate).  The
 945  * iteration code will take this into account instead of asserting if the
 946  * cursor is outside the iteration range.
 947  */
 948 void
 949 hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index)
 950 {
 951         hammer_cursor_t cursor;
 952         hammer_node_ondisk_t ondisk;
 953
 954         KKASSERT(parent != NULL);
 955         ondisk = node->ondisk;
 956
 957         while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) {
 958                 KKASSERT(cursor->node == node);
 959                 KKASSERT(cursor->index == 0);
 960                 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry);
 961                 TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry);
 962                 if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
 963                         cursor->leaf = NULL;
 964                 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT;
 965                 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK;
 966                 cursor->node = parent;
 967                 cursor->index = index;
 968                 hammer_ref_node(parent);
 969                 hammer_rel_node(node);
 970         }
 971 }
 972
 973 /*
 974  * node was split at (onode, index) with elements >= index moved to nnode.
 975  */
 976 void
 977 hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index)
 978 {
 979         hammer_cursor_t cursor;
 980         hammer_node_ondisk_t ondisk;
 981         hammer_node_ondisk_t nndisk;
 982
 983         ondisk = onode->ondisk;
 984         nndisk = nnode->ondisk;
 985
 986 again:
 987         TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) {
 988                 KKASSERT(cursor->node == onode);
 989                 if (cursor->index < index)
 990                         continue;
 991                 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
 992                 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
 993                 if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
 994                         cursor->leaf = &nndisk->elms[cursor->index - index].leaf;
 995                 cursor->node = nnode;
 996                 cursor->index -= index;
 997                 hammer_ref_node(nnode);
 998                 hammer_rel_node(onode);
 999                 goto again;
1000         }
1001 }
1002
1003 /*
1004  * An element was moved from one node to another or within a node.  The
1005  * index may also represent the end of the node (index == numelements).
1006  *
1007  * {oparent,pindex} is the parent node's pointer to onode/oindex.
1008  *
1009  * This is used by the rebalancing code.  This is not an insertion or
1010  * deletion and any additional elements, including the degenerate case at
1011  * the end of the node, will be dealt with by additional distinct calls.
1012  */
1013 void
1014 hammer_cursor_moved_element(hammer_node_t oparent, int pindex,
1015                             hammer_node_t onode, int oindex,
1016                             hammer_node_t nnode, int nindex)
1017 {
1018         hammer_cursor_t cursor;
1019         hammer_node_ondisk_t ondisk;
1020         hammer_node_ondisk_t nndisk;
1021
1022         /*
1023          * Adjust any cursors pointing at the element
1024          */
1025         ondisk = onode->ondisk;
1026         nndisk = nnode->ondisk;
1027 again1:
1028         TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) {
1029                 KKASSERT(cursor->node == onode);
1030                 if (cursor->index != oindex)
1031                         continue;
1032                 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
1033                 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
1034                 if (cursor->leaf == &ondisk->elms[oindex].leaf)
1035                         cursor->leaf = &nndisk->elms[nindex].leaf;
1036                 cursor->node = nnode;
1037                 cursor->index = nindex;
1038                 hammer_ref_node(nnode);
1039                 hammer_rel_node(onode);
1040                 goto again1;
1041         }
1042
1043         /*
1044          * When moving the first element of onode to a different node any
1045          * cursor which is pointing at (oparent,pindex) must be repointed
1046          * to nnode and ATEDISK must be cleared.
1047          *
1048          * This prevents cursors from losing track due to insertions.
1049          * Insertions temporarily release the cursor in order to update
1050          * the mirror_tids.  It primarily effects the mirror_write code.
1051          * The other code paths generally only do a single insertion and
1052          * then relookup or drop the cursor.
1053          */
1054         if (onode == nnode || oindex)
1055                 return;
1056         ondisk = oparent->ondisk;
1057 again2:
1058         TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) {
1059                 KKASSERT(cursor->node == oparent);
1060                 if (cursor->index != pindex)
1061                         continue;
1062                 hkprintf("debug: shifted cursor pointing at parent\n"
1063                         "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n",
1064                         (intmax_t)oparent->node_offset, pindex,
1065                         (intmax_t)onode->node_offset, oindex,
1066                         (intmax_t)nnode->node_offset, nindex);
1067                 TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry);
1068                 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
1069                 if (cursor->leaf == &ondisk->elms[oindex].leaf)
1070                         cursor->leaf = &nndisk->elms[nindex].leaf;
1071                 cursor->node = nnode;
1072                 cursor->index = nindex;
1073                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1074                 hammer_ref_node(nnode);
1075                 hammer_rel_node(oparent);
1076                 goto again2;
1077         }
1078 }
1079
1080 /*
1081  * The B-Tree element pointing to the specified node was moved from (oparent)
1082  * to (nparent, nindex).  We must locate any tracked cursors pointing at
1083  * node and adjust their parent accordingly.
1084  *
1085  * This is used by the rebalancing code when packing elements causes an
1086  * element to shift from one node to another.
1087  */
1088 void
1089 hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent,
1090                              hammer_node_t nparent, int nindex)
1091 {
1092         hammer_cursor_t cursor;
1093
1094 again:
1095         TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
1096                 KKASSERT(cursor->node == node);
1097                 if (cursor->parent == oparent) {
1098                         cursor->parent = nparent;
1099                         cursor->parent_index = nindex;
1100                         hammer_ref_node(nparent);
1101                         hammer_rel_node(oparent);
1102                         goto again;
1103                 }
1104         }
1105 }
1106
1107 /*
1108  * Deleted element at (node, index)
1109  *
1110  * Shift indexes >= index
1111  */
1112 void
1113 hammer_cursor_deleted_element(hammer_node_t node, int index)
1114 {
1115         hammer_cursor_t cursor;
1116         hammer_node_ondisk_t ondisk;
1117
1118         ondisk = node->ondisk;
1119
1120         TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
1121                 KKASSERT(cursor->node == node);
1122                 if (cursor->index == index) {
1123                         cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT;
1124                         if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
1125                                 cursor->leaf = NULL;
1126                 } else if (cursor->index > index) {
1127                         if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
1128                                 cursor->leaf = &ondisk->elms[cursor->index - 1].leaf;
1129                         --cursor->index;
1130                 }
1131         }
1132 }
1133
1134 /*
1135  * Inserted element at (node, index)
1136  *
1137  * Shift indexes >= index
1138  */
1139 void
1140 hammer_cursor_inserted_element(hammer_node_t node, int index)
1141 {
1142         hammer_cursor_t cursor;
1143         hammer_node_ondisk_t ondisk;
1144
1145         ondisk = node->ondisk;
1146
1147         TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
1148                 KKASSERT(cursor->node == node);
1149                 if (cursor->index >= index) {
1150                         if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
1151                                 cursor->leaf = &ondisk->elms[cursor->index + 1].leaf;
1152                         ++cursor->index;
1153                 }
1154         }
1155 }
1156
1157 /*
1158  * Invalidate the cached data buffer associated with a cursor.
1159  *
1160  * This needs to be done when the underlying block is being freed or
1161  * the referenced buffer can prevent the related buffer cache buffer
1162  * from being properly invalidated.
1163  */
1164 void
1165 hammer_cursor_invalidate_cache(hammer_cursor_t cursor)
1166 {
1167         if (cursor->data_buffer) {
1168                 hammer_rel_buffer(cursor->data_buffer, 0);
1169                 cursor->data_buffer = NULL;
1170                 cursor->data = NULL;
1171         }
1172 }
1173