sys/vfs/hammer/hammer_flusher.c

   1 /*
   2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.45 2008/07/31 04:42:04 dillon Exp $
  35  */
  36 /*
  37  * HAMMER dependancy flusher thread
  38  *
  39  * Meta data updates create buffer dependancies which are arranged as a
  40  * hierarchy of lists.
  41  */
  42
  43 #include "hammer.h"
  44
  45 static void hammer_flusher_master_thread(void *arg);
  46 static void hammer_flusher_slave_thread(void *arg);
  47 static void hammer_flusher_flush(hammer_mount_t hmp);
  48 static void hammer_flusher_flush_inode(hammer_inode_t ip,
  49                                         hammer_transaction_t trans);
  50
  51 /*
  52  * Support structures for the flusher threads.
  53  */
  54 struct hammer_flusher_info {
  55         TAILQ_ENTRY(hammer_flusher_info) entry;
  56         struct hammer_mount *hmp;
  57         thread_t        td;
  58         int             runstate;
  59         int             count;
  60         hammer_flush_group_t flg;
  61         hammer_inode_t  work_array[HAMMER_FLUSH_GROUP_SIZE];
  62 };
  63
  64 typedef struct hammer_flusher_info *hammer_flusher_info_t;
  65
  66 /*
  67  * Sync all inodes pending on the flusher.
  68  *
  69  * All flush groups will be flushed.  This does not queue dirty inodes
  70  * to the flush groups, it just flushes out what has already been queued!
  71  */
  72 void
  73 hammer_flusher_sync(hammer_mount_t hmp)
  74 {
  75         int seq;
  76
  77         seq = hammer_flusher_async(hmp, NULL);
  78         hammer_flusher_wait(hmp, seq);
  79 }
  80
  81 /*
  82  * Sync all inodes pending on the flusher - return immediately.
  83  *
  84  * All flush groups will be flushed.
  85  */
  86 int
  87 hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t close_flg)
  88 {
  89         hammer_flush_group_t flg;
  90         int seq = hmp->flusher.next;
  91
  92         TAILQ_FOREACH(flg, &hmp->flush_group_list, flush_entry) {
  93                 if (flg->running == 0)
  94                         ++seq;
  95                 flg->closed = 1;
  96                 if (flg == close_flg)
  97                         break;
  98         }
  99         if (hmp->flusher.td) {
 100                 if (hmp->flusher.signal++ == 0)
 101                         wakeup(&hmp->flusher.signal);
 102         } else {
 103                 seq = hmp->flusher.done;
 104         }
 105         return(seq);
 106 }
 107
 108 int
 109 hammer_flusher_async_one(hammer_mount_t hmp)
 110 {
 111         int seq;
 112
 113         if (hmp->flusher.td) {
 114                 seq = hmp->flusher.next;
 115                 if (hmp->flusher.signal++ == 0)
 116                         wakeup(&hmp->flusher.signal);
 117         } else {
 118                 seq = hmp->flusher.done;
 119         }
 120         return(seq);
 121 }
 122
 123 /*
 124  * Wait for the flusher to get to the specified sequence number.
 125  * Signal the flusher as often as necessary to keep it going.
 126  */
 127 void
 128 hammer_flusher_wait(hammer_mount_t hmp, int seq)
 129 {
 130         while ((int)(seq - hmp->flusher.done) > 0) {
 131                 if (hmp->flusher.act != seq) {
 132                         if (hmp->flusher.signal++ == 0)
 133                                 wakeup(&hmp->flusher.signal);
 134                 }
 135                 tsleep(&hmp->flusher.done, 0, "hmrfls", 0);
 136         }
 137 }
 138
 139 void
 140 hammer_flusher_wait_next(hammer_mount_t hmp)
 141 {
 142         int seq;
 143
 144         seq = hammer_flusher_async_one(hmp);
 145         hammer_flusher_wait(hmp, seq);
 146 }
 147
 148 void
 149 hammer_flusher_create(hammer_mount_t hmp)
 150 {
 151         hammer_flusher_info_t info;
 152         int i;
 153
 154         hmp->flusher.signal = 0;
 155         hmp->flusher.act = 0;
 156         hmp->flusher.done = 0;
 157         hmp->flusher.next = 1;
 158         hammer_ref(&hmp->flusher.finalize_lock);
 159         TAILQ_INIT(&hmp->flusher.run_list);
 160         TAILQ_INIT(&hmp->flusher.ready_list);
 161
 162         lwkt_create(hammer_flusher_master_thread, hmp,
 163                     &hmp->flusher.td, NULL, 0, -1, "hammer-M");
 164         for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
 165                 info = kmalloc(sizeof(*info), hmp->m_misc, M_WAITOK|M_ZERO);
 166                 info->hmp = hmp;
 167                 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
 168                 lwkt_create(hammer_flusher_slave_thread, info,
 169                             &info->td, NULL, 0, -1, "hammer-S%d", i);
 170         }
 171 }
 172
 173 void
 174 hammer_flusher_destroy(hammer_mount_t hmp)
 175 {
 176         hammer_flusher_info_t info;
 177
 178         /*
 179          * Kill the master
 180          */
 181         hmp->flusher.exiting = 1;
 182         while (hmp->flusher.td) {
 183                 ++hmp->flusher.signal;
 184                 wakeup(&hmp->flusher.signal);
 185                 tsleep(&hmp->flusher.exiting, 0, "hmrwex", hz);
 186         }
 187
 188         /*
 189          * Kill the slaves
 190          */
 191         while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) {
 192                 KKASSERT(info->runstate == 0);
 193                 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
 194                 info->runstate = -1;
 195                 wakeup(&info->runstate);
 196                 while (info->td)
 197                         tsleep(&info->td, 0, "hmrwwc", 0);
 198                 kfree(info, hmp->m_misc);
 199         }
 200 }
 201
 202 /*
 203  * The master flusher thread manages the flusher sequence id and
 204  * synchronization with the slave work threads.
 205  */
 206 static void
 207 hammer_flusher_master_thread(void *arg)
 208 {
 209         hammer_flush_group_t flg;
 210         hammer_mount_t hmp;
 211
 212         hmp = arg;
 213
 214         for (;;) {
 215                 /*
 216                  * Do at least one flush cycle.  We may have to update the
 217                  * UNDO FIFO even if no inodes are queued.
 218                  */
 219                 for (;;) {
 220                         while (hmp->flusher.group_lock)
 221                                 tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0);
 222                         hmp->flusher.act = hmp->flusher.next;
 223                         ++hmp->flusher.next;
 224                         hammer_flusher_clean_loose_ios(hmp);
 225                         hammer_flusher_flush(hmp);
 226                         hmp->flusher.done = hmp->flusher.act;
 227                         wakeup(&hmp->flusher.done);
 228                         flg = TAILQ_FIRST(&hmp->flush_group_list);
 229                         if (flg == NULL || flg->closed == 0)
 230                                 break;
 231                         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 232                                 break;
 233                 }
 234
 235                 /*
 236                  * Wait for activity.
 237                  */
 238                 if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_group_list))
 239                         break;
 240                 while (hmp->flusher.signal == 0)
 241                         tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0);
 242
 243                 /*
 244                  * Flush for each count on signal but only allow one extra
 245                  * flush request to build up.
 246                  */
 247                 if (--hmp->flusher.signal != 0)
 248                         hmp->flusher.signal = 1;
 249         }
 250
 251         /*
 252          * And we are done.
 253          */
 254         hmp->flusher.td = NULL;
 255         wakeup(&hmp->flusher.exiting);
 256         lwkt_exit();
 257 }
 258
 259 /*
 260  * Flush all inodes in the current flush group.
 261  */
 262 static void
 263 hammer_flusher_flush(hammer_mount_t hmp)
 264 {
 265         hammer_flusher_info_t info;
 266         hammer_flush_group_t flg;
 267         hammer_reserve_t resv;
 268         hammer_inode_t ip;
 269         hammer_inode_t next_ip;
 270         int slave_index;
 271         int count;
 272
 273         /*
 274          * Just in-case there's a flush race on mount
 275          */
 276         if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL)
 277                 return;
 278
 279         /*
 280          * We only do one flg but we may have to loop/retry.
 281          */
 282         count = 0;
 283         while ((flg = TAILQ_FIRST(&hmp->flush_group_list)) != NULL) {
 284                 ++count;
 285                 if (hammer_debug_general & 0x0001) {
 286                         kprintf("hammer_flush %d ttl=%d recs=%d\n",
 287                                 hmp->flusher.act,
 288                                 flg->total_count, flg->refs);
 289                 }
 290                 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 291                         break;
 292                 hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
 293
 294                 /*
 295                  * If the previous flush cycle just about exhausted our
 296                  * UNDO space we may have to do a dummy cycle to move the
 297                  * first_offset up before actually digging into a new cycle,
 298                  * or the new cycle will not have sufficient undo space.
 299                  */
 300                 if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3))
 301                         hammer_flusher_finalize(&hmp->flusher.trans, 0);
 302
 303                 /*
 304                  * Ok, we are running this flush group now (this prevents new
 305                  * additions to it).
 306                  */
 307                 flg->running = 1;
 308                 if (hmp->next_flush_group == flg)
 309                         hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
 310
 311                 /*
 312                  * Iterate the inodes in the flg's flush_list and assign
 313                  * them to slaves.
 314                  */
 315                 slave_index = 0;
 316                 info = TAILQ_FIRST(&hmp->flusher.ready_list);
 317                 next_ip = TAILQ_FIRST(&flg->flush_list);
 318
 319                 while ((ip = next_ip) != NULL) {
 320                         next_ip = TAILQ_NEXT(ip, flush_entry);
 321
 322                         if (++hmp->check_yield > hammer_yield_check) {
 323                                 hmp->check_yield = 0;
 324                                 lwkt_user_yield();
 325                         }
 326
 327                         /*
 328                          * Add ip to the slave's work array.  The slave is
 329                          * not currently running.
 330                          */
 331                         info->work_array[info->count++] = ip;
 332                         if (info->count != HAMMER_FLUSH_GROUP_SIZE)
 333                                 continue;
 334
 335                         /*
 336                          * Get the slave running
 337                          */
 338                         TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
 339                         TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
 340                         info->flg = flg;
 341                         info->runstate = 1;
 342                         wakeup(&info->runstate);
 343
 344                         /*
 345                          * Get a new slave.  We may have to wait for one to
 346                          * finish running.
 347                          */
 348                         while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) == NULL) {
 349                                 tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
 350                         }
 351                 }
 352
 353                 /*
 354                  * Run the current slave if necessary
 355                  */
 356                 if (info->count) {
 357                         TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
 358                         TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
 359                         info->flg = flg;
 360                         info->runstate = 1;
 361                         wakeup(&info->runstate);
 362                 }
 363
 364                 /*
 365                  * Wait for all slaves to finish running
 366                  */
 367                 while (TAILQ_FIRST(&hmp->flusher.run_list) != NULL)
 368                         tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
 369
 370                 /*
 371                  * Do the final finalization, clean up
 372                  */
 373                 hammer_flusher_finalize(&hmp->flusher.trans, 1);
 374                 hmp->flusher.tid = hmp->flusher.trans.tid;
 375
 376                 hammer_done_transaction(&hmp->flusher.trans);
 377
 378                 /*
 379                  * Loop up on the same flg.  If the flg is done clean it up
 380                  * and break out.  We only flush one flg.
 381                  */
 382                 if (TAILQ_FIRST(&flg->flush_list) == NULL) {
 383                         KKASSERT(TAILQ_EMPTY(&flg->flush_list));
 384                         KKASSERT(flg->refs == 0);
 385                         TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry);
 386                         kfree(flg, hmp->m_misc);
 387                         break;
 388                 }
 389         }
 390
 391         /*
 392          * We may have pure meta-data to flush, or we may have to finish
 393          * cycling the UNDO FIFO, even if there were no flush groups.
 394          */
 395         if (count == 0 && hammer_flusher_haswork(hmp)) {
 396                 hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
 397                 hammer_flusher_finalize(&hmp->flusher.trans, 1);
 398                 hammer_done_transaction(&hmp->flusher.trans);
 399         }
 400
 401         /*
 402          * Clean up any freed big-blocks (typically zone-2).
 403          * resv->flush_group is typically set several flush groups ahead
 404          * of the free to ensure that the freed block is not reused until
 405          * it can no longer be reused.
 406          */
 407         while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) {
 408                 if (resv->flush_group != hmp->flusher.act)
 409                         break;
 410                 hammer_reserve_clrdelay(hmp, resv);
 411         }
 412 }
 413
 414
 415 /*
 416  * The slave flusher thread pulls work off the master flush_list until no
 417  * work is left.
 418  */
 419 static void
 420 hammer_flusher_slave_thread(void *arg)
 421 {
 422         hammer_flush_group_t flg;
 423         hammer_flusher_info_t info;
 424         hammer_mount_t hmp;
 425         hammer_inode_t ip;
 426         int i;
 427
 428         info = arg;
 429         hmp = info->hmp;
 430
 431         for (;;) {
 432                 while (info->runstate == 0)
 433                         tsleep(&info->runstate, 0, "hmrssw", 0);
 434                 if (info->runstate < 0)
 435                         break;
 436                 flg = info->flg;
 437
 438                 for (i = 0; i < info->count; ++i) {
 439                         ip = info->work_array[i];
 440                         hammer_flusher_flush_inode(ip, &hmp->flusher.trans);
 441                         ++hammer_stats_inode_flushes;
 442                 }
 443                 info->count = 0;
 444                 info->runstate = 0;
 445                 TAILQ_REMOVE(&hmp->flusher.run_list, info, entry);
 446                 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
 447                 wakeup(&hmp->flusher.ready_list);
 448         }
 449         info->td = NULL;
 450         wakeup(&info->td);
 451         lwkt_exit();
 452 }
 453
 454 void
 455 hammer_flusher_clean_loose_ios(hammer_mount_t hmp)
 456 {
 457         hammer_buffer_t buffer;
 458         hammer_io_t io;
 459
 460         /*
 461          * loose ends - buffers without bp's aren't tracked by the kernel
 462          * and can build up, so clean them out.  This can occur when an
 463          * IO completes on a buffer with no references left.
 464          */
 465         if ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) {
 466                 crit_enter();   /* biodone() race */
 467                 while ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) {
 468                         KKASSERT(io->mod_list == &hmp->lose_list);
 469                         TAILQ_REMOVE(&hmp->lose_list, io, mod_entry);
 470                         io->mod_list = NULL;
 471                         if (io->lock.refs == 0)
 472                                 ++hammer_count_refedbufs;
 473                         hammer_ref(&io->lock);
 474                         buffer = (void *)io;
 475                         hammer_rel_buffer(buffer, 0);
 476                 }
 477                 crit_exit();
 478         }
 479 }
 480
 481 /*
 482  * Flush a single inode that is part of a flush group.
 483  *
 484  * Flusher errors are extremely serious, even ENOSPC shouldn't occur because
 485  * the front-end should have reserved sufficient space on the media.  Any
 486  * error other then EWOULDBLOCK will force the mount to be read-only.
 487  */
 488 static
 489 void
 490 hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
 491 {
 492         hammer_mount_t hmp = ip->hmp;
 493         int error;
 494
 495         hammer_flusher_clean_loose_ios(hmp);
 496         error = hammer_sync_inode(trans, ip);
 497
 498         /*
 499          * EWOULDBLOCK can happen under normal operation, all other errors
 500          * are considered extremely serious.  We must set WOULDBLOCK
 501          * mechanics to deal with the mess left over from the abort of the
 502          * previous flush.
 503          */
 504         if (error) {
 505                 ip->flags |= HAMMER_INODE_WOULDBLOCK;
 506                 if (error == EWOULDBLOCK)
 507                         error = 0;
 508         }
 509         hammer_flush_inode_done(ip, error);
 510         while (hmp->flusher.finalize_want)
 511                 tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0);
 512         if (hammer_flusher_undo_exhausted(trans, 1)) {
 513                 kprintf("HAMMER: Warning: UNDO area too small!\n");
 514                 hammer_flusher_finalize(trans, 1);
 515         } else if (hammer_flusher_meta_limit(trans->hmp)) {
 516                 hammer_flusher_finalize(trans, 0);
 517         }
 518 }
 519
 520 /*
 521  * Return non-zero if the UNDO area has less then (QUARTER / 4) of its
 522  * space left.
 523  *
 524  * 1/4 - Emergency free undo space level.  Below this point the flusher
 525  *       will finalize even if directory dependancies have not been resolved.
 526  *
 527  * 2/4 - Used by the pruning and reblocking code.  These functions may be
 528  *       running in parallel with a flush and cannot be allowed to drop
 529  *       available undo space to emergency levels.
 530  *
 531  * 3/4 - Used at the beginning of a flush to force-sync the volume header
 532  *       to give the flush plenty of runway to work in.
 533  */
 534 int
 535 hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter)
 536 {
 537         if (hammer_undo_space(trans) <
 538             hammer_undo_max(trans->hmp) * quarter / 4) {
 539                 return(1);
 540         } else {
 541                 return(0);
 542         }
 543 }
 544
 545 /*
 546  * Flush all pending UNDOs, wait for write completion, update the volume
 547  * header with the new UNDO end position, and flush it.  Then
 548  * asynchronously flush the meta-data.
 549  *
 550  * If this is the last finalization in a flush group we also synchronize
 551  * our cached blockmap and set hmp->flusher_undo_start and our cached undo
 552  * fifo first_offset so the next flush resets the FIFO pointers.
 553  *
 554  * If this is not final it is being called because too many dirty meta-data
 555  * buffers have built up and must be flushed with UNDO synchronization to
 556  * avoid a buffer cache deadlock.
 557  */
 558 void
 559 hammer_flusher_finalize(hammer_transaction_t trans, int final)
 560 {
 561         hammer_volume_t root_volume;
 562         hammer_blockmap_t cundomap, dundomap;
 563         hammer_mount_t hmp;
 564         hammer_io_t io;
 565         int count;
 566         int i;
 567
 568         hmp = trans->hmp;
 569         root_volume = trans->rootvol;
 570
 571         /*
 572          * Exclusively lock the flusher.  This guarantees that all dirty
 573          * buffers will be idled (have a mod-count of 0).
 574          */
 575         ++hmp->flusher.finalize_want;
 576         hammer_lock_ex(&hmp->flusher.finalize_lock);
 577
 578         /*
 579          * If this isn't the final sync several threads may have hit the
 580          * meta-limit at the same time and raced.  Only sync if we really
 581          * have to, after acquiring the lock.
 582          */
 583         if (final == 0 && !hammer_flusher_meta_limit(hmp))
 584                 goto done;
 585
 586         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 587                 goto done;
 588
 589         /*
 590          * Flush data buffers.  This can occur asynchronously and at any
 591          * time.  We must interlock against the frontend direct-data write
 592          * but do not have to acquire the sync-lock yet.
 593          */
 594         count = 0;
 595         while ((io = TAILQ_FIRST(&hmp->data_list)) != NULL) {
 596                 if (io->ioerror)
 597                         break;
 598                 if (io->lock.refs == 0)
 599                         ++hammer_count_refedbufs;
 600                 hammer_ref(&io->lock);
 601                 hammer_io_write_interlock(io);
 602                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
 603                 hammer_io_flush(io, 0);
 604                 hammer_io_done_interlock(io);
 605                 hammer_rel_buffer((hammer_buffer_t)io, 0);
 606                 ++count;
 607         }
 608
 609         /*
 610          * The sync-lock is required for the remaining sequence.  This lock
 611          * prevents meta-data from being modified.
 612          */
 613         hammer_sync_lock_ex(trans);
 614
 615         /*
 616          * If we have been asked to finalize the volume header sync the
 617          * cached blockmap to the on-disk blockmap.  Generate an UNDO
 618          * record for the update.
 619          */
 620         if (final) {
 621                 cundomap = &hmp->blockmap[0];
 622                 dundomap = &root_volume->ondisk->vol0_blockmap[0];
 623                 if (root_volume->io.modified) {
 624                         hammer_modify_volume(trans, root_volume,
 625                                              dundomap, sizeof(hmp->blockmap));
 626                         for (i = 0; i < HAMMER_MAX_ZONES; ++i)
 627                                 hammer_crc_set_blockmap(&cundomap[i]);
 628                         bcopy(cundomap, dundomap, sizeof(hmp->blockmap));
 629                         hammer_modify_volume_done(root_volume);
 630                 }
 631         }
 632
 633         /*
 634          * Flush UNDOs
 635          */
 636         count = 0;
 637         while ((io = TAILQ_FIRST(&hmp->undo_list)) != NULL) {
 638                 if (io->ioerror)
 639                         break;
 640                 KKASSERT(io->modify_refs == 0);
 641                 if (io->lock.refs == 0)
 642                         ++hammer_count_refedbufs;
 643                 hammer_ref(&io->lock);
 644                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
 645                 hammer_io_flush(io, hammer_undo_reclaim(io));
 646                 hammer_rel_buffer((hammer_buffer_t)io, 0);
 647                 ++count;
 648         }
 649
 650         /*
 651          * Wait for I/Os to complete and flush the cache on the target disk.
 652          */
 653         hammer_flusher_clean_loose_ios(hmp);
 654         hammer_io_wait_all(hmp, "hmrfl1");
 655
 656         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 657                 goto failed;
 658
 659         /*
 660          * HAMMER VERSION < 4:
 661          *      Update the on-disk volume header with new UNDO FIFO end
 662          *      position (do not generate new UNDO records for this change).
 663          *      We have to do this for the UNDO FIFO whether (final) is
 664          *      set or not in order for the UNDOs to be recognized on
 665          *      recovery.
 666          *
 667          * HAMMER VERSION >= 4:
 668          *      The UNDO FIFO data written above will be recognized on
 669          *      recovery without us having to sync the volume header.
 670          *
 671          * Also update the on-disk next_tid field.  This does not require
 672          * an UNDO.  However, because our TID is generated before we get
 673          * the sync lock another sync may have beat us to the punch.
 674          *
 675          * This also has the side effect of updating first_offset based on
 676          * a prior finalization when the first finalization of the next flush
 677          * cycle occurs, removing any undo info from the prior finalization
 678          * from consideration.
 679          *
 680          * The volume header will be flushed out synchronously.
 681          */
 682         dundomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
 683         cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
 684
 685         if (dundomap->first_offset != cundomap->first_offset ||
 686                    dundomap->next_offset != cundomap->next_offset) {
 687                 hammer_modify_volume(NULL, root_volume, NULL, 0);
 688                 dundomap->first_offset = cundomap->first_offset;
 689                 dundomap->next_offset = cundomap->next_offset;
 690                 hammer_crc_set_blockmap(dundomap);
 691                 hammer_modify_volume_done(root_volume);
 692         }
 693
 694         /*
 695          * vol0_next_tid is used for TID selection and is updated without
 696          * an UNDO so we do not reuse a TID that may have been rolled-back.
 697          *
 698          * vol0_last_tid is the highest fully-synchronized TID.  It is
 699          * set-up when the UNDO fifo is fully synced, later on (not here).
 700          */
 701         if (root_volume->io.modified) {
 702                 hammer_modify_volume(NULL, root_volume, NULL, 0);
 703                 if (root_volume->ondisk->vol0_next_tid < trans->tid)
 704                         root_volume->ondisk->vol0_next_tid = trans->tid;
 705                 hammer_crc_set_volume(root_volume->ondisk);
 706                 hammer_modify_volume_done(root_volume);
 707                 hammer_io_flush(&root_volume->io, 0);
 708         }
 709
 710         /*
 711          * Wait for I/Os to complete.
 712          *
 713          * For HAMMER VERSION 4+ filesystems we do not have to wait for
 714          * the I/O to complete as the new UNDO FIFO entries are recognized
 715          * even without the volume header update.  This allows the volume
 716          * header to flushed along with meta-data, significantly reducing
 717          * flush overheads.
 718          */
 719         hammer_flusher_clean_loose_ios(hmp);
 720         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
 721                 hammer_io_wait_all(hmp, "hmrfl2");
 722
 723         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 724                 goto failed;
 725
 726         /*
 727          * Flush meta-data.  The meta-data will be undone if we crash
 728          * so we can safely flush it asynchronously.  There is no need
 729          * to wait for I/O to complete (or issue a synchronous disk flush).
 730          *
 731          * In fact, even if we did wait the meta-data will still be undone
 732          * by a crash up until the next flush cycle due to the first_offset
 733          * in the volume header for the UNDO FIFO not being adjusted until
 734          * the following flush cycle.
 735          */
 736         count = 0;
 737         while ((io = TAILQ_FIRST(&hmp->meta_list)) != NULL) {
 738                 if (io->ioerror)
 739                         break;
 740                 KKASSERT(io->modify_refs == 0);
 741                 if (io->lock.refs == 0)
 742                         ++hammer_count_refedbufs;
 743                 hammer_ref(&io->lock);
 744                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
 745                 hammer_io_flush(io, 0);
 746                 hammer_rel_buffer((hammer_buffer_t)io, 0);
 747                 ++count;
 748         }
 749
 750         /*
 751          * If this is the final finalization for the flush group set
 752          * up for the next sequence by setting a new first_offset in
 753          * our cached blockmap and clearing the undo history.
 754          *
 755          * Even though we have updated our cached first_offset, the on-disk
 756          * first_offset still governs available-undo-space calculations.
 757          */
 758         if (final) {
 759                 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
 760                 if (cundomap->first_offset == cundomap->next_offset) {
 761                         hmp->hflags &= ~HMNT_UNDO_DIRTY;
 762                 } else {
 763                         cundomap->first_offset = cundomap->next_offset;
 764                         hmp->hflags |= HMNT_UNDO_DIRTY;
 765                 }
 766                 hammer_clear_undo_history(hmp);
 767
 768                 /*
 769                  * Flush tid sequencing.  flush_tid1 is fully synchronized,
 770                  * meaning a crash will not roll it back.  flush_tid2 has
 771                  * been written out asynchronously and a crash will roll
 772                  * it back.  flush_tid1 is used for all mirroring masters.
 773                  */
 774                 if (hmp->flush_tid1 != hmp->flush_tid2) {
 775                         hmp->flush_tid1 = hmp->flush_tid2;
 776                         wakeup(&hmp->flush_tid1);
 777                 }
 778                 hmp->flush_tid2 = trans->tid;
 779         }
 780
 781         /*
 782          * Cleanup.  Report any critical errors.
 783          */
 784 failed:
 785         hammer_sync_unlock(trans);
 786
 787         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) {
 788                 kprintf("HAMMER(%s): Critical write error during flush, "
 789                         "refusing to sync UNDO FIFO\n",
 790                         root_volume->ondisk->vol_name);
 791         }
 792
 793 done:
 794         hammer_unlock(&hmp->flusher.finalize_lock);
 795
 796         if (--hmp->flusher.finalize_want == 0)
 797                 wakeup(&hmp->flusher.finalize_want);
 798         hammer_stats_commits += final;
 799 }
 800
 801 /*
 802  * Return non-zero if too many dirty meta-data buffers have built up.
 803  *
 804  * Since we cannot allow such buffers to flush until we have dealt with
 805  * the UNDOs, we risk deadlocking the kernel's buffer cache.
 806  */
 807 int
 808 hammer_flusher_meta_limit(hammer_mount_t hmp)
 809 {
 810         if (hmp->locked_dirty_space + hmp->io_running_space >
 811             hammer_limit_dirtybufspace) {
 812                 return(1);
 813         }
 814         return(0);
 815 }
 816
 817 /*
 818  * Return non-zero if too many dirty meta-data buffers have built up.
 819  *
 820  * This version is used by background operations (mirror, prune, reblock)
 821  * to leave room for foreground operations.
 822  */
 823 int
 824 hammer_flusher_meta_halflimit(hammer_mount_t hmp)
 825 {
 826         if (hmp->locked_dirty_space + hmp->io_running_space >
 827             hammer_limit_dirtybufspace / 2) {
 828                 return(1);
 829         }
 830         return(0);
 831 }
 832
 833 /*
 834  * Return non-zero if the flusher still has something to flush.
 835  */
 836 int
 837 hammer_flusher_haswork(hammer_mount_t hmp)
 838 {
 839         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 840                 return(0);
 841         if (TAILQ_FIRST(&hmp->flush_group_list) ||      /* dirty inodes */
 842             TAILQ_FIRST(&hmp->volu_list) ||             /* dirty bufffers */
 843             TAILQ_FIRST(&hmp->undo_list) ||
 844             TAILQ_FIRST(&hmp->data_list) ||
 845             TAILQ_FIRST(&hmp->meta_list) ||
 846             (hmp->hflags & HMNT_UNDO_DIRTY)             /* UNDO FIFO sync */
 847         ) {
 848                 return(1);
 849         }
 850         return(0);
 851 }
 852