sys/vfs/hammer/hammer_flusher.c

   1 /*
   2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.45 2008/07/31 04:42:04 dillon Exp $
  35  */
  36 /*
  37  * HAMMER dependancy flusher thread
  38  *
  39  * Meta data updates create buffer dependancies which are arranged as a
  40  * hierarchy of lists.
  41  */
  42
  43 #include "hammer.h"
  44
  45 static void hammer_flusher_master_thread(void *arg);
  46 static void hammer_flusher_slave_thread(void *arg);
  47 static void hammer_flusher_flush(hammer_mount_t hmp);
  48 static void hammer_flusher_flush_inode(hammer_inode_t ip,
  49                                         hammer_transaction_t trans);
  50
  51 /*
  52  * Support structures for the flusher threads.
  53  */
  54 struct hammer_flusher_info {
  55         TAILQ_ENTRY(hammer_flusher_info) entry;
  56         struct hammer_mount *hmp;
  57         thread_t        td;
  58         int             runstate;
  59         int             count;
  60         hammer_flush_group_t flg;
  61         hammer_inode_t  work_array[HAMMER_FLUSH_GROUP_SIZE];
  62 };
  63
  64 typedef struct hammer_flusher_info *hammer_flusher_info_t;
  65
  66 /*
  67  * Sync all inodes pending on the flusher.
  68  *
  69  * All flush groups will be flushed.  This does not queue dirty inodes
  70  * to the flush groups, it just flushes out what has already been queued!
  71  */
  72 void
  73 hammer_flusher_sync(hammer_mount_t hmp)
  74 {
  75         int seq;
  76
  77         seq = hammer_flusher_async(hmp, NULL);
  78         hammer_flusher_wait(hmp, seq);
  79 }
  80
  81 /*
  82  * Sync all inodes pending on the flusher - return immediately.
  83  *
  84  * All flush groups will be flushed.
  85  */
  86 int
  87 hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t close_flg)
  88 {
  89         hammer_flush_group_t flg;
  90         int seq = hmp->flusher.next;
  91
  92         TAILQ_FOREACH(flg, &hmp->flush_group_list, flush_entry) {
  93                 if (flg->running == 0)
  94                         ++seq;
  95                 flg->closed = 1;
  96                 if (flg == close_flg)
  97                         break;
  98         }
  99         if (hmp->flusher.td) {
 100                 if (hmp->flusher.signal++ == 0)
 101                         wakeup(&hmp->flusher.signal);
 102         } else {
 103                 seq = hmp->flusher.done;
 104         }
 105         return(seq);
 106 }
 107
 108 int
 109 hammer_flusher_async_one(hammer_mount_t hmp)
 110 {
 111         int seq;
 112
 113         if (hmp->flusher.td) {
 114                 seq = hmp->flusher.next;
 115                 if (hmp->flusher.signal++ == 0)
 116                         wakeup(&hmp->flusher.signal);
 117         } else {
 118                 seq = hmp->flusher.done;
 119         }
 120         return(seq);
 121 }
 122
 123 /*
 124  * Wait for the flusher to get to the specified sequence number.
 125  * Signal the flusher as often as necessary to keep it going.
 126  */
 127 void
 128 hammer_flusher_wait(hammer_mount_t hmp, int seq)
 129 {
 130         while ((int)(seq - hmp->flusher.done) > 0) {
 131                 if (hmp->flusher.act != seq) {
 132                         if (hmp->flusher.signal++ == 0)
 133                                 wakeup(&hmp->flusher.signal);
 134                 }
 135                 tsleep(&hmp->flusher.done, 0, "hmrfls", 0);
 136         }
 137 }
 138
 139 void
 140 hammer_flusher_wait_next(hammer_mount_t hmp)
 141 {
 142         int seq;
 143
 144         seq = hammer_flusher_async_one(hmp);
 145         hammer_flusher_wait(hmp, seq);
 146 }
 147
 148 void
 149 hammer_flusher_create(hammer_mount_t hmp)
 150 {
 151         hammer_flusher_info_t info;
 152         int i;
 153
 154         hmp->flusher.signal = 0;
 155         hmp->flusher.act = 0;
 156         hmp->flusher.done = 0;
 157         hmp->flusher.next = 1;
 158         hammer_ref(&hmp->flusher.finalize_lock);
 159         TAILQ_INIT(&hmp->flusher.run_list);
 160         TAILQ_INIT(&hmp->flusher.ready_list);
 161
 162         lwkt_create(hammer_flusher_master_thread, hmp,
 163                     &hmp->flusher.td, NULL, 0, -1, "hammer-M");
 164         for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
 165                 info = kmalloc(sizeof(*info), hmp->m_misc, M_WAITOK|M_ZERO);
 166                 info->hmp = hmp;
 167                 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
 168                 lwkt_create(hammer_flusher_slave_thread, info,
 169                             &info->td, NULL, 0, -1, "hammer-S%d", i);
 170         }
 171 }
 172
 173 void
 174 hammer_flusher_destroy(hammer_mount_t hmp)
 175 {
 176         hammer_flusher_info_t info;
 177
 178         /*
 179          * Kill the master
 180          */
 181         hmp->flusher.exiting = 1;
 182         while (hmp->flusher.td) {
 183                 ++hmp->flusher.signal;
 184                 wakeup(&hmp->flusher.signal);
 185                 tsleep(&hmp->flusher.exiting, 0, "hmrwex", hz);
 186         }
 187
 188         /*
 189          * Kill the slaves
 190          */
 191         while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) {
 192                 KKASSERT(info->runstate == 0);
 193                 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
 194                 info->runstate = -1;
 195                 wakeup(&info->runstate);
 196                 while (info->td)
 197                         tsleep(&info->td, 0, "hmrwwc", 0);
 198                 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
 199                 kfree(info, hmp->m_misc);
 200         }
 201 }
 202
 203 /*
 204  * The master flusher thread manages the flusher sequence id and
 205  * synchronization with the slave work threads.
 206  */
 207 static void
 208 hammer_flusher_master_thread(void *arg)
 209 {
 210         hammer_flush_group_t flg;
 211         hammer_mount_t hmp;
 212
 213         hmp = arg;
 214
 215         for (;;) {
 216                 /*
 217                  * Do at least one flush cycle.  We may have to update the
 218                  * UNDO FIFO even if no inodes are queued.
 219                  */
 220                 for (;;) {
 221                         while (hmp->flusher.group_lock)
 222                                 tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0);
 223                         hmp->flusher.act = hmp->flusher.next;
 224                         ++hmp->flusher.next;
 225                         hammer_flusher_clean_loose_ios(hmp);
 226                         hammer_flusher_flush(hmp);
 227                         hmp->flusher.done = hmp->flusher.act;
 228                         wakeup(&hmp->flusher.done);
 229                         flg = TAILQ_FIRST(&hmp->flush_group_list);
 230                         if (flg == NULL || flg->closed == 0)
 231                                 break;
 232                         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 233                                 break;
 234                 }
 235
 236                 /*
 237                  * Wait for activity.
 238                  */
 239                 if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_group_list))
 240                         break;
 241                 while (hmp->flusher.signal == 0)
 242                         tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0);
 243
 244                 /*
 245                  * Flush for each count on signal but only allow one extra
 246                  * flush request to build up.
 247                  */
 248                 if (--hmp->flusher.signal != 0)
 249                         hmp->flusher.signal = 1;
 250         }
 251
 252         /*
 253          * And we are done.
 254          */
 255         hmp->flusher.td = NULL;
 256         wakeup(&hmp->flusher.exiting);
 257         lwkt_exit();
 258 }
 259
 260 /*
 261  * Flush all inodes in the current flush group.
 262  */
 263 static void
 264 hammer_flusher_flush(hammer_mount_t hmp)
 265 {
 266         hammer_flusher_info_t info;
 267         hammer_flush_group_t flg;
 268         hammer_reserve_t resv;
 269         hammer_inode_t ip;
 270         hammer_inode_t next_ip;
 271         int slave_index;
 272         int count;
 273
 274         /*
 275          * Just in-case there's a flush race on mount
 276          */
 277         if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL)
 278                 return;
 279
 280         /*
 281          * We only do one flg but we may have to loop/retry.
 282          */
 283         count = 0;
 284         while ((flg = TAILQ_FIRST(&hmp->flush_group_list)) != NULL) {
 285                 ++count;
 286                 if (hammer_debug_general & 0x0001) {
 287                         kprintf("hammer_flush %d ttl=%d recs=%d\n",
 288                                 hmp->flusher.act,
 289                                 flg->total_count, flg->refs);
 290                 }
 291                 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 292                         break;
 293                 hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
 294
 295                 /*
 296                  * If the previous flush cycle just about exhausted our
 297                  * UNDO space we may have to do a dummy cycle to move the
 298                  * first_offset up before actually digging into a new cycle,
 299                  * or the new cycle will not have sufficient undo space.
 300                  */
 301                 if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3))
 302                         hammer_flusher_finalize(&hmp->flusher.trans, 0);
 303
 304                 /*
 305                  * Ok, we are running this flush group now (this prevents new
 306                  * additions to it).
 307                  */
 308                 flg->running = 1;
 309                 if (hmp->next_flush_group == flg)
 310                         hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
 311
 312                 /*
 313                  * Iterate the inodes in the flg's flush_list and assign
 314                  * them to slaves.
 315                  */
 316                 slave_index = 0;
 317                 info = TAILQ_FIRST(&hmp->flusher.ready_list);
 318                 next_ip = TAILQ_FIRST(&flg->flush_list);
 319
 320                 while ((ip = next_ip) != NULL) {
 321                         next_ip = TAILQ_NEXT(ip, flush_entry);
 322
 323                         /*
 324                          * Add ip to the slave's work array.  The slave is
 325                          * not currently running.
 326                          */
 327                         info->work_array[info->count++] = ip;
 328                         if (info->count != HAMMER_FLUSH_GROUP_SIZE)
 329                                 continue;
 330
 331                         /*
 332                          * Get the slave running
 333                          */
 334                         TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
 335                         TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
 336                         info->flg = flg;
 337                         info->runstate = 1;
 338                         wakeup(&info->runstate);
 339
 340                         /*
 341                          * Get a new slave.  We may have to wait for one to
 342                          * finish running.
 343                          */
 344                         while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) == NULL) {
 345                                 tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
 346                         }
 347                 }
 348
 349                 /*
 350                  * Run the current slave if necessary
 351                  */
 352                 if (info->count) {
 353                         TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
 354                         TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
 355                         info->flg = flg;
 356                         info->runstate = 1;
 357                         wakeup(&info->runstate);
 358                 }
 359
 360                 /*
 361                  * Wait for all slaves to finish running
 362                  */
 363                 while (TAILQ_FIRST(&hmp->flusher.run_list) != NULL)
 364                         tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
 365
 366                 /*
 367                  * Do the final finalization, clean up
 368                  */
 369                 hammer_flusher_finalize(&hmp->flusher.trans, 1);
 370                 hmp->flusher.tid = hmp->flusher.trans.tid;
 371
 372                 hammer_done_transaction(&hmp->flusher.trans);
 373
 374                 /*
 375                  * Loop up on the same flg.  If the flg is done clean it up
 376                  * and break out.  We only flush one flg.
 377                  */
 378                 if (TAILQ_FIRST(&flg->flush_list) == NULL) {
 379                         KKASSERT(TAILQ_EMPTY(&flg->flush_list));
 380                         KKASSERT(flg->refs == 0);
 381                         TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry);
 382                         kfree(flg, hmp->m_misc);
 383                         break;
 384                 }
 385         }
 386
 387         /*
 388          * We may have pure meta-data to flush, or we may have to finish
 389          * cycling the UNDO FIFO, even if there were no flush groups.
 390          */
 391         if (count == 0 && hammer_flusher_haswork(hmp)) {
 392                 hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
 393                 hammer_flusher_finalize(&hmp->flusher.trans, 1);
 394                 hammer_done_transaction(&hmp->flusher.trans);
 395         }
 396
 397         /*
 398          * Clean up any freed big-blocks (typically zone-2).
 399          * resv->flush_group is typically set several flush groups ahead
 400          * of the free to ensure that the freed block is not reused until
 401          * it can no longer be reused.
 402          */
 403         while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) {
 404                 if (resv->flush_group != hmp->flusher.act)
 405                         break;
 406                 hammer_reserve_clrdelay(hmp, resv);
 407         }
 408 }
 409
 410
 411 /*
 412  * The slave flusher thread pulls work off the master flush_list until no
 413  * work is left.
 414  */
 415 static void
 416 hammer_flusher_slave_thread(void *arg)
 417 {
 418         hammer_flush_group_t flg;
 419         hammer_flusher_info_t info;
 420         hammer_mount_t hmp;
 421         hammer_inode_t ip;
 422         int i;
 423
 424         info = arg;
 425         hmp = info->hmp;
 426
 427         for (;;) {
 428                 while (info->runstate == 0)
 429                         tsleep(&info->runstate, 0, "hmrssw", 0);
 430                 if (info->runstate < 0)
 431                         break;
 432                 flg = info->flg;
 433
 434                 for (i = 0; i < info->count; ++i) {
 435                         ip = info->work_array[i];
 436                         hammer_flusher_flush_inode(ip, &hmp->flusher.trans);
 437                         ++hammer_stats_inode_flushes;
 438                 }
 439                 info->count = 0;
 440                 info->runstate = 0;
 441                 TAILQ_REMOVE(&hmp->flusher.run_list, info, entry);
 442                 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
 443                 wakeup(&hmp->flusher.ready_list);
 444         }
 445         info->td = NULL;
 446         wakeup(&info->td);
 447         lwkt_exit();
 448 }
 449
 450 void
 451 hammer_flusher_clean_loose_ios(hammer_mount_t hmp)
 452 {
 453         hammer_buffer_t buffer;
 454         hammer_io_t io;
 455
 456         /*
 457          * loose ends - buffers without bp's aren't tracked by the kernel
 458          * and can build up, so clean them out.  This can occur when an
 459          * IO completes on a buffer with no references left.
 460          */
 461         if ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) {
 462                 crit_enter();   /* biodone() race */
 463                 while ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) {
 464                         KKASSERT(io->mod_list == &hmp->lose_list);
 465                         TAILQ_REMOVE(&hmp->lose_list, io, mod_entry);
 466                         io->mod_list = NULL;
 467                         if (io->lock.refs == 0)
 468                                 ++hammer_count_refedbufs;
 469                         hammer_ref(&io->lock);
 470                         buffer = (void *)io;
 471                         hammer_rel_buffer(buffer, 0);
 472                 }
 473                 crit_exit();
 474         }
 475 }
 476
 477 /*
 478  * Flush a single inode that is part of a flush group.
 479  *
 480  * Flusher errors are extremely serious, even ENOSPC shouldn't occur because
 481  * the front-end should have reserved sufficient space on the media.  Any
 482  * error other then EWOULDBLOCK will force the mount to be read-only.
 483  */
 484 static
 485 void
 486 hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
 487 {
 488         hammer_mount_t hmp = ip->hmp;
 489         int error;
 490
 491         hammer_flusher_clean_loose_ios(hmp);
 492         error = hammer_sync_inode(trans, ip);
 493
 494         /*
 495          * EWOULDBLOCK can happen under normal operation, all other errors
 496          * are considered extremely serious.  We must set WOULDBLOCK
 497          * mechanics to deal with the mess left over from the abort of the
 498          * previous flush.
 499          */
 500         if (error) {
 501                 ip->flags |= HAMMER_INODE_WOULDBLOCK;
 502                 if (error == EWOULDBLOCK)
 503                         error = 0;
 504         }
 505         hammer_flush_inode_done(ip, error);
 506         while (hmp->flusher.finalize_want)
 507                 tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0);
 508         if (hammer_flusher_undo_exhausted(trans, 1)) {
 509                 kprintf("HAMMER: Warning: UNDO area too small!\n");
 510                 hammer_flusher_finalize(trans, 1);
 511         } else if (hammer_flusher_meta_limit(trans->hmp)) {
 512                 hammer_flusher_finalize(trans, 0);
 513         }
 514 }
 515
 516 /*
 517  * Return non-zero if the UNDO area has less then (QUARTER / 4) of its
 518  * space left.
 519  *
 520  * 1/4 - Emergency free undo space level.  Below this point the flusher
 521  *       will finalize even if directory dependancies have not been resolved.
 522  *
 523  * 2/4 - Used by the pruning and reblocking code.  These functions may be
 524  *       running in parallel with a flush and cannot be allowed to drop
 525  *       available undo space to emergency levels.
 526  *
 527  * 3/4 - Used at the beginning of a flush to force-sync the volume header
 528  *       to give the flush plenty of runway to work in.
 529  */
 530 int
 531 hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter)
 532 {
 533         if (hammer_undo_space(trans) <
 534             hammer_undo_max(trans->hmp) * quarter / 4) {
 535                 return(1);
 536         } else {
 537                 return(0);
 538         }
 539 }
 540
 541 /*
 542  * Flush all pending UNDOs, wait for write completion, update the volume
 543  * header with the new UNDO end position, and flush it.  Then
 544  * asynchronously flush the meta-data.
 545  *
 546  * If this is the last finalization in a flush group we also synchronize
 547  * our cached blockmap and set hmp->flusher_undo_start and our cached undo
 548  * fifo first_offset so the next flush resets the FIFO pointers.
 549  *
 550  * If this is not final it is being called because too many dirty meta-data
 551  * buffers have built up and must be flushed with UNDO synchronization to
 552  * avoid a buffer cache deadlock.
 553  */
 554 void
 555 hammer_flusher_finalize(hammer_transaction_t trans, int final)
 556 {
 557         hammer_volume_t root_volume;
 558         hammer_blockmap_t cundomap, dundomap;
 559         hammer_mount_t hmp;
 560         hammer_io_t io;
 561         int count;
 562         int i;
 563
 564         hmp = trans->hmp;
 565         root_volume = trans->rootvol;
 566
 567         /*
 568          * Exclusively lock the flusher.  This guarantees that all dirty
 569          * buffers will be idled (have a mod-count of 0).
 570          */
 571         ++hmp->flusher.finalize_want;
 572         hammer_lock_ex(&hmp->flusher.finalize_lock);
 573
 574         /*
 575          * If this isn't the final sync several threads may have hit the
 576          * meta-limit at the same time and raced.  Only sync if we really
 577          * have to, after acquiring the lock.
 578          */
 579         if (final == 0 && !hammer_flusher_meta_limit(hmp))
 580                 goto done;
 581
 582         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 583                 goto done;
 584
 585         /*
 586          * Flush data buffers.  This can occur asynchronously and at any
 587          * time.  We must interlock against the frontend direct-data write
 588          * but do not have to acquire the sync-lock yet.
 589          */
 590         count = 0;
 591         while ((io = TAILQ_FIRST(&hmp->data_list)) != NULL) {
 592                 if (io->ioerror)
 593                         break;
 594                 if (io->lock.refs == 0)
 595                         ++hammer_count_refedbufs;
 596                 hammer_ref(&io->lock);
 597                 hammer_io_write_interlock(io);
 598                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
 599                 hammer_io_flush(io);
 600                 hammer_io_done_interlock(io);
 601                 hammer_rel_buffer((hammer_buffer_t)io, 0);
 602                 ++count;
 603         }
 604
 605         /*
 606          * The sync-lock is required for the remaining sequence.  This lock
 607          * prevents meta-data from being modified.
 608          */
 609         hammer_sync_lock_ex(trans);
 610
 611         /*
 612          * If we have been asked to finalize the volume header sync the
 613          * cached blockmap to the on-disk blockmap.  Generate an UNDO
 614          * record for the update.
 615          */
 616         if (final) {
 617                 cundomap = &hmp->blockmap[0];
 618                 dundomap = &root_volume->ondisk->vol0_blockmap[0];
 619                 if (root_volume->io.modified) {
 620                         hammer_modify_volume(trans, root_volume,
 621                                              dundomap, sizeof(hmp->blockmap));
 622                         for (i = 0; i < HAMMER_MAX_ZONES; ++i)
 623                                 hammer_crc_set_blockmap(&cundomap[i]);
 624                         bcopy(cundomap, dundomap, sizeof(hmp->blockmap));
 625                         hammer_modify_volume_done(root_volume);
 626                 }
 627         }
 628
 629         /*
 630          * Flush UNDOs
 631          */
 632         count = 0;
 633         while ((io = TAILQ_FIRST(&hmp->undo_list)) != NULL) {
 634                 if (io->ioerror)
 635                         break;
 636                 KKASSERT(io->modify_refs == 0);
 637                 if (io->lock.refs == 0)
 638                         ++hammer_count_refedbufs;
 639                 hammer_ref(&io->lock);
 640                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
 641                 hammer_io_flush(io);
 642                 hammer_rel_buffer((hammer_buffer_t)io, 0);
 643                 ++count;
 644         }
 645
 646         /*
 647          * Wait for I/Os to complete
 648          */
 649         hammer_flusher_clean_loose_ios(hmp);
 650         hammer_io_wait_all(hmp, "hmrfl1");
 651
 652         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 653                 goto failed;
 654
 655         /*
 656          * Update the on-disk volume header with new UNDO FIFO end position
 657          * (do not generate new UNDO records for this change).  We have to
 658          * do this for the UNDO FIFO whether (final) is set or not.
 659          *
 660          * Also update the on-disk next_tid field.  This does not require
 661          * an UNDO.  However, because our TID is generated before we get
 662          * the sync lock another sync may have beat us to the punch.
 663          *
 664          * This also has the side effect of updating first_offset based on
 665          * a prior finalization when the first finalization of the next flush
 666          * cycle occurs, removing any undo info from the prior finalization
 667          * from consideration.
 668          *
 669          * The volume header will be flushed out synchronously.
 670          */
 671         dundomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
 672         cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
 673
 674         if (dundomap->first_offset != cundomap->first_offset ||
 675                    dundomap->next_offset != cundomap->next_offset) {
 676                 hammer_modify_volume(NULL, root_volume, NULL, 0);
 677                 dundomap->first_offset = cundomap->first_offset;
 678                 dundomap->next_offset = cundomap->next_offset;
 679                 hammer_crc_set_blockmap(dundomap);
 680                 hammer_modify_volume_done(root_volume);
 681         }
 682
 683         /*
 684          * vol0_next_tid is used for TID selection and is updated without
 685          * an UNDO so we do not reuse a TID that may have been rolled-back.
 686          *
 687          * vol0_last_tid is the highest fully-synchronized TID.  It is
 688          * set-up when the UNDO fifo is fully synced, later on (not here).
 689          */
 690         if (root_volume->io.modified) {
 691                 hammer_modify_volume(NULL, root_volume, NULL, 0);
 692                 if (root_volume->ondisk->vol0_next_tid < trans->tid)
 693                         root_volume->ondisk->vol0_next_tid = trans->tid;
 694                 hammer_crc_set_volume(root_volume->ondisk);
 695                 hammer_modify_volume_done(root_volume);
 696                 hammer_io_flush(&root_volume->io);
 697         }
 698
 699         /*
 700          * Wait for I/Os to complete
 701          */
 702         hammer_flusher_clean_loose_ios(hmp);
 703         hammer_io_wait_all(hmp, "hmrfl2");
 704
 705         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 706                 goto failed;
 707
 708         /*
 709          * Flush meta-data.  The meta-data will be undone if we crash
 710          * so we can safely flush it asynchronously.
 711          *
 712          * Repeated catchups will wind up flushing this update's meta-data
 713          * and the UNDO buffers for the next update simultaniously.  This
 714          * is ok.
 715          */
 716         count = 0;
 717         while ((io = TAILQ_FIRST(&hmp->meta_list)) != NULL) {
 718                 if (io->ioerror)
 719                         break;
 720                 KKASSERT(io->modify_refs == 0);
 721                 if (io->lock.refs == 0)
 722                         ++hammer_count_refedbufs;
 723                 hammer_ref(&io->lock);
 724                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
 725                 hammer_io_flush(io);
 726                 hammer_rel_buffer((hammer_buffer_t)io, 0);
 727                 ++count;
 728         }
 729
 730         /*
 731          * If this is the final finalization for the flush group set
 732          * up for the next sequence by setting a new first_offset in
 733          * our cached blockmap and clearing the undo history.
 734          *
 735          * Even though we have updated our cached first_offset, the on-disk
 736          * first_offset still governs available-undo-space calculations.
 737          */
 738         if (final) {
 739                 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
 740                 if (cundomap->first_offset == cundomap->next_offset) {
 741                         hmp->hflags &= ~HMNT_UNDO_DIRTY;
 742                 } else {
 743                         cundomap->first_offset = cundomap->next_offset;
 744                         hmp->hflags |= HMNT_UNDO_DIRTY;
 745                 }
 746                 hammer_clear_undo_history(hmp);
 747
 748                 /*
 749                  * Flush tid sequencing.  flush_tid1 is fully synchronized,
 750                  * meaning a crash will not roll it back.  flush_tid2 has
 751                  * been written out asynchronously and a crash will roll
 752                  * it back.  flush_tid1 is used for all mirroring masters.
 753                  */
 754                 if (hmp->flush_tid1 != hmp->flush_tid2) {
 755                         hmp->flush_tid1 = hmp->flush_tid2;
 756                         wakeup(&hmp->flush_tid1);
 757                 }
 758                 hmp->flush_tid2 = trans->tid;
 759         }
 760
 761         /*
 762          * Cleanup.  Report any critical errors.
 763          */
 764 failed:
 765         hammer_sync_unlock(trans);
 766
 767         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) {
 768                 kprintf("HAMMER(%s): Critical write error during flush, "
 769                         "refusing to sync UNDO FIFO\n",
 770                         root_volume->ondisk->vol_name);
 771         }
 772
 773 done:
 774         hammer_unlock(&hmp->flusher.finalize_lock);
 775
 776         if (--hmp->flusher.finalize_want == 0)
 777                 wakeup(&hmp->flusher.finalize_want);
 778         hammer_stats_commits += final;
 779 }
 780
 781 /*
 782  * Return non-zero if too many dirty meta-data buffers have built up.
 783  *
 784  * Since we cannot allow such buffers to flush until we have dealt with
 785  * the UNDOs, we risk deadlocking the kernel's buffer cache.
 786  */
 787 int
 788 hammer_flusher_meta_limit(hammer_mount_t hmp)
 789 {
 790         if (hmp->locked_dirty_space + hmp->io_running_space >
 791             hammer_limit_dirtybufspace) {
 792                 return(1);
 793         }
 794         return(0);
 795 }
 796
 797 /*
 798  * Return non-zero if too many dirty meta-data buffers have built up.
 799  *
 800  * This version is used by background operations (mirror, prune, reblock)
 801  * to leave room for foreground operations.
 802  */
 803 int
 804 hammer_flusher_meta_halflimit(hammer_mount_t hmp)
 805 {
 806         if (hmp->locked_dirty_space + hmp->io_running_space >
 807             hammer_limit_dirtybufspace / 2) {
 808                 return(1);
 809         }
 810         return(0);
 811 }
 812
 813 /*
 814  * Return non-zero if the flusher still has something to flush.
 815  */
 816 int
 817 hammer_flusher_haswork(hammer_mount_t hmp)
 818 {
 819         if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 820                 return(0);
 821         if (TAILQ_FIRST(&hmp->flush_group_list) ||      /* dirty inodes */
 822             TAILQ_FIRST(&hmp->volu_list) ||             /* dirty bufffers */
 823             TAILQ_FIRST(&hmp->undo_list) ||
 824             TAILQ_FIRST(&hmp->data_list) ||
 825             TAILQ_FIRST(&hmp->meta_list) ||
 826             (hmp->hflags & HMNT_UNDO_DIRTY)             /* UNDO FIFO sync */
 827         ) {
 828                 return(1);
 829         }
 830         return(0);
 831 }
 832