2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.44 2008/07/19 04:49:39 dillon Exp $
37 * HAMMER dependancy flusher thread
39 * Meta data updates create buffer dependancies which are arranged as a
45 static void hammer_flusher_master_thread(void *arg);
46 static void hammer_flusher_slave_thread(void *arg);
47 static void hammer_flusher_flush(hammer_mount_t hmp);
48 static void hammer_flusher_flush_inode(hammer_inode_t ip,
49 hammer_transaction_t trans);
52 * Support structures for the flusher threads.
54 struct hammer_flusher_info {
55 TAILQ_ENTRY(hammer_flusher_info) entry;
56 struct hammer_mount *hmp;
60 hammer_flush_group_t flg;
61 hammer_inode_t work_array[HAMMER_FLUSH_GROUP_SIZE];
64 typedef struct hammer_flusher_info *hammer_flusher_info_t;
67 * Sync all inodes pending on the flusher.
69 * All flush groups will be flushed. This does not queue dirty inodes
70 * to the flush groups, it just flushes out what has already been queued!
73 hammer_flusher_sync(hammer_mount_t hmp)
77 seq = hammer_flusher_async(hmp, NULL);
78 hammer_flusher_wait(hmp, seq);
82 * Sync all inodes pending on the flusher - return immediately.
84 * All flush groups will be flushed.
87 hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t close_flg)
89 hammer_flush_group_t flg;
90 int seq = hmp->flusher.next;
92 TAILQ_FOREACH(flg, &hmp->flush_group_list, flush_entry) {
93 if (flg->running == 0)
99 if (hmp->flusher.td) {
100 if (hmp->flusher.signal++ == 0)
101 wakeup(&hmp->flusher.signal);
103 seq = hmp->flusher.done;
109 hammer_flusher_async_one(hammer_mount_t hmp)
113 if (hmp->flusher.td) {
114 seq = hmp->flusher.next;
115 if (hmp->flusher.signal++ == 0)
116 wakeup(&hmp->flusher.signal);
118 seq = hmp->flusher.done;
124 * Wait for the flusher to get to the specified sequence number.
125 * Signal the flusher as often as necessary to keep it going.
128 hammer_flusher_wait(hammer_mount_t hmp, int seq)
130 while ((int)(seq - hmp->flusher.done) > 0) {
131 if (hmp->flusher.act != seq) {
132 if (hmp->flusher.signal++ == 0)
133 wakeup(&hmp->flusher.signal);
135 tsleep(&hmp->flusher.done, 0, "hmrfls", 0);
140 hammer_flusher_create(hammer_mount_t hmp)
142 hammer_flusher_info_t info;
145 hmp->flusher.signal = 0;
146 hmp->flusher.act = 0;
147 hmp->flusher.done = 0;
148 hmp->flusher.next = 1;
149 hammer_ref(&hmp->flusher.finalize_lock);
150 TAILQ_INIT(&hmp->flusher.run_list);
151 TAILQ_INIT(&hmp->flusher.ready_list);
153 lwkt_create(hammer_flusher_master_thread, hmp,
154 &hmp->flusher.td, NULL, 0, -1, "hammer-M");
155 for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
156 info = kmalloc(sizeof(*info), M_HAMMER, M_WAITOK|M_ZERO);
158 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
159 lwkt_create(hammer_flusher_slave_thread, info,
160 &info->td, NULL, 0, -1, "hammer-S%d", i);
165 hammer_flusher_destroy(hammer_mount_t hmp)
167 hammer_flusher_info_t info;
172 hmp->flusher.exiting = 1;
173 while (hmp->flusher.td) {
174 ++hmp->flusher.signal;
175 wakeup(&hmp->flusher.signal);
176 tsleep(&hmp->flusher.exiting, 0, "hmrwex", hz);
182 while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) {
183 KKASSERT(info->runstate == 0);
184 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
186 wakeup(&info->runstate);
188 tsleep(&info->td, 0, "hmrwwc", 0);
189 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
190 kfree(info, M_HAMMER);
195 * The master flusher thread manages the flusher sequence id and
196 * synchronization with the slave work threads.
199 hammer_flusher_master_thread(void *arg)
201 hammer_flush_group_t flg;
208 * Do at least one flush cycle. We may have to update the
209 * UNDO FIFO even if no inodes are queued.
212 while (hmp->flusher.group_lock)
213 tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0);
214 hmp->flusher.act = hmp->flusher.next;
216 hammer_flusher_clean_loose_ios(hmp);
217 hammer_flusher_flush(hmp);
218 hmp->flusher.done = hmp->flusher.act;
219 wakeup(&hmp->flusher.done);
220 flg = TAILQ_FIRST(&hmp->flush_group_list);
221 if (flg == NULL || flg->closed == 0)
223 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
230 if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_group_list))
232 while (hmp->flusher.signal == 0)
233 tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0);
234 hmp->flusher.signal = 0;
240 hmp->flusher.td = NULL;
241 wakeup(&hmp->flusher.exiting);
246 * Flush all inodes in the current flush group.
249 hammer_flusher_flush(hammer_mount_t hmp)
251 hammer_flusher_info_t info;
252 hammer_flush_group_t flg;
253 hammer_reserve_t resv;
255 hammer_inode_t next_ip;
260 * Just in-case there's a flush race on mount
262 if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL)
266 * We only do one flg but we may have to loop/retry.
269 while ((flg = TAILQ_FIRST(&hmp->flush_group_list)) != NULL) {
271 if (hammer_debug_general & 0x0001) {
272 kprintf("hammer_flush %d ttl=%d recs=%d\n",
274 flg->total_count, flg->refs);
276 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
278 hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
281 * If the previous flush cycle just about exhausted our
282 * UNDO space we may have to do a dummy cycle to move the
283 * first_offset up before actually digging into a new cycle,
284 * or the new cycle will not have sufficient undo space.
286 if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3))
287 hammer_flusher_finalize(&hmp->flusher.trans, 0);
290 * Ok, we are running this flush group now (this prevents new
294 if (hmp->next_flush_group == flg)
295 hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
298 * Iterate the inodes in the flg's flush_list and assign
302 info = TAILQ_FIRST(&hmp->flusher.ready_list);
303 next_ip = TAILQ_FIRST(&flg->flush_list);
305 while ((ip = next_ip) != NULL) {
306 next_ip = TAILQ_NEXT(ip, flush_entry);
309 * Add ip to the slave's work array. The slave is
310 * not currently running.
312 info->work_array[info->count++] = ip;
313 if (info->count != HAMMER_FLUSH_GROUP_SIZE)
317 * Get the slave running
319 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
320 TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
323 wakeup(&info->runstate);
326 * Get a new slave. We may have to wait for one to
329 while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) == NULL) {
330 tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
335 * Run the current slave if necessary
338 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
339 TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
342 wakeup(&info->runstate);
346 * Wait for all slaves to finish running
348 while (TAILQ_FIRST(&hmp->flusher.run_list) != NULL)
349 tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
352 * Do the final finalization, clean up
354 hammer_flusher_finalize(&hmp->flusher.trans, 1);
355 hmp->flusher.tid = hmp->flusher.trans.tid;
357 hammer_done_transaction(&hmp->flusher.trans);
360 * Loop up on the same flg. If the flg is done clean it up
361 * and break out. We only flush one flg.
363 if (TAILQ_FIRST(&flg->flush_list) == NULL) {
364 KKASSERT(TAILQ_EMPTY(&flg->flush_list));
365 KKASSERT(flg->refs == 0);
366 TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry);
367 kfree(flg, M_HAMMER);
373 * We may have pure meta-data to flush, or we may have to finish
374 * cycling the UNDO FIFO, even if there were no flush groups.
376 if (count == 0 && hammer_flusher_haswork(hmp)) {
377 hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
378 hammer_flusher_finalize(&hmp->flusher.trans, 1);
379 hammer_done_transaction(&hmp->flusher.trans);
383 * Clean up any freed big-blocks (typically zone-2).
384 * resv->flush_group is typically set several flush groups ahead
385 * of the free to ensure that the freed block is not reused until
386 * it can no longer be reused.
388 while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) {
389 if (resv->flush_group != hmp->flusher.act)
391 hammer_reserve_clrdelay(hmp, resv);
397 * The slave flusher thread pulls work off the master flush_list until no
401 hammer_flusher_slave_thread(void *arg)
403 hammer_flush_group_t flg;
404 hammer_flusher_info_t info;
413 while (info->runstate == 0)
414 tsleep(&info->runstate, 0, "hmrssw", 0);
415 if (info->runstate < 0)
419 for (i = 0; i < info->count; ++i) {
420 ip = info->work_array[i];
421 hammer_flusher_flush_inode(ip, &hmp->flusher.trans);
422 ++hammer_stats_inode_flushes;
426 TAILQ_REMOVE(&hmp->flusher.run_list, info, entry);
427 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
428 wakeup(&hmp->flusher.ready_list);
436 hammer_flusher_clean_loose_ios(hammer_mount_t hmp)
438 hammer_buffer_t buffer;
442 * loose ends - buffers without bp's aren't tracked by the kernel
443 * and can build up, so clean them out. This can occur when an
444 * IO completes on a buffer with no references left.
446 if ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) {
447 crit_enter(); /* biodone() race */
448 while ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) {
449 KKASSERT(io->mod_list == &hmp->lose_list);
450 TAILQ_REMOVE(&hmp->lose_list, io, mod_entry);
452 if (io->lock.refs == 0)
453 ++hammer_count_refedbufs;
454 hammer_ref(&io->lock);
456 hammer_rel_buffer(buffer, 0);
463 * Flush a single inode that is part of a flush group.
465 * Flusher errors are extremely serious, even ENOSPC shouldn't occur because
466 * the front-end should have reserved sufficient space on the media. Any
467 * error other then EWOULDBLOCK will force the mount to be read-only.
471 hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
473 hammer_mount_t hmp = ip->hmp;
476 hammer_flusher_clean_loose_ios(hmp);
477 error = hammer_sync_inode(trans, ip);
480 * EWOULDBLOCK can happen under normal operation, all other errors
481 * are considered extremely serious. We must set WOULDBLOCK
482 * mechanics to deal with the mess left over from the abort of the
486 ip->flags |= HAMMER_INODE_WOULDBLOCK;
487 if (error == EWOULDBLOCK)
490 hammer_flush_inode_done(ip, error);
491 while (hmp->flusher.finalize_want)
492 tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0);
493 if (hammer_flusher_undo_exhausted(trans, 1)) {
494 kprintf("HAMMER: Warning: UNDO area too small!\n");
495 hammer_flusher_finalize(trans, 1);
496 } else if (hammer_flusher_meta_limit(trans->hmp)) {
497 hammer_flusher_finalize(trans, 0);
502 * Return non-zero if the UNDO area has less then (QUARTER / 4) of its
505 * 1/4 - Emergency free undo space level. Below this point the flusher
506 * will finalize even if directory dependancies have not been resolved.
508 * 2/4 - Used by the pruning and reblocking code. These functions may be
509 * running in parallel with a flush and cannot be allowed to drop
510 * available undo space to emergency levels.
512 * 3/4 - Used at the beginning of a flush to force-sync the volume header
513 * to give the flush plenty of runway to work in.
516 hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter)
518 if (hammer_undo_space(trans) <
519 hammer_undo_max(trans->hmp) * quarter / 4) {
527 * Flush all pending UNDOs, wait for write completion, update the volume
528 * header with the new UNDO end position, and flush it. Then
529 * asynchronously flush the meta-data.
531 * If this is the last finalization in a flush group we also synchronize
532 * our cached blockmap and set hmp->flusher_undo_start and our cached undo
533 * fifo first_offset so the next flush resets the FIFO pointers.
535 * If this is not final it is being called because too many dirty meta-data
536 * buffers have built up and must be flushed with UNDO synchronization to
537 * avoid a buffer cache deadlock.
540 hammer_flusher_finalize(hammer_transaction_t trans, int final)
542 hammer_volume_t root_volume;
543 hammer_blockmap_t cundomap, dundomap;
550 root_volume = trans->rootvol;
553 * Exclusively lock the flusher. This guarantees that all dirty
554 * buffers will be idled (have a mod-count of 0).
556 ++hmp->flusher.finalize_want;
557 hammer_lock_ex(&hmp->flusher.finalize_lock);
560 * If this isn't the final sync several threads may have hit the
561 * meta-limit at the same time and raced. Only sync if we really
562 * have to, after acquiring the lock.
564 if (final == 0 && !hammer_flusher_meta_limit(hmp))
567 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
571 * Flush data buffers. This can occur asynchronously and at any
572 * time. We must interlock against the frontend direct-data write
573 * but do not have to acquire the sync-lock yet.
576 while ((io = TAILQ_FIRST(&hmp->data_list)) != NULL) {
579 if (io->lock.refs == 0)
580 ++hammer_count_refedbufs;
581 hammer_ref(&io->lock);
582 hammer_io_write_interlock(io);
583 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
585 hammer_io_done_interlock(io);
586 hammer_rel_buffer((hammer_buffer_t)io, 0);
591 * The sync-lock is required for the remaining sequence. This lock
592 * prevents meta-data from being modified.
594 hammer_sync_lock_ex(trans);
597 * If we have been asked to finalize the volume header sync the
598 * cached blockmap to the on-disk blockmap. Generate an UNDO
599 * record for the update.
602 cundomap = &hmp->blockmap[0];
603 dundomap = &root_volume->ondisk->vol0_blockmap[0];
604 if (root_volume->io.modified) {
605 hammer_modify_volume(trans, root_volume,
606 dundomap, sizeof(hmp->blockmap));
607 for (i = 0; i < HAMMER_MAX_ZONES; ++i)
608 hammer_crc_set_blockmap(&cundomap[i]);
609 bcopy(cundomap, dundomap, sizeof(hmp->blockmap));
610 hammer_modify_volume_done(root_volume);
618 while ((io = TAILQ_FIRST(&hmp->undo_list)) != NULL) {
621 KKASSERT(io->modify_refs == 0);
622 if (io->lock.refs == 0)
623 ++hammer_count_refedbufs;
624 hammer_ref(&io->lock);
625 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
627 hammer_rel_buffer((hammer_buffer_t)io, 0);
632 * Wait for I/Os to complete
634 hammer_flusher_clean_loose_ios(hmp);
635 hammer_io_wait_all(hmp, "hmrfl1");
637 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
641 * Update the on-disk volume header with new UNDO FIFO end position
642 * (do not generate new UNDO records for this change). We have to
643 * do this for the UNDO FIFO whether (final) is set or not.
645 * Also update the on-disk next_tid field. This does not require
646 * an UNDO. However, because our TID is generated before we get
647 * the sync lock another sync may have beat us to the punch.
649 * This also has the side effect of updating first_offset based on
650 * a prior finalization when the first finalization of the next flush
651 * cycle occurs, removing any undo info from the prior finalization
652 * from consideration.
654 * The volume header will be flushed out synchronously.
656 dundomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
657 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
659 if (dundomap->first_offset != cundomap->first_offset ||
660 dundomap->next_offset != cundomap->next_offset) {
661 hammer_modify_volume(NULL, root_volume, NULL, 0);
662 dundomap->first_offset = cundomap->first_offset;
663 dundomap->next_offset = cundomap->next_offset;
664 hammer_crc_set_blockmap(dundomap);
665 hammer_modify_volume_done(root_volume);
668 if (root_volume->io.modified) {
669 hammer_modify_volume(NULL, root_volume, NULL, 0);
670 if (root_volume->ondisk->vol0_next_tid < trans->tid)
671 root_volume->ondisk->vol0_next_tid = trans->tid;
672 hammer_crc_set_volume(root_volume->ondisk);
673 hammer_modify_volume_done(root_volume);
674 hammer_io_flush(&root_volume->io);
678 * Wait for I/Os to complete
680 hammer_flusher_clean_loose_ios(hmp);
681 hammer_io_wait_all(hmp, "hmrfl2");
683 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
687 * Flush meta-data. The meta-data will be undone if we crash
688 * so we can safely flush it asynchronously.
690 * Repeated catchups will wind up flushing this update's meta-data
691 * and the UNDO buffers for the next update simultaniously. This
695 while ((io = TAILQ_FIRST(&hmp->meta_list)) != NULL) {
698 KKASSERT(io->modify_refs == 0);
699 if (io->lock.refs == 0)
700 ++hammer_count_refedbufs;
701 hammer_ref(&io->lock);
702 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
704 hammer_rel_buffer((hammer_buffer_t)io, 0);
709 * If this is the final finalization for the flush group set
710 * up for the next sequence by setting a new first_offset in
711 * our cached blockmap and clearing the undo history.
713 * Even though we have updated our cached first_offset, the on-disk
714 * first_offset still governs available-undo-space calculations.
717 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
718 if (cundomap->first_offset == cundomap->next_offset) {
719 hmp->hflags &= ~HMNT_UNDO_DIRTY;
721 cundomap->first_offset = cundomap->next_offset;
722 hmp->hflags |= HMNT_UNDO_DIRTY;
724 hammer_clear_undo_history(hmp);
728 * Cleanup. Report any critical errors.
731 hammer_sync_unlock(trans);
733 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) {
734 kprintf("HAMMER(%s): Critical write error during flush, "
735 "refusing to sync UNDO FIFO\n",
736 root_volume->ondisk->vol_name);
740 hammer_unlock(&hmp->flusher.finalize_lock);
741 if (--hmp->flusher.finalize_want == 0)
742 wakeup(&hmp->flusher.finalize_want);
743 hammer_stats_commits += final;
747 * Return non-zero if too many dirty meta-data buffers have built up.
749 * Since we cannot allow such buffers to flush until we have dealt with
750 * the UNDOs, we risk deadlocking the kernel's buffer cache.
753 hammer_flusher_meta_limit(hammer_mount_t hmp)
755 if (hmp->locked_dirty_space + hmp->io_running_space >
756 hammer_limit_dirtybufspace) {
763 * Return non-zero if too many dirty meta-data buffers have built up.
765 * This version is used by background operations (mirror, prune, reblock)
766 * to leave room for foreground operations.
769 hammer_flusher_meta_halflimit(hammer_mount_t hmp)
771 if (hmp->locked_dirty_space + hmp->io_running_space >
772 hammer_limit_dirtybufspace / 2) {
779 * Return non-zero if the flusher still has something to flush.
782 hammer_flusher_haswork(hammer_mount_t hmp)
784 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
786 if (TAILQ_FIRST(&hmp->flush_group_list) || /* dirty inodes */
787 TAILQ_FIRST(&hmp->volu_list) || /* dirty bufffers */
788 TAILQ_FIRST(&hmp->undo_list) ||
789 TAILQ_FIRST(&hmp->data_list) ||
790 TAILQ_FIRST(&hmp->meta_list) ||
791 (hmp->hflags & HMNT_UNDO_DIRTY) /* UNDO FIFO sync */