struct hammer_flush_group {
TAILQ_ENTRY(hammer_flush_group) flush_entry;
struct hammer_fls_rb_tree flush_tree;
- int unused01;
+ int seq; /* our seq no */
int total_count; /* record load */
int running; /* group is running */
int closed;
int count;
};
-#define HAMMER_RECLAIM_WAIT 4000 /* default vfs.hammer.limit_reclaim */
-
/*
* Track who is creating the greatest burden on the
* inode cache.
int signal; /* flusher thread sequencer */
int act; /* currently active flush group */
int done; /* set to act when complete */
- int next; /* next flush group */
+ int next; /* next unallocated flg seqno */
int group_lock; /* lock sequencing of the next flush */
int exiting; /* request master exit */
thread_t td; /* master flusher thread */
TAILQ_HEAD(, hammer_undo) undo_lru_list;
TAILQ_HEAD(, hammer_reserve) delay_list;
struct hammer_flush_group_list flush_group_list;
+ hammer_flush_group_t fill_flush_group;
hammer_flush_group_t next_flush_group;
TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
TAILQ_HEAD(, hammer_dedup_cache) dedup_lru_list;
}
/*
- * Sync all inodes pending on the flusher - return immediately.
+ * Sync all flush groups through to close_flg - return immediately.
+ * If close_flg is NULL all flush groups are synced.
*
- * All flush groups will be flushed.
+ * Returns the sequence number of the last closed flush group,
+ * which may be close_flg. When syncing to the end if there
+ * are no flush groups pending we still cycle the flusher, so
+ * we return the next seq number not yet allocated.
*/
int
hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t close_flg)
{
hammer_flush_group_t flg;
- int seq = hmp->flusher.next;
+ int seq;
+
+ /*
+ * Already closed
+ */
+ if (close_flg && close_flg->closed)
+ return(close_flg->seq);
- TAILQ_FOREACH(flg, &hmp->flush_group_list, flush_entry) {
- if (flg->running == 0)
- ++seq;
+ /*
+ * Close flush groups until we hit the end of the list
+ * or close_flg.
+ */
+ while ((flg = hmp->next_flush_group) != NULL) {
+ KKASSERT(flg->closed == 0 && flg->running == 0);
flg->closed = 1;
+ hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
if (flg == close_flg)
break;
}
+
if (hmp->flusher.td) {
if (hmp->flusher.signal++ == 0)
wakeup(&hmp->flusher.signal);
+ seq = flg ? flg->seq : hmp->flusher.next;
} else {
seq = hmp->flusher.done;
}
return(seq);
}
+/*
+ * Flush the current/next flushable flg. This function is typically called
+ * in a loop along with hammer_flusher_wait(hmp, returned_seq) to iterate
+ * flush groups until specific conditions are met.
+ *
+ * If a flush is currently in progress its seq is returned.
+ *
+ * If no flush is currently in progress the next available flush group
+ * will be flushed and its seq returned.
+ *
+ * If no flush groups are present a dummy seq will be allocated and
+ * returned and the flusher will be activated (e.g. to flush the
+ * undo/redo and the volume header).
+ */
int
hammer_flusher_async_one(hammer_mount_t hmp)
{
+ hammer_flush_group_t flg;
int seq;
if (hmp->flusher.td) {
- seq = hmp->flusher.next;
- if (hmp->flusher.signal++ == 0)
- wakeup(&hmp->flusher.signal);
+ flg = TAILQ_FIRST(&hmp->flush_group_list);
+ seq = hammer_flusher_async(hmp, flg);
} else {
seq = hmp->flusher.done;
}
hammer_flusher_wait(hammer_mount_t hmp, int seq)
{
while ((int)(seq - hmp->flusher.done) > 0) {
- if (hmp->flusher.act != seq) {
+ if ((int)(seq - hmp->flusher.act) > 0) {
if (hmp->flusher.signal++ == 0)
wakeup(&hmp->flusher.signal);
}
for (;;) {
/*
- * Do at least one flush cycle. We may have to update the
- * UNDO FIFO even if no inodes are queued.
+ * Flush all closed flgs. If no flg's are closed we still
+ * do at least one flush cycle as we may have to update
+ * the UNDO FIFO even if no inodes are queued.
*/
for (;;) {
while (hmp->flusher.group_lock)
tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0);
- hmp->flusher.act = hmp->flusher.next;
- ++hmp->flusher.next;
hammer_flusher_clean_loose_ios(hmp);
hammer_flusher_flush(hmp);
hmp->flusher.done = hmp->flusher.act;
/*
* Just in-case there's a flush race on mount
*/
- if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL)
+ if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL) {
return;
+ }
+
+ /*
+ * Set the actively flushing sequence number. If no flushable
+ * groups are present allocate a dummy sequence number for the
+ * operation.
+ */
+ flg = TAILQ_FIRST(&hmp->flush_group_list);
+ if (flg == NULL) {
+ hmp->flusher.act = hmp->flusher.next;
+ ++hmp->flusher.next;
+ } else if (flg->closed) {
+ KKASSERT(flg->running == 0);
+ flg->running = 1;
+ hmp->flusher.act = flg->seq;
+ if (hmp->fill_flush_group == flg)
+ hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
+ }
/*
* We only do one flg but we may have to loop/retry.
+ *
+ * Due to various races it is possible to come across a flush
+ * group which as not yet been closed.
*/
count = 0;
- while ((flg = TAILQ_FIRST(&hmp->flush_group_list)) != NULL) {
+ while (flg && flg->running) {
++count;
if (hammer_debug_general & 0x0001) {
kprintf("hammer_flush %d ttl=%d recs=%d\n",
hmp->flusher.act,
- flg->total_count, flg->refs);
+ flg->total_count,
+ flg->refs);
}
if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
break;
if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3))
hammer_flusher_finalize(&hmp->flusher.trans, 0);
- /*
- * Ok, we are running this flush group now (this prevents new
- * additions to it).
- */
- flg->running = 1;
- if (hmp->next_flush_group == flg)
- hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
+ KKASSERT(hmp->next_flush_group != flg);
/*
* Iterate the inodes in the flg's flush_tree and assign
kfree(flg, hmp->m_misc);
break;
}
+ KKASSERT(TAILQ_FIRST(&hmp->flush_group_list) == flg);
}
/*
int good;
/*
- * next_flush_group is the first flush group we can place the inode
- * in. It may be NULL. If it becomes full we append a new flush
- * group and make that the next_flush_group.
+ * fill_flush_group is the first flush group we may be able to
+ * continue filling, it may be open or closed but it will always
+ * be past the currently flushing (running) flg.
+ *
+ * next_flush_group is the next open flush group.
*/
hmp = ip->hmp;
- while ((flg = hmp->next_flush_group) != NULL) {
+ while ((flg = hmp->fill_flush_group) != NULL) {
KKASSERT(flg->running == 0);
- if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit)
+ if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
+ flg->total_count <= hammer_autoflush) {
break;
- hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
+ }
+ hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
hammer_flusher_async(ip->hmp, flg);
}
if (flg == NULL) {
flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
- hmp->next_flush_group = flg;
+ flg->seq = hmp->flusher.next++;
+ if (hmp->next_flush_group == NULL)
+ hmp->next_flush_group = flg;
+ if (hmp->fill_flush_group == NULL)
+ hmp->fill_flush_group = flg;
RB_INIT(&flg->flush_tree);
TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
}
static void
hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
{
+ hammer_mount_t hmp = ip->hmp;
int go_count;
/*
hammer_ref(&ip->lock);
ip->flush_state = HAMMER_FST_FLUSH;
ip->flush_group = flg;
- ++ip->hmp->flusher.group_lock;
- ++ip->hmp->count_iqueued;
+ ++hmp->flusher.group_lock;
+ ++hmp->count_iqueued;
++hammer_count_iqueued;
++flg->total_count;
hammer_redo_fifo_start_flush(ip);
- /*
- * If the flush group reaches the autoflush limit we want to signal
- * the flusher. This is particularly important for remove()s.
- *
- * If the default hammer_limit_reclaim is changed via sysctl
- * make sure we don't hit a degenerate case where we don't start
- * a flush but blocked on further inode ops.
- */
- if (flg->total_count == hammer_autoflush ||
- flg->total_count >= hammer_limit_reclaim / 4)
- flags |= HAMMER_FLUSH_SIGNAL;
-
#if 0
/*
* We need to be able to vfsync/truncate from the backend.
*/
if (go_count == 0) {
if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
- --ip->hmp->count_iqueued;
+ --hmp->count_iqueued;
--hammer_count_iqueued;
--flg->total_count;
ip->flush_state = HAMMER_FST_SETUP;
ip->flush_group = NULL;
+ if (flags & HAMMER_FLUSH_SIGNAL) {
+ ip->flags |= HAMMER_INODE_REFLUSH |
+ HAMMER_INODE_RESIGNAL;
+ } else {
+ ip->flags |= HAMMER_INODE_REFLUSH;
+ }
#if 0
if (ip->flags & HAMMER_INODE_VHELD) {
ip->flags &= ~HAMMER_INODE_VHELD;
* when an inode is in SETUP.
*/
ip->flags |= HAMMER_INODE_REFLUSH;
- if (flags & HAMMER_FLUSH_SIGNAL) {
- ip->flags |= HAMMER_INODE_RESIGNAL;
- hammer_flusher_async(ip->hmp, flg);
- }
- if (--ip->hmp->flusher.group_lock == 0)
- wakeup(&ip->hmp->flusher.group_lock);
+ if (--hmp->flusher.group_lock == 0)
+ wakeup(&hmp->flusher.group_lock);
return;
}
}
*/
KKASSERT(flg->running == 0);
RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
- if (--ip->hmp->flusher.group_lock == 0)
- wakeup(&ip->hmp->flusher.group_lock);
+ if (--hmp->flusher.group_lock == 0)
+ wakeup(&hmp->flusher.group_lock);
- if (flags & HAMMER_FLUSH_SIGNAL) {
- hammer_flusher_async(ip->hmp, flg);
+ /*
+ * Auto-flush the group if it grows too large. Make sure the
+ * inode reclaim wait pipeline continues to work.
+ */
+ if (flg->total_count >= hammer_autoflush ||
+ flg->total_count >= hammer_limit_reclaim / 4) {
+ if (hmp->fill_flush_group == flg)
+ hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
+ hammer_flusher_async(hmp, flg);
}
}
--hmp->inode_reclaims;
ip->flags &= ~HAMMER_INODE_RECLAIM;
- while ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
- if (reclaim->count > 0 && --reclaim->count == 0) {
+ if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
+ KKASSERT(reclaim->count > 0);
+ if (--reclaim->count == 0) {
TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
wakeup(reclaim);
}
- if (hmp->inode_reclaims > hammer_limit_reclaim / 2)
- break;
}
}
*
* When we block we don't care *which* inode has finished reclaiming,
* as lone as one does.
+ *
+ * The reclaim pipeline is primary governed by the auto-flush which is
+ * 1/4 hammer_limit_reclaim. We don't want to block if the count is
+ * less than 1/2 hammer_limit_reclaim. From 1/2 to full count is
+ * dynamically governed.
*/
void
hammer_inode_waitreclaims(hammer_transaction_t trans)
{
hammer_mount_t hmp = trans->hmp;
struct hammer_reclaim reclaim;
+ int lower_limit;
/*
- * Track inode load
+ * Track inode load, delay if the number of reclaiming inodes is
+ * between 2/4 and 4/4 hammer_limit_reclaim, depending.
*/
if (curthread->td_proc) {
struct hammer_inostats *stats;
- int lower_limit;
stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
++stats->count;
if (stats->count > hammer_limit_reclaim / 2)
stats->count = hammer_limit_reclaim / 2;
lower_limit = hammer_limit_reclaim - stats->count;
- if (hammer_debug_general & 0x10000)
- kprintf("pid %5d limit %d\n", (int)curthread->td_proc->p_pid, lower_limit);
-
- if (hmp->inode_reclaims < lower_limit)
- return;
+ if (hammer_debug_general & 0x10000) {
+ kprintf("pid %5d limit %d\n",
+ (int)curthread->td_proc->p_pid, lower_limit);
+ }
} else {
- /*
- * Default mode
- */
- if (hmp->inode_reclaims < hammer_limit_reclaim)
- return;
+ lower_limit = hammer_limit_reclaim * 3 / 4;
+ }
+ if (hmp->inode_reclaims >= lower_limit) {
+ reclaim.count = 1;
+ TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
+ tsleep(&reclaim, 0, "hmrrcm", hz);
+ if (reclaim.count > 0)
+ TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
}
- reclaim.count = 1;
- TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
- tsleep(&reclaim, 0, "hmrrcm", hz);
- if (reclaim.count > 0)
- TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
}
/*
int hammer_limit_running_io; /* per-mount */
int hammer_limit_recs; /* as a whole XXX */
int hammer_limit_inode_recs = 1024; /* per inode */
-int hammer_limit_reclaim = HAMMER_RECLAIM_WAIT;
+int hammer_limit_reclaim;
int hammer_live_dedup_cache_size = DEDUP_CACHE_SIZE;
int hammer_limit_redo = 4096 * 1024; /* per inode */
-int hammer_autoflush = 2000; /* auto flush */
+int hammer_autoflush = 500; /* auto flush (typ on reclaim) */
int hammer_bio_count;
int hammer_verify_zone;
int hammer_verify_data = 1;
hammer_limit_running_io = hammer_limit_dirtybufspace;
if (hammer_limit_running_io > 10 * 1024 * 1024)
hammer_limit_running_io = 10 * 1024 * 1024;
+
+ /*
+ * The hammer_inode structure detaches from the vnode on reclaim.
+ * This limits the number of inodes in this state to prevent a
+ * memory pool blowout.
+ */
+ if (hammer_limit_reclaim == 0)
+ hammer_limit_reclaim = desiredvnodes / 10;
+
return(0);
}