* Change the tailq of inodes in a flush group to a red-black tree.
The flusher now processes inodes in sorted order and breaks them up
into larger sets for concurrent flushing. The flusher threads are thus
more likely to concurrently process inodes which are fairly far apart
in the B-Tree.
This greatly reduces lock interference between flusher threads. However,
B-Tree deadlocks are still an issue between inodes undergoing flushes
and front-end access operations. This can be observed by noting periods
of low dev-write activity in 'hammer iostats 1' output during a blogbench
test. The hammer-S* kernel threads will likely be in a 'hmrdlk' state
at the same time.
* Add sysctl vfs.hammer.limit_reclaim to set the maximum
number of inodes with no vnode associations, default 4000.
NOTE: For debugging only, setting this value too high will blow
out the kmalloc pool.
* Without this a 'sync' could end up flushing 50,000 inodes in a single
* transaction.
*/
+struct hammer_fls_rb_tree;
+RB_HEAD(hammer_fls_rb_tree, hammer_inode);
+RB_PROTOTYPE(hammer_fls_rb_tree, hammer_inode, rb_flsnode,
+ hammer_ino_rb_compare);
+
struct hammer_flush_group {
TAILQ_ENTRY(hammer_flush_group) flush_entry;
- TAILQ_HEAD(, hammer_inode) flush_list;
- int unused01; /* inode load */
+ struct hammer_fls_rb_tree flush_tree;
+ int unused01;
int total_count; /* record load */
int running; /* group is running */
int closed;
RB_ENTRY(hammer_inode) rb_node;
hammer_inode_state_t flush_state;
hammer_flush_group_t flush_group;
- TAILQ_ENTRY(hammer_inode) flush_entry;
+ RB_ENTRY(hammer_inode) rb_flsnode; /* when on flush list */
struct hammer_record_list target_list; /* target of dependant recs */
int64_t obj_id; /* (key) object identifier */
hammer_tid_t obj_asof; /* (key) snapshot or 0 */
#define HAMMER_INODE_MODMASK_NOXDIRTY \
(HAMMER_INODE_MODMASK & ~HAMMER_INODE_XDIRTY)
-#define HAMMER_FLUSH_GROUP_SIZE 64
-
#define HAMMER_FLUSH_SIGNAL 0x0001
#define HAMMER_FLUSH_RECURSION 0x0002
int count;
};
-#define HAMMER_RECLAIM_FLUSH 2000
-#define HAMMER_RECLAIM_WAIT 4000
+#define HAMMER_RECLAIM_WAIT 4000 /* default vfs.hammer.limit_reclaim */
/*
* Structure used to represent an unsynchronized record in-memory. These
extern int hammer_limit_dirtybufspace;
extern int hammer_limit_recs;
extern int hammer_limit_inode_recs;
+extern int hammer_limit_reclaim;
extern int hammer_bio_count;
extern int hammer_verify_zone;
extern int hammer_verify_data;
static void hammer_flusher_flush_inode(hammer_inode_t ip,
hammer_transaction_t trans);
+RB_GENERATE(hammer_fls_rb_tree, hammer_inode, rb_flsnode,
+ hammer_ino_rb_compare);
+
+/*
+ * Inodes are sorted and assigned to slave threads in groups of 128.
+ * We want a flush group size large enough such that the slave threads
+ * are not likely to interfere with each other when accessing the B-Tree,
+ * but not so large that we lose concurrency.
+ */
+#define HAMMER_FLUSH_GROUP_SIZE 128
+
/*
* Support structures for the flusher threads.
*/
hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
/*
- * Iterate the inodes in the flg's flush_list and assign
+ * Iterate the inodes in the flg's flush_tree and assign
* them to slaves.
*/
slave_index = 0;
info = TAILQ_FIRST(&hmp->flusher.ready_list);
- next_ip = TAILQ_FIRST(&flg->flush_list);
+ next_ip = RB_FIRST(hammer_fls_rb_tree, &flg->flush_tree);
while ((ip = next_ip) != NULL) {
- next_ip = TAILQ_NEXT(ip, flush_entry);
+ next_ip = RB_NEXT(hammer_fls_rb_tree,
+ &flg->flush_tree, ip);
if (++hmp->check_yield > hammer_yield_check) {
hmp->check_yield = 0;
* Loop up on the same flg. If the flg is done clean it up
* and break out. We only flush one flg.
*/
- if (TAILQ_FIRST(&flg->flush_list) == NULL) {
- KKASSERT(TAILQ_EMPTY(&flg->flush_list));
+ if (RB_EMPTY(&flg->flush_tree)) {
KKASSERT(flg->refs == 0);
TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry);
kfree(flg, hmp->m_misc);
/*
- * The slave flusher thread pulls work off the master flush_list until no
+ * The slave flusher thread pulls work off the master flush list until no
* work is left.
*/
static void
*/
switch(ip->flush_state) {
case HAMMER_FST_FLUSH:
- TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
+ RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
--ip->flush_group->refs;
ip->flush_group = NULL;
/* fall through */
if (flg == NULL) {
flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
hmp->next_flush_group = flg;
- TAILQ_INIT(&flg->flush_list);
+ RB_INIT(&flg->flush_tree);
TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
}
* The flusher list inherits our inode and reference.
*/
KKASSERT(flg->running == 0);
- TAILQ_INSERT_TAIL(&flg->flush_list, ip, flush_entry);
+ RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
if (--ip->hmp->flusher.group_lock == 0)
wakeup(&ip->hmp->flusher.group_lock);
/*
* Remove from the flush_group
*/
- TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
+ RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
ip->flush_group = NULL;
/*
TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
wakeup(reclaim);
}
- if (hmp->inode_reclaims > HAMMER_RECLAIM_WAIT / 2)
+ if (hmp->inode_reclaims > hammer_limit_reclaim / 2)
break;
}
}
{
struct hammer_reclaim reclaim;
- if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT)
+ if (hmp->inode_reclaims < hammer_limit_reclaim)
return;
reclaim.count = 1;
TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
* Hysteresis.
*/
if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
- if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT / 2 &&
+ if (hmp->inode_reclaims < hammer_limit_reclaim / 2 &&
hmp->count_iqueued < hmp->count_inodes / 20) {
hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
return;
}
} else {
- if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT ||
+ if (hmp->inode_reclaims < hammer_limit_reclaim ||
hmp->count_iqueued < hmp->count_inodes / 10) {
return;
}
int hammer_limit_dirtybufspace; /* per-mount */
int hammer_limit_recs; /* as a whole XXX */
int hammer_limit_inode_recs = 1024; /* per inode */
+int hammer_limit_reclaim = HAMMER_RECLAIM_WAIT;
int hammer_autoflush = 2000; /* auto flush */
int hammer_bio_count;
int hammer_verify_zone;
&hammer_limit_recs, 0, "");
SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_inode_recs, CTLFLAG_RW,
&hammer_limit_inode_recs, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_reclaim, CTLFLAG_RW,
+ &hammer_limit_reclaim, 0, "");
SYSCTL_INT(_vfs_hammer, OID_AUTO, count_fsyncs, CTLFLAG_RD,
&hammer_count_fsyncs, 0, "");
kmalloc_create(&hmp->m_inodes, "HAMMER-inodes");
maxinodes = desiredvnodes + desiredvnodes / 5 +
- HAMMER_RECLAIM_WAIT;
+ hammer_limit_reclaim * 2;
kmalloc_raise_limit(hmp->m_inodes,
maxinodes * sizeof(struct hammer_inode));
KKASSERT(RB_EMPTY(&hmp->rb_inos_root));
while ((flg = TAILQ_FIRST(&hmp->flush_group_list)) != NULL) {
TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry);
- KKASSERT(TAILQ_EMPTY(&flg->flush_list));
+ KKASSERT(RB_EMPTY(&flg->flush_tree));
if (flg->refs) {
kprintf("HAMMER: Warning, flush_group %p was "
"not empty on umount!\n", flg);