HAMMER VFS - Reduce stalls during bulk file operations
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 19 Feb 2010 18:41:22 +0000 (10:41 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 19 Feb 2010 18:41:22 +0000 (10:41 -0800)
* Track modifying inode operations on a per-PID basis (loosely) and
  call hammer_inode_wait_reclaims() earlier for those pids.

  The algorithm selects a wait point based on the process's perceived
  contribution to the inode load.  The greater the contribution, the
  more readily we stall the process in order to wait for related reclaims
  to process.

  Processes with lower loads have higher reclaim points and do not stall
  as readily as they did before.

* Remove waitreclaims calls based on B-Tree scans.  I'm not sure why I had
  this in there but it was creating an excessive number of unnecessary
  stalls, so if any problems crop up I'll have to find another way to deal
  with them.

* These changes (particularly the first) should reduce unnecessary stalls
  for the programs not doing heavy inode operations.  Hopefully that means
  rm -rf and tar extractions will not have as quite the detrimental effect
  on other processes as they did before.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_prune.c
sys/vfs/hammer/hammer_transaction.c
sys/vfs/hammer/hammer_vnops.c

index b41a16c..44afd33 100644 (file)
@@ -427,6 +427,19 @@ struct hammer_reclaim {
 #define HAMMER_RECLAIM_WAIT    4000    /* default vfs.hammer.limit_reclaim */
 
 /*
+ * Track who is creating the greatest burden on the
+ * inode cache.
+ */
+struct hammer_inostats {
+       pid_t           pid;    /* track user process */
+       int             ltick;  /* last tick */
+       int             count;  /* count (degenerates) */
+};
+
+#define HAMMER_INOSTATS_HSIZE  32
+#define HAMMER_INOSTATS_HMASK  (HAMMER_INOSTATS_HSIZE - 1)
+
+/*
  * Structure used to represent an unsynchronized record in-memory.  These
  * records typically represent directory entries.  Only non-historical
  * records are kept in-memory.
@@ -844,6 +857,8 @@ struct hammer_mount {
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
        TAILQ_HEAD(, hammer_reclaim) reclaim_list;
        TAILQ_HEAD(, hammer_io) iorun_list;
+
+       struct hammer_inostats  inostats[HAMMER_INOSTATS_HSIZE];
 };
 
 typedef struct hammer_mount    *hammer_mount_t;
@@ -968,8 +983,7 @@ void        hammer_scan_inode_snapshots(hammer_mount_t hmp,
                        void *data);
 void   hammer_put_inode(struct hammer_inode *ip);
 void   hammer_put_inode_ref(struct hammer_inode *ip);
-void   hammer_inode_waitreclaims(hammer_mount_t hmp);
-void   hammer_inode_waithard(hammer_mount_t hmp);
+void   hammer_inode_waitreclaims(hammer_transaction_t trans);
 
 int    hammer_unload_volume(hammer_volume_t volume, void *data __unused);
 int    hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused);
@@ -1211,7 +1225,7 @@ void hammer_start_transaction_fls(struct hammer_transaction *trans,
 void hammer_done_transaction(struct hammer_transaction *trans);
 hammer_tid_t hammer_alloc_tid(hammer_mount_t hmp, int count);
 
-void hammer_modify_inode(hammer_inode_t ip, int flags);
+void hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags);
 void hammer_flush_inode(hammer_inode_t ip, int flags);
 void hammer_flush_inode_done(hammer_inode_t ip, int error);
 void hammer_wait_inode(hammer_inode_t ip);
index 5fd1a34..0537d11 100644 (file)
@@ -50,6 +50,8 @@ static int    hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
 static int     hammer_setup_parent_inodes_helper(hammer_record_t record,
                                        int depth, hammer_flush_group_t flg);
 static void    hammer_inode_wakereclaims(hammer_inode_t ip);
+static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp,
+                                       pid_t pid);
 
 #ifdef DEBUG_TRUNCATE
 extern struct hammer_inode *HammerTruncIp;
@@ -563,7 +565,13 @@ retry:
                ip = NULL;
        }
        hammer_done_cursor(&cursor);
-       trans->flags |= HAMMER_TRANSF_NEWINODE;
+
+       /*
+        * NEWINODE is only set if the inode becomes dirty later,
+        * setting it here just leads to unnecessary stalls.
+        *
+        * trans->flags |= HAMMER_TRANSF_NEWINODE;
+        */
        return (ip);
 }
 
@@ -1091,7 +1099,7 @@ hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
                                            pfsm, &ip);
                if (error == 0) {
                        ++ip->ino_data.nlinks;
-                       hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
+                       hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
                }
        }
        if (ip)
@@ -1586,7 +1594,7 @@ hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
  */
 void
-hammer_modify_inode(hammer_inode_t ip, int flags)
+hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
 {
        /* 
         * ronly of 0 or 2 does not trigger assertion.
@@ -1602,6 +1610,17 @@ hammer_modify_inode(hammer_inode_t ip, int flags)
                ++ip->hmp->rsv_inodes;
        }
 
+       /*
+        * Set the NEWINODE flag in the transaction if the inode
+        * transitions to a dirty state.  This is used to track
+        * the load on the inode cache.
+        */
+       if (trans &&
+           (ip->flags & HAMMER_INODE_MODMASK) == 0 &&
+           (flags & HAMMER_INODE_MODMASK)) {
+               trans->flags |= HAMMER_TRANSF_NEWINODE;
+       }
+
        ip->flags |= flags;
 }
 
@@ -3120,12 +3139,36 @@ hammer_inode_wakereclaims(hammer_inode_t ip)
  * as lone as one does.
  */
 void
-hammer_inode_waitreclaims(hammer_mount_t hmp)
+hammer_inode_waitreclaims(hammer_transaction_t trans)
 {
+       hammer_mount_t hmp = trans->hmp;
        struct hammer_reclaim reclaim;
 
-       if (hmp->inode_reclaims < hammer_limit_reclaim)
-               return;
+       /*
+        * Track inode load
+        */
+       if (curthread->td_proc) {
+               struct hammer_inostats *stats;
+               int lower_limit;
+
+               stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
+               ++stats->count;
+
+               if (stats->count > hammer_limit_reclaim / 2)
+                       stats->count = hammer_limit_reclaim / 2;
+               lower_limit = hammer_limit_reclaim - stats->count;
+               if (hammer_debug_general & 0x10000)
+                       kprintf("pid %5d limit %d\n", (int)curthread->td_proc->p_pid, lower_limit);
+
+               if (hmp->inode_reclaims < lower_limit)
+                       return;
+       } else {
+               /*
+                * Default mode
+                */
+               if (hmp->inode_reclaims < hammer_limit_reclaim)
+                       return;
+       }
        reclaim.count = 1;
        TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
        tsleep(&reclaim, 0, "hmrrcm", hz);
@@ -3133,6 +3176,37 @@ hammer_inode_waitreclaims(hammer_mount_t hmp)
                TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
 }
 
+static
+struct hammer_inostats *
+hammer_inode_inostats(hammer_mount_t hmp, pid_t pid)
+{
+       struct hammer_inostats *stats;
+       int delta;
+       int chain;
+
+       for (chain = 0; chain < 4; ++chain) {
+               stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK];
+               if (stats->pid == pid)
+                       break;
+       }
+       if (chain == 4) {
+               stats = &hmp->inostats[(pid + ticks) & HAMMER_INOSTATS_HMASK];
+               stats->pid = pid;
+       }
+
+       if (stats->count && stats->ltick != ticks) {
+               delta = ticks - stats->ltick;
+               stats->ltick = ticks;
+               if (delta <= 0 || delta > hz * 60)
+                       stats->count = 0;
+               else
+                       stats->count = stats->count * hz / (hz + delta);
+       }
+       if (hammer_debug_general & 0x10000)
+               kprintf("pid %5d stats %d\n", (int)pid, stats->count);
+       return (stats);
+}
+
 #if 0
 
 /*
index 6f578db..ac4d8e6 100644 (file)
@@ -679,7 +679,7 @@ hammer_ip_add_directory(struct hammer_transaction *trans,
 
        ++ip->ino_data.nlinks;
        ip->ino_data.ctime = trans->time;
-       hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
+       hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
 
        /*
         * Find an unused namekey.  Both the in-memory record tree and
@@ -735,7 +735,7 @@ hammer_ip_add_directory(struct hammer_transaction *trans,
        error = hammer_mem_add(record);
        if (error == 0) {
                dip->ino_data.mtime = trans->time;
-               hammer_modify_inode(dip, HAMMER_INODE_MTIME);
+               hammer_modify_inode(trans, dip, HAMMER_INODE_MTIME);
        }
 failed:
        hammer_done_cursor(&cursor);
@@ -850,9 +850,9 @@ hammer_ip_del_directory(struct hammer_transaction *trans,
                        ip->ino_data.ctime = trans->time;
                }
                dip->ino_data.mtime = trans->time;
-               hammer_modify_inode(dip, HAMMER_INODE_MTIME);
+               hammer_modify_inode(trans, dip, HAMMER_INODE_MTIME);
                if (ip) {
-                       hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
+                       hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
                        if (ip->ino_data.nlinks == 0 &&
                            (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
                                hammer_done_cursor(cursor);
@@ -1381,7 +1381,7 @@ hammer_mem_add(hammer_record_t record)
        ++record->ip->rsv_recs;
        record->ip->hmp->rsv_databytes += record->leaf.data_len;
        record->flags |= HAMMER_RECF_ONRBTREE;
-       hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY);
+       hammer_modify_inode(NULL, record->ip, HAMMER_INODE_XDIRTY);
        hammer_rel_mem_record(record);
        return(0);
 }
index c0dca40..aed0de9 100644 (file)
@@ -324,7 +324,7 @@ prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
                                (long long)elm->base.obj_id);
                }
                hammer_rel_inode(ip, 0);
-               hammer_inode_waitreclaims(cursor->trans->hmp);
+               hammer_inode_waitreclaims(cursor->trans);
        } else {
                kprintf("unable to prune disconnected inode %016llx\n",
                        (long long)elm->base.obj_id);
index a3fb181..5d668be 100644 (file)
@@ -118,7 +118,6 @@ hammer_start_transaction_fls(struct hammer_transaction *trans,
 void
 hammer_done_transaction(struct hammer_transaction *trans)
 {
-       hammer_mount_t hmp = trans->hmp;
        int expected_lock_refs;
 
        hammer_rel_volume(trans->rootvol, 0);
@@ -128,9 +127,11 @@ hammer_done_transaction(struct hammer_transaction *trans)
        trans->sync_lock_refs = 0;
        if (trans->type != HAMMER_TRANS_FLS) {
                if (trans->flags & HAMMER_TRANSF_NEWINODE)
-                       hammer_inode_waitreclaims(hmp);
+                       hammer_inode_waitreclaims(trans);
+               /*
                else if (trans->flags & HAMMER_TRANSF_DIDIO)
-                       hammer_inode_waitreclaims(hmp);
+                       hammer_inode_waitreclaims(trans);
+               */
        }
 }
 
index f43c003..460a347 100644 (file)
@@ -441,7 +441,7 @@ skip:
                if ((ip->flags & HAMMER_INODE_RO) == 0 &&
                    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
                        ip->ino_data.atime = trans.time;
-                       hammer_modify_inode(ip, HAMMER_INODE_ATIME);
+                       hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
                }
                hammer_done_transaction(&trans);
                if (got_mplock > 0)
@@ -742,7 +742,7 @@ hammer_vop_write(struct vop_write_args *ap)
                }
                ip->ino_data.mtime = trans.time;
                flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
-               hammer_modify_inode(ip, flags);
+               hammer_modify_inode(&trans, ip, flags);
 
                /*
                 * Once we dirty the buffer any cached zone-X offset
@@ -1893,7 +1893,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap)
                if (error == 0) {
                        ip->ino_data.parent_obj_id = tdip->obj_id;
                        ip->ino_data.ctime = trans.time;
-                       hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
+                       hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
                }
        }
        if (error)
@@ -2030,7 +2030,7 @@ hammer_vop_markatime(struct vop_markatime_args *ap)
        ++hammer_stats_file_iopsw;
 
        ip->ino_data.atime = trans.time;
-       hammer_modify_inode(ip, HAMMER_INODE_ATIME);
+       hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
        hammer_done_transaction(&trans);
        hammer_knote(ap->a_vp, NOTE_ATTRIB);
        return (0);
@@ -2265,7 +2265,7 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
        }
 done:
        if (error == 0)
-               hammer_modify_inode(ip, modflags);
+               hammer_modify_inode(&trans, ip, modflags);
        hammer_done_transaction(&trans);
        hammer_knote(ap->a_vp, kflags);
        return (error);
@@ -2344,7 +2344,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
                 */
                if (error == 0) {
                        nip->ino_data.size = bytes;
-                       hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
+                       hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
                }
        }
        if (error == 0)