kernel - Performance improvements during heavy memory/IO use
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 11 Nov 2011 00:32:10 +0000 (16:32 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 11 Nov 2011 00:32:10 +0000 (16:32 -0800)
* Remove the vm.vm_load logic, it was breaking things worse and fixing
  things not so much.

* Fix a bug in the pageout algorithm that was causing the PQ_ACTIVE queue
  to drain excessively, messing up the LRU/activity algorithm.

* Rip out hammer_limit_running_io and instead just call waitrunningbufspace().

* Change the waitrunningbufspace() logic to add a bit of hyseresis and to
  fairly block everyone doing write I/O, otherwise some threads may be
  blocked while other threads are allowed to proceed while the buf_daemon
  is trying to flush stuff out.

sys/kern/kern_clock.c
sys/kern/vfs_bio.c
sys/sys/thread.h
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_vfsops.c
sys/vm/vm_extern.h
sys/vm/vm_fault.c
sys/vm/vm_pageout.c
sys/vm/vm_pageout.h

index 63e9951..dab7b6d 100644 (file)
@@ -533,11 +533,6 @@ hardclock(systimer_t info, int in_ipi __unused, struct intrframe *frame)
             */
            cpu_sfence();
            basetime_index = ni;
-
-           /*
-            * Figure out how badly the system is starved for memory
-            */
-           vm_fault_ratecheck();
        }
 
        /*
index 2f6552e..20598f3 100644 (file)
@@ -267,7 +267,7 @@ runningbufwakeup(struct buf *bp)
                /*
                 * see waitrunningbufspace() for limit test.
                 */
-               limit = hirunningspace * 4 / 6;
+               limit = hirunningspace * 3 / 6;
                if (runningbufreq && runningbufspace <= limit) {
                        runningbufreq = 0;
                        spin_unlock(&bufcspin);
@@ -305,38 +305,27 @@ bufcountwakeup(void)
 /*
  * waitrunningbufspace()
  *
- * Wait for the amount of running I/O to drop to hirunningspace * 4 / 6.
- * This is the point where write bursting stops so we don't want to wait
- * for the running amount to drop below it (at least if we still want bioq
- * to burst writes).
+ * If runningbufspace exceeds 4/6 hirunningspace we block until
+ * runningbufspace drops to 3/6 hirunningspace.  We also block if another
+ * thread blocked here in order to be fair, even if runningbufspace
+ * is now lower than the limit.
  *
  * The caller may be using this function to block in a tight loop, we
- * must block while runningbufspace is greater then or equal to
- * hirunningspace * 4 / 6.
- *
- * And even with that it may not be enough, due to the presence of
- * B_LOCKED dirty buffers, so also wait for at least one running buffer
- * to complete.
+ * must block while runningbufspace is greater than at least
+ * hirunningspace * 3 / 6.
  */
 void
 waitrunningbufspace(void)
 {
        int limit = hirunningspace * 4 / 6;
-       int dummy;
 
-       spin_lock(&bufcspin);
-       if (runningbufspace > limit) {
-               while (runningbufspace > limit) {
-                       ++runningbufreq;
+       if (runningbufspace > limit || runningbufreq) {
+               spin_lock(&bufcspin);
+               while (runningbufspace > limit || runningbufreq) {
+                       runningbufreq = 1;
                        ssleep(&runningbufreq, &bufcspin, 0, "wdrn1", 0);
                }
                spin_unlock(&bufcspin);
-       } else if (runningbufspace > limit / 2) {
-               ++runningbufreq;
-               spin_unlock(&bufcspin);
-               tsleep(&dummy, 0, "wdrn2", 1);
-       } else {
-               spin_unlock(&bufcspin);
        }
 }
 
index 7d22ac4..d8555a6 100644 (file)
@@ -275,7 +275,7 @@ struct thread {
     int                td_fairq_load;          /* fairq */
     int                td_fairq_count;         /* fairq */
     struct globaldata *td_migrate_gd;  /* target gd for thread migration */
-    const void *td_mplock_stallpc;     /* last mplock stall address */
+    const void *unused01;
 #ifdef DEBUG_CRIT_SECTIONS
 #define CRIT_DEBUG_ARRAY_SIZE   32
 #define CRIT_DEBUG_ARRAY_MASK   (CRIT_DEBUG_ARRAY_SIZE - 1)
index ed43d8a..1bad1b1 100644 (file)
@@ -913,7 +913,7 @@ struct hammer_mount {
        struct hammer_mod_rb_tree lose_root;    /* loose buffers      */
        int     locked_dirty_space;             /* meta/volu count    */
        int     io_running_space;               /* io_token */
-       int     io_running_wakeup;              /* io_token */
+       int     unused01;
        int     objid_cache_count;
        int     dedup_cache_count;
        int     error;                          /* critical I/O error */
@@ -1045,7 +1045,6 @@ extern int hammer_count_io_running_read;
 extern int hammer_count_io_running_write;
 extern int hammer_count_io_locked;
 extern int hammer_limit_dirtybufspace;
-extern int hammer_limit_running_io;
 extern int hammer_limit_recs;
 extern int hammer_limit_inode_recs;
 extern int hammer_limit_reclaims;
index 21c3101..9298b77 100644 (file)
@@ -207,13 +207,13 @@ hammer_flusher_create(hammer_mount_t hmp)
        TAILQ_INIT(&hmp->flusher.ready_list);
 
        lwkt_create(hammer_flusher_master_thread, hmp,
-                   &hmp->flusher.td, NULL, TDF_SYSTHREAD, -1, "hammer-M");
+                   &hmp->flusher.td, NULL, 0, -1, "hammer-M");
        for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
                info = kmalloc(sizeof(*info), hmp->m_misc, M_WAITOK|M_ZERO);
                info->hmp = hmp;
                TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
                lwkt_create(hammer_flusher_slave_thread, info,
-                           &info->td, NULL, TDF_SYSTHREAD, -1, "hammer-S%d", i);
+                           &info->td, NULL, 0, -1, "hammer-S%d", i);
        }
 }
 
index de02111..be5f0c3 100644 (file)
@@ -1124,11 +1124,6 @@ hammer_io_complete(struct buf *bp)
                hammer_stats_disk_write += iou->io.bytes;
                atomic_add_int(&hammer_count_io_running_write, -iou->io.bytes);
                atomic_add_int(&hmp->io_running_space, -iou->io.bytes);
-               if (hmp->io_running_wakeup &&
-                   hmp->io_running_space < hammer_limit_running_io / 2) {
-                   hmp->io_running_wakeup = 0;
-                   wakeup(&hmp->io_running_wakeup);
-               }
                KKASSERT(hmp->io_running_space >= 0);
                iou->io.running = 0;
 
@@ -1999,8 +1994,5 @@ hammer_io_flush_sync(hammer_mount_t hmp)
 void
 hammer_io_limit_backlog(hammer_mount_t hmp)
 {
-        while (hmp->io_running_space > hammer_limit_running_io) {
-                hmp->io_running_wakeup = 1;
-                tsleep(&hmp->io_running_wakeup, 0, "hmiolm", hz / 10);
-        }
+       waitrunningbufspace();
 }
index 4dfb470..ccd0ff3 100644 (file)
@@ -101,7 +101,6 @@ int hammer_count_io_running_read;
 int hammer_count_io_running_write;
 int hammer_count_io_locked;
 int hammer_limit_dirtybufspace;                /* per-mount */
-int hammer_limit_running_io;           /* per-mount */
 int hammer_limit_recs;                 /* as a whole XXX */
 int hammer_limit_inode_recs = 2048;    /* per inode */
 int hammer_limit_reclaims;
@@ -166,8 +165,6 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, tdmux_ticks, CTLFLAG_RW,
 
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW,
           &hammer_limit_dirtybufspace, 0, "");
-SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_running_io, CTLFLAG_RW,
-          &hammer_limit_running_io, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_recs, CTLFLAG_RW,
           &hammer_limit_recs, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_inode_recs, CTLFLAG_RW,
@@ -359,20 +356,6 @@ hammer_vfs_init(struct vfsconf *conf)
        }
 
        /*
-        * Set reasonable limits to maintain an I/O pipeline.  This is
-        * used by the flush code which explicitly initiates I/O, and
-        * is per-mount.
-        *
-        * The system-driven buffer cache uses vfs.lorunningspace and
-        * vfs.hirunningspace globally.
-        */
-       if (hammer_limit_running_io == 0)
-               hammer_limit_running_io = hammer_limit_dirtybufspace;
-
-       if (hammer_limit_running_io > 10 * 1024 * 1024)
-               hammer_limit_running_io = 10 * 1024 * 1024;
-
-       /*
         * The hammer_inode structure detaches from the vnode on reclaim.
         * This limits the number of inodes in this state to prevent a
         * memory pool blowout.
index a9b9b43..1664ea7 100644 (file)
@@ -97,7 +97,6 @@ void vm_fault_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t);
 void vm_fault_unwire (vm_map_t, vm_map_entry_t);
 int vm_fault_wire (vm_map_t, vm_map_entry_t, boolean_t);
 void vm_fork (struct proc *, struct proc *, int);
-void vm_fault_ratecheck(void);
 int vm_test_nominal (void);
 void vm_wait_nominal (void);
 void vm_init_limits(struct proc *);
index d591eff..dc618a2 100644 (file)
@@ -115,7 +115,6 @@ struct faultstate {
        vm_map_t map;
        vm_map_entry_t entry;
        int lookup_still_valid;
-       int didlimit;
        int hardfault;
        int fault_flags;
        int map_generation;
@@ -131,7 +130,6 @@ static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int);
 #if 0
 static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t *, int *);
 #endif
-static int vm_fault_ratelimit(struct vmspace *);
 static void vm_set_nosync(vm_page_t m, vm_map_entry_t entry);
 static void vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry,
                        int prot);
@@ -256,7 +254,6 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags)
 
        mycpu->gd_cnt.v_vm_faults++;
 
-       fs.didlimit = 0;
        fs.hardfault = 0;
        fs.fault_flags = fault_flags;
        growstack = 1;
@@ -528,7 +525,6 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 
        mycpu->gd_cnt.v_vm_faults++;
 
-       fs.didlimit = 0;
        fs.hardfault = 0;
        fs.fault_flags = fault_flags;
        KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
@@ -737,7 +733,6 @@ vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
        entry.maptype = VM_MAPTYPE_NORMAL;
        entry.protection = entry.max_protection = fault_type;
 
-       fs.didlimit = 0;
        fs.hardfault = 0;
        fs.fault_flags = fault_flags;
        fs.map = NULL;
@@ -1138,26 +1133,6 @@ vm_fault_object(struct faultstate *fs,
                        }
 
                        /*
-                        * Ratelimit.
-                        */
-                       if (fs->didlimit == 0 && curproc != NULL) {
-                               int limticks;
-
-                               limticks = vm_fault_ratelimit(curproc->p_vmspace);
-                               if (limticks) {
-                                       vm_object_pip_wakeup(fs->first_object);
-                                       vm_object_chain_release_all(
-                                               fs->first_object, fs->object);
-                                       if (fs->object != fs->first_object)
-                                               vm_object_drop(fs->object);
-                                       unlock_and_deallocate(fs);
-                                       tsleep(curproc, 0, "vmrate", limticks);
-                                       fs->didlimit = 1;
-                                       return (KERN_TRY_AGAIN);
-                               }
-                       }
-
-                       /*
                         * Allocate a new page for this object/offset pair.
                         *
                         * It is possible for the allocation to race, so
@@ -1798,40 +1773,6 @@ vm_fault_unwire(vm_map_t map, vm_map_entry_t entry)
 }
 
 /*
- * Reduce the rate at which memory is allocated to a process based
- * on the perceived load on the VM system. As the load increases
- * the allocation burst rate goes down and the delay increases. 
- *
- * Rate limiting does not apply when faulting active or inactive
- * pages.  When faulting 'cache' pages, rate limiting only applies
- * if the system currently has a severe page deficit.
- *
- * XXX vm_pagesupply should be increased when a page is freed.
- *
- * We sleep up to 1/10 of a second.
- */
-static int
-vm_fault_ratelimit(struct vmspace *vmspace)
-{
-       if (vm_load_enable == 0)
-               return(0);
-       if (vmspace->vm_pagesupply > 0) {
-               --vmspace->vm_pagesupply;       /* SMP race ok */
-               return(0);
-       }
-#ifdef INVARIANTS
-       if (vm_load_debug) {
-               kprintf("load %-4d give %d pgs, wait %d, pid %-5d (%s)\n",
-                       vm_load, 
-                       (1000 - vm_load ) / 10, vm_load * hz / 10000,
-                       curproc->p_pid, curproc->p_comm);
-       }
-#endif
-       vmspace->vm_pagesupply = (1000 - vm_load) / 10;
-       return(vm_load * hz / 10000);
-}
-
-/*
  * Copy all of the pages from a wired-down map entry to another.
  *
  * The source and destination maps must be locked for write.
index 6178152..328fd2e 100644 (file)
@@ -183,18 +183,6 @@ static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
        CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
-int vm_load;
-SYSCTL_INT(_vm, OID_AUTO, vm_load,
-       CTLFLAG_RD, &vm_load, 0, "load on the VM system");
-int vm_load_enable = 1;
-SYSCTL_INT(_vm, OID_AUTO, vm_load_enable,
-       CTLFLAG_RW, &vm_load_enable, 0, "enable vm_load rate limiting");
-#ifdef INVARIANTS
-int vm_load_debug;
-SYSCTL_INT(_vm, OID_AUTO, vm_load_debug,
-       CTLFLAG_RW, &vm_load_debug, 0, "debug vm_load");
-#endif
-
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
@@ -209,24 +197,6 @@ static void vm_req_vmdaemon (void);
 static void vm_pageout_page_stats(int q);
 
 /*
- * Update vm_load to slow down faulting processes.
- *
- * SMP races ok.
- * No requirements.
- */
-void
-vm_fault_ratecheck(void)
-{
-       if (vm_pages_needed) {
-               if (vm_load < 1000)
-                       ++vm_load;
-       } else {
-               if (vm_load > 0)
-                       --vm_load;
-       }
-}
-
-/*
  * vm_pageout_clean:
  *
  * Clean the page and remove it from the laundry.  The page must not be
@@ -1914,12 +1884,23 @@ vm_pageout_thread(void)
                active_shortage = vmstats.v_inactive_target -
                                  vmstats.v_inactive_count;
 
-               tmp = inactive_shortage;
-               if (tmp < vmstats.v_inactive_target / 10)
-                       tmp = vmstats.v_inactive_target / 10;
-               inactive_shortage -= delta1;
-               if (inactive_shortage <= 0 && active_shortage > tmp * 2)
-                       active_shortage = tmp * 2;
+               /*
+                * If we were unable to free sufficient inactive pages to
+                * satisfy the free/cache queue requirements then simply
+                * reaching the inactive target may not be good enough.
+                * Try to deactivate pages in excess of the target based
+                * on the shortfall.
+                *
+                * However to prevent thrashing the VM system do not
+                * deactivate more than an additional 1/10 the inactive
+                * target's worth of active pages.
+                */
+               if (delta1 < inactive_shortage) {
+                       tmp = (inactive_shortage - delta1) * 2;
+                       if (tmp > vmstats.v_inactive_target / 10)
+                               tmp = vmstats.v_inactive_target / 10;
+                       active_shortage += tmp;
+               }
 
                delta2 = 0;
                for (q = 0; q < PQ_MAXL2_SIZE; ++q) {
index cc662ef..1b878b9 100644 (file)
@@ -84,9 +84,6 @@ extern int vm_page_max_wired;
 extern int vm_pages_needed;    /* should be some "event" structure */
 extern int vm_pageout_pages_needed;
 extern int vm_pageout_deficit;
-extern int vm_load;
-extern int vm_load_enable;
-extern int vm_load_debug;
 
 #define VM_PAGEOUT_ASYNC 0
 #define VM_PAGEOUT_SYNC 1