From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Mon, 24 Sep 2012 20:32:11 +0000 (-0700)
Subject: kernel - usched_dfly revamp (4), improve tail
X-Git-Tag: v3.2.0~75
X-Git-Url: https://gitweb.dragonflybsd.org/~nant/dragonfly.git/commitdiff_plain/d992c3771baa6d7578bf112b8dbea1326e461f01

kernel - usched_dfly revamp (4), improve tail

* Improve tail performance (many more cpu-bound processes than available
  cpus).

* Experiment with removing the LWKT priority adjustments for kernel vs user.
  Instead give LWKT a hint about the user scheduler when scheduling a thread.
  LWKT's round-robin is left unhinted to hopefully round-robin starved LWKTs
  running in kernel mode.

* Implement a better calculation for the per-thread uload than the priority.
  Instead, use estcpu.

* Adjust default weigntings for new uload calculation scale.
---

diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index cf69eb35b9..22cedcd8c5 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -673,7 +673,11 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags)
 	td->td_proc = destproc;
 	td->td_lwp = lp;
 	td->td_switch = cpu_heavy_switch;
+#ifdef LWKT_SPLIT_USERPRI
 	lwkt_setpri(td, TDPRI_KERN_USER);
+#else
+	lwkt_setpri(td, TDPRI_USER_NORM);
+#endif
 	lwkt_set_comm(td, "%s", destproc->p_comm);
 
 	/*
diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c
index 3501185b51..beaf7a0810 100644
--- a/sys/kern/lwkt_thread.c
+++ b/sys/kern/lwkt_thread.c
@@ -180,8 +180,17 @@ _lwkt_dequeue(thread_t td)
 /*
  * Priority enqueue.
  *
- * NOTE: There are a limited number of lwkt threads runnable since user
- *	 processes only schedule one at a time per cpu.
+ * There are a limited number of lwkt threads runnable since user
+ * processes only schedule one at a time per cpu.  However, there can
+ * be many user processes in kernel mode exiting from a tsleep() which
+ * become runnable.  We do a secondary comparison using td_upri to try
+ * to order these in the situation where several wake up at the same time
+ * to avoid excessive switching.
+ *
+ * NOTE: lwkt_schedulerclock() will force a round-robin based on td_pri and
+ *	 will ignore user priority.  This is to ensure that user threads in
+ *	 kernel mode get cpu at some point regardless of what the user
+ *	 scheduler thinks.
  */
 static __inline
 void
@@ -198,8 +207,12 @@ _lwkt_enqueue(thread_t td)
 	    TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
 	    atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
 	} else {
-	    while (xtd && xtd->td_pri >= td->td_pri)
+	    while (xtd &&
+		   (xtd->td_pri > td->td_pri ||
+		    (xtd->td_pri == td->td_pri &&
+		     xtd->td_upri >= td->td_pri))) {
 		xtd = TAILQ_NEXT(xtd, td_threadq);
+	    }
 	    if (xtd)
 		TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
 	    else
@@ -706,6 +719,7 @@ lwkt_switch(void)
 		goto skip;
 
 	while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
+#ifdef LWKT_SPLIT_USERPRI
 		/*
 		 * Never schedule threads returning to userland or the
 		 * user thread scheduler helper thread when higher priority
@@ -717,6 +731,7 @@ lwkt_switch(void)
 			ntd = NULL;
 			break;
 		}
+#endif
 
 		/*
 		 * Try this one.
@@ -1129,8 +1144,11 @@ lwkt_passive_release(struct thread *td)
 {
     struct lwp *lp = td->td_lwp;
 
+#ifdef LWKT_SPLIT_USERPRI
     td->td_release = NULL;
     lwkt_setpri_self(TDPRI_KERN_USER);
+#endif
+
     lp->lwp_proc->p_usched->release_curproc(lp);
 }
 
@@ -1497,6 +1515,10 @@ lwkt_schedulerclock(thread_t td)
 	 * If the current thread is at the head of the runq shift it to the
 	 * end of any equal-priority threads and request a LWKT reschedule
 	 * if it moved.
+	 *
+	 * Ignore upri in this situation.  There will only be one user thread
+	 * in user mode, all others will be user threads running in kernel
+	 * mode and we have to make sure they get some cpu.
 	 */
 	xtd = TAILQ_NEXT(td, td_threadq);
 	if (xtd && xtd->td_pri == td->td_pri) {
diff --git a/sys/kern/usched_bsd4.c b/sys/kern/usched_bsd4.c
index 629be2e695..043b264d2c 100644
--- a/sys/kern/usched_bsd4.c
+++ b/sys/kern/usched_bsd4.c
@@ -1173,7 +1173,11 @@ bsd4_resetpriority(struct lwp *lp)
 	 * The newpriority incorporates the queue type so do a simple masked
 	 * check to determine if the process has moved to another queue.  If
 	 * it has, and it is currently on a run queue, then move it.
+	 *
+	 * td_upri has normal sense (higher values are more desireable), so
+	 * negate it.
 	 */
+	lp->lwp_thread->td_upri = -(newpriority & ~PPQMASK);
 	if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
 		lp->lwp_priority = newpriority;
 		if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
diff --git a/sys/kern/usched_dfly.c b/sys/kern/usched_dfly.c
index 121af38792..5c64851c33 100644
--- a/sys/kern/usched_dfly.c
+++ b/sys/kern/usched_dfly.c
@@ -94,6 +94,7 @@ TAILQ_HEAD(rq, lwp);
 #define lwp_rqindex	lwp_usdata.dfly.rqindex
 #define lwp_estcpu	lwp_usdata.dfly.estcpu
 #define lwp_estfast	lwp_usdata.dfly.estfast
+#define lwp_uload	lwp_usdata.dfly.uload
 #define lwp_rqtype	lwp_usdata.dfly.rqtype
 #define lwp_qcpu	lwp_usdata.dfly.qcpu
 
@@ -253,11 +254,12 @@ SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW,
 #ifdef SMP
 static int usched_dfly_smt = 0;
 static int usched_dfly_cache_coherent = 0;
-static int usched_dfly_weight1 = 50;	/* keep thread on current cpu */
-static int usched_dfly_weight2 = 30;	/* synchronous peer's current cpu */
-static int usched_dfly_weight3 = 10;	/* number of threads on queue */
-static int usched_dfly_weight4 = 40;	/* availability of idle cores */
+static int usched_dfly_weight1 = 200;	/* keep thread on current cpu */
+static int usched_dfly_weight2 = 1200;	/* synchronous peer's current cpu */
+static int usched_dfly_weight3 = 40;	/* number of threads on queue */
+static int usched_dfly_weight4 = 160;	/* availability of idle cores */
 static int usched_dfly_features = 0x8F;	/* allow pulls */
+static int usched_dfly_swmask = ~PPQMASK; /* allow pulls */
 #endif
 static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10;
 static int usched_dfly_decay = 8;
@@ -380,10 +382,12 @@ dfly_acquire_curproc(struct lwp *lp)
 		 *
 		 * It is important to do a masked test to avoid the edge
 		 * case where two near-equal-priority threads are constantly
-		 * interrupting each other.
+		 * interrupting each other.  Since our context is the one
+		 * that is active NOW, we WANT to steal the uschedcp
+		 * designation and not switch-flap.
 		 */
 		if (dd->uschedcp &&
-		   (dd->upri & ~PPQMASK) >
+		   (dd->upri & ~PPQMASK) >=
 		   (lp->lwp_priority & ~PPQMASK)) {
 			dd->uschedcp = lp;
 			dd->upri = lp->lwp_priority;
@@ -641,8 +645,7 @@ dfly_changeqcpu_locked(struct lwp *lp, dfly_pcpu_t dd, dfly_pcpu_t rdd)
 	if (lp->lwp_qcpu != rdd->cpuid) {
 		if (lp->lwp_mpflags & LWP_MP_ULOAD) {
 			atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
-			atomic_add_int(&dd->uload,
-				   -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+			atomic_add_int(&dd->uload, -lp->lwp_uload);
 			atomic_add_int(&dd->ucount, -1);
 			atomic_add_int(&dfly_ucount, -1);
 		}
@@ -764,26 +767,37 @@ dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 	dfly_resetpriority(lp);
 
 	/*
-	 * Rebalance cpus on each scheduler tick.  Each cpu in turn will
-	 * calculate the worst queue and, if sufficiently loaded, will
-	 * pull a process from that queue into our current queue.
+	 * Rebalance two cpus every 8 ticks, pulling the worst thread
+	 * from the worst cpu's queue into a rotating cpu number.
 	 *
-	 * To try to avoid always moving the same thread. XXX
+	 * This mechanic is needed because the push algorithms can
+	 * steady-state in an non-optimal configuration.  We need to mix it
+	 * up a little, even if it means breaking up a paired thread, so
+	 * the push algorithms can rebalance the degenerate conditions.
+	 * This portion of the algorithm exists to ensure stability at the
+	 * selected weightings.
+	 *
+	 * Because we might be breaking up optimal conditions we do not want
+	 * to execute this too quickly, hence we only rebalance approximately
+	 * ~7-8 times per second.  The push's, on the otherhand, are capable
+	 * moving threads to other cpus at a much higher rate.
+	 *
+	 * We choose the most heavily loaded thread from the worst queue
+	 * in order to ensure that multiple heavy-weight threads on the same
+	 * queue get broken up, and also because these threads are the most
+	 * likely to be able to remain in place.  Hopefully then any pairings,
+	 * if applicable, migrate to where these threads are.
 	 */
 #ifdef SMP
 	if ((usched_dfly_features & 0x04) &&
-	    ((uint16_t)sched_ticks % ncpus) == gd->gd_cpuid) {
+	    ((u_int)sched_ticks & 7) == 0 &&
+	    (u_int)sched_ticks / 8 % ncpus == gd->gd_cpuid) {
 		/*
 		 * Our cpu is up.
 		 */
 		struct lwp *nlp;
 		dfly_pcpu_t rdd;
 
-		/*
-		 * We have to choose the worst thread in the worst queue
-		 * because it likely finished its batch on that cpu and is
-		 * now waiting for cpu again.
-		 */
 		rdd = dfly_choose_worst_queue(dd);
 		if (rdd) {
 			spin_lock(&dd->spin);
@@ -972,6 +986,7 @@ dfly_resetpriority(struct lwp *lp)
 	int rcpu;
 	int checkpri;
 	int estcpu;
+	int delta_uload;
 
 	crit_enter();
 
@@ -1028,6 +1043,20 @@ dfly_resetpriority(struct lwp *lp)
 		/* NOT REACHED */
 	}
 
+	/*
+	 * The LWKT scheduler doesn't dive usched structures, give it a hint
+	 * on the relative priority of user threads running in the kernel.
+	 * The LWKT scheduler will always ensure that a user thread running
+	 * in the kernel will get cpu some time, regardless of its upri,
+	 * but can decide not to instantly switch from one kernel or user
+	 * mode user thread to a kernel-mode user thread when it has a less
+	 * desireable user priority.
+	 *
+	 * td_upri has normal sense (higher values are more desireable), so
+	 * negate it.
+	 */
+	lp->lwp_thread->td_upri = -(newpriority & usched_dfly_swmask);
+
 	/*
 	 * The newpriority incorporates the queue type so do a simple masked
 	 * check to determine if the process has moved to another queue.  If
@@ -1037,21 +1066,6 @@ dfly_resetpriority(struct lwp *lp)
 	 * we end up in the same run queue.
 	 */
 	if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
-		int delta_uload;
-
-		/*
-		 * uload can change, calculate the adjustment to reduce
-		 * edge cases since choosers scan the cpu topology without
-		 * locks.
-		 */
-		if (lp->lwp_mpflags & LWP_MP_ULOAD) {
-			delta_uload =
-				-((lp->lwp_priority & ~PPQMASK) & PRIMASK) +
-				((newpriority & ~PPQMASK) & PRIMASK);
-			atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
-				       delta_uload);
-			/* no change in ucount */
-		}
 		if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
 			dfly_remrunqueue_locked(rdd, lp);
 			lp->lwp_priority = newpriority;
@@ -1074,6 +1088,15 @@ dfly_resetpriority(struct lwp *lp)
 		rcpu = -1;
 	}
 
+	/*
+	 * Adjust effective load
+	 */
+	delta_uload = lp->lwp_estcpu / NQS;	/* 0-511, 0-100% cpu */
+	delta_uload -= lp->lwp_uload;
+	lp->lwp_uload += delta_uload;
+	if (lp->lwp_mpflags & LWP_MP_ULOAD)
+		atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload, delta_uload);
+
 	/*
 	 * Determine if we need to reschedule the target cpu.  This only
 	 * occurs if the LWP is already on a scheduler queue, which means
@@ -1094,7 +1117,8 @@ dfly_resetpriority(struct lwp *lp)
 	if (rcpu >= 0) {
 		if ((dfly_rdyprocmask & CPUMASK(rcpu)) &&
 		    (checkpri == 0 ||
-		     (rdd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
+		     (rdd->upri & ~PRIMASK) >
+		     (lp->lwp_priority & ~PRIMASK))) {
 #ifdef SMP
 			if (rcpu == mycpu->gd_cpuid) {
 				spin_unlock(&rdd->spin);
@@ -1183,8 +1207,7 @@ dfly_exiting(struct lwp *lp, struct proc *child_proc)
 
 	if (lp->lwp_mpflags & LWP_MP_ULOAD) {
 		atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
-		atomic_add_int(&dd->uload,
-			       -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+		atomic_add_int(&dd->uload, -lp->lwp_uload);
 		atomic_add_int(&dd->ucount, -1);
 		atomic_add_int(&dfly_ucount, -1);
 	}
@@ -1208,8 +1231,7 @@ dfly_uload_update(struct lwp *lp)
 			if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
 				atomic_set_int(&lp->lwp_mpflags,
 					       LWP_MP_ULOAD);
-				atomic_add_int(&dd->uload,
-				   ((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+				atomic_add_int(&dd->uload, lp->lwp_uload);
 				atomic_add_int(&dd->ucount, 1);
 				atomic_add_int(&dfly_ucount, 1);
 			}
@@ -1221,8 +1243,7 @@ dfly_uload_update(struct lwp *lp)
 			if (lp->lwp_mpflags & LWP_MP_ULOAD) {
 				atomic_clear_int(&lp->lwp_mpflags,
 						 LWP_MP_ULOAD);
-				atomic_add_int(&dd->uload,
-				   -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+				atomic_add_int(&dd->uload, -lp->lwp_uload);
 				atomic_add_int(&dd->ucount, -1);
 				atomic_add_int(&dfly_ucount, -1);
 			}
@@ -1338,14 +1359,12 @@ dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
 	 */
 	if (rdd != dd) {
 		if (lp->lwp_mpflags & LWP_MP_ULOAD) {
-			atomic_add_int(&rdd->uload,
-			    -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+			atomic_add_int(&rdd->uload, -lp->lwp_uload);
 			atomic_add_int(&rdd->ucount, -1);
 			atomic_add_int(&dfly_ucount, -1);
 		}
 		lp->lwp_qcpu = dd->cpuid;
-		atomic_add_int(&dd->uload,
-		    ((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+		atomic_add_int(&dd->uload, lp->lwp_uload);
 		atomic_add_int(&dd->ucount, 1);
 		atomic_add_int(&dfly_ucount, 1);
 		atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
@@ -1477,8 +1496,7 @@ dfly_choose_best_queue(struct lwp *lp)
 			 */
 			if ((lp->lwp_mpflags & LWP_MP_ULOAD) &&
 			    (dd->cpumask & cpun->members)) {
-				load -= ((lp->lwp_priority & ~PPQMASK) &
-				         PRIMASK);
+				load -= lp->lwp_uload;
 				load -= usched_dfly_weight3;
 			}
 
@@ -1562,8 +1580,10 @@ dfly_choose_worst_queue(dfly_pcpu_t dd)
 	int n;
 	int count;
 	int load;
+#if 0
 	int pri;
 	int hpri;
+#endif
 	int highest_load;
 
 	/*
@@ -1661,6 +1681,7 @@ dfly_choose_worst_queue(dfly_pcpu_t dd)
 	if (rdd == dd)
 		return(NULL);
 
+#if 0
 	hpri = 0;
 	if (rdd->rtqueuebits && hpri < (pri = bsrl(rdd->rtqueuebits)))
 		hpri = pri;
@@ -1671,6 +1692,7 @@ dfly_choose_worst_queue(dfly_pcpu_t dd)
 	hpri *= PPQ;
 	if (rdd->uload - hpri < dd->uload + hpri)
 		return(NULL);
+#endif
 	return (rdd);
 }
 
@@ -1838,8 +1860,7 @@ dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
 
 	if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
 		atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
-		atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
-			       (lp->lwp_priority & ~PPQMASK) & PRIMASK);
+		atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload, lp->lwp_uload);
 		atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].ucount, 1);
 		atomic_add_int(&dfly_ucount, 1);
 	}
@@ -1958,6 +1979,8 @@ dfly_helper_thread(void *dummy)
 		 * from another cpu.  Since we're stealing, might as well
 		 * load balance at the same time.
 		 *
+		 * We choose the highest-loaded thread from the worst queue.
+		 *
 		 * NOTE! This function only returns a non-NULL rdd when
 		 *	 another cpu's queue is obviously overloaded.  We
 		 *	 do not want to perform the type of rebalancing
@@ -1968,7 +1991,7 @@ dfly_helper_thread(void *dummy)
 		 */
 		rdd = dfly_choose_worst_queue(dd);
 		if (rdd && spin_trylock(&rdd->spin)) {
-			nlp = dfly_chooseproc_locked(rdd, dd, NULL, 0);
+			nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1);
 			spin_unlock(&rdd->spin);
 		} else {
 			nlp = NULL;
@@ -2208,6 +2231,12 @@ dfly_helper_thread_cpu_init(void)
 			       &usched_dfly_features, 15,
 			       "Allow pulls into empty queues");
 
+		SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
+			       SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
+			       OID_AUTO, "swmask", CTLFLAG_RW,
+			       &usched_dfly_swmask, ~PPQMASK,
+			       "Queue mask to force thread switch");
+
 
 #if 0
 		SYSCTL_ADD_PROC(&usched_dfly_sysctl_ctx,
diff --git a/sys/kern/usched_dummy.c b/sys/kern/usched_dummy.c
index 50293ef956..3d89756923 100644
--- a/sys/kern/usched_dummy.c
+++ b/sys/kern/usched_dummy.c
@@ -401,6 +401,12 @@ dummy_resetpriority(struct lwp *lp)
 		lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
 		return;
 	}
+
+	/*
+	 * td_upri has normal sense (higher numbers are more desireable),
+	 * so negate it.
+	 */
+	lp->lwp_thread->td_upri = -lp->lwp_priority;
 	/* XXX spinlock usually needed */
 }
 
diff --git a/sys/sys/thread.h b/sys/sys/thread.h
index cc1f794921..d60e2628e3 100644
--- a/sys/sys/thread.h
+++ b/sys/sys/thread.h
@@ -285,7 +285,8 @@ struct thread {
     int		td_cscount_unused;
 #endif
     int		td_wakefromcpu;	/* who woke me up? */
-    int		td_unused02[3];	/* for future fields */
+    int		td_upri;	/* user priority (sub-priority under td_pri) */
+    int		td_unused02[2];	/* for future fields */
     int		td_unused03[4];	/* for future fields */
     struct iosched_data td_iosdata;	/* Dynamic I/O scheduling data */
     struct timeval td_start;	/* start time for a thread/process */
@@ -373,6 +374,7 @@ struct thread {
 #define TDF_KERNELFP		0x01000000	/* kernel using fp coproc */
 #define TDF_DELAYED_WAKEUP	0x02000000
 #define TDF_CRYPTO		0x04000000	/* crypto thread */
+#define TDF_USERMODE		0x08000000	/* in or entering user mode */
 
 #define TDF_MP_STOPREQ		0x00000001	/* suspend_kproc */
 #define TDF_MP_WAKEREQ		0x00000002	/* resume_kproc */
diff --git a/sys/sys/thread2.h b/sys/sys/thread2.h
index 13bbff050e..0d8db6e101 100644
--- a/sys/sys/thread2.h
+++ b/sys/sys/thread2.h
@@ -261,9 +261,11 @@ lwkt_getpri_self(void)
 static __inline void
 lwkt_passive_recover(thread_t td)
 {
+#ifdef LWKT_SPLIT_USERPRI
     if (td->td_release == NULL)
 	lwkt_setpri_self(TDPRI_USER_NORM);
     td->td_release = NULL;
+#endif
 }
 
 /*
diff --git a/sys/sys/usched.h b/sys/sys/usched.h
index bb9e9cd43a..82e7c5446d 100644
--- a/sys/sys/usched.h
+++ b/sys/sys/usched.h
@@ -64,7 +64,7 @@ union usched_data {
 	char	forked;		/* lock cpu during fork */
 	char	rqindex;
 	short	estfast;	/* fast estcpu collapse mode */
-	short	unused01;
+	short	uload;		/* for delta uload adjustments */
 	int	estcpu;		/* dynamic priority modification */
 	u_short rqtype;		/* protected copy of rtprio type */
 	u_short	qcpu;		/* which cpu are we enqueued on? */