/*
* lwkt thread scheduler fair queueing
*/
- lwkt_fairq_schedulerclock(curthread);
+ lwkt_schedulerclock(curthread);
/*
* softticks are handled for all cpus
hardclock_softtick(gd);
/*
- * The LWKT scheduler will generally allow the current process to
- * return to user mode even if there are other runnable LWKT threads
- * running in kernel mode on behalf of a user process. This will
- * ensure that those other threads have an opportunity to run in
- * fairly short order (but not instantly).
- */
- need_lwkt_resched();
-
- /*
* ITimer handling is per-tick, per-cpu.
*
* We must acquire the per-process token in order for ksignal()
return(0);
PHOLD(p);
- lwkt_gettoken(&p->p_token);
+ if (lwkt_trytoken(&p->p_token) == FALSE) {
+ PRELE(p);
+ return(0);
+ }
p->p_swtime++;
FOREACH_LWP_IN_PROC(lp, p) {
return(0);
PHOLD(p);
- lwkt_gettoken(&p->p_token);
+ if (lwkt_trytoken(&p->p_token) == FALSE) {
+ PRELE(p);
+ return(0);
+ }
if (p->p_stat == SZOMB || p->p_limit == NULL) {
lwkt_reltoken(&p->p_token);
* Schedule the target thread. If the message flags contains MSGF_NORESCHED
* we tell the scheduler not to reschedule if td is at a higher priority.
*
- * This routine is called even if the thread is already scheduled so messages
- * without NORESCHED will cause the target thread to be rescheduled even if
- * prior messages did not.
+ * This routine is called even if the thread is already scheduled.
*/
static __inline
void
_lwkt_schedule_msg(thread_t td, int flags)
{
- if (flags & MSGF_NORESCHED)
- lwkt_schedule_noresched(td);
- else
- lwkt_schedule(td);
+ lwkt_schedule(td);
}
/*
static __int64_t preempt_hit = 0;
static __int64_t preempt_miss = 0;
static __int64_t preempt_weird = 0;
-static __int64_t token_contention_count __debugvar = 0;
+static __int64_t token_contention_count[TDPRI_MAX+1] __debugvar;
static int lwkt_use_spin_port;
static struct objcache *thread_cache;
static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame);
static void lwkt_setcpu_remote(void *arg);
#endif
-static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td);
-static int lwkt_fairq_tick(globaldata_t gd, thread_t td);
extern void cpu_heavy_restore(void);
extern void cpu_lwkt_restore(void);
SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0,
"Number of preempted threads.");
#ifdef INVARIANTS
-SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
- &token_contention_count, 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_00, CTLFLAG_RW,
+ &token_contention_count[0], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_01, CTLFLAG_RW,
+ &token_contention_count[1], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_02, CTLFLAG_RW,
+ &token_contention_count[2], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_03, CTLFLAG_RW,
+ &token_contention_count[3], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_04, CTLFLAG_RW,
+ &token_contention_count[4], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_05, CTLFLAG_RW,
+ &token_contention_count[5], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_06, CTLFLAG_RW,
+ &token_contention_count[6], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_07, CTLFLAG_RW,
+ &token_contention_count[7], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_08, CTLFLAG_RW,
+ &token_contention_count[8], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_09, CTLFLAG_RW,
+ &token_contention_count[9], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_10, CTLFLAG_RW,
+ &token_contention_count[10], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_11, CTLFLAG_RW,
+ &token_contention_count[11], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_12, CTLFLAG_RW,
+ &token_contention_count[12], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_13, CTLFLAG_RW,
+ &token_contention_count[13], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_14, CTLFLAG_RW,
+ &token_contention_count[14], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_15, CTLFLAG_RW,
+ &token_contention_count[15], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_16, CTLFLAG_RW,
+ &token_contention_count[16], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_17, CTLFLAG_RW,
+ &token_contention_count[17], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_18, CTLFLAG_RW,
+ &token_contention_count[18], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_19, CTLFLAG_RW,
+ &token_contention_count[19], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_20, CTLFLAG_RW,
+ &token_contention_count[20], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_21, CTLFLAG_RW,
+ &token_contention_count[21], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_22, CTLFLAG_RW,
+ &token_contention_count[22], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_23, CTLFLAG_RW,
+ &token_contention_count[23], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_24, CTLFLAG_RW,
+ &token_contention_count[24], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_25, CTLFLAG_RW,
+ &token_contention_count[25], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_26, CTLFLAG_RW,
+ &token_contention_count[26], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_27, CTLFLAG_RW,
+ &token_contention_count[27], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_28, CTLFLAG_RW,
+ &token_contention_count[28], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_29, CTLFLAG_RW,
+ &token_contention_count[29], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_30, CTLFLAG_RW,
+ &token_contention_count[30], 0, "spinning due to token contention");
+SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_31, CTLFLAG_RW,
+ &token_contention_count[31], 0, "spinning due to token contention");
#endif
static int fairq_enable = 0;
SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW,
&fairq_enable, 0, "Turn on fairq priority accumulators");
-static int fairq_bypass = 1;
+static int fairq_bypass = -1;
SYSCTL_INT(_lwkt, OID_AUTO, fairq_bypass, CTLFLAG_RW,
&fairq_bypass, 0, "Allow fairq to bypass td on token failure");
extern int lwkt_sched_debug;
td->td_flags &= ~TDF_RUNQ;
TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
-
- gd->gd_fairq_total_pri -= td->td_pri;
if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING);
-
- /*td->td_fairq_lticks = ticks;*/
}
}
td->td_flags |= TDF_RUNQ;
xtd = TAILQ_FIRST(&gd->gd_tdrunq);
if (xtd == NULL) {
- TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
- atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
+ TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
+ atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
} else {
- while (xtd && xtd->td_pri >= td->td_pri)
- xtd = TAILQ_NEXT(xtd, td_threadq);
- if (xtd)
- TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
- else
- TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
+ while (xtd && xtd->td_pri >= td->td_pri)
+ xtd = TAILQ_NEXT(xtd, td_threadq);
+ if (xtd)
+ TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
+ else
+ TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
}
- gd->gd_fairq_total_pri += td->td_pri;
/*
- * The thread might have been dequeued for a while, bump it's
- * fairq.
+ * Request a LWKT reschedule if we are now at the head of the queue.
*/
- if (td->td_fairq_lticks != ticks) {
- td->td_fairq_lticks = ticks;
- lwkt_fairq_accumulate(gd, td);
- }
+ if (TAILQ_FIRST(&gd->gd_tdrunq) == td)
+ need_lwkt_resched();
}
}
}
/*
- * Passive release (used to transition from user to kernel mode
- * when we block or switch rather then when we enter the kernel).
- * This function is NOT called if we are switching into a preemption
- * or returning from a preemption. Typically this causes us to lose
- * our current process designation (if we have one) and become a true
- * LWKT thread, and may also hand the current process designation to
- * another process and schedule thread.
+ * Release our current user process designation if we are blocking
+ * or if a user reschedule was requested.
+ *
+ * NOTE: This function is NOT called if we are switching into or
+ * returning from a preemption.
+ *
+ * NOTE: Releasing our current user process designation may cause
+ * it to be assigned to another thread, which in turn will
+ * cause us to block in the usched acquire code when we attempt
+ * to return to userland.
+ *
+ * NOTE: On SMP systems this can be very nasty when heavy token
+ * contention is present so we want to be careful not to
+ * release the designation gratuitously.
*/
- if (td->td_release)
+ if (td->td_release &&
+ (user_resched_wanted() || (td->td_flags & TDF_RUNQ) == 0)) {
td->td_release(td);
+ }
+ /*
+ * Release all tokens
+ */
crit_enter_gd(gd);
if (TD_TOKS_HELD(td))
lwkt_relalltokens(td);
* The interrupt may have woken a thread up, we need to properly
* set the reschedule flag if the originally interrupted thread is
* at a lower priority.
+ *
+ * The interrupt may not have descheduled.
*/
- if (TAILQ_FIRST(&gd->gd_tdrunq) &&
- TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) {
+ if (TAILQ_FIRST(&gd->gd_tdrunq) != ntd)
need_lwkt_resched();
- }
- /* YYY release mp lock on switchback if original doesn't need it */
goto havethread_preempted;
}
/*
- * Update the fairq accumulator if we are switching away in a
- * different tick.
- */
- lwkt_fairq_tick(gd, td);
-
- /*
- * Implement round-robin fairq with priority insertion. The priority
- * insertion is handled by _lwkt_enqueue()
- *
* If we cannot obtain ownership of the tokens we cannot immediately
* schedule the target thread.
*
* the current thread has been descheduled.
*/
for (;;) {
- /*
- * We have already docked the current thread. If we get stuck in a
- * scheduler switching loop we do not want to dock it over and over
- * again. Reset lticks.
- */
- if (td != &gd->gd_idlethread)
- td->td_fairq_lticks = ticks;
-
clear_lwkt_resched();
/*
* Hotpath - pull the head of the run queue and attempt to schedule
- * it. Fairq exhaustion moves the task to the end of the list. If
- * no threads are runnable we switch to the idle thread.
+ * it.
*/
for (;;) {
ntd = TAILQ_FIRST(&gd->gd_tdrunq);
goto haveidle;
}
break;
-
-#if 0
- if (ntd->td_fairq_accum >= 0)
- break;
-
- /*splz_check(); cannot do this here, see above */
- lwkt_fairq_accumulate(gd, ntd);
- TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
- TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq);
-#endif
}
/*
* What we do now is try to find a thread that we can schedule
* in its stead.
*
- * The coldpath scan does NOT rearrange threads in the run list
- * and it also ignores the accumulator. We locate the thread with
- * the highest accumulator value (positive or negative), then the
- * next highest, and so forth. This isn't the most efficient but
- * will theoretically try to schedule one thread per pass which
- * is not horrible.
- *
- * If the accumulator for the selected thread happens to be negative
- * the timer interrupt will come along and ask for another reschedule
- * within 1 tick.
- *
- * NOTE: This scan will also include threads whos fairq's were
- * accumulated in the first loop.
+ * The coldpath scan does NOT rearrange threads in the run list.
+ * The lwkt_schedulerclock() will assert need_lwkt_resched() on
+ * the next tick whenever the current head is not the current thread.
*/
#ifdef INVARIANTS
- ++token_contention_count;
+ ++token_contention_count[ntd->td_pri];
+ ++ntd->td_contended;
#endif
- if (fairq_bypass)
+ if (fairq_bypass > 0)
goto skip;
- need_lwkt_resched();
xtd = NULL;
while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
-#if 0
- if (ntd->td_fairq_accum < 0)
- continue;
- if (xtd == NULL || ntd->td_pri > xtd->td_pri)
- xtd = ntd;
-#endif
+ /*
+ * Never schedule threads returning to userland or the
+ * user thread scheduler helper thread when higher priority
+ * threads are present.
+ */
+ if (ntd->td_pri < TDPRI_KERN_LPSCHED) {
+ ntd = NULL;
+ break;
+ }
+
+ /*
+ * Try this one.
+ */
if (TD_TOKS_NOT_HELD(ntd) ||
lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops))) {
goto havethread;
}
- }
-#if 0
- if (xtd) {
- if (TD_TOKS_NOT_HELD(xtd) ||
- lwkt_getalltokens(xtd, (spinning >= lwkt_spin_loops)))
- {
- ntd = xtd;
- goto havethread;
- }
- }
+#ifdef INVARIANTS
+ ++token_contention_count[ntd->td_pri];
+ ++ntd->td_contended;
#endif
-
-#if 0
- if (fairq_bypass)
- goto skip;
-
- xtd = NULL;
- while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
- /*
- * Try to switch to this thread. Kernel threads have priority
- * over user threads in this case.
- */
- if (ntd->td_pri < TDPRI_KERN_LPSCHED) {
- if (xtd == NULL)
- xtd = ntd;
- continue;
- }
-
- if (TD_TOKS_NOT_HELD(ntd) ||
- lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops)))
- {
- goto havethread;
- }
- /* thread contested, try another */
}
- /*
- * We exhausted the run list but we may have recorded a user
- * thread to try.
- */
- if (xtd) {
- ntd = xtd;
- if ((gd->gd_reqflags & RQF_AST_LWKT_RESCHED) == 0 &&
- (TD_TOKS_NOT_HELD(ntd) ||
- lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops)))
- ) {
- goto havethread;
- }
- }
-#endif
-
skip:
/*
* We exhausted the run list, meaning that all runnable threads
* ordering the tokens by address.
*/
if ((td->td_flags & TDF_RUNQ) == 0) {
- need_lwkt_resched();
+ need_lwkt_resched(); /* prevent hlt */
goto haveidle;
}
#if defined(INVARIANTS) && defined(__amd64__)
havethread:
/*
- * The thread may have been sitting in the runq for a while, be sure
- * to reset td_fairq_lticks to avoid an improper scheduling tick against
- * the thread if it gets dequeued again quickly.
- *
- * We must always decrement td_fairq_accum on non-idle threads just
- * in case a thread never gets a tick due to being in a continuous
- * critical section. The page-zeroing code does this, for example.
- */
- /* ntd->td_fairq_lticks = ticks; */
- --ntd->td_fairq_accum;
- if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd))
- ntd->td_fairq_accum = -TDFAIRQ_MAX(gd);
-
- /*
* If the thread we came up with is a higher or equal priority verses
* the thread at the head of the queue we move our thread to the
* front. This way we can always check the front of the queue.
*/
ntd->td_wmesg = NULL;
++gd->gd_cnt.v_swtch;
+#if 0
xtd = TAILQ_FIRST(&gd->gd_tdrunq);
if (ntd != xtd && ntd->td_pri >= xtd->td_pri) {
TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq);
}
+#endif
gd->gd_idle_repeat = 0;
havethread_preempted:
*/
splz_check();
#endif
-
/* NOTE: current cpu may have changed after switch */
crit_exit_quick(td);
}
*
* YYY The target thread must be in a critical section (else it must
* inherit our critical section? I dunno yet).
- *
- * Set need_lwkt_resched() unconditionally for now YYY.
*/
KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri));
td = gd->gd_curthread;
if (preempt_enable == 0) {
+#if 0
if (ntd->td_pri > td->td_pri)
need_lwkt_resched();
+#endif
++preempt_miss;
return;
}
}
if (td->td_critcount > critcount) {
++preempt_miss;
+#if 0
need_lwkt_resched();
+#endif
return;
}
#ifdef SMP
if (ntd->td_gd != gd) {
++preempt_miss;
+#if 0
need_lwkt_resched();
+#endif
return;
}
#endif
if (TD_TOKS_HELD(ntd)) {
++preempt_miss;
+#if 0
need_lwkt_resched();
+#endif
return;
}
if (td == ntd || ((td->td_flags | ntd->td_flags) & TDF_PREEMPT_LOCK)) {
++preempt_weird;
+#if 0
need_lwkt_resched();
+#endif
return;
}
if (ntd->td_preempted) {
++preempt_hit;
+#if 0
need_lwkt_resched();
+#endif
return;
}
KKASSERT(gd->gd_processing_ipiq == 0);
{
globaldata_t gd = mycpu;
thread_t td = gd->gd_curthread;
- thread_t xtd;
if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
splz();
- if (td->td_fairq_accum < 0) {
+ if (lwkt_resched_wanted()) {
lwkt_schedule_self(curthread);
lwkt_switch();
- } else {
- xtd = TAILQ_FIRST(&gd->gd_tdrunq);
- if (xtd && xtd->td_pri > td->td_pri) {
- lwkt_schedule_self(curthread);
- lwkt_switch();
- }
}
}
* quantum has run out.
*/
if (lwkt_resched_wanted() ||
- user_resched_wanted() ||
- td->td_fairq_accum < 0)
+ user_resched_wanted())
{
lwkt_switch();
}
* It is possible for this routine to be called after a failed _enqueue
* (due to the target thread migrating, sleeping, or otherwise blocked).
* We have to check that the thread is actually on the run queue!
- *
- * reschedok is an optimized constant propagated from lwkt_schedule() or
- * lwkt_schedule_noresched(). By default it is non-zero, causing a
- * reschedule to be requested if the target thread has a higher priority.
- * The port messaging code will set MSG_NORESCHED and cause reschedok to
- * be 0, prevented undesired reschedules.
*/
static __inline
void
-_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok)
+_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount)
{
- thread_t otd;
-
if (ntd->td_flags & TDF_RUNQ) {
- if (ntd->td_preemptable && reschedok) {
+ if (ntd->td_preemptable) {
ntd->td_preemptable(ntd, ccount); /* YYY +token */
- } else if (reschedok) {
- otd = curthread;
- if (ntd->td_pri > otd->td_pri)
- need_lwkt_resched();
}
-
- /*
- * If we are in a different tick give the thread a cycle advantage.
- * This is primarily to avoid a degenerate case for interrupt threads
- * where accumulator crosses into negative territory unnecessarily.
- */
- if (ntd->td_fairq_lticks != ticks)
- lwkt_fairq_accumulate(gd, ntd);
}
}
static __inline
void
-_lwkt_schedule(thread_t td, int reschedok)
+_lwkt_schedule(thread_t td)
{
globaldata_t mygd = mycpu;
#ifdef SMP
if (td->td_gd == mygd) {
_lwkt_enqueue(td);
- _lwkt_schedule_post(mygd, td, 1, reschedok);
+ _lwkt_schedule_post(mygd, td, 1);
} else {
lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
}
#else
_lwkt_enqueue(td);
- _lwkt_schedule_post(mygd, td, 1, reschedok);
+ _lwkt_schedule_post(mygd, td, 1);
#endif
}
crit_exit_gd(mygd);
void
lwkt_schedule(thread_t td)
{
- _lwkt_schedule(td, 1);
+ _lwkt_schedule(td);
}
void
-lwkt_schedule_noresched(thread_t td)
+lwkt_schedule_noresched(thread_t td) /* XXX not impl */
{
- _lwkt_schedule(td, 0);
+ _lwkt_schedule(td);
}
#ifdef SMP
if (frame && ntd->td_preemptable) {
crit_exit_noyield(td);
- _lwkt_schedule(ntd, 1);
+ _lwkt_schedule(ntd);
crit_enter_quick(td);
} else {
- _lwkt_schedule(ntd, 1);
+ _lwkt_schedule(ntd);
}
}
}
/*
- * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle.
- *
- * Example: two competing threads, same priority N. decrement by (2*N)
- * increment by N*8, each thread will get 4 ticks.
+ * hz tick scheduler clock for LWKT threads
*/
void
-lwkt_fairq_schedulerclock(thread_t td)
+lwkt_schedulerclock(thread_t td)
{
- globaldata_t gd;
+ globaldata_t gd = td->td_gd;
+ thread_t xtd;
- if (fairq_enable) {
- while (td) {
- gd = td->td_gd;
- lwkt_fairq_tick(gd, td);
- if (td->td_fairq_accum < 0)
- need_lwkt_resched();
- td = td->td_preempted;
+ if (TAILQ_FIRST(&gd->gd_tdrunq) == td) {
+ /*
+ * If the current thread is at the head of the runq shift it to the
+ * end of any equal-priority threads and request a LWKT reschedule
+ * if it moved.
+ */
+ xtd = TAILQ_NEXT(td, td_threadq);
+ if (xtd && xtd->td_pri == td->td_pri) {
+ TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
+ while (xtd && xtd->td_pri == td->td_pri)
+ xtd = TAILQ_NEXT(xtd, td_threadq);
+ if (xtd)
+ TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
+ else
+ TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
+ need_lwkt_resched();
}
+ } else {
+ /*
+ * If we scheduled a thread other than the one at the head of the
+ * queue always request a reschedule every tick.
+ */
+ need_lwkt_resched();
}
}
-static void
-lwkt_fairq_accumulate(globaldata_t gd, thread_t td)
-{
- td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE;
- if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd))
- td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd);
-}
-
-static int
-lwkt_fairq_tick(globaldata_t gd, thread_t td)
-{
- if (td->td_fairq_lticks != ticks && td != &gd->gd_idlethread) {
- td->td_fairq_lticks = ticks;
- td->td_fairq_accum -= gd->gd_fairq_total_pri;
- if (td->td_fairq_accum < -TDFAIRQ_MAX(gd))
- td->td_fairq_accum = -TDFAIRQ_MAX(gd);
- return TRUE;
- }
- return FALSE;
-}
-
/*
* Migrate the current thread to the specified cpu.
*
ref->tr_owner = td;
}
+/*
+ * See kern/kern_spinlock.c for the discussion on cache-friendly contention
+ * resolution. We currently do not use cpu_lfence() (expensive!!) and, more
+ * importantly, we do a read-test of t_ref before attempting an atomic op,
+ * which greatly reduces hw cache bus contention.
+ */
static
int
_lwkt_trytoken_spin(lwkt_token_t tok, lwkt_tokref_t ref)
if (lwkt_token_delay) {
tsc_delay(lwkt_token_delay);
} else {
- cpu_lfence();
cpu_pause();
}
}
*/
if (_lwkt_trytoken_spin(tok, scan))
break;
- if (lwkt_sched_debug)
- kprintf("toka %p %s\n", tok, tok->t_desc);
+ if (lwkt_sched_debug > 0) {
+ --lwkt_sched_debug;
+ kprintf("toka %p %s %s\n",
+ tok, tok->t_desc, td->td_comm);
+ }
/*
* Otherwise we failed to acquire all the tokens.
*/
if (_lwkt_trytoken_spin(tok, scan))
break;
- if (lwkt_sched_debug)
- kprintf("tokb %p %s\n", tok, tok->t_desc);
+ if (lwkt_sched_debug > 0) {
+ --lwkt_sched_debug;
+ kprintf("tokb %p %s %s\n",
+ tok, tok->t_desc, td->td_comm);
+ }
/*
* Tokens are released in reverse order to reduce
{
globaldata_t gd;
bsd4_pcpu_t dd;
+#if 0
struct lwp *olp;
+#endif
crit_enter();
bsd4_recalculate_estcpu(lp);
/*
* Loop until we are the current user thread
*/
+ gd = mycpu;
+ dd = &bsd4_pcpu[gd->gd_cpuid];
+
do {
/*
- * Reload after a switch or setrunqueue/switch possibly
- * moved us to another cpu.
+ * Process any pending events and higher priority threads.
*/
- /*clear_lwkt_resched();*/
- gd = mycpu;
- dd = &bsd4_pcpu[gd->gd_cpuid];
- cpu_pause();
+ lwkt_yield();
/*
* Become the currently scheduled user thread for this cpu
dd->upri = lp->lwp_priority;
} else if (dd->upri > lp->lwp_priority) {
/*
- * We can steal the current lwp designation from the
- * olp that was previously assigned to this cpu.
+ * We can steal the current cpu's lwp designation
+ * away simply by replacing it. The other thread
+ * will stall when it tries to return to userland.
*/
- olp = dd->uschedcp;
dd->uschedcp = lp;
dd->upri = lp->lwp_priority;
+ /*
lwkt_deschedule(olp->lwp_thread);
bsd4_setrunqueue(olp);
+ */
} else {
/*
* We cannot become the current lwp, place the lp
* on the bsd4 run-queue and deschedule ourselves.
+ *
+ * When we are reactivated we will have another
+ * chance.
*/
lwkt_deschedule(lp->lwp_thread);
bsd4_setrunqueue(lp);
lwkt_switch();
+ /*
+ * Reload after a switch or setrunqueue/switch possibly
+ * moved us to another cpu.
+ */
+ gd = mycpu;
+ dd = &bsd4_pcpu[gd->gd_cpuid];
}
-
- /*
- * Because we are in a critical section interrupts may wind
- * up pending and prevent an interrupt thread from being
- * scheduled, we have to run splz() unconditionally to
- * ensure that these folks are properly scheduled so we can
- * then test the LWKT thread reschedule flag.
- *
- * Other threads at our current user priority have already
- * put in their bids, but we must run any kernel threads
- * at higher priorities, and we could lose our bid to
- * another thread trying to return to user mode in the
- * process.
- *
- * If we lose our bid we will be descheduled and put on
- * the run queue. When we are reactivated we will have
- * another chance.
- */
- splz();
- if (lwkt_resched_wanted())
- lwkt_switch();
} while (dd->uschedcp != lp);
crit_exit();
{
globaldata_t gd;
bsd4_pcpu_t dd;
+ bsd4_pcpu_t tmpdd;
struct lwp *nlp;
cpumask_t mask;
int cpuid;
smp_active_mask;
if (tmpmask) {
tmpid = BSFCPUMASK(tmpmask);
- gd = globaldata_find(cpuid);
- dd = &bsd4_pcpu[cpuid];
+ tmpdd = &bsd4_pcpu[tmpid];
atomic_clear_cpumask(&bsd4_rdyprocmask,
CPUMASK(tmpid));
spin_unlock(&bsd4_spin);
- lwkt_schedule(&dd->helper_thread);
+ lwkt_schedule(&tmpdd->helper_thread);
} else {
spin_unlock(&bsd4_spin);
}
*/
spin_unlock(&bsd4_spin);
}
+
+ /*
+ * We're descheduled unless someone scheduled us. Switch away.
+ * Exiting the critical section will cause splz() to be called
+ * for us if interrupts and such are pending.
+ */
crit_exit_gd(gd);
lwkt_switch();
}
}
/*
+ * Only system threads can use the interrupt reserve
+ */
+ if ((curthread->td_flags & TDF_SYSTHREAD) == 0) {
+ vm_wait(hz);
+ return(NULL);
+ }
+
+
+ /*
* Allocate and allow use of the interrupt reserve.
*
* If after all that we still can't allocate a VM page we are
struct lwkt_ipiq *gd_ipiq; /* array[ncpu] of ipiq's */
struct lwkt_ipiq gd_cpusyncq; /* ipiq for cpu synchro */
u_int gd_npoll; /* ipiq synchronization */
- int gd_fairq_total_pri;
+ int gd_unused01;
struct thread gd_unused02B;
struct thread gd_idlethread;
SLGlobalData gd_slab; /* slab allocator */
typedef struct globaldata *globaldata_t;
-#define RQB_IPIQ 0
-#define RQB_INTPEND 1
-#define RQB_AST_OWEUPC 2
-#define RQB_AST_SIGNAL 3
-#define RQB_AST_USER_RESCHED 4
-#define RQB_AST_LWKT_RESCHED 5
-#define RQB_AST_UPCALL 6
-#define RQB_TIMER 7
-#define RQB_RUNNING 8
-#define RQB_SPINNING 9
+#define RQB_IPIQ 0 /* 0001 */
+#define RQB_INTPEND 1 /* 0002 */
+#define RQB_AST_OWEUPC 2 /* 0004 */
+#define RQB_AST_SIGNAL 3 /* 0008 */
+#define RQB_AST_USER_RESCHED 4 /* 0010 */
+#define RQB_AST_LWKT_RESCHED 5 /* 0020 */
+#define RQB_AST_UPCALL 6 /* 0040 */
+#define RQB_TIMER 7 /* 0080 */
+#define RQB_RUNNING 8 /* 0100 */
+#define RQB_SPINNING 9 /* 0200 */
#define RQF_IPIQ (1 << RQB_IPIQ)
#define RQF_INTPEND (1 << RQB_INTPEND)
#define MSGF_QUEUED 0x0004 /* message has been queued sanitychk */
#define MSGF_SYNC 0x0008 /* synchronous message operation */
#define MSGF_INTRANSIT 0x0010 /* in-transit (IPI) */
-#define MSGF_NORESCHED 0x0020 /* do not reschedule target lwkt */
+#define MSGF_UNUSED0020 0x0020
#define MSGF_DROPABLE 0x0040 /* message supports drop */
#define MSGF_ABORTABLE 0x0080 /* message supports abort */
#define MSGF_PRIORITY 0x0100 /* priority message */
void *td_dsched_priv1; /* priv data for I/O schedulers */
int td_refs; /* hold position in gd_tdallq / hold free */
int td_nest_count; /* prevent splz nesting */
- int td_unused01[2]; /* for future fields */
+ int td_contended; /* token contention count */
+ int td_unused01[1]; /* for future fields */
#ifdef SMP
int td_cscount; /* cpu synchronization master */
#else
struct caps_kinfo *td_caps; /* list of client and server registrations */
lwkt_tokref_t td_toks_stop;
struct lwkt_tokref td_toks_array[LWKT_MAXTOKENS];
- int td_fairq_lticks; /* fairq wakeup accumulator reset */
- int td_fairq_accum; /* fairq priority accumulator */
+ int td_fairq_load; /* fairq */
+ int td_fairq_count; /* fairq */
struct globaldata *td_migrate_gd; /* target gd for thread migration */
const void *td_mplock_stallpc; /* last mplock stall address */
#ifdef DEBUG_CRIT_SECTIONS
#define TDF_KERNELFP 0x01000000 /* kernel using fp coproc */
#define TDF_UNUSED02000000 0x02000000
#define TDF_CRYPTO 0x04000000 /* crypto thread */
-#define TDF_MARKER 0x80000000 /* fairq marker thread */
/*
* Thread priorities. Typically only one thread from any given
#define TDPRI_INT_HIGH 29 /* high priority interrupt */
#define TDPRI_MAX 31
-/*
- * Scale is the approximate number of ticks for which we desire the
- * entire gd_tdrunq to get service. With hz = 100 a scale of 8 is 80ms.
- *
- * Setting this value too small will result in inefficient switching
- * rates.
- */
-#define TDFAIRQ_SCALE 8
-#define TDFAIRQ_MAX(gd) ((gd)->gd_fairq_total_pri * TDFAIRQ_SCALE)
-
#define LWKT_THREAD_STACK (UPAGES * PAGE_SIZE)
#define IN_CRITICAL_SECT(td) ((td)->td_critcount)
extern void lwkt_setpri(thread_t, int);
extern void lwkt_setpri_initial(thread_t, int);
extern void lwkt_setpri_self(int);
-extern void lwkt_fairq_schedulerclock(thread_t td);
-extern void lwkt_fairq_setpri_self(int pri);
-extern int lwkt_fairq_push(int pri);
-extern void lwkt_fairq_pop(int pri);
-extern void lwkt_fairq_yield(void);
+extern void lwkt_schedulerclock(thread_t td);
extern void lwkt_setcpu_self(struct globaldata *);
extern void lwkt_migratecpu(int);
struct vnode *vp;
};
-static int vm_fast_fault = 1;
-SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0,
- "Burst fault zero-fill regions");
static int debug_cluster = 0;
SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, "");
/*
* Allocate a new page for this object/offset pair.
+ *
+ * XXX for now don't use the VM_ALLOC_ZERO flag
+ * because this will continuously cycle pages
+ * through the cpu caches. Try to use a recently
+ * freed page.
*/
fs->m = NULL;
if (!vm_page_count_severe()) {
fs->m = vm_page_alloc(fs->object, pindex,
((fs->vp || fs->object->backing_object) ?
VM_ALLOC_NORMAL :
- VM_ALLOC_NORMAL | VM_ALLOC_ZERO));
+ VM_ALLOC_NORMAL /*| VM_ALLOC_ZERO*/));
}
if (fs->m == NULL) {
vm_object_pip_wakeup(fs->first_object);
* processes address space. It is a "cousin" of pmap_object_init_pt,
* except it runs at page fault time instead of mmap time.
*
+ * vm.fast_fault Enables pre-faulting zero-fill pages
+ *
+ * vm.prefault_pages Number of pages (1/2 negative, 1/2 positive) to
+ * prefault. Scan stops in either direction when
+ * a page is found to already exist.
+ *
* This code used to be per-platform pmap_prefault(). It is now
* machine-independent and enhanced to also pre-fault zero-fill pages
* (see vm.fast_fault) as well as make them writable, which greatly
*
* No other requirements.
*/
-#define PFBAK 4
-#define PFFOR 4
-#define PAGEORDER_SIZE (PFBAK+PFFOR)
-
-static int vm_prefault_pageorder[] = {
- -PAGE_SIZE, PAGE_SIZE,
- -2 * PAGE_SIZE, 2 * PAGE_SIZE,
- -3 * PAGE_SIZE, 3 * PAGE_SIZE,
- -4 * PAGE_SIZE, 4 * PAGE_SIZE
-};
+static int vm_prefault_pages = 8;
+SYSCTL_INT(_vm, OID_AUTO, prefault_pages, CTLFLAG_RW, &vm_prefault_pages, 0,
+ "Maximum number of pages to pre-fault");
+static int vm_fast_fault = 1;
+SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0,
+ "Burst fault zero-fill regions");
/*
* Set PG_NOSYNC if the map entry indicates so, but only if the page
{
struct lwp *lp;
vm_page_t m;
- vm_offset_t starta;
vm_offset_t addr;
vm_pindex_t index;
vm_pindex_t pindex;
vm_object_t object;
int pprot;
int i;
+ int noneg;
+ int nopos;
+ int maxpages;
+
+ /*
+ * Get stable max count value, disabled if set to 0
+ */
+ maxpages = vm_prefault_pages;
+ cpu_ccfence();
+ if (maxpages <= 0)
+ return;
/*
* We do not currently prefault mappings that use virtual page
if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace)))
return;
- starta = addra - PFBAK * PAGE_SIZE;
- if (starta < entry->start)
- starta = entry->start;
- else if (starta > addra)
- starta = 0;
+ /*
+ * Limit pre-fault count to 1024 pages.
+ */
+ if (maxpages > 1024)
+ maxpages = 1024;
object = entry->object.vm_object;
KKASSERT(object != NULL);
KKASSERT(object == entry->object.vm_object);
vm_object_chain_acquire(object);
- for (i = 0; i < PAGEORDER_SIZE; i++) {
+ noneg = 0;
+ nopos = 0;
+ for (i = 0; i < maxpages; ++i) {
vm_object_t lobject;
vm_object_t nobject;
int allocated = 0;
int error;
- addr = addra + vm_prefault_pageorder[i];
- if (addr > addra + (PFFOR * PAGE_SIZE))
- addr = 0;
-
- if (addr < starta || addr >= entry->end)
+ /*
+ * Calculate the page to pre-fault, stopping the scan in
+ * each direction separately if the limit is reached.
+ */
+ if (i & 1) {
+ if (noneg)
+ continue;
+ addr = addra - ((i + 1) >> 1) * PAGE_SIZE;
+ } else {
+ if (nopos)
+ continue;
+ addr = addra + ((i + 2) >> 1) * PAGE_SIZE;
+ }
+ if (addr < entry->start) {
+ noneg = 1;
+ if (noneg && nopos)
+ break;
+ continue;
+ }
+ if (addr >= entry->end) {
+ nopos = 1;
+ if (noneg && nopos)
+ break;
continue;
+ }
- if (pmap_prefault_ok(pmap, addr) == 0)
+ /*
+ * Skip pages already mapped, and stop scanning in that
+ * direction. When the scan terminates in both directions
+ * we are done.
+ */
+ if (pmap_prefault_ok(pmap, addr) == 0) {
+ if (i & 1)
+ noneg = 1;
+ else
+ nopos = 1;
+ if (noneg && nopos)
+ break;
continue;
+ }
/*
* Follow the VM object chain to obtain the page to be mapped
if (lobject->backing_object == NULL) {
if (vm_fast_fault == 0)
break;
- if (vm_prefault_pageorder[i] < 0 ||
- (prot & VM_PROT_WRITE) == 0 ||
+ if ((prot & VM_PROT_WRITE) == 0 ||
vm_page_count_min(0)) {
break;
}
/*
* NOTE: Allocated from base object
+ *
+ * XXX for now don't use the VM_ALLOC_ZERO
+ * flag because this will continuously
+ * cycle pages through the cpu caches.
+ * Try to use a recently freed page.
*/
m = vm_page_alloc(object, index,
- VM_ALLOC_NORMAL | VM_ALLOC_ZERO);
+ VM_ALLOC_NORMAL /*| VM_ALLOC_ZERO*/);
if ((m->flags & PG_ZERO) == 0) {
vm_page_zero_fill(m);
#endif
/*
- * The lock is a pool token, keep holding it across potential
- * wakeups to interlock the tsleep/wakeup.
+ * The lock is a pool token, no new holders should be possible once
+ * we drop hold_count 1->0 as there is no longer any way to reference
+ * the object.
*/
if (refcount_release(&obj->hold_count)) {
if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
zfree(obj_zone, obj);
- wakeup(obj);
}
vm_object_unlock(obj); /* uses pool token, ok to call on freed obj */
}
object->resident_page_count = 0;
object->agg_pv_list_count = 0;
object->shadow_count = 0;
+#ifdef SMP
+ /* cpu localization twist */
+ object->pg_color = (int)(intptr_t)curthread;
+#else
object->pg_color = next_index;
+#endif
if ( size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
else
lwkt_gettoken(&vmobj_token);
TAILQ_REMOVE(&vm_object_list, object, object_list);
vm_object_count--;
- vm_object_dead_wakeup(object);
lwkt_reltoken(&vmobj_token);
+ vm_object_dead_wakeup(object);
if (object->ref_count != 0) {
panic("vm_object_terminate2: object with references, "
LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
source->shadow_count++;
source->generation++;
+#ifdef SMP
+ /* cpu localization twist */
+ result->pg_color = (int)(intptr_t)curthread;
+#else
result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
PQ_L2_MASK;
+#endif
}
/*
lwkt_gettoken(&vmobj_token);
TAILQ_REMOVE(&vm_object_list, backing_object, object_list);
vm_object_count--;
- vm_object_dead_wakeup(backing_object);
lwkt_reltoken(&vmobj_token);
+ vm_object_dead_wakeup(backing_object);
}
/*
static void vm_page_queue_init(void);
static void vm_page_free_wakeup(void);
-static vm_page_t vm_page_select_cache(vm_object_t, vm_pindex_t);
+static vm_page_t vm_page_select_cache(u_short pg_color);
static vm_page_t _vm_page_list_find2(int basequeue, int index);
static void _vm_page_deactivate_locked(vm_page_t m, int athead);
m->phys_addr = pa;
m->flags = 0;
m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
+#ifdef SMP
+ /*
+ * Twist for cpu localization instead of page coloring.
+ */
+ m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE) & PQ_L2_MASK;
+ m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE)) & PQ_L2_MASK;
+#endif
m->queue = m->pc + PQ_FREE;
KKASSERT(m->dirty == 0);
atomic_add_int(pq->cnt, -1);
pq->lcnt--;
m->queue = PQ_NONE;
+ vm_page_queues_spin_unlock(queue);
if ((queue - m->pc) == PQ_FREE && (m->flags & PG_ZERO))
atomic_subtract_int(&vm_page_zero_count, 1);
- vm_page_queues_spin_unlock(queue);
if ((queue - m->pc) == PQ_CACHE || (queue - m->pc) == PQ_FREE)
return (queue - m->pc);
}
m->object = object;
m->pindex = pindex;
vm_page_rb_tree_RB_INSERT(&object->rb_memq, m);
- atomic_add_int(&object->agg_pv_list_count, m->md.pv_list_count);
+ /* atomic_add_int(&object->agg_pv_list_count, m->md.pv_list_count); */
vm_page_spin_unlock(m);
/*
vm_page_spin_lock(m);
vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
object->resident_page_count--;
- atomic_add_int(&object->agg_pv_list_count, -m->md.pv_list_count);
+ /* atomic_add_int(&object->agg_pv_list_count, -m->md.pv_list_count); */
m->object = NULL;
vm_page_spin_unlock(m);
* The page coloring optimization attempts to locate a page that does
* not overload other nearby pages in the object in the cpu's L1 or L2
* caches. We need this optimization because cpu caches tend to be
- * physical caches, while object spaces tend to be virtual. This optimization
- * also gives us multiple queues and spinlocks to worth with on SMP systems.
+ * physical caches, while object spaces tend to be virtual.
+ *
+ * On MP systems each PQ_FREE and PQ_CACHE color queue has its own spinlock
+ * and the algorithm is adjusted to localize allocations on a per-core basis.
+ * This is done by 'twisting' the colors.
*
* The page is returned spinlocked and removed from its queue (it will
* be on PQ_NONE), or NULL. The page is not PG_BUSY'd. The caller
*
*/
static vm_page_t
-vm_page_select_cache(vm_object_t object, vm_pindex_t pindex)
+vm_page_select_cache(u_short pg_color)
{
vm_page_t m;
for (;;) {
- m = _vm_page_list_find(PQ_CACHE,
- (pindex + object->pg_color) & PQ_L2_MASK,
- FALSE);
+ m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK, FALSE);
if (m == NULL)
break;
/*
* This routine may not block.
*/
static __inline vm_page_t
-vm_page_select_free(vm_object_t object, vm_pindex_t pindex,
- boolean_t prefer_zero)
+vm_page_select_free(u_short pg_color, boolean_t prefer_zero)
{
vm_page_t m;
for (;;) {
- m = _vm_page_list_find(PQ_FREE,
- (pindex + object->pg_color) & PQ_L2_MASK,
+ m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK,
prefer_zero);
if (m == NULL)
break;
* vm_page_alloc()
*
* Allocate and return a memory cell associated with this VM object/offset
- * pair.
+ * pair. If object is NULL an unassociated page will be allocated.
*
* page_req classes:
*
* VM_ALLOC_INTERRUPT allow free list to be completely drained
* VM_ALLOC_ZERO advisory request for pre-zero'd page
*
- * The object must be locked.
- * This routine may not block.
+ * The object must be locked if not NULL
+ * This routine may not block
* The returned page will be marked PG_BUSY
*
* Additional special handling is required when called from an interrupt
vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
{
vm_page_t m = NULL;
+ u_short pg_color;
- KKASSERT(object != NULL);
- KASSERT(!vm_page_lookup(object, pindex),
- ("vm_page_alloc: page already allocated"));
+#ifdef SMP
+ /*
+ * Cpu twist - cpu localization algorithm
+ */
+ if (object) {
+ pg_color = mycpu->gd_cpuid + (pindex & ~ncpus_fit_mask) +
+ (object->pg_color & ~ncpus_fit_mask);
+ KASSERT(vm_page_lookup(object, pindex) == NULL,
+ ("vm_page_alloc: page already allocated"));
+ } else {
+ pg_color = mycpu->gd_cpuid + (pindex & ~ncpus_fit_mask);
+ }
+#else
+ /*
+ * Normal page coloring algorithm
+ */
+ if (object) {
+ pg_color = object->pg_color + pindex;
+ KASSERT(vm_page_lookup(object, pindex) == NULL,
+ ("vm_page_alloc: page already allocated"));
+ } else {
+ pg_color = pindex;
+ }
+#endif
KKASSERT(page_req &
(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
* The free queue has sufficient free pages to take one out.
*/
if (page_req & VM_ALLOC_ZERO)
- m = vm_page_select_free(object, pindex, TRUE);
+ m = vm_page_select_free(pg_color, TRUE);
else
- m = vm_page_select_free(object, pindex, FALSE);
+ m = vm_page_select_free(pg_color, FALSE);
} else if (page_req & VM_ALLOC_NORMAL) {
/*
* Allocatable from the cache (non-interrupt only). On
" cache page from preempting interrupt\n");
m = NULL;
} else {
- m = vm_page_select_cache(object, pindex);
+ m = vm_page_select_cache(pg_color);
}
#else
- m = vm_page_select_cache(object, pindex);
+ m = vm_page_select_cache(pg_color);
#endif
/*
* On success move the page into the free queue and loop.
*
* NOTE: Inserting a page here does not insert it into any pmaps
* (which could cause us to block allocating memory).
+ *
+ * NOTE: If no object an unassociated page is allocated, m->pindex
+ * can be used by the caller for any purpose.
*/
- vm_page_insert(m, object, pindex);
+ if (object)
+ vm_page_insert(m, object, pindex);
+ else
+ m->pindex = pindex;
/*
* Don't wakeup too often - wakeup the pageout daemon when
static __inline void
vm_page_free_zero(vm_page_t m)
{
-#ifdef __x86_64__
- /* JG DEBUG64 We check if the page is really zeroed. */
+#ifdef PMAP_DEBUG
char *p = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
int i;
void *
zalloc(vm_zone_t z)
{
+ globaldata_t gd = mycpu;
void *item;
#ifdef INVARIANTS
if (z == NULL)
zerror(ZONE_ERROR_INVALID);
#endif
+ /*
+ * Avoid spinlock contention by allocating from a per-cpu queue
+ */
+ if (z->zfreecnt_pcpu[gd->gd_cpuid] > 0) {
+ crit_enter_gd(gd);
+ if (z->zfreecnt_pcpu[gd->gd_cpuid] > 0) {
+ item = z->zitems_pcpu[gd->gd_cpuid];
+#ifdef INVARIANTS
+ KASSERT(item != NULL,
+ ("zitems_pcpu unexpectedly NULL"));
+ if (((void **)item)[1] != (void *)ZENTRY_FREE)
+ zerror(ZONE_ERROR_NOTFREE);
+ ((void **)item)[1] = 0;
+#endif
+ z->zitems_pcpu[gd->gd_cpuid] = ((void **) item)[0];
+ --z->zfreecnt_pcpu[gd->gd_cpuid];
+ z->znalloc++;
+ crit_exit_gd(gd);
+ return item;
+ }
+ crit_exit_gd(gd);
+ }
+
+ /*
+ * Per-zone spinlock for the remainder.
+ */
spin_lock(&z->zlock);
if (z->zfreecnt > z->zfreemin) {
item = z->zitems;
#ifdef INVARIANTS
KASSERT(item != NULL, ("zitems unexpectedly NULL"));
- if (((void **) item)[1] != (void *) ZENTRY_FREE)
+ if (((void **)item)[1] != (void *)ZENTRY_FREE)
zerror(ZONE_ERROR_NOTFREE);
- ((void **) item)[1] = 0;
+ ((void **)item)[1] = 0;
#endif
- z->zitems = ((void **) item)[0];
+ z->zitems = ((void **)item)[0];
z->zfreecnt--;
z->znalloc++;
spin_unlock(&z->zlock);
void
zfree(vm_zone_t z, void *item)
{
+ globaldata_t gd = mycpu;
+ int zmax;
+
+ /*
+ * Avoid spinlock contention by freeing into a per-cpu queue
+ */
+ if ((zmax = z->zmax) != 0)
+ zmax = zmax / ncpus / 16;
+ if (zmax < 64)
+ zmax = 64;
+
+ if (z->zfreecnt_pcpu[gd->gd_cpuid] < zmax) {
+ crit_enter_gd(gd);
+ ((void **)item)[0] = z->zitems_pcpu[gd->gd_cpuid];
+#ifdef INVARIANTS
+ if (((void **)item)[1] == (void *)ZENTRY_FREE)
+ zerror(ZONE_ERROR_ALREADYFREE);
+ ((void **)item)[1] = (void *)ZENTRY_FREE;
+#endif
+ z->zitems_pcpu[gd->gd_cpuid] = item;
+ ++z->zfreecnt_pcpu[gd->gd_cpuid];
+ crit_exit_gd(gd);
+ return;
+ }
+
+ /*
+ * Per-zone spinlock for the remainder.
+ */
spin_lock(&z->zlock);
- ((void **) item)[0] = z->zitems;
+ ((void **)item)[0] = z->zitems;
#ifdef INVARIANTS
- if (((void **) item)[1] == (void *) ZENTRY_FREE)
+ if (((void **)item)[1] == (void *)ZENTRY_FREE)
zerror(ZONE_ERROR_ALREADYFREE);
- ((void **) item)[1] = (void *) ZENTRY_FREE;
+ ((void **)item)[1] = (void *)ZENTRY_FREE;
#endif
z->zitems = item;
z->zfreecnt++;
lwkt_gettoken(&vm_token);
LIST_INSERT_HEAD(&zlist, z, zlink);
lwkt_reltoken(&vm_token);
+
+ bzero(z->zitems_pcpu, sizeof(z->zitems_pcpu));
+ bzero(z->zfreecnt_pcpu, sizeof(z->zfreecnt_pcpu));
}
z->zkmvec = NULL;
{
int i;
+ bzero(z->zitems_pcpu, sizeof(z->zitems_pcpu));
+ bzero(z->zfreecnt_pcpu, sizeof(z->zfreecnt_pcpu));
+
z->zname = name;
z->zsize = size;
z->zpagemax = 0;
bzero(item, nitems * z->zsize);
z->zitems = NULL;
for (i = 0; i < nitems; i++) {
- ((void **) item)[0] = z->zitems;
+ ((void **)item)[0] = z->zitems;
#ifdef INVARIANTS
- ((void **) item)[1] = (void *) ZENTRY_FREE;
+ ((void **)item)[1] = (void *)ZENTRY_FREE;
#endif
z->zitems = item;
item = (uint8_t *)item + z->zsize;
if (nitems != 0) {
nitems -= 1;
for (i = 0; i < nitems; i++) {
- ((void **) item)[0] = z->zitems;
+ ((void **)item)[0] = z->zitems;
#ifdef INVARIANTS
- ((void **) item)[1] = (void *) ZENTRY_FREE;
+ ((void **)item)[1] = (void *)ZENTRY_FREE;
#endif
z->zitems = item;
item = (uint8_t *)item + z->zsize;
z->znalloc++;
} else if (z->zfreecnt > 0) {
item = z->zitems;
- z->zitems = ((void **) item)[0];
+ z->zitems = ((void **)item)[0];
#ifdef INVARIANTS
- if (((void **) item)[1] != (void *) ZENTRY_FREE)
+ if (((void **)item)[1] != (void *)ZENTRY_FREE)
zerror(ZONE_ERROR_NOTFREE);
((void **) item)[1] = 0;
#endif
*/
typedef struct vm_zone {
struct spinlock zlock; /* lock for data structure */
+ void *zitems_pcpu[SMP_MAXCPU];
+ int zfreecnt_pcpu[SMP_MAXCPU];
void *zitems; /* linked list of items */
int zfreecnt; /* free entries */
int zfreemin; /* minimum number of free entries */