From c17a6852f80ce59ea0641ae01d0b3c52f0584101 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 11 Nov 2011 22:31:27 -0800 Subject: [PATCH] kernel - Fix IPI signaling issue, add a few assertions * Change the low-level IPI code to physically disable interrupts when waiting for the ICR status bit to clear and issuing a new IPI. It appears that on Intel cpus it is possible for (with circumstantial evidence) a LAPIC EOI to busy the command sequencer. Thus if interrupts are enabled inside a critical section, even if all they do is EOI the LAPIC and IRET, this can prevent an IPI from being sent if the interrupt occurs at just the right moment during an IPI operation. * Because IPIs are already limited to one per target cpu at any given moment via gd->gd_npoll we can also do away with the ipiq polling code that was inside the ICR wait. * Add a few assertions to try to catch other possible MP problems. Assert that TDF_TSLEEPQ is not set when freeing a thread. Assert that the cpusync ipiq does not overflow. Assert that the vm_object hold count does not go negative. Reported-by: ftigeot --- sys/kern/lwkt_ipiq.c | 10 ++++++ sys/kern/lwkt_thread.c | 3 +- sys/platform/pc64/apic/lapic.c | 64 ++++++++++++++++++---------------- sys/vm/vm_object.c | 1 + 4 files changed, 46 insertions(+), 32 deletions(-) diff --git a/sys/kern/lwkt_ipiq.c b/sys/kern/lwkt_ipiq.c index 2235f5946f..a2580aa346 100644 --- a/sys/kern/lwkt_ipiq.c +++ b/sys/kern/lwkt_ipiq.c @@ -505,6 +505,12 @@ again: } mask &= ~CPUMASK(n); } + + /* + * Process pending cpusyncs. If the current thread has a cpusync + * active cpusync we only run the list once and do not re-flag + * as the thread itself is processing its interlock. + */ if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) { if (gd->gd_curthread->td_cscount == 0) goto again; @@ -638,6 +644,9 @@ lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, * to the loading of ip_rindex. Even though stores might be * ordered, loads are probably not. A memory fence is required * to prevent reordering of the loads after the ip_rindex update. + * + * NOTE: Single pass only. Returns non-zero if the queue is not empty + * on return. */ while (wi - (ri = ip->ip_rindex) > 0) { ri &= MAXCPUFIFO_MASK; @@ -888,6 +897,7 @@ lwkt_cpusync_remote2(lwkt_cpusync_t cs) ip->ip_info[wi].arg1 = cs; ip->ip_info[wi].arg2 = 0; cpu_sfence(); + KKASSERT(ip->ip_windex - ip->ip_rindex < MAXCPUFIFO); ++ip->ip_windex; if (ipiq_debug && (ip->ip_windex & 0xFFFFFF) == 0) { kprintf("cpu %d cm=%016jx %016jx f=%p\n", diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index d91187664e..309457af64 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -537,7 +537,8 @@ void lwkt_free_thread(thread_t td) { KKASSERT(td->td_refs == 0); - KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK|TDF_RUNQ)) == 0); + KKASSERT((td->td_flags & (TDF_RUNNING | TDF_PREEMPT_LOCK | + TDF_RUNQ | TDF_TSLEEPQ)) == 0); if (td->td_flags & TDF_ALLOCATED_THREAD) { objcache_put(thread_cache, td); } else if (td->td_flags & TDF_ALLOCATED_STACK) { diff --git a/sys/platform/pc64/apic/lapic.c b/sys/platform/pc64/apic/lapic.c index 0df8ab44f6..93b89ddcb9 100644 --- a/sys/platform/pc64/apic/lapic.c +++ b/sys/platform/pc64/apic/lapic.c @@ -514,65 +514,67 @@ apic_dump(char* str) * vector is any valid SYSTEM INT vector * delivery_mode is 1 of: APIC_DELMODE_FIXED, APIC_DELMODE_LOWPRIO * - * A backlog of requests can create a deadlock between cpus. To avoid this - * we have to be able to accept IPIs at the same time we are trying to send - * them. The critical section prevents us from attempting to send additional - * IPIs reentrantly, but also prevents IPIQ processing so we have to call - * lwkt_process_ipiq() manually. It's rather messy and expensive for this - * to occur but fortunately it does not happen too often. + * WARNINGS! + * + * We now implement a per-cpu interlock (gd->gd_npoll) to prevent more than + * one IPI from being sent to any given cpu at a time. Thus we no longer + * have to process incoming IPIs while waiting for the status to clear. + * No deadlock should be possible. + * + * We now physically disable interrupts for the lapic ICR operation. If + * we do not do this then it looks like an EOI sent to the lapic (which + * occurs even with a critical section) can interfere with the command + * register ready status and cause an IPI to be lost. + * + * e.g. an interrupt can occur, issue the EOI, IRET, and cause the command + * register to busy just before we write to icr_lo, resulting in a lost + * issuance. This only appears to occur on Intel cpus and is not + * documented. It could simply be that cpus are so fast these days that + * it was always an issue, but is only now rearing its ugly head. This + * is conjecture. */ int apic_ipi(int dest_type, int vector, int delivery_mode) { + unsigned long rflags; u_long icr_lo; - crit_enter(); - if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { - unsigned long rflags = read_rflags(); - cpu_enable_intr(); - DEBUG_PUSH_INFO("apic_ipi"); - while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { - lwkt_process_ipiq(); - } - DEBUG_POP_INFO(); - write_rflags(rflags); + rflags = read_rflags(); + cpu_disable_intr(); + while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { + cpu_pause(); } - icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) | dest_type | delivery_mode | vector; lapic->icr_lo = icr_lo; - crit_exit(); + write_rflags(rflags); + return 0; } void single_apic_ipi(int cpu, int vector, int delivery_mode) { + unsigned long rflags; u_long icr_lo; u_long icr_hi; - crit_enter(); - if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { - unsigned long rflags = read_rflags(); - cpu_enable_intr(); - DEBUG_PUSH_INFO("single_apic_ipi"); - while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { - lwkt_process_ipiq(); - } - DEBUG_POP_INFO(); - write_rflags(rflags); + rflags = read_rflags(); + cpu_disable_intr(); + while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { + cpu_pause(); } icr_hi = lapic->icr_hi & ~APIC_ID_MASK; icr_hi |= (CPUID_TO_APICID(cpu) << 24); lapic->icr_hi = icr_hi; /* build ICR_LOW */ - icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) - | APIC_DEST_DESTFLD | delivery_mode | vector; + icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) | + APIC_DEST_DESTFLD | delivery_mode | vector; /* write APIC ICR */ lapic->icr_lo = icr_lo; - crit_exit(); + write_rflags(rflags); } #if 0 diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 85e2a33e47..e29b1e281b 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -255,6 +255,7 @@ vm_object_drop(vm_object_t obj) * we drop hold_count 1->0 as there is no longer any way to reference * the object. */ + KKASSERT(obj->hold_count > 0); if (refcount_release(&obj->hold_count)) { if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) zfree(obj_zone, obj); -- 2.41.0