kernel - Fix serious issue w/ smp_invltlb(), plus other issues (2)
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 28 Oct 2010 19:36:32 +0000 (12:36 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 28 Oct 2010 21:22:29 +0000 (14:22 -0700)
It looks like there are a number of additional cases where kernel threads
can wind up running with interrupts physically disabled, which can create
serious problems for smp_invltlb().

It is not entirely clear how these deadlocks occur since the IPI code does
a forced "STI" if it would otherwise loop, but there are several other
hard loops that do not:  lwkt_switch looping on a vm_token, spin locks,
and probably a few other places.

We want interrupts to be enabled in all cases where these sorts of loops
occur in order to be able to service Xinvltlb via smp_invltlb() as well
as to prevent the LAPIC interrupt queue from filling up and livelocking
something that it shouldn't.

* lwkt_preempt() must save, zero, and restore gd->gd_intr_nesting_level
  when issuing a preemption.  Otherwise it will improperly panic on
  an assertion if the preempting interrupt thread code tries to switch out.
  It is perfectly acceptable for the preempting thread to block (it just
  switches back to the thread that got preempted).

  Why the assertion was not occuring before I do not know but it is
  probably related to threads getting stuck in 'cli' mode.  The additional
  changes below appear to significantly increase the number of interrupt
  thread preemptions which succeed (lwkt.preempt_{hit,miss} counters).

* STI prior to running ithread_fast_handler() from all IDTVECs related
  to device interrupts.

* STI in Xcpustop, Xipiq, and Xtimer.  These functions can call more
  complex C code and doing so with interrupts disabled may prevent
  Xinvltlb (via smp_invltlb()) from being executed, deadlocking the
  system.

* Reorder a mfence().  Probably not needed but do it anyway.

sys/kern/lwkt_thread.c
sys/platform/pc32/apic/apic_vector.s
sys/platform/pc32/i386/mp_machdep.c
sys/platform/pc32/icu/icu_vector.s
sys/platform/pc64/apic/apic_vector.s
sys/platform/pc64/icu/icu_vector.s
sys/platform/pc64/x86_64/mp_machdep.c

index 38bdc75..a825189 100644 (file)
@@ -928,6 +928,7 @@ lwkt_preempt(thread_t ntd, int critcount)
     int mpheld;
     int savecnt;
 #endif
+    int save_gd_intr_nesting_level;
 
     /*
      * The caller has put us in a critical section.  We can only preempt
@@ -1012,12 +1013,19 @@ lwkt_preempt(thread_t ntd, int critcount)
     /*
      * Since we are able to preempt the current thread, there is no need to
      * call need_lwkt_resched().
+     *
+     * We must temporarily clear gd_intr_nesting_level around the switch
+     * since switchouts from the target thread are allowed (they will just
+     * return to our thread), and since the target thread has its own stack.
      */
     ++preempt_hit;
     ntd->td_preempted = td;
     td->td_flags |= TDF_PREEMPT_LOCK;
     KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd);
+    save_gd_intr_nesting_level = gd->gd_intr_nesting_level;
+    gd->gd_intr_nesting_level = 0;
     td->td_switch(ntd);
+    gd->gd_intr_nesting_level = save_gd_intr_nesting_level;
 
     KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
 #ifdef SMP
index d7f9d0e..6412da9 100644 (file)
@@ -161,6 +161,7 @@ IDTVEC(vec_name) ;                                                  \
        pushl   $irq_num ;                                              \
        pushl   %esp ;                   /* pass frame by reference */  \
        incl    TD_CRITCOUNT(%ebx) ;                                    \
+       sti ;                                                           \
        call    ithread_fast_handler ;   /* returns 0 to unmask */      \
        decl    TD_CRITCOUNT(%ebx) ;                                    \
        addl    $8, %esp ;                                              \
@@ -190,6 +191,8 @@ Xspuriousint:
 
 /*
  * Handle TLB shootdowns.
+ *
+ * NOTE: Interrupts remain disabled.
  */
        .text
        SUPERALIGN_TEXT
@@ -249,9 +252,13 @@ Xcpustop:
         * Indicate that we have stopped and loop waiting for permission
         * to start again.  We must still process IPI events while in a
         * stopped state.
+        *
+        * Interrupts must remain enabled for non-IPI'd per-cpu interrupts
+        * (e.g. Xtimer, Xinvltlb).
         */
        MPLOCKED
        btsl    %eax, stopped_cpus      /* stopped_cpus |= (1<<id) */
+       sti
 1:
        andl    $~RQF_IPIQ,PCPU(reqflags)
        pushl   %eax
@@ -305,6 +312,7 @@ Xipiq:
        pushl   %esp                    /* pass frame by reference */
        incl    PCPU(intr_nesting_level)
        incl    TD_CRITCOUNT(%ebx)
+       sti
        call    lwkt_process_ipiq_frame
        decl    TD_CRITCOUNT(%ebx)
        decl    PCPU(intr_nesting_level)
@@ -335,6 +343,7 @@ Xtimer:
        pushl   %esp                    /* pass frame by reference */
        incl    PCPU(intr_nesting_level)
        incl    TD_CRITCOUNT(%ebx)
+       sti
        call    lapic_timer_process_frame
        decl    TD_CRITCOUNT(%ebx)
        decl    PCPU(intr_nesting_level)
index 2fb3f89..eda9722 100644 (file)
@@ -2610,21 +2610,59 @@ smp_invltlb(void)
 {
 #ifdef SMP
        struct mdglobaldata *md = mdcpu;
+#ifdef SMP_INVLTLB_DEBUG
+       long count = 0;
+       long xcount = 0;
+#endif
 
        crit_enter_gd(&md->mi);
        md->gd_invltlb_ret = 0;
        ++md->mi.gd_cnt.v_smpinvltlb;
        atomic_set_int(&smp_invltlb_req, md->mi.gd_cpumask);
+#ifdef SMP_INVLTLB_DEBUG
+again:
+#endif
        if (smp_startup_mask == smp_active_mask) {
                all_but_self_ipi(XINVLTLB_OFFSET);
        } else {
                selected_apic_ipi(smp_active_mask & ~md->mi.gd_cpumask,
                                  XINVLTLB_OFFSET, APIC_DELMODE_FIXED);
        }
+
+#ifdef SMP_INVLTLB_DEBUG
+       if (xcount)
+               kprintf("smp_invltlb: ipi sent\n");
+#endif
        while ((md->gd_invltlb_ret & smp_active_mask & ~md->mi.gd_cpumask) !=
               (smp_active_mask & ~md->mi.gd_cpumask)) {
                cpu_mfence();
                cpu_pause();
+#ifdef SMP_INVLTLB_DEBUG
+               /* DEBUGGING */
+               if (++count == 400000000) {
+                       print_backtrace(-1);
+                       kprintf("smp_invltlb: endless loop %08lx %08lx, "
+                               "rflags %016lx retry",
+                               (long)md->gd_invltlb_ret,
+                               (long)smp_invltlb_req,
+                               (long)read_eflags());
+                       __asm __volatile ("sti");
+                       ++xcount;
+                       if (xcount > 2)
+                               lwkt_process_ipiq();
+                       if (xcount > 3) {
+                               int bcpu = bsfl(~md->gd_invltlb_ret & ~md->mi.gd_cpumask & smp_active_mask);
+                               globaldata_t xgd;
+                               kprintf("bcpu %d\n", bcpu);
+                               xgd = globaldata_find(bcpu);
+                               kprintf("thread %p %s\n", xgd->gd_curthread, xgd->gd_curthread->td_comm);
+                       }
+                       if (xcount > 5)
+                               panic("giving up");
+                       count = 0;
+                       goto again;
+               }
+#endif
        }
        atomic_clear_int(&smp_invltlb_req, md->mi.gd_cpumask);
        crit_exit_gd(&md->mi);
index d196f9e..592f284 100644 (file)
@@ -160,6 +160,7 @@ IDTVEC(vec_name) ;                                                  \
        pushl   $irq_num ;                                              \
        pushl   %esp ;                  /* pass frame by reference */   \
        incl    TD_CRITCOUNT(%ebx) ;                                    \
+       sti ;                                                           \
        call    ithread_fast_handler ;  /* returns 0 to unmask int */   \
        decl    TD_CRITCOUNT(%ebx) ;                                    \
        addl    $8,%esp ;                                               \
index 201e341..c8e3825 100644 (file)
@@ -144,6 +144,7 @@ IDTVEC(vec_name) ;                                                  \
        pushq   $irq_num ;              /* trapframe -> intrframe */    \
        movq    %rsp, %rdi ;            /* pass frame by reference */   \
        incl    TD_CRITCOUNT(%rbx) ;                                    \
+       sti ;                                                           \
        call    ithread_fast_handler ;  /* returns 0 to unmask */       \
        decl    TD_CRITCOUNT(%rbx) ;                                    \
        addq    $8, %rsp ;              /* intrframe -> trapframe */    \
@@ -173,6 +174,8 @@ Xspuriousint:
 
 /*
  * Handle TLB shootdowns.
+ *
+ * NOTE: interrupts are left disabled.
  */
        .text
        SUPERALIGN_TEXT
@@ -220,9 +223,13 @@ Xcpustop:
         * Indicate that we have stopped and loop waiting for permission
         * to start again.  We must still process IPI events while in a
         * stopped state.
+        *
+        * Interrupts must remain enabled for non-IPI'd per-cpu interrupts
+        * (e.g. Xtimer, Xinvltlb).
         */
        MPLOCKED
        btsl    %eax, stopped_cpus      /* stopped_cpus |= (1<<id) */
+       sti
 1:
        andl    $~RQF_IPIQ,PCPU(reqflags)
        pushq   %rax
@@ -273,6 +280,7 @@ Xipiq:
        movq    %rsp,%rdi               /* pass frame by reference */
        incl    PCPU(intr_nesting_level)
        incl    TD_CRITCOUNT(%rbx)
+       sti
        call    lwkt_process_ipiq_frame
        decl    TD_CRITCOUNT(%rbx)
        decl    PCPU(intr_nesting_level)
@@ -304,6 +312,7 @@ Xtimer:
        movq    %rsp,%rdi               /* pass frame by reference */
        incl    PCPU(intr_nesting_level)
        incl    TD_CRITCOUNT(%rbx)
+       sti
        call    lapic_timer_process_frame
        decl    TD_CRITCOUNT(%rbx)
        decl    PCPU(intr_nesting_level)
index 68888a1..bfc392f 100644 (file)
@@ -155,6 +155,7 @@ IDTVEC(vec_name) ;                                                  \
        pushq   $irq_num ;                                              \
        movq    %rsp,%rdi ;             /* rdi = call argument */       \
        incl    TD_CRITCOUNT(%rbx) ;                                    \
+       sti ;                                                           \
        call    ithread_fast_handler ;  /* returns 0 to unmask int */   \
        decl    TD_CRITCOUNT(%rbx) ;                                    \
        addq    $8,%rsp ;               /* intr frame -> trap frame */  \
index dd396c5..5a7dd68 100644 (file)
@@ -2795,30 +2795,58 @@ smp_invltlb(void)
 {
 #ifdef SMP
        struct mdglobaldata *md = mdcpu;
-#if 0
+#ifdef SMP_INVLTLB_DEBUG
        long count = 0;
+       long xcount = 0;
 #endif
 
        crit_enter_gd(&md->mi);
        md->gd_invltlb_ret = 0;
        ++md->mi.gd_cnt.v_smpinvltlb;
        atomic_set_int(&smp_invltlb_req, md->mi.gd_cpumask);
+#ifdef SMP_INVLTLB_DEBUG
+again:
+#endif
        if (smp_startup_mask == smp_active_mask) {
                all_but_self_ipi(XINVLTLB_OFFSET);
        } else {
                selected_apic_ipi(smp_active_mask & ~md->mi.gd_cpumask,
                                  XINVLTLB_OFFSET, APIC_DELMODE_FIXED);
        }
+
+#ifdef SMP_INVLTLB_DEBUG
+       if (xcount)
+               kprintf("smp_invltlb: ipi sent\n");
+#endif
        while ((md->gd_invltlb_ret & smp_active_mask & ~md->mi.gd_cpumask) !=
               (smp_active_mask & ~md->mi.gd_cpumask)) {
                cpu_mfence();
                cpu_pause();
-#if 0
+#ifdef SMP_INVLTLB_DEBUG
                /* DEBUGGING */
                if (++count == 400000000) {
-                       panic("smp_invltlb: endless loop %08lx %08lx",
+                       print_backtrace(-1);
+                       kprintf("smp_invltlb: endless loop %08lx %08lx, "
+                               "rflags %016jx retry",
                              (long)md->gd_invltlb_ret,
-                             (long)smp_invltlb_req);
+                             (long)smp_invltlb_req,
+                             (intmax_t)read_rflags());
+                       __asm __volatile ("sti");
+                       ++xcount;
+                       if (xcount > 2)
+                               lwkt_process_ipiq();
+                       if (xcount > 3) {
+                               int bcpu = bsfl(~md->gd_invltlb_ret & ~md->mi.gd_cpumask & smp_active_mask);
+                               globaldata_t xgd;
+
+                               kprintf("bcpu %d\n", bcpu);
+                               xgd = globaldata_find(bcpu);
+                               kprintf("thread %p %s\n", xgd->gd_curthread, xgd->gd_curthread->td_comm);
+                       }
+                       if (xcount > 5)
+                               Debugger("giving up");
+                       count = 0;
+                       goto again;
                }
 #endif
        }
@@ -2842,8 +2870,8 @@ smp_invltlb_intr(void)
        cpumask_t mask;
        int cpu;
 
-       mask = smp_invltlb_req;
        cpu_mfence();
+       mask = smp_invltlb_req;
        cpu_invltlb();
        while (mask) {
                cpu = bsfl(mask);