kernel - Enhance the sniff code, refactor interrupt disablement for IPIs
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 27 Mar 2017 06:24:09 +0000 (23:24 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 27 Mar 2017 06:43:09 +0000 (23:43 -0700)
* Add kern.sniff_enable, default to 1.  Allows the sysop to disable the
  feature if desired.

* Add kern.sniff_target, allows sniff IPIs to be targetted to all cpus
  (-1), or to a particular cpu (0...N).  This feature allows the sysop
  to test IPI delivery to particular CPUs (typically monitoring with
  systat -pv 0.1) to determine that delivery is working properly.

* Bring in some additional AMD-specific setup from FreeBSD, beginnings
  of support for the APIC Extended space.  For now just make sure the
  extended entries are masked.

* Change interrupt disablement expectations.  The caller of apic_ipi(),
  selected_apic_ipi(), and related macros is now required to hard-disable
  interrupts rather than these functions doing so.  This allows the caller
  to run certain operational sequences atomically.

* Use the TSC to detect IPI send stalls instead of a hard-coded loop count.

* Also set the APIC_LEVEL_ASSERT bit when issuing a directed IPI, though
  the spec says this is unnecessary.  Do it anyway.

* Remove unnecessary critical section in selected_apic_ipi().  We are in
  a hard-disablement and in particular we do not want to accidently trigger
  a splz() due to the crit_exit() while in the hard-disablement.

* Enhance the IPI stall detection and recovery code.  Provide more
  inforamtion.  Also enable the LOOPMASK_IN debugging tracker by default.

* Add a testing feature to machdep.all_but_self_ipi_enable.  By setting
  this to 2, we force the smp_invltlb() to always use the ALL_BUT_SELF IPI.
  For testing only.

sys/cpu/x86_64/include/cpufunc.h
sys/kern/kern_clock.c
sys/kern/lwkt_ipiq.c
sys/platform/pc64/apic/apicreg.h
sys/platform/pc64/apic/lapic.c
sys/platform/pc64/x86_64/mp_machdep.c
sys/platform/pc64/x86_64/pmap_inval.c

index 0d1e9a0..1d95c11 100644 (file)
@@ -687,6 +687,7 @@ cpu_invltlb(void)
 
 extern void smp_invltlb(void);
 extern void smp_sniff(void);
+extern void cpu_sniff(int dcpu);
 
 static __inline u_short
 rfs(void)
index 94d05c5..0ab5ba0 100644 (file)
@@ -129,6 +129,11 @@ struct kinfo_pcheader cputime_pcheader = { PCTRACK_SIZE, PCTRACK_ARYSIZE };
 struct kinfo_pctrack cputime_pctrack[MAXCPU][PCTRACK_SIZE];
 #endif
 
+static int sniff_enable = 1;
+static int sniff_target = -1;
+SYSCTL_INT(_kern, OID_AUTO, sniff_enable, CTLFLAG_RW, &sniff_enable, 0 , "");
+SYSCTL_INT(_kern, OID_AUTO, sniff_target, CTLFLAG_RW, &sniff_target, 0 , "");
+
 static int
 sysctl_cputime(SYSCTL_HANDLER_ARGS)
 {
@@ -154,8 +159,15 @@ sysctl_cputime(SYSCTL_HANDLER_ARGS)
                        break;
        }
 
-       if (root_error == 0)
-               smp_sniff();
+       if (root_error == 0) {
+               if (sniff_enable) {
+                       int n = sniff_target;
+                       if (n < 0)
+                               smp_sniff();
+                       else if (n < ncpus)
+                               cpu_sniff(n);
+               }
+       }
 
        return (error);
 }
index 13104fc..b8274ef 100644 (file)
@@ -266,8 +266,10 @@ lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2)
                                gd->gd_cpuid, target->gd_cpuid, repeating,
                                target->gd_sample_pc, target->gd_sample_sp);
                        smp_sniff();
+                       cpu_disable_intr();
                        ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid);
                        cpu_send_ipiq(target->gd_cpuid);
+                       cpu_enable_intr();
                } else {
                        kprintf("send_ipiq %d->%d tgt not draining (%d)\n",
                                gd->gd_cpuid, target->gd_cpuid, repeating);
index 63dd02f..bf0d87c 100644 (file)
  *     triggered the APIC will send an EOI to all I/O APICs.  For the moment
  *     you can write garbage to the EOI register but for future compatibility
  *     0 should be written.
+ *
+ * 03F0 SELF_IPI
+ * 0400 EXT_FEAT
+ * 0410 EXT_CTRL
+ * 0420 EXT_SEOI
+ * 0430
+ * 0440
+ * 0450
+ * 0460
+ * 0470
+ * 0480 EXT_IER0
+ * 0490 EXT_IER1
+ * 04A0 EXT_IER2
+ * 04B0 EXT_IER3
+ * 04C0 EXT_IER4
+ * 04D0 EXT_IER5
+ * 04E0 EXT_IER6
+ * 04F0 EXT_IER7
+ * 0500 EXT_LVT0
+ * 0510 EXT_LVT1
+ * 0520 EXT_LVT2
+ * 0530 EXT_LVT3
  */
 
 #ifndef LOCORE
@@ -390,14 +412,33 @@ struct LAPIC {
        u_int32_t lvt_lint1;    PAD3;
        u_int32_t lvt_error;    PAD3;
        u_int32_t icr_timer;    PAD3;
-       u_int32_t ccr_timer;    PAD3;
+       u_int32_t ccr_timer;    PAD3;   /* e9 */
        /* reserved */          PAD4;
        /* reserved */          PAD4;
        /* reserved */          PAD4;
        /* reserved */          PAD4;
-       u_int32_t dcr_timer;    PAD3;
+       u_int32_t dcr_timer;    PAD3;   /* 3e */
+       u_int32_t self_ipi;     PAD3;   /* 3f - Only in x2APIC */
+       u_int32_t ext_feat;     PAD3;
+       u_int32_t ext_ctrl;     PAD3;
+       u_int32_t ext_seoi;     PAD3;
        /* reserved */          PAD4;
-};
+       /* reserved */          PAD4;
+       /* reserved */          PAD4;
+       /* reserved */          PAD4;
+       /* reserved */          PAD4;
+       u_int32_t ext_ier0;     PAD3;
+       u_int32_t ext_ier1;     PAD3;
+       u_int32_t ext_ier2;     PAD3;
+       u_int32_t ext_ier3;     PAD3;
+       u_int32_t ext_ier4;     PAD3;
+       u_int32_t ext_ier5;     PAD3;
+       u_int32_t ext_ier6;     PAD3;
+       u_int32_t ext_ier7;     PAD3;
+       struct {                        /* 50 */
+               u_int32_t lvt;  PAD3;
+       } ext_lvt[16];
+} __packed;
 
 typedef struct LAPIC lapic_t;
 
@@ -458,6 +499,8 @@ typedef struct IOAPIC ioapic_t;
 #define APIC_VER_VERSION       0x000000ff
 #define APIC_VER_MAXLVT                0x00ff0000
 #define MAXLVTSHIFT            16
+#define APIC_VER_EOI_SUPP      0x01000000
+#define APIC_VER_AMD_EXT_SPACE 0x80000000
 
 /*
  * lapic.ldr (rw)
@@ -493,6 +536,7 @@ typedef struct IOAPIC ioapic_t;
 #define APIC_SVR_VECTOR                0x000000ff
 #define APIC_SVR_ENABLE                0x00000100
 #define APIC_SVR_FOCUS_DISABLE 0x00000200
+#define APIC_SVR_EOI_SUPP      0x00001000
 
 /*
  * lapic.tpr
@@ -714,6 +758,31 @@ typedef struct IOAPIC ioapic_t;
 #define APIC_TDCR_128          0x0a
 #define APIC_TDCR_1            0x0b
 
+/*
+ * lapic.self_ipi (x2APIC only)
+ */
+/*
+ * lapic.ext_feat (AMD only)
+ */
+#define APIC_EXTFEAT_MASK      0x00ff0000
+#define APIC_EXTFEAT_SHIFT     16
+#define APIC_EXTFEAT_EXTID_CAP 0x00000004
+#define APIC_EXTFEAT_SEIO_CAP  0x00000002
+#define APIC_EXTFEAT_IER_CAP   0x00000001
+
+/*
+ * lapic.ext_ctrl
+ * lapic.ext_seoi
+ * lapic.ext_ier{0-7}
+ */
+/*
+ * lapic.ext_lvt[N].lvt
+ */
+#define APIC_EXTLVT_IBS                0       /* Instruction based sampling */
+#define APIC_EXTLVT_MCA                1       /* MCE thresholding */
+#define APIC_EXTLVT_DEI                2       /* Deferred error interrupt */
+#define APIC_EXTLVT_SBI                3       /* Sideband interface */
+
 /******************************************************************************
  * I/O APIC defines
  */
index b7697a9..988288f 100644 (file)
@@ -31,6 +31,7 @@
 #include <sys/bus.h>
 #include <sys/machintr.h>
 #include <machine/globaldata.h>
+#include <machine/clock.h>
 #include <machine/smp.h>
 #include <machine/md_var.h>
 #include <machine/pmap.h>
@@ -42,6 +43,7 @@
 #include <machine_base/icu/icu_var.h>
 #include <machine/segments.h>
 #include <sys/thread2.h>
+#include <sys/spinlock2.h>
 
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
@@ -112,7 +114,8 @@ lapic_init(boolean_t bsp)
         */
        if (bsp) {
                if (cpu_vendor_id == CPU_VENDOR_AMD &&
-                   CPUID_TO_FAMILY(cpu_id) >= 0xf) {
+                   CPUID_TO_FAMILY(cpu_id) >= 0x0f &&
+                   CPUID_TO_FAMILY(cpu_id) < 0x17) {   /* XXX */
                        uint32_t tcr;
 
                        /*
@@ -237,6 +240,52 @@ lapic_init(boolean_t bsp)
        temp &= ~APIC_TPR_PRIO;         /* clear priority field */
        lapic->tpr = temp;
 
+       /*
+        * AMD specific setup
+        */
+       if (cpu_vendor_id == CPU_VENDOR_AMD &&
+           (lapic->version & APIC_VER_AMD_EXT_SPACE)) {
+               uint32_t ext_feat;
+               uint32_t count;
+               uint32_t max_count;
+               uint32_t lvt;
+               uint32_t i;
+
+               ext_feat = lapic->ext_feat;
+               count = (ext_feat & APIC_EXTFEAT_MASK) >> APIC_EXTFEAT_SHIFT;
+               max_count = sizeof(lapic->ext_lvt) / sizeof(lapic->ext_lvt[0]);
+               if (count > max_count)
+                       count = max_count;
+               for (i = 0; i < count; ++i) {
+                       lvt = lapic->ext_lvt[i].lvt;
+
+                       lvt &= ~(APIC_LVT_POLARITY_MASK | APIC_LVT_TRIG_MASK |
+                                APIC_LVT_DM_MASK | APIC_LVT_MASKED);
+                       lvt |= APIC_LVT_MASKED | APIC_LVT_DM_FIXED;
+
+                       switch(i) {
+                       case APIC_EXTLVT_IBS:
+                               break;
+                       case APIC_EXTLVT_MCA:
+                               break;
+                       case APIC_EXTLVT_DEI:
+                               break;
+                       case APIC_EXTLVT_SBI:
+                               break;
+                       default:
+                               break;
+                       }
+                       if (bsp) {
+                               kprintf("   LAPIC AMD elvt%d: 0x%08x",
+                                       i, lapic->ext_lvt[i].lvt);
+                               if (lapic->ext_lvt[i].lvt != lvt)
+                                       kprintf(" -> 0x%08x", lvt);
+                               kprintf("\n");
+                       }
+                       lapic->ext_lvt[i].lvt = lvt;
+               }
+       }
+
        /* 
         * Enable the LAPIC 
         */
@@ -244,6 +293,14 @@ lapic_init(boolean_t bsp)
        temp |= APIC_SVR_ENABLE;        /* enable the LAPIC */
        temp &= ~APIC_SVR_FOCUS_DISABLE; /* enable lopri focus processor */
 
+       if (lapic->version & APIC_VER_EOI_SUPP) {
+               if (temp & APIC_SVR_EOI_SUPP) {
+                       temp &= ~APIC_SVR_EOI_SUPP;
+                       if (bsp)
+                               kprintf("    LAPIC disabling EOI supp\n");
+               }
+       }
+
        /*
         * Set the spurious interrupt vector.  The low 4 bits of the vector
         * must be 1111.
@@ -488,51 +545,67 @@ apic_dump(char* str)
 int
 apic_ipi(int dest_type, int vector, int delivery_mode)
 {
-       unsigned long rflags;
-       u_long  icr_lo;
+       uint32_t icr_hi;
+       uint32_t icr_lo;
+       int64_t tsc;
        int loops = 1;
 
-       rflags = read_rflags();
-       cpu_disable_intr();
-       while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
-               cpu_pause();
-               if (++loops == 10000000)
-                       kprintf("apic_ipi stall cpu %d\n", mycpuid);
+       if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
+               tsc = rdtsc();
+               while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
+                       cpu_pause();
+                       if ((int64_t)(rdtsc() - (tsc + tsc_frequency)) > 0) {
+                               kprintf("apic_ipi stall cpu %d (sing)\n",
+                                       mycpuid);
+                               tsc = rdtsc();
+                               if (++loops > 30)
+                                       panic("apic stall");
+                       }
+               }
        }
+       icr_hi = lapic->icr_hi & ~APIC_ID_MASK;
        icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) | dest_type | 
-               delivery_mode | vector;
+                APIC_LEVEL_ASSERT | delivery_mode | vector;
+       lapic->icr_hi = icr_hi;
        lapic->icr_lo = icr_lo;
-       write_rflags(rflags);
 
        return 0;
 }
 
+/*
+ * Interrupts must be hard-disabled by caller
+ */
 void
 single_apic_ipi(int cpu, int vector, int delivery_mode)
 {
-       unsigned long rflags;
-       u_long  icr_lo;
-       u_long  icr_hi;
+       uint32_t  icr_lo;
+       uint32_t  icr_hi;
+       int64_t tsc;
        int loops = 1;
 
-       rflags = read_rflags();
-       cpu_disable_intr();
-       while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
-               cpu_pause();
-               if (++loops == 10000000)
-                       kprintf("apic_ipi stall cpu %d (sing)\n", mycpuid);
+       if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
+               tsc = rdtsc();
+               while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) {
+                       cpu_pause();
+                       if ((int64_t)(rdtsc() - (tsc + tsc_frequency)) > 0) {
+                               kprintf("single_apic_ipi stall cpu %d (sing)\n",
+                                       mycpuid);
+                               tsc = rdtsc();
+                               if (++loops > 30)
+                                       panic("apic stall");
+                       }
+               }
        }
        icr_hi = lapic->icr_hi & ~APIC_ID_MASK;
        icr_hi |= (CPUID_TO_APICID(cpu) << 24);
-       lapic->icr_hi = icr_hi;
 
        /* build ICR_LOW */
        icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) |
-                APIC_DEST_DESTFLD | delivery_mode | vector;
+                APIC_LEVEL_ASSERT | APIC_DEST_DESTFLD | delivery_mode | vector;
 
        /* write APIC ICR */
+       lapic->icr_hi = icr_hi;
        lapic->icr_lo = icr_lo;
-       write_rflags(rflags);
 }
 
 #if 0  
@@ -579,17 +652,17 @@ single_apic_ipi_passive(int cpu, int vector, int delivery_mode)
  * target is a bitmask of destination cpus.  Vector is any
  * valid system INT vector.  Delivery mode may be either
  * APIC_DELMODE_FIXED or APIC_DELMODE_LOWPRIO.
+ *
+ * Interrupts must be hard-disabled by caller
  */
 void
 selected_apic_ipi(cpumask_t target, int vector, int delivery_mode)
 {
-       crit_enter();
        while (CPUMASK_TESTNZERO(target)) {
                int n = BSFCPUMASK(target);
                CPUMASK_NANDBIT(target, n);
                single_apic_ipi(n, vector, delivery_mode);
        }
-       crit_exit();
 }
 
 /*
index 552e610..f06ead1 100644 (file)
@@ -824,6 +824,7 @@ smitest(void)
 cpumask_t smp_smurf_mask;
 static cpumask_t smp_invltlb_mask;
 #define LOOPRECOVER
+#define LOOPMASK_IN
 #ifdef LOOPMASK_IN
 cpumask_t smp_in_mask;
 #endif
@@ -945,7 +946,8 @@ smp_invltlb(void)
         */
        CPUMASK_ORMASK(mask, md->mi.gd_cpumask);
        if (all_but_self_ipi_enable &&
-           CPUMASK_CMPMASKEQ(smp_startup_mask, mask)) {
+           (all_but_self_ipi_enable >= 2 ||
+            CPUMASK_CMPMASKEQ(smp_startup_mask, mask))) {
                all_but_self_ipi(XINVLTLB_OFFSET);
        } else {
                CPUMASK_NANDMASK(mask, md->mi.gd_cpumask);
@@ -972,10 +974,22 @@ smp_invltlb(void)
                cpu_pause();
 #ifdef LOOPRECOVER
                if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) {
-                       kprintf("smp_invltlb %d: waited too long %08jx "
-                               "dbg=%08jx %08jx\n",
+                       /*
+                        * cpuid        - cpu doing the waiting
+                        * invltlb_mask - IPI in progress
+                        */
+                       kprintf("smp_invltlb %d: waited too long inv=%08jx "
+                               "smurf=%08jx "
+#ifdef LOOPMASK_IN
+                               "in=%08jx "
+#endif
+                               "idle=%08jx/%08jx\n",
                                md->mi.gd_cpuid,
                                smp_invltlb_mask.ary[0],
+                               smp_smurf_mask.ary[0],
+#ifdef LOOPMASK_IN
+                               smp_in_mask.ary[0],
+#endif
                                smp_idleinvl_mask.ary[0],
                                smp_idleinvl_reqs.ary[0]);
                        mdcpu->gd_xinvaltlb = 0;
@@ -1035,7 +1049,8 @@ smp_invlpg(cpumask_t *cmdmask)
         * We do not include our own cpu when issuing the IPI.
         */
        if (all_but_self_ipi_enable &&
-           CPUMASK_CMPMASKEQ(smp_startup_mask, mask)) {
+           (all_but_self_ipi_enable >= 2 ||
+            CPUMASK_CMPMASKEQ(smp_startup_mask, mask))) {
                all_but_self_ipi(XINVLTLB_OFFSET);
        } else {
                CPUMASK_NANDMASK(mask, md->mi.gd_cpumask);
@@ -1057,13 +1072,35 @@ smp_sniff(void)
 {
        globaldata_t gd = mycpu;
        int dummy;
+       register_t rflags;
 
        /*
         * Ignore all_but_self_ipi_enable here and just use it.
         */
+       rflags = read_rflags();
+       cpu_disable_intr();
        all_but_self_ipi(XSNIFF_OFFSET);
        gd->gd_sample_pc = smp_sniff;
        gd->gd_sample_sp = &dummy;
+       write_rflags(rflags);
+}
+
+void
+cpu_sniff(int dcpu)
+{
+       globaldata_t rgd = globaldata_find(dcpu);
+       register_t rflags;
+       int dummy;
+
+       /*
+        * Ignore all_but_self_ipi_enable here and just use it.
+        */
+       rflags = read_rflags();
+       cpu_disable_intr();
+       single_apic_ipi(dcpu, XSNIFF_OFFSET, APIC_DELMODE_FIXED);
+       rgd->gd_sample_pc = cpu_sniff;
+       rgd->gd_sample_sp = &dummy;
+       write_rflags(rflags);
 }
 
 /*
@@ -1125,11 +1162,11 @@ smp_inval_intr(void)
         *          on the reentrancy detect (caused by another interrupt).
         */
        cpumask = smp_invmask;
-loop:
-       cpu_enable_intr();
 #ifdef LOOPMASK_IN
        ATOMIC_CPUMASK_ORBIT(smp_in_mask, md->mi.gd_cpuid);
 #endif
+loop:
+       cpu_enable_intr();
        ATOMIC_CPUMASK_NANDBIT(smp_smurf_mask, md->mi.gd_cpuid);
 
        /*
@@ -1152,11 +1189,24 @@ loop:
 
 #ifdef LOOPRECOVER
                if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) {
+                       /*
+                        * cpuid        - cpu doing the waiting
+                        * invmask      - IPI in progress
+                        * invltlb_mask - which ones are TLB invalidations?
+                        */
                        kprintf("smp_inval_intr %d inv=%08jx tlbm=%08jx "
+                               "smurf=%08jx "
+#ifdef LOOPMASK_IN
+                               "in=%08jx "
+#endif
                                "idle=%08jx/%08jx\n",
                                md->mi.gd_cpuid,
                                smp_invmask.ary[0],
                                smp_invltlb_mask.ary[0],
+                               smp_smurf_mask.ary[0],
+#ifdef LOOPMASK_IN
+                               smp_in_mask.ary[0],
+#endif
                                smp_idleinvl_mask.ary[0],
                                smp_idleinvl_reqs.ary[0]);
                        tsc_base = rdtsc();
@@ -1215,9 +1265,6 @@ loop:
                cpu_mfence();
        }
 
-#ifdef LOOPMASK_IN
-       ATOMIC_CPUMASK_NANDBIT(smp_in_mask, md->mi.gd_cpuid);
-#endif
        /*
         * Check to see if another Xinvltlb interrupt occurred and loop up
         * if it did.
@@ -1227,6 +1274,9 @@ loop:
                md->gd_xinvaltlb = 1;
                goto loop;
        }
+#ifdef LOOPMASK_IN
+       ATOMIC_CPUMASK_NANDBIT(smp_in_mask, md->mi.gd_cpuid);
+#endif
        md->gd_xinvaltlb = 0;
 }
 
@@ -1238,7 +1288,7 @@ cpu_wbinvd_on_all_cpus_callback(void *arg)
 
 /*
  * When called the executing CPU will send an IPI to all other CPUs
- *  requesting that they halt execution.
+ * requesting that they halt execution.
  *
  * Usually (but not necessarily) called with 'other_cpus' as its arg.
  *
@@ -1526,6 +1576,9 @@ ap_finish(void)
 
 SYSINIT(finishsmp, SI_BOOT2_FINISH_SMP, SI_ORDER_FIRST, ap_finish, NULL);
 
+/*
+ * Interrupts must be hard-disabled by caller
+ */
 void
 cpu_send_ipiq(int dcpu)
 {
index 36e635d..a572da8 100644 (file)
@@ -773,17 +773,15 @@ pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
                                        info->failed = 1;
                                        loopdebug("C", info);
                                        /* XXX recover from possible bug */
-                                       mdcpu->gd_xinvaltlb = 0;
+                                       cpu_disable_intr();
                                        ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask,
                                                                info->mask);
-                                       cpu_disable_intr();
                                        smp_invlpg(&smp_active_mask);
 
                                        /*
                                         * Force outer-loop retest of Xinvltlb
                                         * requests (see mp_machdep.c).
                                         */
-                                       mdcpu->gd_xinvaltlb = 2;
                                        cpu_enable_intr();
                                }
 #endif