From 67534613c487a9b4308151907f5d0e4777f5c814 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 26 Mar 2017 23:24:09 -0700 Subject: [PATCH] kernel - Enhance the sniff code, refactor interrupt disablement for IPIs * Add kern.sniff_enable, default to 1. Allows the sysop to disable the feature if desired. * Add kern.sniff_target, allows sniff IPIs to be targetted to all cpus (-1), or to a particular cpu (0...N). This feature allows the sysop to test IPI delivery to particular CPUs (typically monitoring with systat -pv 0.1) to determine that delivery is working properly. * Bring in some additional AMD-specific setup from FreeBSD, beginnings of support for the APIC Extended space. For now just make sure the extended entries are masked. * Change interrupt disablement expectations. The caller of apic_ipi(), selected_apic_ipi(), and related macros is now required to hard-disable interrupts rather than these functions doing so. This allows the caller to run certain operational sequences atomically. * Use the TSC to detect IPI send stalls instead of a hard-coded loop count. * Also set the APIC_LEVEL_ASSERT bit when issuing a directed IPI, though the spec says this is unnecessary. Do it anyway. * Remove unnecessary critical section in selected_apic_ipi(). We are in a hard-disablement and in particular we do not want to accidently trigger a splz() due to the crit_exit() while in the hard-disablement. * Enhance the IPI stall detection and recovery code. Provide more inforamtion. Also enable the LOOPMASK_IN debugging tracker by default. * Add a testing feature to machdep.all_but_self_ipi_enable. By setting this to 2, we force the smp_invltlb() to always use the ALL_BUT_SELF IPI. For testing only. --- sys/cpu/x86_64/include/cpufunc.h | 1 + sys/kern/kern_clock.c | 16 +++- sys/kern/lwkt_ipiq.c | 2 + sys/platform/pc64/apic/apicreg.h | 75 +++++++++++++++- sys/platform/pc64/apic/lapic.c | 123 ++++++++++++++++++++------ sys/platform/pc64/x86_64/mp_machdep.c | 73 ++++++++++++--- sys/platform/pc64/x86_64/pmap_inval.c | 4 +- 7 files changed, 251 insertions(+), 43 deletions(-) diff --git a/sys/cpu/x86_64/include/cpufunc.h b/sys/cpu/x86_64/include/cpufunc.h index 0d1e9a01df..1d95c11190 100644 --- a/sys/cpu/x86_64/include/cpufunc.h +++ b/sys/cpu/x86_64/include/cpufunc.h @@ -687,6 +687,7 @@ cpu_invltlb(void) extern void smp_invltlb(void); extern void smp_sniff(void); +extern void cpu_sniff(int dcpu); static __inline u_short rfs(void) diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 94d05c5bb1..0ab5ba0cda 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -129,6 +129,11 @@ struct kinfo_pcheader cputime_pcheader = { PCTRACK_SIZE, PCTRACK_ARYSIZE }; struct kinfo_pctrack cputime_pctrack[MAXCPU][PCTRACK_SIZE]; #endif +static int sniff_enable = 1; +static int sniff_target = -1; +SYSCTL_INT(_kern, OID_AUTO, sniff_enable, CTLFLAG_RW, &sniff_enable, 0 , ""); +SYSCTL_INT(_kern, OID_AUTO, sniff_target, CTLFLAG_RW, &sniff_target, 0 , ""); + static int sysctl_cputime(SYSCTL_HANDLER_ARGS) { @@ -154,8 +159,15 @@ sysctl_cputime(SYSCTL_HANDLER_ARGS) break; } - if (root_error == 0) - smp_sniff(); + if (root_error == 0) { + if (sniff_enable) { + int n = sniff_target; + if (n < 0) + smp_sniff(); + else if (n < ncpus) + cpu_sniff(n); + } + } return (error); } diff --git a/sys/kern/lwkt_ipiq.c b/sys/kern/lwkt_ipiq.c index 13104fcbe4..b8274ef7b5 100644 --- a/sys/kern/lwkt_ipiq.c +++ b/sys/kern/lwkt_ipiq.c @@ -266,8 +266,10 @@ lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2) gd->gd_cpuid, target->gd_cpuid, repeating, target->gd_sample_pc, target->gd_sample_sp); smp_sniff(); + cpu_disable_intr(); ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); cpu_send_ipiq(target->gd_cpuid); + cpu_enable_intr(); } else { kprintf("send_ipiq %d->%d tgt not draining (%d)\n", gd->gd_cpuid, target->gd_cpuid, repeating); diff --git a/sys/platform/pc64/apic/apicreg.h b/sys/platform/pc64/apic/apicreg.h index 63dd02ff69..bf0d87cea3 100644 --- a/sys/platform/pc64/apic/apicreg.h +++ b/sys/platform/pc64/apic/apicreg.h @@ -324,6 +324,28 @@ * triggered the APIC will send an EOI to all I/O APICs. For the moment * you can write garbage to the EOI register but for future compatibility * 0 should be written. + * + * 03F0 SELF_IPI + * 0400 EXT_FEAT + * 0410 EXT_CTRL + * 0420 EXT_SEOI + * 0430 + * 0440 + * 0450 + * 0460 + * 0470 + * 0480 EXT_IER0 + * 0490 EXT_IER1 + * 04A0 EXT_IER2 + * 04B0 EXT_IER3 + * 04C0 EXT_IER4 + * 04D0 EXT_IER5 + * 04E0 EXT_IER6 + * 04F0 EXT_IER7 + * 0500 EXT_LVT0 + * 0510 EXT_LVT1 + * 0520 EXT_LVT2 + * 0530 EXT_LVT3 */ #ifndef LOCORE @@ -390,14 +412,33 @@ struct LAPIC { u_int32_t lvt_lint1; PAD3; u_int32_t lvt_error; PAD3; u_int32_t icr_timer; PAD3; - u_int32_t ccr_timer; PAD3; + u_int32_t ccr_timer; PAD3; /* e9 */ /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; - u_int32_t dcr_timer; PAD3; + u_int32_t dcr_timer; PAD3; /* 3e */ + u_int32_t self_ipi; PAD3; /* 3f - Only in x2APIC */ + u_int32_t ext_feat; PAD3; + u_int32_t ext_ctrl; PAD3; + u_int32_t ext_seoi; PAD3; /* reserved */ PAD4; -}; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + /* reserved */ PAD4; + u_int32_t ext_ier0; PAD3; + u_int32_t ext_ier1; PAD3; + u_int32_t ext_ier2; PAD3; + u_int32_t ext_ier3; PAD3; + u_int32_t ext_ier4; PAD3; + u_int32_t ext_ier5; PAD3; + u_int32_t ext_ier6; PAD3; + u_int32_t ext_ier7; PAD3; + struct { /* 50 */ + u_int32_t lvt; PAD3; + } ext_lvt[16]; +} __packed; typedef struct LAPIC lapic_t; @@ -458,6 +499,8 @@ typedef struct IOAPIC ioapic_t; #define APIC_VER_VERSION 0x000000ff #define APIC_VER_MAXLVT 0x00ff0000 #define MAXLVTSHIFT 16 +#define APIC_VER_EOI_SUPP 0x01000000 +#define APIC_VER_AMD_EXT_SPACE 0x80000000 /* * lapic.ldr (rw) @@ -493,6 +536,7 @@ typedef struct IOAPIC ioapic_t; #define APIC_SVR_VECTOR 0x000000ff #define APIC_SVR_ENABLE 0x00000100 #define APIC_SVR_FOCUS_DISABLE 0x00000200 +#define APIC_SVR_EOI_SUPP 0x00001000 /* * lapic.tpr @@ -714,6 +758,31 @@ typedef struct IOAPIC ioapic_t; #define APIC_TDCR_128 0x0a #define APIC_TDCR_1 0x0b +/* + * lapic.self_ipi (x2APIC only) + */ +/* + * lapic.ext_feat (AMD only) + */ +#define APIC_EXTFEAT_MASK 0x00ff0000 +#define APIC_EXTFEAT_SHIFT 16 +#define APIC_EXTFEAT_EXTID_CAP 0x00000004 +#define APIC_EXTFEAT_SEIO_CAP 0x00000002 +#define APIC_EXTFEAT_IER_CAP 0x00000001 + +/* + * lapic.ext_ctrl + * lapic.ext_seoi + * lapic.ext_ier{0-7} + */ +/* + * lapic.ext_lvt[N].lvt + */ +#define APIC_EXTLVT_IBS 0 /* Instruction based sampling */ +#define APIC_EXTLVT_MCA 1 /* MCE thresholding */ +#define APIC_EXTLVT_DEI 2 /* Deferred error interrupt */ +#define APIC_EXTLVT_SBI 3 /* Sideband interface */ + /****************************************************************************** * I/O APIC defines */ diff --git a/sys/platform/pc64/apic/lapic.c b/sys/platform/pc64/apic/lapic.c index b7697a9a49..988288f86e 100644 --- a/sys/platform/pc64/apic/lapic.c +++ b/sys/platform/pc64/apic/lapic.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include @@ -112,7 +114,8 @@ lapic_init(boolean_t bsp) */ if (bsp) { if (cpu_vendor_id == CPU_VENDOR_AMD && - CPUID_TO_FAMILY(cpu_id) >= 0xf) { + CPUID_TO_FAMILY(cpu_id) >= 0x0f && + CPUID_TO_FAMILY(cpu_id) < 0x17) { /* XXX */ uint32_t tcr; /* @@ -237,6 +240,52 @@ lapic_init(boolean_t bsp) temp &= ~APIC_TPR_PRIO; /* clear priority field */ lapic->tpr = temp; + /* + * AMD specific setup + */ + if (cpu_vendor_id == CPU_VENDOR_AMD && + (lapic->version & APIC_VER_AMD_EXT_SPACE)) { + uint32_t ext_feat; + uint32_t count; + uint32_t max_count; + uint32_t lvt; + uint32_t i; + + ext_feat = lapic->ext_feat; + count = (ext_feat & APIC_EXTFEAT_MASK) >> APIC_EXTFEAT_SHIFT; + max_count = sizeof(lapic->ext_lvt) / sizeof(lapic->ext_lvt[0]); + if (count > max_count) + count = max_count; + for (i = 0; i < count; ++i) { + lvt = lapic->ext_lvt[i].lvt; + + lvt &= ~(APIC_LVT_POLARITY_MASK | APIC_LVT_TRIG_MASK | + APIC_LVT_DM_MASK | APIC_LVT_MASKED); + lvt |= APIC_LVT_MASKED | APIC_LVT_DM_FIXED; + + switch(i) { + case APIC_EXTLVT_IBS: + break; + case APIC_EXTLVT_MCA: + break; + case APIC_EXTLVT_DEI: + break; + case APIC_EXTLVT_SBI: + break; + default: + break; + } + if (bsp) { + kprintf(" LAPIC AMD elvt%d: 0x%08x", + i, lapic->ext_lvt[i].lvt); + if (lapic->ext_lvt[i].lvt != lvt) + kprintf(" -> 0x%08x", lvt); + kprintf("\n"); + } + lapic->ext_lvt[i].lvt = lvt; + } + } + /* * Enable the LAPIC */ @@ -244,6 +293,14 @@ lapic_init(boolean_t bsp) temp |= APIC_SVR_ENABLE; /* enable the LAPIC */ temp &= ~APIC_SVR_FOCUS_DISABLE; /* enable lopri focus processor */ + if (lapic->version & APIC_VER_EOI_SUPP) { + if (temp & APIC_SVR_EOI_SUPP) { + temp &= ~APIC_SVR_EOI_SUPP; + if (bsp) + kprintf(" LAPIC disabling EOI supp\n"); + } + } + /* * Set the spurious interrupt vector. The low 4 bits of the vector * must be 1111. @@ -488,51 +545,67 @@ apic_dump(char* str) int apic_ipi(int dest_type, int vector, int delivery_mode) { - unsigned long rflags; - u_long icr_lo; + uint32_t icr_hi; + uint32_t icr_lo; + int64_t tsc; int loops = 1; - rflags = read_rflags(); - cpu_disable_intr(); - while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { - cpu_pause(); - if (++loops == 10000000) - kprintf("apic_ipi stall cpu %d\n", mycpuid); + if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { + tsc = rdtsc(); + while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { + cpu_pause(); + if ((int64_t)(rdtsc() - (tsc + tsc_frequency)) > 0) { + kprintf("apic_ipi stall cpu %d (sing)\n", + mycpuid); + tsc = rdtsc(); + if (++loops > 30) + panic("apic stall"); + } + } } + icr_hi = lapic->icr_hi & ~APIC_ID_MASK; icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) | dest_type | - delivery_mode | vector; + APIC_LEVEL_ASSERT | delivery_mode | vector; + lapic->icr_hi = icr_hi; lapic->icr_lo = icr_lo; - write_rflags(rflags); return 0; } +/* + * Interrupts must be hard-disabled by caller + */ void single_apic_ipi(int cpu, int vector, int delivery_mode) { - unsigned long rflags; - u_long icr_lo; - u_long icr_hi; + uint32_t icr_lo; + uint32_t icr_hi; + int64_t tsc; int loops = 1; - rflags = read_rflags(); - cpu_disable_intr(); - while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { - cpu_pause(); - if (++loops == 10000000) - kprintf("apic_ipi stall cpu %d (sing)\n", mycpuid); + if ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { + tsc = rdtsc(); + while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { + cpu_pause(); + if ((int64_t)(rdtsc() - (tsc + tsc_frequency)) > 0) { + kprintf("single_apic_ipi stall cpu %d (sing)\n", + mycpuid); + tsc = rdtsc(); + if (++loops > 30) + panic("apic stall"); + } + } } icr_hi = lapic->icr_hi & ~APIC_ID_MASK; icr_hi |= (CPUID_TO_APICID(cpu) << 24); - lapic->icr_hi = icr_hi; /* build ICR_LOW */ icr_lo = (lapic->icr_lo & APIC_ICRLO_RESV_MASK) | - APIC_DEST_DESTFLD | delivery_mode | vector; + APIC_LEVEL_ASSERT | APIC_DEST_DESTFLD | delivery_mode | vector; /* write APIC ICR */ + lapic->icr_hi = icr_hi; lapic->icr_lo = icr_lo; - write_rflags(rflags); } #if 0 @@ -579,17 +652,17 @@ single_apic_ipi_passive(int cpu, int vector, int delivery_mode) * target is a bitmask of destination cpus. Vector is any * valid system INT vector. Delivery mode may be either * APIC_DELMODE_FIXED or APIC_DELMODE_LOWPRIO. + * + * Interrupts must be hard-disabled by caller */ void selected_apic_ipi(cpumask_t target, int vector, int delivery_mode) { - crit_enter(); while (CPUMASK_TESTNZERO(target)) { int n = BSFCPUMASK(target); CPUMASK_NANDBIT(target, n); single_apic_ipi(n, vector, delivery_mode); } - crit_exit(); } /* diff --git a/sys/platform/pc64/x86_64/mp_machdep.c b/sys/platform/pc64/x86_64/mp_machdep.c index 552e610d44..f06ead145e 100644 --- a/sys/platform/pc64/x86_64/mp_machdep.c +++ b/sys/platform/pc64/x86_64/mp_machdep.c @@ -824,6 +824,7 @@ smitest(void) cpumask_t smp_smurf_mask; static cpumask_t smp_invltlb_mask; #define LOOPRECOVER +#define LOOPMASK_IN #ifdef LOOPMASK_IN cpumask_t smp_in_mask; #endif @@ -945,7 +946,8 @@ smp_invltlb(void) */ CPUMASK_ORMASK(mask, md->mi.gd_cpumask); if (all_but_self_ipi_enable && - CPUMASK_CMPMASKEQ(smp_startup_mask, mask)) { + (all_but_self_ipi_enable >= 2 || + CPUMASK_CMPMASKEQ(smp_startup_mask, mask))) { all_but_self_ipi(XINVLTLB_OFFSET); } else { CPUMASK_NANDMASK(mask, md->mi.gd_cpumask); @@ -972,10 +974,22 @@ smp_invltlb(void) cpu_pause(); #ifdef LOOPRECOVER if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { - kprintf("smp_invltlb %d: waited too long %08jx " - "dbg=%08jx %08jx\n", + /* + * cpuid - cpu doing the waiting + * invltlb_mask - IPI in progress + */ + kprintf("smp_invltlb %d: waited too long inv=%08jx " + "smurf=%08jx " +#ifdef LOOPMASK_IN + "in=%08jx " +#endif + "idle=%08jx/%08jx\n", md->mi.gd_cpuid, smp_invltlb_mask.ary[0], + smp_smurf_mask.ary[0], +#ifdef LOOPMASK_IN + smp_in_mask.ary[0], +#endif smp_idleinvl_mask.ary[0], smp_idleinvl_reqs.ary[0]); mdcpu->gd_xinvaltlb = 0; @@ -1035,7 +1049,8 @@ smp_invlpg(cpumask_t *cmdmask) * We do not include our own cpu when issuing the IPI. */ if (all_but_self_ipi_enable && - CPUMASK_CMPMASKEQ(smp_startup_mask, mask)) { + (all_but_self_ipi_enable >= 2 || + CPUMASK_CMPMASKEQ(smp_startup_mask, mask))) { all_but_self_ipi(XINVLTLB_OFFSET); } else { CPUMASK_NANDMASK(mask, md->mi.gd_cpumask); @@ -1057,13 +1072,35 @@ smp_sniff(void) { globaldata_t gd = mycpu; int dummy; + register_t rflags; /* * Ignore all_but_self_ipi_enable here and just use it. */ + rflags = read_rflags(); + cpu_disable_intr(); all_but_self_ipi(XSNIFF_OFFSET); gd->gd_sample_pc = smp_sniff; gd->gd_sample_sp = &dummy; + write_rflags(rflags); +} + +void +cpu_sniff(int dcpu) +{ + globaldata_t rgd = globaldata_find(dcpu); + register_t rflags; + int dummy; + + /* + * Ignore all_but_self_ipi_enable here and just use it. + */ + rflags = read_rflags(); + cpu_disable_intr(); + single_apic_ipi(dcpu, XSNIFF_OFFSET, APIC_DELMODE_FIXED); + rgd->gd_sample_pc = cpu_sniff; + rgd->gd_sample_sp = &dummy; + write_rflags(rflags); } /* @@ -1125,11 +1162,11 @@ smp_inval_intr(void) * on the reentrancy detect (caused by another interrupt). */ cpumask = smp_invmask; -loop: - cpu_enable_intr(); #ifdef LOOPMASK_IN ATOMIC_CPUMASK_ORBIT(smp_in_mask, md->mi.gd_cpuid); #endif +loop: + cpu_enable_intr(); ATOMIC_CPUMASK_NANDBIT(smp_smurf_mask, md->mi.gd_cpuid); /* @@ -1152,11 +1189,24 @@ loop: #ifdef LOOPRECOVER if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { + /* + * cpuid - cpu doing the waiting + * invmask - IPI in progress + * invltlb_mask - which ones are TLB invalidations? + */ kprintf("smp_inval_intr %d inv=%08jx tlbm=%08jx " + "smurf=%08jx " +#ifdef LOOPMASK_IN + "in=%08jx " +#endif "idle=%08jx/%08jx\n", md->mi.gd_cpuid, smp_invmask.ary[0], smp_invltlb_mask.ary[0], + smp_smurf_mask.ary[0], +#ifdef LOOPMASK_IN + smp_in_mask.ary[0], +#endif smp_idleinvl_mask.ary[0], smp_idleinvl_reqs.ary[0]); tsc_base = rdtsc(); @@ -1215,9 +1265,6 @@ loop: cpu_mfence(); } -#ifdef LOOPMASK_IN - ATOMIC_CPUMASK_NANDBIT(smp_in_mask, md->mi.gd_cpuid); -#endif /* * Check to see if another Xinvltlb interrupt occurred and loop up * if it did. @@ -1227,6 +1274,9 @@ loop: md->gd_xinvaltlb = 1; goto loop; } +#ifdef LOOPMASK_IN + ATOMIC_CPUMASK_NANDBIT(smp_in_mask, md->mi.gd_cpuid); +#endif md->gd_xinvaltlb = 0; } @@ -1238,7 +1288,7 @@ cpu_wbinvd_on_all_cpus_callback(void *arg) /* * When called the executing CPU will send an IPI to all other CPUs - * requesting that they halt execution. + * requesting that they halt execution. * * Usually (but not necessarily) called with 'other_cpus' as its arg. * @@ -1526,6 +1576,9 @@ ap_finish(void) SYSINIT(finishsmp, SI_BOOT2_FINISH_SMP, SI_ORDER_FIRST, ap_finish, NULL); +/* + * Interrupts must be hard-disabled by caller + */ void cpu_send_ipiq(int dcpu) { diff --git a/sys/platform/pc64/x86_64/pmap_inval.c b/sys/platform/pc64/x86_64/pmap_inval.c index 36e635d60a..a572da847f 100644 --- a/sys/platform/pc64/x86_64/pmap_inval.c +++ b/sys/platform/pc64/x86_64/pmap_inval.c @@ -773,17 +773,15 @@ pmap_inval_intr(cpumask_t *cpumaskp, int toolong) info->failed = 1; loopdebug("C", info); /* XXX recover from possible bug */ - mdcpu->gd_xinvaltlb = 0; + cpu_disable_intr(); ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, info->mask); - cpu_disable_intr(); smp_invlpg(&smp_active_mask); /* * Force outer-loop retest of Xinvltlb * requests (see mp_machdep.c). */ - mdcpu->gd_xinvaltlb = 2; cpu_enable_intr(); } #endif -- 2.41.0