8f2d005343eac77e380cfe6c3977fe53e430510f
[dragonfly.git] / sys / platform / pc64 / apic / lapic.c
1 /*
2  * Copyright (c) 1996, by Steve Passe
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. The name of the developer may NOT be used to endorse or promote products
11  *    derived from this software without specific prior written permission.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/i386/i386/mpapic.c,v 1.37.2.7 2003/01/25 02:31:47 peter Exp $
26  */
27
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/ktr.h>
32 #include <sys/bus.h>
33 #include <sys/machintr.h>
34 #include <machine/globaldata.h>
35 #include <machine/clock.h>
36 #include <machine/limits.h>
37 #include <machine/smp.h>
38 #include <machine/md_var.h>
39 #include <machine/pmap.h>
40 #include <machine/specialreg.h>
41 #include <machine_base/apic/lapic.h>
42 #include <machine_base/apic/ioapic.h>
43 #include <machine_base/apic/ioapic_abi.h>
44 #include <machine_base/apic/apicvar.h>
45 #include <machine_base/icu/icu_var.h>
46 #include <machine/segments.h>
47 #include <sys/thread2.h>
48 #include <sys/spinlock2.h>
49
50 #include <machine/cputypes.h>
51 #include <machine/intr_machdep.h>
52
53 #if !defined(KTR_LAPIC)
54 #define KTR_LAPIC       KTR_ALL
55 #endif
56 KTR_INFO_MASTER(lapic);
57 KTR_INFO(KTR_LAPIC, lapic, mem_eoi, 0, "mem_eoi");
58 #define log_lapic(name)     KTR_LOG(lapic_ ## name)
59
60 extern int naps;
61
62 volatile lapic_t *lapic_mem;
63
64 static void     lapic_timer_calibrate(void);
65 static void     lapic_timer_set_divisor(int);
66 static void     lapic_timer_fixup_handler(void *);
67 static void     lapic_timer_restart_handler(void *);
68
69
70 static int      lapic_timer_enable = 1;
71 TUNABLE_INT("hw.lapic_timer_enable", &lapic_timer_enable);
72
73 static int      lapic_timer_tscdeadline = 1;
74 TUNABLE_INT("hw.lapic_timer_tscdeadline", &lapic_timer_tscdeadline);
75
76 static int      lapic_calibrate_test = 0;
77 TUNABLE_INT("hw.lapic_calibrate_test", &lapic_calibrate_test);
78
79 static int      lapic_calibrate_fast = 1;
80 TUNABLE_INT("hw.lapic_calibrate_fast", &lapic_calibrate_fast);
81
82 static void     lapic_timer_tscdlt_reload(struct cputimer_intr *, sysclock_t);
83 static void     lapic_mem_timer_intr_reload(struct cputimer_intr *, sysclock_t);
84 static void     lapic_timer_intr_enable(struct cputimer_intr *);
85 static void     lapic_timer_intr_restart(struct cputimer_intr *);
86 static void     lapic_timer_intr_pmfixup(struct cputimer_intr *);
87
88 static struct cputimer_intr lapic_cputimer_intr = {
89         .freq = 0,
90         .reload = lapic_mem_timer_intr_reload,
91         .enable = lapic_timer_intr_enable,
92         .config = cputimer_intr_default_config,
93         .restart = lapic_timer_intr_restart,
94         .pmfixup = lapic_timer_intr_pmfixup,
95         .initclock = cputimer_intr_default_initclock,
96         .pcpuhand = NULL,
97         .next = SLIST_ENTRY_INITIALIZER,
98         .name = "lapic",
99         .type = CPUTIMER_INTR_LAPIC,
100         .prio = CPUTIMER_INTR_PRIO_LAPIC,
101         .caps = CPUTIMER_INTR_CAP_NONE,
102         .priv = NULL
103 };
104
105 static int              lapic_timer_divisor_idx = -1;
106 static const uint32_t   lapic_timer_divisors[] = {
107         APIC_TDCR_2,    APIC_TDCR_4,    APIC_TDCR_8,    APIC_TDCR_16,
108         APIC_TDCR_32,   APIC_TDCR_64,   APIC_TDCR_128,  APIC_TDCR_1
109 };
110 #define APIC_TIMER_NDIVISORS (int)(NELEM(lapic_timer_divisors))
111
112 static int      lapic_use_tscdeadline = 0;
113 /* The raw TSC frequency might not fit into a sysclock_t value. */
114 static int      lapic_timer_tscfreq_shift;
115
116 /*
117  * APIC ID <-> CPU ID mapping structures.
118  */
119 int     cpu_id_to_apic_id[NAPICID];
120 int     apic_id_to_cpu_id[NAPICID];
121 int     lapic_enable = 1;
122 int     lapic_usable = 0;
123
124 /* Separate cachelines for each cpu's info. */
125 struct deadlines {
126         uint64_t timestamp;
127         uint64_t downcount_time;
128         uint64_t padding[6];
129 };
130 struct deadlines *tsc_deadlines = NULL;
131
132 static void     lapic_mem_eoi(void);
133 static int      lapic_mem_ipi(int dest_type, int vector, int delivery_mode);
134 static void     lapic_mem_single_ipi(int cpu, int vector, int delivery_mode);
135
136 void            (*lapic_eoi)(void);
137 int             (*apic_ipi)(int dest_type, int vector, int delivery_mode);
138 void            (*single_apic_ipi)(int cpu, int vector, int delivery_mode);
139
140 static __inline void
141 lapic_mem_icr_set(uint32_t apic_id, uint32_t icr_lo_val)
142 {
143         uint32_t icr_lo, icr_hi;
144
145         icr_hi = (LAPIC_MEM_READ(icr_hi) & ~APIC_ID_MASK) |
146             (apic_id << APIC_ID_SHIFT);
147         icr_lo = (LAPIC_MEM_READ(icr_lo) & APIC_ICRLO_RESV_MASK) | icr_lo_val;
148
149         LAPIC_MEM_WRITE(icr_hi, icr_hi);
150         LAPIC_MEM_WRITE(icr_lo, icr_lo);
151 }
152
153 /*
154  * Enable LAPIC, configure interrupts.
155  */
156 void
157 lapic_init(boolean_t bsp)
158 {
159         uint32_t timer;
160         u_int   temp;
161
162         if (bsp) {
163                 /* Decide whether we want to use TSC Deadline mode. */
164                 if (lapic_timer_tscdeadline != 0 &&
165                     (cpu_feature2 & CPUID2_TSCDLT) &&
166                     tsc_invariant && tsc_frequency != 0) {
167                         lapic_use_tscdeadline = 1;
168                         tsc_deadlines = kmalloc_cachealign(
169                             sizeof(struct deadlines) * (naps + 1),
170                             M_DEVBUF, M_WAITOK | M_ZERO);
171                 }
172         }
173
174         /*
175          * Install vectors
176          *
177          * Since IDT is shared between BSP and APs, these vectors
178          * only need to be installed once; we do it on BSP.
179          */
180         if (bsp) {
181                 if (cpu_vendor_id == CPU_VENDOR_AMD &&
182                     CPUID_TO_FAMILY(cpu_id) >= 0x0f &&
183                     CPUID_TO_FAMILY(cpu_id) < 0x17) {   /* XXX */
184                         uint32_t tcr;
185
186                         /*
187                          * Set the LINTEN bit in the HyperTransport
188                          * Transaction Control Register.
189                          *
190                          * This will cause EXTINT and NMI interrupts
191                          * routed over the hypertransport bus to be
192                          * fed into the LAPIC LINT0/LINT1.  If the bit
193                          * isn't set, the interrupts will go to the
194                          * general cpu INTR/NMI pins.  On a dual-core
195                          * cpu the interrupt winds up going to BOTH cpus.
196                          * The first cpu that does the interrupt ack
197                          * cycle will get the correct interrupt.  The
198                          * second cpu that does it will get a spurious
199                          * interrupt vector (typically IRQ 7).
200                          */
201                         outl(0x0cf8,
202                             (1 << 31) | /* enable */
203                             (0 << 16) | /* bus */
204                             (0x18 << 11) | /* dev (cpu + 0x18) */
205                             (0 << 8) |  /* func */
206                             0x68        /* reg */
207                             );
208                         tcr = inl(0xcfc);
209                         if ((tcr & 0x00010000) == 0) {
210                                 kprintf("LAPIC: AMD LINTEN on\n");
211                                 outl(0xcfc, tcr|0x00010000);
212                         }
213                         outl(0x0cf8, 0);
214                 }
215
216                 /* Install a 'Spurious INTerrupt' vector */
217                 setidt_global(XSPURIOUSINT_OFFSET, Xspuriousint,
218                     SDT_SYSIGT, SEL_KPL, 0);
219
220                 /* Install a timer vector */
221                 setidt_global(XTIMER_OFFSET, Xtimer,
222                     SDT_SYSIGT, SEL_KPL, 0);
223
224                 /* Install an inter-CPU IPI for TLB invalidation */
225                 setidt_global(XINVLTLB_OFFSET, Xinvltlb,
226                     SDT_SYSIGT, SEL_KPL, 0);
227
228                 /* Install an inter-CPU IPI for IPIQ messaging */
229                 setidt_global(XIPIQ_OFFSET, Xipiq,
230                     SDT_SYSIGT, SEL_KPL, 0);
231
232                 /* Install an inter-CPU IPI for CPU stop/restart */
233                 setidt_global(XCPUSTOP_OFFSET, Xcpustop,
234                     SDT_SYSIGT, SEL_KPL, 0);
235
236                 /* Install an inter-CPU IPI for TLB invalidation */
237                 setidt_global(XSNIFF_OFFSET, Xsniff,
238                     SDT_SYSIGT, SEL_KPL, 0);
239         }
240
241         /*
242          * Setup LINT0 as ExtINT on the BSP.  This is theoretically an
243          * aggregate interrupt input from the 8259.  The INTA cycle
244          * will be routed to the external controller (the 8259) which
245          * is expected to supply the vector.
246          *
247          * Must be setup edge triggered, active high.
248          *
249          * Disable LINT0 on BSP, if I/O APIC is enabled.
250          *
251          * Disable LINT0 on the APs.  It doesn't matter what delivery
252          * mode we use because we leave it masked.
253          */
254         temp = LAPIC_READ(lvt_lint0);
255         temp &= ~(APIC_LVT_MASKED | APIC_LVT_TRIG_MASK | 
256                   APIC_LVT_POLARITY_MASK | APIC_LVT_DM_MASK);
257         if (bsp) {
258                 temp |= APIC_LVT_DM_EXTINT;
259                 if (ioapic_enable)
260                         temp |= APIC_LVT_MASKED;
261         } else {
262                 temp |= APIC_LVT_DM_FIXED | APIC_LVT_MASKED;
263         }
264         LAPIC_WRITE(lvt_lint0, temp);
265
266         /*
267          * Setup LINT1 as NMI.
268          *
269          * Must be setup edge trigger, active high.
270          *
271          * Enable LINT1 on BSP, if I/O APIC is enabled.
272          *
273          * Disable LINT1 on the APs.
274          */
275         temp = LAPIC_READ(lvt_lint1);
276         temp &= ~(APIC_LVT_MASKED | APIC_LVT_TRIG_MASK | 
277                   APIC_LVT_POLARITY_MASK | APIC_LVT_DM_MASK);
278         temp |= APIC_LVT_MASKED | APIC_LVT_DM_NMI;
279         if (bsp && ioapic_enable)
280                 temp &= ~APIC_LVT_MASKED;
281         LAPIC_WRITE(lvt_lint1, temp);
282
283         /*
284          * Mask the LAPIC error interrupt, LAPIC performance counter
285          * interrupt.
286          */
287         LAPIC_WRITE(lvt_error, LAPIC_READ(lvt_error) | APIC_LVT_MASKED);
288         LAPIC_WRITE(lvt_pcint, LAPIC_READ(lvt_pcint) | APIC_LVT_MASKED);
289
290         /*
291          * Set LAPIC timer vector and mask the LAPIC timer interrupt.
292          */
293         timer = LAPIC_READ(lvt_timer);
294         timer &= ~APIC_LVTT_VECTOR;
295         timer |= XTIMER_OFFSET;
296         timer |= APIC_LVTT_MASKED;
297         LAPIC_WRITE(lvt_timer, timer);
298
299         /*
300          * Set the Task Priority Register as needed.   At the moment allow
301          * interrupts on all cpus (the APs will remain CLId until they are
302          * ready to deal).
303          */
304         temp = LAPIC_READ(tpr);
305         temp &= ~APIC_TPR_PRIO;         /* clear priority field */
306         LAPIC_WRITE(tpr, temp);
307
308         /*
309          * AMD specific setup
310          */
311         if (cpu_vendor_id == CPU_VENDOR_AMD && lapic_mem != NULL &&
312             (LAPIC_MEM_READ(version) & APIC_VER_AMD_EXT_SPACE)) {
313                 uint32_t ext_feat;
314                 uint32_t count;
315                 uint32_t max_count;
316                 uint32_t lvt;
317                 uint32_t i;
318
319                 ext_feat = LAPIC_MEM_READ(ext_feat);
320                 count = (ext_feat & APIC_EXTFEAT_MASK) >> APIC_EXTFEAT_SHIFT;
321                 max_count = sizeof(lapic_mem->ext_lvt) /
322                     sizeof(lapic_mem->ext_lvt[0]);
323                 if (count > max_count)
324                         count = max_count;
325                 for (i = 0; i < count; ++i) {
326                         lvt = LAPIC_MEM_READ(ext_lvt[i].lvt);
327
328                         lvt &= ~(APIC_LVT_POLARITY_MASK | APIC_LVT_TRIG_MASK |
329                                  APIC_LVT_DM_MASK | APIC_LVT_MASKED);
330                         lvt |= APIC_LVT_MASKED | APIC_LVT_DM_FIXED;
331
332                         switch(i) {
333                         case APIC_EXTLVT_IBS:
334                                 break;
335                         case APIC_EXTLVT_MCA:
336                                 break;
337                         case APIC_EXTLVT_DEI:
338                                 break;
339                         case APIC_EXTLVT_SBI:
340                                 break;
341                         default:
342                                 break;
343                         }
344                         if (bsp) {
345                                 kprintf("   LAPIC AMD elvt%d: 0x%08x",
346                                         i, LAPIC_MEM_READ(ext_lvt[i].lvt));
347                                 if (LAPIC_MEM_READ(ext_lvt[i].lvt) != lvt)
348                                         kprintf(" -> 0x%08x", lvt);
349                                 kprintf("\n");
350                         }
351                         LAPIC_MEM_WRITE(ext_lvt[i].lvt, lvt);
352                 }
353         }
354
355         /* 
356          * Enable the LAPIC 
357          */
358         temp = LAPIC_READ(svr);
359         temp |= APIC_SVR_ENABLE;        /* enable the LAPIC */
360         temp &= ~APIC_SVR_FOCUS_DISABLE; /* enable lopri focus processor */
361
362         if (LAPIC_READ(version) & APIC_VER_EOI_SUPP) {
363                 if (temp & APIC_SVR_EOI_SUPP) {
364                         temp &= ~APIC_SVR_EOI_SUPP;
365                         if (bsp)
366                                 kprintf("    LAPIC disabling EOI supp\n");
367                 }
368         }
369
370         /*
371          * Set the spurious interrupt vector.  The low 4 bits of the vector
372          * must be 1111.
373          */
374         if ((XSPURIOUSINT_OFFSET & 0x0F) != 0x0F)
375                 panic("bad XSPURIOUSINT_OFFSET: 0x%08x", XSPURIOUSINT_OFFSET);
376         temp &= ~APIC_SVR_VECTOR;
377         temp |= XSPURIOUSINT_OFFSET;
378
379         LAPIC_WRITE(svr, temp);
380
381         /*
382          * Pump out a few EOIs to clean out interrupts that got through
383          * before we were able to set the TPR.
384          */
385         LAPIC_WRITE(eoi, 0);
386         LAPIC_WRITE(eoi, 0);
387         LAPIC_WRITE(eoi, 0);
388
389         if (bsp) {
390                 lapic_timer_calibrate();
391                 if (lapic_timer_enable) {
392                         if (cpu_thermal_feature & CPUID_THERMAL_ARAT) {
393                                 /*
394                                  * Local APIC timer will not stop
395                                  * in deep C-state.
396                                  */
397                                 lapic_cputimer_intr.caps |=
398                                     CPUTIMER_INTR_CAP_PS;
399                         }
400                         if (lapic_use_tscdeadline) {
401                                 lapic_cputimer_intr.reload =
402                                     lapic_timer_tscdlt_reload;
403                         }
404                         cputimer_intr_register(&lapic_cputimer_intr);
405                         cputimer_intr_select(&lapic_cputimer_intr, 0);
406                 }
407         } else if (!lapic_use_tscdeadline) {
408                 lapic_timer_set_divisor(lapic_timer_divisor_idx);
409         }
410
411         if (bootverbose)
412                 apic_dump("apic_initialize()");
413 }
414
415 static void
416 lapic_timer_set_divisor(int divisor_idx)
417 {
418         KKASSERT(divisor_idx >= 0 && divisor_idx < APIC_TIMER_NDIVISORS);
419         LAPIC_WRITE(dcr_timer, lapic_timer_divisors[divisor_idx]);
420 }
421
422 static void
423 lapic_timer_oneshot(u_int count)
424 {
425         uint32_t value;
426
427         value = LAPIC_READ(lvt_timer);
428         value &= ~(APIC_LVTT_PERIODIC | APIC_LVTT_TSCDLT);
429         LAPIC_WRITE(lvt_timer, value);
430         LAPIC_WRITE(icr_timer, count);
431 }
432
433 static void
434 lapic_timer_oneshot_quick(u_int count)
435 {
436         LAPIC_WRITE(icr_timer, count);
437 }
438
439 static void
440 lapic_timer_tscdeadline_quick(uint64_t diff)
441 {
442         uint64_t val = rdtsc() + diff;
443
444         wrmsr(MSR_TSC_DEADLINE, val);
445         tsc_deadlines[mycpuid].timestamp = val;
446 }
447
448 static uint64_t
449 lapic_scale_to_tsc(unsigned value, unsigned scale)
450 {
451         uint64_t val;
452
453         val = value;
454         val *= tsc_frequency;
455         val += (scale - 1);
456         val /= scale;
457         return val;
458 }
459
460 #define MAX_MEASURE_RETRIES     100
461
462 static u_int64_t
463 do_tsc_calibration(u_int us, u_int64_t apic_delay_tsc)
464 {
465         u_int64_t old_tsc1, old_tsc2, new_tsc1, new_tsc2;
466         u_int64_t diff, count;
467         u_int64_t a;
468         u_int32_t start, end;
469         int retries1 = 0, retries2 = 0;
470
471 retry1:
472         lapic_timer_oneshot_quick(APIC_TIMER_MAX_COUNT);
473         old_tsc1 = rdtsc_ordered();
474         start = LAPIC_READ(ccr_timer);
475         old_tsc2 = rdtsc_ordered();
476         if (apic_delay_tsc > 0 && retries1 < MAX_MEASURE_RETRIES &&
477             old_tsc2 - old_tsc1 > 2 * apic_delay_tsc) {
478                 retries1++;
479                 goto retry1;
480         }
481         DELAY(us);
482 retry2:
483         new_tsc1 = rdtsc_ordered();
484         end = LAPIC_READ(ccr_timer);
485         new_tsc2 = rdtsc_ordered();
486         if (apic_delay_tsc > 0 && retries2 < MAX_MEASURE_RETRIES &&
487             new_tsc2 - new_tsc1 > 2 * apic_delay_tsc) {
488                 retries2++;
489                 goto retry2;
490         }
491         if (end == 0)
492                 return 0;
493
494         count = start - end;
495
496         /* Make sure the lapic can count for up to 2s */
497         a = (unsigned)APIC_TIMER_MAX_COUNT;
498         if (us < 2000000 && (u_int64_t)count * 2000000 >= a * us)
499                 return 0;
500
501         if (lapic_calibrate_test > 0 && (retries1 > 0 || retries2 > 0)) {
502                 kprintf("%s: retries1=%d retries2=%d\n",
503                     __func__, retries1, retries2);
504         }
505
506         diff = (new_tsc1 - old_tsc1) + (new_tsc2 - old_tsc2);
507         /* XXX First estimate if the total TSC diff value makes sense */
508         /* This will almost overflow, but only almost :) */
509         count = (2 * count * tsc_frequency) / diff;
510
511         return count;
512 }
513
514 static uint64_t
515 do_cputimer_calibration(u_int us)
516 {
517         sysclock_t value;
518         sysclock_t start, end, beginning, finish;
519
520         lapic_timer_oneshot(APIC_TIMER_MAX_COUNT);
521         beginning = LAPIC_READ(ccr_timer);
522         start = sys_cputimer->count();
523         DELAY(us);
524         end = sys_cputimer->count();
525         finish = LAPIC_READ(ccr_timer);
526         if (finish == 0)
527                 return 0;
528         /* value is the LAPIC timer difference. */
529         value = beginning - finish;
530         /* end is the sys_cputimer difference. */
531         end -= start;
532         if (end == 0)
533                 return 0;
534         value = ((uint64_t)value * sys_cputimer->freq) / end;
535         return value;
536 }
537
538 static void
539 lapic_timer_calibrate(void)
540 {
541         sysclock_t value;
542         u_int64_t apic_delay_tsc = 0;
543         int use_tsc_calibration = 0;
544
545         /* No need to calibrate lapic_timer, if we will use TSC Deadline mode */
546         if (lapic_use_tscdeadline) {
547                 lapic_timer_tscfreq_shift = 0;
548                 while ((tsc_frequency >> lapic_timer_tscfreq_shift) > INT_MAX)
549                         lapic_timer_tscfreq_shift++;
550                 lapic_cputimer_intr.freq =
551                     tsc_frequency >> lapic_timer_tscfreq_shift;
552                 kprintf(
553                     "lapic: TSC Deadline Mode: shift %d, frequency %u Hz\n",
554                     lapic_timer_tscfreq_shift, lapic_cputimer_intr.freq);
555                 return;
556         }
557
558         /*
559          * On real hardware, tsc_invariant == 0 wouldn't be an issue, but in
560          * a virtual machine the frequency may get changed by the host.
561          */
562         if (tsc_frequency != 0 && tsc_invariant && lapic_calibrate_fast)
563                 use_tsc_calibration = 1;
564
565         if (use_tsc_calibration) {
566                 u_int64_t min_apic_tsc = 0, max_apic_tsc = 0;
567                 u_int64_t old_tsc, new_tsc;
568                 sysclock_t val;
569                 int i;
570
571                 /* warm up */
572                 lapic_timer_oneshot(APIC_TIMER_MAX_COUNT);
573                 for (i = 0; i < 10; i++)
574                         val = LAPIC_READ(ccr_timer);
575
576                 for (i = 0; i < 100; i++) {
577                         old_tsc = rdtsc_ordered();
578                         val = LAPIC_READ(ccr_timer);
579                         new_tsc = rdtsc_ordered();
580                         new_tsc -= old_tsc;
581                         apic_delay_tsc += new_tsc;
582                         if (min_apic_tsc == 0 ||
583                             min_apic_tsc > new_tsc) {
584                                 min_apic_tsc = new_tsc;
585                         }
586                         if (max_apic_tsc < new_tsc)
587                                 max_apic_tsc = new_tsc;
588                 }
589                 apic_delay_tsc /= 100;
590                 kprintf(
591                     "LAPIC latency (in TSC ticks): %lu min: %lu max: %lu\n",
592                     apic_delay_tsc, min_apic_tsc, max_apic_tsc);
593                 apic_delay_tsc = min_apic_tsc;
594         }
595
596         if (!use_tsc_calibration) {
597                 int i;
598
599                 /*
600                  * Do some exercising of the lapic timer access. This improves
601                  * precision of the subsequent calibration run in at least some
602                  * virtualization cases.
603                  */
604                 lapic_timer_set_divisor(0);
605                 for (i = 0; i < 10; i++)
606                         (void)do_cputimer_calibration(100);
607         }
608         /* Try to calibrate the local APIC timer. */
609         for (lapic_timer_divisor_idx = 0;
610              lapic_timer_divisor_idx < APIC_TIMER_NDIVISORS;
611              lapic_timer_divisor_idx++) {
612                 lapic_timer_set_divisor(lapic_timer_divisor_idx);
613                 if (use_tsc_calibration) {
614                         value = do_tsc_calibration(200*1000, apic_delay_tsc);
615                 } else {
616                         value = do_cputimer_calibration(2*1000*1000);
617                 }
618                 if (value != 0)
619                         break;
620         }
621         if (lapic_timer_divisor_idx >= APIC_TIMER_NDIVISORS)
622                 panic("lapic: no proper timer divisor?!");
623         lapic_cputimer_intr.freq = value;
624
625         kprintf("lapic: divisor index %d, frequency %u Hz\n",
626                 lapic_timer_divisor_idx, lapic_cputimer_intr.freq);
627
628         if (lapic_calibrate_test > 0) {
629                 uint64_t freq;
630                 int i;
631
632                 for (i = 1; i <= 20; i++) {
633                         if (use_tsc_calibration) {
634                                 freq = do_tsc_calibration(i*100*1000,
635                                     apic_delay_tsc);
636                         } else {
637                                 freq = do_cputimer_calibration(i*100*1000);
638                         }
639                         if (freq != 0)
640                                 kprintf("%ums: %lu\n", i * 100, freq);
641                 }
642         }
643 }
644
645 static void
646 lapic_timer_tscdlt_reload(struct cputimer_intr *cti, sysclock_t reload)
647 {
648         struct globaldata *gd = mycpu;
649         uint64_t diff, now, val;
650
651         if (reload > 1000*1000*1000)
652                 reload = 1000*1000*1000;
653         diff = (uint64_t)reload * tsc_frequency / sys_cputimer->freq;
654         if (diff < 4)
655                 diff = 4;
656         if (cpu_vendor_id == CPU_VENDOR_INTEL)
657                 cpu_lfence();
658         else
659                 cpu_mfence();
660         now = rdtsc();
661         val = now + diff;
662         if (gd->gd_timer_running) {
663                 uint64_t deadline = tsc_deadlines[mycpuid].timestamp;
664                 if (deadline == 0 || now > deadline || val < deadline) {
665                         wrmsr(MSR_TSC_DEADLINE, val);
666                         tsc_deadlines[mycpuid].timestamp = val;
667                 }
668         } else {
669                 gd->gd_timer_running = 1;
670                 wrmsr(MSR_TSC_DEADLINE, val);
671                 tsc_deadlines[mycpuid].timestamp = val;
672         }
673 }
674
675 static void
676 lapic_mem_timer_intr_reload(struct cputimer_intr *cti, sysclock_t reload)
677 {
678         struct globaldata *gd = mycpu;
679
680         reload = (int64_t)reload * cti->freq / sys_cputimer->freq;
681         if (reload < 2)
682                 reload = 2;
683
684         if (gd->gd_timer_running) {
685                 if (reload < LAPIC_MEM_READ(ccr_timer))
686                         LAPIC_MEM_WRITE(icr_timer, reload);
687         } else {
688                 gd->gd_timer_running = 1;
689                 LAPIC_MEM_WRITE(icr_timer, reload);
690         }
691 }
692
693 static void
694 lapic_timer_intr_enable(struct cputimer_intr *cti __unused)
695 {
696         uint32_t timer;
697
698         timer = LAPIC_READ(lvt_timer);
699         timer &= ~(APIC_LVTT_MASKED | APIC_LVTT_PERIODIC | APIC_LVTT_TSCDLT);
700         if (lapic_use_tscdeadline)
701                 timer |= APIC_LVTT_TSCDLT;
702         LAPIC_WRITE(lvt_timer, timer);
703         if (lapic_use_tscdeadline)
704                 cpu_mfence();
705
706         lapic_timer_fixup_handler(NULL);
707 }
708
709 static void
710 lapic_timer_fixup_handler(void *arg)
711 {
712         int *started = arg;
713
714         if (started != NULL)
715                 *started = 0;
716
717         if (cpu_vendor_id == CPU_VENDOR_AMD) {
718                 /*
719                  * Detect the presence of C1E capability mostly on latest
720                  * dual-cores (or future) k8 family.  This feature renders
721                  * the local APIC timer dead, so we disable it by reading
722                  * the Interrupt Pending Message register and clearing both
723                  * C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
724                  * 
725                  * Reference:
726                  *   "BIOS and Kernel Developer's Guide for AMD NPT
727                  *    Family 0Fh Processors"
728                  *   #32559 revision 3.00
729                  */
730                 if ((cpu_id & 0x00000f00) == 0x00000f00 &&
731                     (cpu_id & 0x0fff0000) >= 0x00040000) {
732                         uint64_t msr;
733
734                         msr = rdmsr(0xc0010055);
735                         if (msr & 0x18000000) {
736                                 struct globaldata *gd = mycpu;
737
738                                 kprintf("cpu%d: AMD C1E detected\n",
739                                         gd->gd_cpuid);
740                                 wrmsr(0xc0010055, msr & ~0x18000000ULL);
741
742                                 /*
743                                  * We are kinda stalled;
744                                  * kick start again.
745                                  */
746                                 gd->gd_timer_running = 1;
747                                 if (lapic_use_tscdeadline) {
748                                         /* Maybe reached in Virtual Machines? */
749                                         lapic_timer_tscdeadline_quick(5000);
750                                 } else {
751                                         lapic_timer_oneshot_quick(2);
752                                 }
753
754                                 if (started != NULL)
755                                         *started = 1;
756                         }
757                 }
758         }
759 }
760
761 static void
762 lapic_timer_restart_handler(void *dummy __unused)
763 {
764         int started;
765
766         lapic_timer_fixup_handler(&started);
767         if (!started) {
768                 struct globaldata *gd = mycpu;
769
770                 gd->gd_timer_running = 1;
771                 if (lapic_use_tscdeadline) {
772                         /* Maybe reached in Virtual Machines? */
773                         lapic_timer_tscdeadline_quick(5000);
774                 } else {
775                         lapic_timer_oneshot_quick(2);
776                 }
777         }
778 }
779
780 /*
781  * This function is called only by ACPICA code currently:
782  * - AMD C1E fixup.  AMD C1E only seems to happen after ACPI
783  *   module controls PM.  So once ACPICA is attached, we try
784  *   to apply the fixup to prevent LAPIC timer from hanging.
785  */
786 static void
787 lapic_timer_intr_pmfixup(struct cputimer_intr *cti __unused)
788 {
789         lwkt_send_ipiq_mask(smp_active_mask,
790                             lapic_timer_fixup_handler, NULL);
791 }
792
793 static void
794 lapic_timer_intr_restart(struct cputimer_intr *cti __unused)
795 {
796         lwkt_send_ipiq_mask(smp_active_mask, lapic_timer_restart_handler, NULL);
797 }
798
799
800 /*
801  * dump contents of local APIC registers
802  */
803 void
804 apic_dump(char* str)
805 {
806         kprintf("SMP: CPU%d %s:\n", mycpu->gd_cpuid, str);
807         kprintf("     lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
808                 LAPIC_READ(lvt_lint0), LAPIC_READ(lvt_lint1), LAPIC_READ(tpr),
809                 LAPIC_READ(svr));
810 }
811
812 /*
813  * Inter Processor Interrupt functions.
814  */
815
816 static __inline void
817 lapic_mem_icr_unpend(const char *func)
818 {
819         if (LAPIC_MEM_READ(icr_lo) & APIC_DELSTAT_PEND) {
820                 int64_t tsc;
821                 int loops = 1;
822
823                 tsc = rdtsc();
824                 while (LAPIC_MEM_READ(icr_lo) & APIC_DELSTAT_PEND) {
825                         cpu_pause();
826                         if ((tsc_sclock_t)(rdtsc() -
827                                            (tsc + tsc_frequency)) > 0) {
828                                 tsc = rdtsc();
829                                 if (++loops > 30) {
830                                         panic("%s: cpu%d apic stalled",
831                                             func, mycpuid);
832                                 } else {
833                                         kprintf("%s: cpu%d apic stalled\n",
834                                             func, mycpuid);
835                                 }
836                         }
837                 }
838         }
839 }
840
841 /*
842  * Send APIC IPI 'vector' to 'destType' via 'deliveryMode'.
843  *
844  *  destType is 1 of: APIC_DEST_SELF, APIC_DEST_ALLISELF, APIC_DEST_ALLESELF
845  *  vector is any valid SYSTEM INT vector
846  *  delivery_mode is 1 of: APIC_DELMODE_FIXED, APIC_DELMODE_LOWPRIO
847  *
848  * WARNINGS!
849  *
850  * We now implement a per-cpu interlock (gd->gd_npoll) to prevent more than
851  * one IPI from being sent to any given cpu at a time.  Thus we no longer
852  * have to process incoming IPIs while waiting for the status to clear.
853  * No deadlock should be possible.
854  *
855  * We now physically disable interrupts for the lapic ICR operation.  If
856  * we do not do this then it looks like an EOI sent to the lapic (which
857  * occurs even with a critical section) can interfere with the command
858  * register ready status and cause an IPI to be lost.
859  *
860  * e.g. an interrupt can occur, issue the EOI, IRET, and cause the command
861  * register to busy just before we write to icr_lo, resulting in a lost
862  * issuance.  This only appears to occur on Intel cpus and is not
863  * documented.  It could simply be that cpus are so fast these days that
864  * it was always an issue, but is only now rearing its ugly head.  This
865  * is conjecture.
866  */
867 static int
868 lapic_mem_ipi(int dest_type, int vector, int delivery_mode)
869 {
870         lapic_mem_icr_unpend(__func__);
871         lapic_mem_icr_set(0,
872             dest_type | APIC_LEVEL_ASSERT | delivery_mode | vector);
873         return 0;
874 }
875
876 /*
877  * Interrupts must be hard-disabled by caller
878  */
879 static void
880 lapic_mem_single_ipi(int cpu, int vector, int delivery_mode)
881 {
882         lapic_mem_icr_unpend(__func__);
883         lapic_mem_icr_set(CPUID_TO_APICID(cpu),
884             APIC_DEST_DESTFLD | APIC_LEVEL_ASSERT | delivery_mode | vector);
885 }
886
887 /*
888  * Send APIC IPI 'vector' to 'target's via 'delivery_mode'.
889  *
890  * target is a bitmask of destination cpus.  Vector is any
891  * valid system INT vector.  Delivery mode may be either
892  * APIC_DELMODE_FIXED or APIC_DELMODE_LOWPRIO.
893  *
894  * Interrupts must be hard-disabled by caller
895  */
896 void
897 selected_apic_ipi(cpumask_t target, int vector, int delivery_mode)
898 {
899         while (CPUMASK_TESTNZERO(target)) {
900                 int n = BSFCPUMASK(target);
901                 CPUMASK_NANDBIT(target, n);
902                 single_apic_ipi(n, vector, delivery_mode);
903         }
904 }
905
906 /*
907  * Load a 'downcount time' in uSeconds.
908  */
909 void
910 set_apic_timer(int us)
911 {
912         u_int count;
913
914         if (lapic_use_tscdeadline) {
915                 uint64_t val;
916
917                 val = lapic_scale_to_tsc(us, 1000000);
918                 val += rdtsc();
919                 /* No need to arm the lapic here, just track the timeout. */
920                 tsc_deadlines[mycpuid].downcount_time = val;
921                 return;
922         }
923
924         /*
925          * When we reach here, lapic timer's frequency
926          * must have been calculated as well as the
927          * divisor (lapic->dcr_timer is setup during the
928          * divisor calculation).
929          */
930         KKASSERT(lapic_cputimer_intr.freq != 0 &&
931                  lapic_timer_divisor_idx >= 0);
932
933         count = ((us * (int64_t)lapic_cputimer_intr.freq) + 999999) / 1000000;
934         lapic_timer_oneshot(count);
935 }
936
937
938 /*
939  * Read remaining time in timer, in microseconds (rounded up).
940  */
941 int
942 read_apic_timer(void)
943 {
944         uint64_t val;
945
946         if (lapic_use_tscdeadline) {
947                 uint64_t now;
948
949                 val = tsc_deadlines[mycpuid].downcount_time;
950                 now = rdtsc();
951                 if (val == 0 || now > val) {
952                         return 0;
953                 } else {
954                         val -= now;
955                         val *= 1000000;
956                         val += (tsc_frequency - 1);
957                         val /= tsc_frequency;
958                         if (val > INT_MAX)
959                                 val = INT_MAX;
960                         return val;
961                 }
962         }
963
964         val = LAPIC_READ(ccr_timer);
965         if (val == 0)
966                 return 0;
967
968         KKASSERT(lapic_cputimer_intr.freq > 0);
969         val *= 1000000;
970         val += (lapic_cputimer_intr.freq - 1);
971         val /= lapic_cputimer_intr.freq;
972         if (val > INT_MAX)
973                 val = INT_MAX;
974         return val;
975 }
976
977
978 /*
979  * Spin-style delay, set delay time in uS, spin till it drains.
980  */
981 void
982 u_sleep(int count)
983 {
984         set_apic_timer(count);
985         while (read_apic_timer())
986                  /* spin */ ;
987 }
988
989 int
990 lapic_unused_apic_id(int start)
991 {
992         int i;
993
994         for (i = start; i < APICID_MAX; ++i) {
995                 if (APICID_TO_CPUID(i) == -1)
996                         return i;
997         }
998         return NAPICID;
999 }
1000
1001 void
1002 lapic_map(vm_paddr_t lapic_addr)
1003 {
1004         lapic_mem = pmap_mapdev_uncacheable(lapic_addr, sizeof(struct LAPIC));
1005 }
1006
1007 static TAILQ_HEAD(, lapic_enumerator) lapic_enumerators =
1008         TAILQ_HEAD_INITIALIZER(lapic_enumerators);
1009
1010 int
1011 lapic_config(void)
1012 {
1013         struct lapic_enumerator *e;
1014         int error, i, ap_max;
1015
1016         KKASSERT(lapic_enable);
1017
1018         for (i = 0; i < NAPICID; ++i)
1019                 APICID_TO_CPUID(i) = -1;
1020
1021         TAILQ_FOREACH(e, &lapic_enumerators, lapic_link) {
1022                 error = e->lapic_probe(e);
1023                 if (!error)
1024                         break;
1025         }
1026         if (e == NULL) {
1027                 kprintf("LAPIC: Can't find LAPIC\n");
1028                 return ENXIO;
1029         }
1030
1031         error = e->lapic_enumerate(e);
1032         if (error) {
1033                 kprintf("LAPIC: enumeration failed\n");
1034                 return ENXIO;
1035         }
1036
1037         /* LAPIC is usable now. */
1038         lapic_usable = 1;
1039
1040         ap_max = MAXCPU - 1;
1041         TUNABLE_INT_FETCH("hw.ap_max", &ap_max);
1042         if (ap_max > MAXCPU - 1)
1043                 ap_max = MAXCPU - 1;
1044
1045         if (naps > ap_max) {
1046                 kprintf("LAPIC: Warning use only %d out of %d "
1047                         "available APs\n",
1048                         ap_max, naps);
1049                 naps = ap_max;
1050         }
1051
1052         return 0;
1053 }
1054
1055 void
1056 lapic_enumerator_register(struct lapic_enumerator *ne)
1057 {
1058         struct lapic_enumerator *e;
1059
1060         TAILQ_FOREACH(e, &lapic_enumerators, lapic_link) {
1061                 if (e->lapic_prio < ne->lapic_prio) {
1062                         TAILQ_INSERT_BEFORE(e, ne, lapic_link);
1063                         return;
1064                 }
1065         }
1066         TAILQ_INSERT_TAIL(&lapic_enumerators, ne, lapic_link);
1067 }
1068
1069 void
1070 lapic_set_cpuid(int cpu_id, int apic_id)
1071 {
1072         CPUID_TO_APICID(cpu_id) = apic_id;
1073         APICID_TO_CPUID(apic_id) = cpu_id;
1074 }
1075
1076 void
1077 lapic_fixup_noioapic(void)
1078 {
1079         u_int   temp;
1080
1081         /* Only allowed on BSP */
1082         KKASSERT(mycpuid == 0);
1083         KKASSERT(!ioapic_enable);
1084
1085         temp = LAPIC_READ(lvt_lint0);
1086         temp &= ~APIC_LVT_MASKED;
1087         LAPIC_WRITE(lvt_lint0, temp);
1088
1089         temp = LAPIC_READ(lvt_lint1);
1090         temp |= APIC_LVT_MASKED;
1091         LAPIC_WRITE(lvt_lint1, temp);
1092 }
1093
1094 static void
1095 lapic_mem_eoi(void)
1096 {
1097         log_lapic(mem_eoi);
1098         LAPIC_MEM_WRITE(eoi, 0);
1099 }
1100
1101 static void
1102 lapic_mem_seticr_sync(uint32_t apic_id, uint32_t icr_lo_val)
1103 {
1104         lapic_mem_icr_set(apic_id, icr_lo_val);
1105         while (LAPIC_MEM_READ(icr_lo) & APIC_DELSTAT_PEND)
1106                 /* spin */;
1107 }
1108
1109 void
1110 lapic_seticr_sync(uint32_t apic_id, uint32_t icr_lo_val)
1111 {
1112         /* TODO: x2apic */
1113         lapic_mem_seticr_sync(apic_id, icr_lo_val);
1114 }
1115
1116 static void
1117 lapic_sysinit(void *dummy __unused)
1118 {
1119         if (lapic_enable) {
1120                 int error;
1121
1122                 lapic_eoi = lapic_mem_eoi;
1123                 apic_ipi = lapic_mem_ipi;
1124                 single_apic_ipi = lapic_mem_single_ipi;
1125
1126                 error = lapic_config();
1127                 if (error)
1128                         lapic_enable = 0;
1129         }
1130
1131         if (lapic_enable) {
1132                 /* Initialize BSP's local APIC */
1133                 lapic_init(TRUE);
1134         } else if (ioapic_enable) {
1135                 ioapic_enable = 0;
1136                 icu_reinit_noioapic();
1137         }
1138 }
1139 SYSINIT(lapic, SI_BOOT2_LAPIC, SI_ORDER_FIRST, lapic_sysinit, NULL);