Commit | Line | Data |
---|---|---|
46d4e165 JG |
1 | /* |
2 | * Copyright (c) 1996, by Steve Passe | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. The name of the developer may NOT be used to endorse or promote products | |
11 | * derived from this software without specific prior written permission. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
14 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
16 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
17 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
18 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
19 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
20 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
21 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
22 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
23 | * SUCH DAMAGE. | |
24 | * | |
25 | * $FreeBSD: src/sys/i386/i386/mp_machdep.c,v 1.115.2.15 2003/03/14 21:22:35 jhb Exp $ | |
46d4e165 JG |
26 | */ |
27 | ||
28 | #include "opt_cpu.h" | |
29 | ||
30 | #include <sys/param.h> | |
31 | #include <sys/systm.h> | |
32 | #include <sys/kernel.h> | |
33 | #include <sys/sysctl.h> | |
34 | #include <sys/malloc.h> | |
35 | #include <sys/memrange.h> | |
36 | #include <sys/cons.h> /* cngetc() */ | |
37 | #include <sys/machintr.h> | |
0e9325d3 | 38 | #include <sys/cpu_topology.h> |
46d4e165 | 39 | |
684a93c4 MD |
40 | #include <sys/mplock2.h> |
41 | ||
46d4e165 JG |
42 | #include <vm/vm.h> |
43 | #include <vm/vm_param.h> | |
44 | #include <vm/pmap.h> | |
45 | #include <vm/vm_kern.h> | |
46 | #include <vm/vm_extern.h> | |
47 | #include <sys/lock.h> | |
48 | #include <vm/vm_map.h> | |
46d4e165 JG |
49 | |
50 | #include <machine/smp.h> | |
51 | #include <machine_base/apic/apicreg.h> | |
52 | #include <machine/atomic.h> | |
53 | #include <machine/cpufunc.h> | |
f77c018a | 54 | #include <machine/cputypes.h> |
2b6cd37e | 55 | #include <machine_base/apic/lapic.h> |
61452645 | 56 | #include <machine_base/apic/ioapic.h> |
8005c0c8 | 57 | #include <machine_base/acpica/acpi_md_cpu.h> |
46d4e165 JG |
58 | #include <machine/psl.h> |
59 | #include <machine/segments.h> | |
60 | #include <machine/tss.h> | |
61 | #include <machine/specialreg.h> | |
62 | #include <machine/globaldata.h> | |
4117f2fd | 63 | #include <machine/pmap_inval.h> |
1a5c7e0f | 64 | #include <machine/clock.h> |
46d4e165 JG |
65 | |
66 | #include <machine/md_var.h> /* setidt() */ | |
57a9c56b | 67 | #include <machine_base/icu/icu.h> /* IPIs */ |
3566408b | 68 | #include <machine_base/icu/icu_var.h> |
e0918665 | 69 | #include <machine_base/apic/ioapic_abi.h> |
57a9c56b | 70 | #include <machine/intr_machdep.h> /* IPIs */ |
46d4e165 | 71 | |
46d4e165 JG |
72 | #define WARMBOOT_TARGET 0 |
73 | #define WARMBOOT_OFF (KERNBASE + 0x0467) | |
74 | #define WARMBOOT_SEG (KERNBASE + 0x0469) | |
75 | ||
46d4e165 JG |
76 | #define CMOS_REG (0x70) |
77 | #define CMOS_DATA (0x71) | |
78 | #define BIOS_RESET (0x0f) | |
79 | #define BIOS_WARM (0x0a) | |
80 | ||
2d20c15f MD |
81 | #define INVLPG_TIMEOUT_DEFAULT 10 |
82 | #define INVLPG_TIMEOUT_VM 60 | |
83 | ||
46d4e165 JG |
84 | /* |
85 | * this code MUST be enabled here and in mpboot.s. | |
86 | * it follows the very early stages of AP boot by placing values in CMOS ram. | |
87 | * it NORMALLY will never be needed and thus the primitive method for enabling. | |
88 | * | |
89 | */ | |
90 | #if defined(CHECK_POINTS) | |
91 | #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) | |
92 | #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) | |
93 | ||
94 | #define CHECK_INIT(D); \ | |
95 | CHECK_WRITE(0x34, (D)); \ | |
96 | CHECK_WRITE(0x35, (D)); \ | |
97 | CHECK_WRITE(0x36, (D)); \ | |
98 | CHECK_WRITE(0x37, (D)); \ | |
99 | CHECK_WRITE(0x38, (D)); \ | |
100 | CHECK_WRITE(0x39, (D)); | |
101 | ||
102 | #define CHECK_PRINT(S); \ | |
103 | kprintf("%s: %d, %d, %d, %d, %d, %d\n", \ | |
104 | (S), \ | |
105 | CHECK_READ(0x34), \ | |
106 | CHECK_READ(0x35), \ | |
107 | CHECK_READ(0x36), \ | |
108 | CHECK_READ(0x37), \ | |
109 | CHECK_READ(0x38), \ | |
110 | CHECK_READ(0x39)); | |
111 | ||
112 | #else /* CHECK_POINTS */ | |
113 | ||
114 | #define CHECK_INIT(D) | |
115 | #define CHECK_PRINT(S) | |
116 | ||
117 | #endif /* CHECK_POINTS */ | |
118 | ||
119 | /* | |
120 | * Values to send to the POST hardware. | |
121 | */ | |
122 | #define MP_BOOTADDRESS_POST 0x10 | |
123 | #define MP_PROBE_POST 0x11 | |
124 | #define MPTABLE_PASS1_POST 0x12 | |
125 | ||
126 | #define MP_START_POST 0x13 | |
127 | #define MP_ENABLE_POST 0x14 | |
128 | #define MPTABLE_PASS2_POST 0x15 | |
129 | ||
130 | #define START_ALL_APS_POST 0x16 | |
131 | #define INSTALL_AP_TRAMP_POST 0x17 | |
132 | #define START_AP_POST 0x18 | |
133 | ||
134 | #define MP_ANNOUNCE_POST 0x19 | |
135 | ||
46d4e165 JG |
136 | /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ |
137 | int current_postcode; | |
138 | ||
139 | /** XXX FIXME: what system files declare these??? */ | |
46d4e165 | 140 | |
637df2f6 | 141 | extern int naps; |
1a923442 | 142 | extern int _udatasel; |
46d4e165 | 143 | |
46d4e165 JG |
144 | int64_t tsc0_offset; |
145 | extern int64_t tsc_offsets[]; | |
146 | ||
46d4e165 JG |
147 | /* AP uses this during bootstrap. Do not staticize. */ |
148 | char *bootSTK; | |
149 | static int bootAP; | |
150 | ||
46d4e165 JG |
151 | struct pcb stoppcbs[MAXCPU]; |
152 | ||
153 | extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); | |
154 | ||
46d4e165 JG |
155 | /* |
156 | * Local data and functions. | |
157 | */ | |
158 | ||
46d4e165 | 159 | static u_int boot_address; |
46d4e165 | 160 | static int mp_finish; |
c6b1591c | 161 | static int mp_finish_lapic; |
46d4e165 | 162 | |
46d4e165 | 163 | static int start_all_aps(u_int boot_addr); |
bfc09ba0 | 164 | #if 0 |
46d4e165 | 165 | static void install_ap_tramp(u_int boot_addr); |
bfc09ba0 | 166 | #endif |
bb467734 MD |
167 | static int start_ap(struct mdglobaldata *gd, u_int boot_addr, int smibest); |
168 | static int smitest(void); | |
3a69c113 | 169 | static void mp_bsp_simple_setup(void); |
46d4e165 | 170 | |
c07315c4 | 171 | /* which cpus have been started */ |
36bbfccb | 172 | __read_mostly static cpumask_t smp_startup_mask = CPUMASK_INITIALIZER_ONLYONE; |
c07315c4 | 173 | /* which cpus have lapic been inited */ |
36bbfccb | 174 | __read_mostly static cpumask_t smp_lapic_mask = CPUMASK_INITIALIZER_ONLYONE; |
c07315c4 | 175 | /* which cpus are ready for IPIs etc? */ |
36bbfccb MD |
176 | __read_mostly cpumask_t smp_active_mask = CPUMASK_INITIALIZER_ONLYONE; |
177 | __read_mostly cpumask_t smp_finalize_mask = CPUMASK_INITIALIZER_ONLYONE; | |
c07315c4 | 178 | |
95be233a SZ |
179 | SYSCTL_OPAQUE(_machdep, OID_AUTO, smp_active, CTLFLAG_RD, |
180 | &smp_active_mask, sizeof(smp_active_mask), "LU", ""); | |
46d4e165 | 181 | static u_int bootMP_size; |
36bbfccb | 182 | __read_mostly static u_int report_invlpg_src; |
79f2da03 MD |
183 | SYSCTL_INT(_machdep, OID_AUTO, report_invlpg_src, CTLFLAG_RW, |
184 | &report_invlpg_src, 0, ""); | |
36bbfccb | 185 | __read_mostly static u_int report_invltlb_src; |
79f2da03 MD |
186 | SYSCTL_INT(_machdep, OID_AUTO, report_invltlb_src, CTLFLAG_RW, |
187 | &report_invltlb_src, 0, ""); | |
36bbfccb | 188 | __read_mostly static int optimized_invltlb; |
1a5c7e0f MD |
189 | SYSCTL_INT(_machdep, OID_AUTO, optimized_invltlb, CTLFLAG_RW, |
190 | &optimized_invltlb, 0, ""); | |
36bbfccb | 191 | __read_mostly static int all_but_self_ipi_enable = 1; |
bba35d66 MD |
192 | SYSCTL_INT(_machdep, OID_AUTO, all_but_self_ipi_enable, CTLFLAG_RW, |
193 | &all_but_self_ipi_enable, 0, ""); | |
36bbfccb | 194 | __read_mostly static int invlpg_timeout = INVLPG_TIMEOUT_DEFAULT; |
2d20c15f MD |
195 | SYSCTL_INT(_machdep, OID_AUTO, invlpg_timeout, CTLFLAG_RW, |
196 | &invlpg_timeout, 0, ""); | |
46d4e165 | 197 | |
f77c018a MC |
198 | /* Local data for detecting CPU TOPOLOGY */ |
199 | static int core_bits = 0; | |
200 | static int logical_CPU_bits = 0; | |
201 | ||
202 | ||
46d4e165 JG |
203 | /* |
204 | * Calculate usable address in base memory for AP trampoline code. | |
205 | */ | |
206 | u_int | |
207 | mp_bootaddress(u_int basemem) | |
208 | { | |
209 | POSTCODE(MP_BOOTADDRESS_POST); | |
210 | ||
c855ebba JG |
211 | bootMP_size = mptramp_end - mptramp_start; |
212 | boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ | |
213 | if (((basemem * 1024) - boot_address) < bootMP_size) | |
214 | boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ | |
46d4e165 JG |
215 | /* 3 levels of page table pages */ |
216 | mptramp_pagetables = boot_address - (PAGE_SIZE * 3); | |
217 | ||
218 | return mptramp_pagetables; | |
219 | } | |
220 | ||
46d4e165 JG |
221 | /* |
222 | * Print various information about the SMP system hardware and setup. | |
223 | */ | |
224 | void | |
225 | mp_announce(void) | |
226 | { | |
227 | int x; | |
228 | ||
229 | POSTCODE(MP_ANNOUNCE_POST); | |
230 | ||
231 | kprintf("DragonFly/MP: Multiprocessor motherboard\n"); | |
fbac0dc4 | 232 | kprintf(" cpu0 (BSP): apic id: %2d\n", CPUID_TO_APICID(0)); |
637df2f6 | 233 | for (x = 1; x <= naps; ++x) |
fbac0dc4 | 234 | kprintf(" cpu%d (AP): apic id: %2d\n", x, CPUID_TO_APICID(x)); |
46d4e165 | 235 | |
f45bfca0 | 236 | if (!ioapic_enable) |
7a603b36 | 237 | kprintf(" Warning: APIC I/O disabled\n"); |
46d4e165 JG |
238 | } |
239 | ||
240 | /* | |
241 | * AP cpu's call this to sync up protected mode. | |
242 | * | |
ec073ddc | 243 | * WARNING! %gs is not set up on entry. This routine sets up %gs. |
46d4e165 JG |
244 | */ |
245 | void | |
246 | init_secondary(void) | |
247 | { | |
248 | int gsel_tss; | |
249 | int x, myid = bootAP; | |
250 | u_int64_t msr, cr0; | |
251 | struct mdglobaldata *md; | |
252 | struct privatespace *ps; | |
31815141 | 253 | struct user_segment_descriptor *gdt; |
46d4e165 | 254 | |
4864d541 | 255 | ps = CPU_prvspace[myid]; |
31815141 | 256 | gdt = ps->mdglobaldata.gd_gdt; |
46d4e165 | 257 | |
9e24b495 | 258 | gdt_segs[GPROC0_SEL].ssd_base = (long)&ps->common_tss; |
46d4e165 JG |
259 | ps->mdglobaldata.mi.gd_prvspace = ps; |
260 | ||
261 | /* We fill the 32-bit segment descriptors */ | |
262 | for (x = 0; x < NGDT; x++) { | |
263 | if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) | |
31815141 | 264 | ssdtosd(&gdt_segs[x], &gdt[x]); |
46d4e165 JG |
265 | } |
266 | /* And now a 64-bit one */ | |
267 | ssdtosyssd(&gdt_segs[GPROC0_SEL], | |
31815141 | 268 | (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); |
46d4e165 | 269 | |
31815141 MD |
270 | r_gdt.rd_limit = MAXGDT_LIMIT - 1; |
271 | r_gdt.rd_base = (long)(intptr_t)gdt; | |
46d4e165 JG |
272 | lgdt(&r_gdt); /* does magic intra-segment return */ |
273 | ||
ec073ddc JG |
274 | /* lgdt() destroys the GSBASE value, so we load GSBASE after lgdt() */ |
275 | wrmsr(MSR_FSBASE, 0); /* User value */ | |
276 | wrmsr(MSR_GSBASE, (u_int64_t)ps); | |
277 | wrmsr(MSR_KGSBASE, 0); /* XXX User value while we're in the kernel */ | |
278 | ||
8a06c6ee | 279 | lidt(&r_idt_arr[mdcpu->mi.gd_cpuid]); |
46d4e165 | 280 | |
1a923442 MD |
281 | load_ds(_udatasel); |
282 | load_es(_udatasel); | |
283 | load_fs(_udatasel); | |
284 | ||
46d4e165 JG |
285 | #if 0 |
286 | lldt(_default_ldt); | |
287 | mdcpu->gd_currentldt = _default_ldt; | |
288 | #endif | |
289 | ||
290 | gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); | |
31815141 | 291 | gdt[GPROC0_SEL].sd_type = SDT_SYSTSS; |
46d4e165 JG |
292 | |
293 | md = mdcpu; /* loaded through %gs:0 (mdglobaldata.mi.gd_prvspace)*/ | |
294 | ||
4611d87f | 295 | /* |
fc921477 MD |
296 | * TSS entry point for interrupts, traps, and exceptions |
297 | * (sans NMI). This will always go to near the top of the pcpu | |
298 | * trampoline area. Hardware-pushed data will be copied into | |
299 | * the trap-frame on entry, and (if necessary) returned to the | |
300 | * trampoline on exit. | |
301 | * | |
302 | * We store some pcb data for the trampoline code above the | |
303 | * stack the cpu hw pushes into, and arrange things so the | |
304 | * address of tr_pcb_rsp is the same as the desired top of | |
305 | * stack. | |
4611d87f | 306 | */ |
9e24b495 MD |
307 | ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp; |
308 | ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0; | |
85b33048 MD |
309 | ps->trampoline.tr_pcb_gs_kernel = (register_t)md; |
310 | ps->trampoline.tr_pcb_cr3 = KPML4phys; /* adj to user cr3 live */ | |
311 | ps->dbltramp.tr_pcb_gs_kernel = (register_t)md; | |
312 | ps->dbltramp.tr_pcb_cr3 = KPML4phys; | |
313 | ps->dbgtramp.tr_pcb_gs_kernel = (register_t)md; | |
314 | ps->dbgtramp.tr_pcb_cr3 = KPML4phys; | |
fc921477 | 315 | |
46d4e165 | 316 | #if 0 /* JG XXX */ |
9e24b495 | 317 | ps->common_tss.tss_ioopt = (sizeof ps->common_tss) << 16; |
46d4e165 | 318 | #endif |
31815141 | 319 | md->gd_tss_gdt = &gdt[GPROC0_SEL]; |
46d4e165 | 320 | md->gd_common_tssd = *md->gd_tss_gdt; |
093565f2 MD |
321 | |
322 | /* double fault stack */ | |
85b33048 MD |
323 | ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp; |
324 | ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp; | |
093565f2 | 325 | |
46d4e165 JG |
326 | ltr(gsel_tss); |
327 | ||
46d4e165 JG |
328 | /* |
329 | * Set to a known state: | |
330 | * Set by mpboot.s: CR0_PG, CR0_PE | |
331 | * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM | |
332 | */ | |
333 | cr0 = rcr0(); | |
334 | cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); | |
335 | load_cr0(cr0); | |
336 | ||
337 | /* Set up the fast syscall stuff */ | |
338 | msr = rdmsr(MSR_EFER) | EFER_SCE; | |
339 | wrmsr(MSR_EFER, msr); | |
340 | wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); | |
341 | wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); | |
342 | msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | | |
343 | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); | |
344 | wrmsr(MSR_STAR, msr); | |
921ef7b6 | 345 | wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC); |
46d4e165 JG |
346 | |
347 | pmap_set_opt(); /* PSE/4MB pages, etc */ | |
b524ca76 | 348 | pmap_init_pat(); /* Page Attribute Table */ |
46d4e165 JG |
349 | |
350 | /* set up CPU registers and state */ | |
351 | cpu_setregs(); | |
352 | ||
353 | /* set up SSE/NX registers */ | |
20a6d9db | 354 | initializecpu(myid); |
46d4e165 JG |
355 | |
356 | /* set up FPU state on the AP */ | |
186c803f | 357 | npxinit(); |
ec073ddc | 358 | |
f89b4a45 SZ |
359 | /* If BSP is in the X2APIC mode, put the AP into the X2APIC mode. */ |
360 | if (x2apic_enable) | |
361 | lapic_x2apic_enter(FALSE); | |
362 | ||
ec073ddc | 363 | /* disable the APIC, just to be SURE */ |
8afc0c3d | 364 | LAPIC_WRITE(svr, (LAPIC_READ(svr) & ~APIC_SVR_ENABLE)); |
46d4e165 JG |
365 | } |
366 | ||
367 | /******************************************************************* | |
368 | * local functions and data | |
369 | */ | |
370 | ||
371 | /* | |
3a69c113 | 372 | * Start the SMP system |
46d4e165 JG |
373 | */ |
374 | static void | |
3a69c113 | 375 | mp_start_aps(void *dummy __unused) |
46d4e165 | 376 | { |
2e0ed166 SZ |
377 | if (lapic_enable) { |
378 | /* start each Application Processor */ | |
3a69c113 | 379 | start_all_aps(boot_address); |
2e0ed166 | 380 | } else { |
3a69c113 | 381 | mp_bsp_simple_setup(); |
944562df | 382 | } |
46d4e165 | 383 | } |
f3f3eadb | 384 | SYSINIT(startaps, SI_BOOT2_START_APS, SI_ORDER_FIRST, mp_start_aps, NULL); |
46d4e165 | 385 | |
46d4e165 JG |
386 | /* |
387 | * start each AP in our list | |
388 | */ | |
389 | static int | |
390 | start_all_aps(u_int boot_addr) | |
391 | { | |
392 | vm_offset_t va = boot_address + KERNBASE; | |
393 | u_int64_t *pt4, *pt3, *pt2; | |
4864d541 | 394 | int pssize; |
77a4bf30 | 395 | int x, i; |
46d4e165 | 396 | int shift; |
bb467734 MD |
397 | int smicount; |
398 | int smibest; | |
399 | int smilast; | |
46d4e165 JG |
400 | u_char mpbiosreason; |
401 | u_long mpbioswarmvec; | |
402 | struct mdglobaldata *gd; | |
403 | struct privatespace *ps; | |
1997b4c2 | 404 | size_t ipiq_size; |
46d4e165 JG |
405 | |
406 | POSTCODE(START_ALL_APS_POST); | |
407 | ||
46d4e165 JG |
408 | /* install the AP 1st level boot code */ |
409 | pmap_kenter(va, boot_address); | |
bfc09ba0 | 410 | cpu_invlpg((void *)va); /* JG XXX */ |
46d4e165 JG |
411 | bcopy(mptramp_start, (void *)va, bootMP_size); |
412 | ||
413 | /* Locate the page tables, they'll be below the trampoline */ | |
414 | pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); | |
415 | pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); | |
416 | pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); | |
417 | ||
418 | /* Create the initial 1GB replicated page tables */ | |
419 | for (i = 0; i < 512; i++) { | |
420 | /* Each slot of the level 4 pages points to the same level 3 page */ | |
421 | pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); | |
c713db65 AL |
422 | pt4[i] |= kernel_pmap->pmap_bits[PG_V_IDX] | |
423 | kernel_pmap->pmap_bits[PG_RW_IDX] | | |
424 | kernel_pmap->pmap_bits[PG_U_IDX]; | |
46d4e165 JG |
425 | |
426 | /* Each slot of the level 3 pages points to the same level 2 page */ | |
427 | pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); | |
c713db65 AL |
428 | pt3[i] |= kernel_pmap->pmap_bits[PG_V_IDX] | |
429 | kernel_pmap->pmap_bits[PG_RW_IDX] | | |
430 | kernel_pmap->pmap_bits[PG_U_IDX]; | |
46d4e165 JG |
431 | |
432 | /* The level 2 page slots are mapped with 2MB pages for 1GB. */ | |
433 | pt2[i] = i * (2 * 1024 * 1024); | |
c713db65 AL |
434 | pt2[i] |= kernel_pmap->pmap_bits[PG_V_IDX] | |
435 | kernel_pmap->pmap_bits[PG_RW_IDX] | | |
436 | kernel_pmap->pmap_bits[PG_PS_IDX] | | |
437 | kernel_pmap->pmap_bits[PG_U_IDX]; | |
46d4e165 JG |
438 | } |
439 | ||
440 | /* save the current value of the warm-start vector */ | |
441 | mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); | |
442 | outb(CMOS_REG, BIOS_RESET); | |
443 | mpbiosreason = inb(CMOS_DATA); | |
444 | ||
445 | /* setup a vector to our boot code */ | |
446 | *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; | |
447 | *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); | |
448 | outb(CMOS_REG, BIOS_RESET); | |
449 | outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ | |
450 | ||
bb467734 MD |
451 | /* |
452 | * If we have a TSC we can figure out the SMI interrupt rate. | |
453 | * The SMI does not necessarily use a constant rate. Spend | |
454 | * up to 250ms trying to figure it out. | |
455 | */ | |
456 | smibest = 0; | |
457 | if (cpu_feature & CPUID_TSC) { | |
458 | set_apic_timer(275000); | |
459 | smilast = read_apic_timer(); | |
460 | for (x = 0; x < 20 && read_apic_timer(); ++x) { | |
461 | smicount = smitest(); | |
462 | if (smibest == 0 || smilast - smicount < smibest) | |
463 | smibest = smilast - smicount; | |
464 | smilast = smicount; | |
465 | } | |
466 | if (smibest > 250000) | |
467 | smibest = 0; | |
bb467734 MD |
468 | } |
469 | if (smibest) | |
470 | kprintf("SMI Frequency (worst case): %d Hz (%d us)\n", | |
471 | 1000000 / smibest, smibest); | |
472 | ||
2d20c15f MD |
473 | /* |
474 | * This is nasty but if we are a guest in a virtual machine, | |
475 | * give the smpinvl synchronization code up to 60 seconds | |
476 | */ | |
477 | ||
478 | if (vmm_guest != VMM_GUEST_NONE) | |
479 | invlpg_timeout = INVLPG_TIMEOUT_VM; | |
480 | ||
46d4e165 | 481 | /* start each AP */ |
637df2f6 | 482 | for (x = 1; x <= naps; ++x) { |
46d4e165 JG |
483 | /* This is a bit verbose, it will go away soon. */ |
484 | ||
4864d541 | 485 | pssize = sizeof(struct privatespace); |
31815141 MD |
486 | ps = (void *) |
487 | kmem_alloc3(kernel_map, pssize, VM_SUBSYS_GD, | |
488 | KM_CPU(x)); | |
489 | bzero(ps, pssize); | |
4864d541 | 490 | CPU_prvspace[x] = ps; |
31815141 MD |
491 | gd = &ps->mdglobaldata; |
492 | gd->mi.gd_prvspace = ps; | |
493 | gd->gd_gdt = (void *) | |
494 | kmem_alloc3(kernel_map, MAXGDT_LIMIT, VM_SUBSYS_GD, | |
495 | KM_CPU(x)); | |
496 | bzero(gd->gd_gdt, MAXGDT_LIMIT); | |
497 | ||
1997b4c2 | 498 | #if 0 |
4864d541 | 499 | kprintf("ps %d %p %d\n", x, ps, pssize); |
1997b4c2 | 500 | #endif |
46d4e165 JG |
501 | |
502 | /* prime data page for it to use */ | |
503 | mi_gdinit(&gd->mi, x); | |
504 | cpu_gdinit(gd, x); | |
1997b4c2 | 505 | ipiq_size = sizeof(struct lwkt_ipiq) * (naps + 1); |
1eeaf6b2 | 506 | gd->mi.gd_ipiq = (void *)kmem_alloc3(kernel_map, ipiq_size, |
0ffdc451 | 507 | VM_SUBSYS_IPIQ, KM_CPU(x)); |
1997b4c2 | 508 | bzero(gd->mi.gd_ipiq, ipiq_size); |
46d4e165 | 509 | |
8005c0c8 SZ |
510 | gd->gd_acpi_id = CPUID_TO_ACPIID(gd->mi.gd_cpuid); |
511 | ||
cc3685b0 SZ |
512 | /* initialize arc4random. */ |
513 | arc4_init_pcpu(x); | |
514 | ||
46d4e165 JG |
515 | /* setup a vector to our boot code */ |
516 | *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; | |
517 | *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); | |
518 | outb(CMOS_REG, BIOS_RESET); | |
519 | outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ | |
520 | ||
521 | /* | |
522 | * Setup the AP boot stack | |
523 | */ | |
1997b4c2 | 524 | bootSTK = &ps->idlestack[UPAGES * PAGE_SIZE - PAGE_SIZE]; |
46d4e165 JG |
525 | bootAP = x; |
526 | ||
527 | /* attempt to start the Application Processor */ | |
528 | CHECK_INIT(99); /* setup checkpoints */ | |
bb467734 | 529 | if (!start_ap(gd, boot_addr, smibest)) { |
ea96e50f | 530 | kprintf("\nAP #%d (PHY# %d) failed!\n", |
fbac0dc4 | 531 | x, CPUID_TO_APICID(x)); |
46d4e165 JG |
532 | CHECK_PRINT("trace"); /* show checkpoints */ |
533 | /* better panic as the AP may be running loose */ | |
534 | kprintf("panic y/n? [y] "); | |
ce7866b8 | 535 | cnpoll(TRUE); |
46d4e165 JG |
536 | if (cngetc() != 'n') |
537 | panic("bye-bye"); | |
ce7866b8 | 538 | cnpoll(FALSE); |
46d4e165 JG |
539 | } |
540 | CHECK_PRINT("trace"); /* show checkpoints */ | |
46d4e165 JG |
541 | } |
542 | ||
543 | /* set ncpus to 1 + highest logical cpu. Not all may have come up */ | |
544 | ncpus = x; | |
545 | ||
46d4e165 JG |
546 | for (shift = 0; (1 << shift) <= ncpus; ++shift) |
547 | ; | |
548 | --shift; | |
46d4e165 JG |
549 | |
550 | /* ncpus_fit -- ncpus rounded up to the nearest power of 2 */ | |
551 | if ((1 << shift) < ncpus) | |
552 | ++shift; | |
553 | ncpus_fit = 1 << shift; | |
554 | ncpus_fit_mask = ncpus_fit - 1; | |
555 | ||
556 | /* build our map of 'other' CPUs */ | |
c07315c4 MD |
557 | mycpu->gd_other_cpus = smp_startup_mask; |
558 | CPUMASK_NANDBIT(mycpu->gd_other_cpus, mycpu->gd_cpuid); | |
1997b4c2 | 559 | |
3ab3ae18 MD |
560 | malloc_reinit_ncpus(); |
561 | ||
8005c0c8 SZ |
562 | gd = (struct mdglobaldata *)mycpu; |
563 | gd->gd_acpi_id = CPUID_TO_ACPIID(mycpu->gd_cpuid); | |
564 | ||
1997b4c2 | 565 | ipiq_size = sizeof(struct lwkt_ipiq) * ncpus; |
1eeaf6b2 | 566 | mycpu->gd_ipiq = (void *)kmem_alloc3(kernel_map, ipiq_size, |
0ffdc451 | 567 | VM_SUBSYS_IPIQ, KM_CPU(0)); |
1997b4c2 | 568 | bzero(mycpu->gd_ipiq, ipiq_size); |
46d4e165 | 569 | |
cc3685b0 SZ |
570 | /* initialize arc4random. */ |
571 | arc4_init_pcpu(0); | |
572 | ||
46d4e165 JG |
573 | /* restore the warmstart vector */ |
574 | *(u_long *) WARMBOOT_OFF = mpbioswarmvec; | |
575 | outb(CMOS_REG, BIOS_RESET); | |
576 | outb(CMOS_DATA, mpbiosreason); | |
577 | ||
578 | /* | |
579 | * NOTE! The idlestack for the BSP was setup by locore. Finish | |
580 | * up, clean out the P==V mapping we did earlier. | |
581 | */ | |
46d4e165 JG |
582 | pmap_set_opt(); |
583 | ||
c6b1591c SZ |
584 | /* |
585 | * Wait all APs to finish initializing LAPIC | |
586 | */ | |
c6b1591c SZ |
587 | if (bootverbose) |
588 | kprintf("SMP: Waiting APs LAPIC initialization\n"); | |
589 | if (cpu_feature & CPUID_TSC) | |
590 | tsc0_offset = rdtsc(); | |
591 | tsc_offsets[0] = 0; | |
1997b4c2 | 592 | mp_finish_lapic = 1; |
c6b1591c | 593 | rel_mplock(); |
1997b4c2 | 594 | |
c07315c4 | 595 | while (CPUMASK_CMPMASKNEQ(smp_lapic_mask, smp_startup_mask)) { |
06c66eb2 | 596 | cpu_pause(); |
c6b1591c SZ |
597 | cpu_lfence(); |
598 | if (cpu_feature & CPUID_TSC) | |
599 | tsc0_offset = rdtsc(); | |
600 | } | |
06c66eb2 MD |
601 | while (try_mplock() == 0) { |
602 | cpu_pause(); | |
603 | cpu_lfence(); | |
604 | } | |
c6b1591c | 605 | |
46d4e165 JG |
606 | /* number of APs actually started */ |
607 | return ncpus - 1; | |
608 | } | |
609 | ||
610 | ||
611 | /* | |
612 | * load the 1st level AP boot code into base memory. | |
613 | */ | |
614 | ||
615 | /* targets for relocation */ | |
616 | extern void bigJump(void); | |
617 | extern void bootCodeSeg(void); | |
618 | extern void bootDataSeg(void); | |
619 | extern void MPentry(void); | |
620 | extern u_int MP_GDT; | |
621 | extern u_int mp_gdtbase; | |
622 | ||
bfc09ba0 MD |
623 | #if 0 |
624 | ||
46d4e165 JG |
625 | static void |
626 | install_ap_tramp(u_int boot_addr) | |
627 | { | |
628 | int x; | |
629 | int size = *(int *) ((u_long) & bootMP_size); | |
630 | u_char *src = (u_char *) ((u_long) bootMP); | |
631 | u_char *dst = (u_char *) boot_addr + KERNBASE; | |
632 | u_int boot_base = (u_int) bootMP; | |
633 | u_int8_t *dst8; | |
634 | u_int16_t *dst16; | |
635 | u_int32_t *dst32; | |
636 | ||
637 | POSTCODE(INSTALL_AP_TRAMP_POST); | |
638 | ||
639 | for (x = 0; x < size; ++x) | |
640 | *dst++ = *src++; | |
641 | ||
642 | /* | |
643 | * modify addresses in code we just moved to basemem. unfortunately we | |
644 | * need fairly detailed info about mpboot.s for this to work. changes | |
645 | * to mpboot.s might require changes here. | |
646 | */ | |
647 | ||
648 | /* boot code is located in KERNEL space */ | |
649 | dst = (u_char *) boot_addr + KERNBASE; | |
650 | ||
651 | /* modify the lgdt arg */ | |
652 | dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); | |
653 | *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); | |
654 | ||
655 | /* modify the ljmp target for MPentry() */ | |
656 | dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); | |
657 | *dst32 = ((u_int) MPentry - KERNBASE); | |
658 | ||
659 | /* modify the target for boot code segment */ | |
660 | dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); | |
661 | dst8 = (u_int8_t *) (dst16 + 1); | |
662 | *dst16 = (u_int) boot_addr & 0xffff; | |
663 | *dst8 = ((u_int) boot_addr >> 16) & 0xff; | |
664 | ||
665 | /* modify the target for boot data segment */ | |
666 | dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); | |
667 | dst8 = (u_int8_t *) (dst16 + 1); | |
668 | *dst16 = (u_int) boot_addr & 0xffff; | |
669 | *dst8 = ((u_int) boot_addr >> 16) & 0xff; | |
670 | } | |
671 | ||
bfc09ba0 | 672 | #endif |
46d4e165 JG |
673 | |
674 | /* | |
bb467734 | 675 | * This function starts the AP (application processor) identified |
46d4e165 JG |
676 | * by the APIC ID 'physicalCpu'. It does quite a "song and dance" |
677 | * to accomplish this. This is necessary because of the nuances | |
678 | * of the different hardware we might encounter. It ain't pretty, | |
679 | * but it seems to work. | |
680 | * | |
681 | * NOTE: eventually an AP gets to ap_init(), which is called just | |
682 | * before the AP goes into the LWKT scheduler's idle loop. | |
683 | */ | |
684 | static int | |
bb467734 | 685 | start_ap(struct mdglobaldata *gd, u_int boot_addr, int smibest) |
46d4e165 JG |
686 | { |
687 | int physical_cpu; | |
688 | int vector; | |
46d4e165 JG |
689 | |
690 | POSTCODE(START_AP_POST); | |
691 | ||
692 | /* get the PHYSICAL APIC ID# */ | |
fbac0dc4 | 693 | physical_cpu = CPUID_TO_APICID(gd->mi.gd_cpuid); |
46d4e165 JG |
694 | |
695 | /* calculate the vector */ | |
696 | vector = (boot_addr >> 12) & 0xff; | |
697 | ||
bb467734 MD |
698 | /* We don't want anything interfering */ |
699 | cpu_disable_intr(); | |
700 | ||
46d4e165 JG |
701 | /* Make sure the target cpu sees everything */ |
702 | wbinvd(); | |
703 | ||
bb467734 MD |
704 | /* |
705 | * Try to detect when a SMI has occurred, wait up to 200ms. | |
706 | * | |
707 | * If a SMI occurs during an AP reset but before we issue | |
708 | * the STARTUP command, the AP may brick. To work around | |
709 | * this problem we hold off doing the AP startup until | |
710 | * after we have detected the SMI. Hopefully another SMI | |
711 | * will not occur before we finish the AP startup. | |
712 | * | |
713 | * Retries don't seem to help. SMIs have a window of opportunity | |
714 | * and if USB->legacy keyboard emulation is enabled in the BIOS | |
715 | * the interrupt rate can be quite high. | |
716 | * | |
717 | * NOTE: Don't worry about the L1 cache load, it might bloat | |
718 | * ldelta a little but ndelta will be so huge when the SMI | |
719 | * occurs the detection logic will still work fine. | |
720 | */ | |
721 | if (smibest) { | |
722 | set_apic_timer(200000); | |
723 | smitest(); | |
724 | } | |
725 | ||
46d4e165 JG |
726 | /* |
727 | * first we do an INIT/RESET IPI this INIT IPI might be run, reseting | |
728 | * and running the target CPU. OR this INIT IPI might be latched (P5 | |
729 | * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be | |
730 | * ignored. | |
bb467734 MD |
731 | * |
732 | * see apic/apicreg.h for icr bit definitions. | |
733 | * | |
734 | * TIME CRITICAL CODE, DO NOT DO ANY KPRINTFS IN THE HOT PATH. | |
46d4e165 JG |
735 | */ |
736 | ||
bb467734 MD |
737 | /* |
738 | * Do an INIT IPI: assert RESET | |
739 | * | |
740 | * Use edge triggered mode to assert INIT | |
741 | */ | |
8afc0c3d SZ |
742 | lapic_seticr_sync(physical_cpu, |
743 | APIC_DESTMODE_PHY | | |
744 | APIC_DEST_DESTFLD | | |
745 | APIC_TRIGMOD_EDGE | | |
746 | APIC_LEVEL_ASSERT | | |
747 | APIC_DELMODE_INIT); | |
46d4e165 | 748 | |
bb467734 MD |
749 | /* |
750 | * The spec calls for a 10ms delay but we may have to use a | |
751 | * MUCH lower delay to avoid bricking an AP due to a fast SMI | |
752 | * interrupt. We have other loops here too and dividing by 2 | |
753 | * doesn't seem to be enough even after subtracting 350us, | |
754 | * so we divide by 4. | |
755 | * | |
756 | * Our minimum delay is 150uS, maximum is 10ms. If no SMI | |
757 | * interrupt was detected we use the full 10ms. | |
758 | */ | |
759 | if (smibest == 0) | |
760 | u_sleep(10000); | |
761 | else if (smibest < 150 * 4 + 350) | |
762 | u_sleep(150); | |
763 | else if ((smibest - 350) / 4 < 10000) | |
764 | u_sleep((smibest - 350) / 4); | |
765 | else | |
766 | u_sleep(10000); | |
46d4e165 | 767 | |
bb467734 MD |
768 | /* |
769 | * Do an INIT IPI: deassert RESET | |
770 | * | |
771 | * Use level triggered mode to deassert. It is unclear | |
772 | * why we need to do this. | |
773 | */ | |
8afc0c3d SZ |
774 | lapic_seticr_sync(physical_cpu, |
775 | APIC_DESTMODE_PHY | | |
776 | APIC_DEST_DESTFLD | | |
777 | APIC_TRIGMOD_LEVEL | | |
778 | APIC_LEVEL_DEASSERT | | |
779 | APIC_DELMODE_INIT); | |
bb467734 | 780 | u_sleep(150); /* wait 150us */ |
46d4e165 JG |
781 | |
782 | /* | |
bb467734 | 783 | * Next we do a STARTUP IPI: the previous INIT IPI might still be |
46d4e165 JG |
784 | * latched, (P5 bug) this 1st STARTUP would then terminate |
785 | * immediately, and the previously started INIT IPI would continue. OR | |
786 | * the previous INIT IPI has already run. and this STARTUP IPI will | |
787 | * run. OR the previous INIT IPI was ignored. and this STARTUP IPI | |
788 | * will run. | |
8afc0c3d SZ |
789 | * |
790 | * XXX set APIC_LEVEL_ASSERT | |
46d4e165 | 791 | */ |
8afc0c3d SZ |
792 | lapic_seticr_sync(physical_cpu, |
793 | APIC_DESTMODE_PHY | | |
794 | APIC_DEST_DESTFLD | | |
795 | APIC_DELMODE_STARTUP | | |
796 | vector); | |
46d4e165 JG |
797 | u_sleep(200); /* wait ~200uS */ |
798 | ||
799 | /* | |
bb467734 | 800 | * Finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF |
46d4e165 JG |
801 | * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR |
802 | * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is | |
803 | * recognized after hardware RESET or INIT IPI. | |
8afc0c3d SZ |
804 | * |
805 | * XXX set APIC_LEVEL_ASSERT | |
46d4e165 | 806 | */ |
8afc0c3d SZ |
807 | lapic_seticr_sync(physical_cpu, |
808 | APIC_DESTMODE_PHY | | |
809 | APIC_DEST_DESTFLD | | |
810 | APIC_DELMODE_STARTUP | | |
811 | vector); | |
bb467734 MD |
812 | |
813 | /* Resume normal operation */ | |
814 | cpu_enable_intr(); | |
46d4e165 JG |
815 | |
816 | /* wait for it to start, see ap_init() */ | |
817 | set_apic_timer(5000000);/* == 5 seconds */ | |
818 | while (read_apic_timer()) { | |
c07315c4 | 819 | if (CPUMASK_TESTBIT(smp_startup_mask, gd->mi.gd_cpuid)) |
46d4e165 JG |
820 | return 1; /* return SUCCESS */ |
821 | } | |
bb467734 | 822 | |
46d4e165 JG |
823 | return 0; /* return FAILURE */ |
824 | } | |
825 | ||
bb467734 MD |
826 | static |
827 | int | |
828 | smitest(void) | |
829 | { | |
830 | int64_t ltsc; | |
831 | int64_t ntsc; | |
832 | int64_t ldelta; | |
833 | int64_t ndelta; | |
834 | int count; | |
835 | ||
836 | ldelta = 0; | |
837 | ndelta = 0; | |
838 | while (read_apic_timer()) { | |
839 | ltsc = rdtsc(); | |
840 | for (count = 0; count < 100; ++count) | |
841 | ntsc = rdtsc(); /* force loop to occur */ | |
842 | if (ldelta) { | |
843 | ndelta = ntsc - ltsc; | |
844 | if (ldelta > ndelta) | |
845 | ldelta = ndelta; | |
846 | if (ndelta > ldelta * 2) | |
847 | break; | |
848 | } else { | |
849 | ldelta = ntsc - ltsc; | |
850 | } | |
851 | } | |
852 | return(read_apic_timer()); | |
853 | } | |
46d4e165 JG |
854 | |
855 | /* | |
7d4d6fdb MD |
856 | * Synchronously flush the TLB on all other CPU's. The current cpu's |
857 | * TLB is not flushed. If the caller wishes to flush the current cpu's | |
858 | * TLB the caller must call cpu_invltlb() in addition to smp_invltlb(). | |
46d4e165 | 859 | * |
79f2da03 MD |
860 | * This routine may be called concurrently from multiple cpus. When this |
861 | * happens, smp_invltlb() can wind up sticking around in the confirmation | |
862 | * while() loop at the end as additional cpus are added to the global | |
863 | * cpumask, until they are acknowledged by another IPI. | |
864 | * | |
7d4d6fdb MD |
865 | * NOTE: If for some reason we were unable to start all cpus we cannot |
866 | * safely use broadcast IPIs. | |
46d4e165 | 867 | */ |
7d4d6fdb | 868 | |
79f2da03 MD |
869 | cpumask_t smp_smurf_mask; |
870 | static cpumask_t smp_invltlb_mask; | |
bba35d66 | 871 | #define LOOPRECOVER |
67534613 | 872 | #define LOOPMASK_IN |
79f2da03 MD |
873 | #ifdef LOOPMASK_IN |
874 | cpumask_t smp_in_mask; | |
875 | #endif | |
876 | cpumask_t smp_invmask; | |
877 | extern cpumask_t smp_idleinvl_mask; | |
878 | extern cpumask_t smp_idleinvl_reqs; | |
7d4d6fdb | 879 | |
79f2da03 | 880 | /* |
ccd67bf6 MD |
881 | * Atomically OR bits in *mask to smp_smurf_mask. Adjust *mask to remove |
882 | * bits that do not need to be IPId. These bits are still part of the command, | |
883 | * but the target cpus have already been signalled and do not need to be | |
884 | * sigalled again. | |
79f2da03 MD |
885 | */ |
886 | #include <sys/spinlock.h> | |
887 | #include <sys/spinlock2.h> | |
888 | ||
889 | static __noinline | |
890 | void | |
e47e3dba | 891 | smp_smurf_fetchset(cpumask_t *mask) |
79f2da03 | 892 | { |
ccd67bf6 MD |
893 | cpumask_t omask; |
894 | int i; | |
895 | __uint64_t obits; | |
896 | __uint64_t nbits; | |
897 | ||
898 | i = 0; | |
899 | while (i < CPUMASK_ELEMENTS) { | |
900 | obits = smp_smurf_mask.ary[i]; | |
901 | cpu_ccfence(); | |
902 | nbits = obits | mask->ary[i]; | |
903 | if (atomic_cmpset_long(&smp_smurf_mask.ary[i], obits, nbits)) { | |
904 | omask.ary[i] = obits; | |
905 | ++i; | |
79f2da03 | 906 | } |
79f2da03 | 907 | } |
ccd67bf6 | 908 | CPUMASK_NANDMASK(*mask, omask); |
79f2da03 MD |
909 | } |
910 | ||
911 | /* | |
ccd67bf6 MD |
912 | * This is a mechanism which guarantees that cpu_invltlb() will be executed |
913 | * on idle cpus without having to signal or wake them up. The invltlb will be | |
914 | * executed when they wake up, prior to any scheduling or interrupt thread. | |
79f2da03 | 915 | * |
ccd67bf6 MD |
916 | * (*mask) is modified to remove the cpus we successfully negotiate this |
917 | * function with. This function may only be used with semi-synchronous | |
918 | * commands (typically invltlb's or semi-synchronous invalidations which | |
919 | * are usually associated only with kernel memory). | |
79f2da03 | 920 | */ |
79f2da03 | 921 | void |
ccd67bf6 | 922 | smp_smurf_idleinvlclr(cpumask_t *mask) |
79f2da03 | 923 | { |
1a5c7e0f MD |
924 | if (optimized_invltlb) { |
925 | ATOMIC_CPUMASK_ORMASK(smp_idleinvl_reqs, *mask); | |
926 | /* cpu_lfence() not needed */ | |
927 | CPUMASK_NANDMASK(*mask, smp_idleinvl_mask); | |
928 | } | |
79f2da03 | 929 | } |
b4b1a37a | 930 | |
79f2da03 MD |
931 | /* |
932 | * Issue cpu_invltlb() across all cpus except the current cpu. | |
ccd67bf6 MD |
933 | * |
934 | * This function will arrange to avoid idle cpus, but still gurantee that | |
935 | * invltlb is run on them when they wake up prior to any scheduling or | |
936 | * nominal interrupt. | |
79f2da03 | 937 | */ |
46d4e165 JG |
938 | void |
939 | smp_invltlb(void) | |
940 | { | |
7d4d6fdb | 941 | struct mdglobaldata *md = mdcpu; |
79f2da03 | 942 | cpumask_t mask; |
79f2da03 | 943 | unsigned long rflags; |
bba35d66 | 944 | #ifdef LOOPRECOVER |
5b49787b | 945 | tsc_uclock_t tsc_base = rdtsc(); |
1a5c7e0f | 946 | int repeats = 0; |
7d4d6fdb | 947 | #endif |
4117f2fd | 948 | |
79f2da03 MD |
949 | if (report_invltlb_src > 0) { |
950 | if (--report_invltlb_src <= 0) | |
951 | print_backtrace(8); | |
952 | } | |
1a5c7e0f | 953 | |
79f2da03 MD |
954 | /* |
955 | * Disallow normal interrupts, set all active cpus except our own | |
956 | * in the global smp_invltlb_mask. | |
957 | */ | |
7d4d6fdb | 958 | ++md->mi.gd_cnt.v_smpinvltlb; |
79f2da03 MD |
959 | crit_enter_gd(&md->mi); |
960 | ||
961 | /* | |
962 | * Bits we want to set in smp_invltlb_mask. We do not want to signal | |
963 | * our own cpu. Also try to remove bits associated with idle cpus | |
964 | * that we can flag for auto-invltlb. | |
965 | */ | |
966 | mask = smp_active_mask; | |
967 | CPUMASK_NANDBIT(mask, md->mi.gd_cpuid); | |
ccd67bf6 | 968 | smp_smurf_idleinvlclr(&mask); |
79f2da03 MD |
969 | |
970 | rflags = read_rflags(); | |
971 | cpu_disable_intr(); | |
972 | ATOMIC_CPUMASK_ORMASK(smp_invltlb_mask, mask); | |
973 | ||
974 | /* | |
975 | * IPI non-idle cpus represented by mask. The omask calculation | |
976 | * removes cpus from the mask which already have a Xinvltlb IPI | |
977 | * pending (avoid double-queueing the IPI). | |
978 | * | |
979 | * We must disable real interrupts when setting the smurf flags or | |
980 | * we might race a XINVLTLB before we manage to send the ipi's for | |
981 | * the bits we set. | |
982 | * | |
983 | * NOTE: We are not signalling ourselves, mask already does NOT | |
984 | * include our own cpu. | |
985 | */ | |
e47e3dba | 986 | smp_smurf_fetchset(&mask); |
79f2da03 MD |
987 | |
988 | /* | |
989 | * Issue the IPI. Note that the XINVLTLB IPI runs regardless of | |
990 | * the critical section count on the target cpus. | |
991 | */ | |
992 | CPUMASK_ORMASK(mask, md->mi.gd_cpumask); | |
bba35d66 | 993 | if (all_but_self_ipi_enable && |
67534613 MD |
994 | (all_but_self_ipi_enable >= 2 || |
995 | CPUMASK_CMPMASKEQ(smp_startup_mask, mask))) { | |
46d4e165 JG |
996 | all_but_self_ipi(XINVLTLB_OFFSET); |
997 | } else { | |
79f2da03 MD |
998 | CPUMASK_NANDMASK(mask, md->mi.gd_cpumask); |
999 | selected_apic_ipi(mask, XINVLTLB_OFFSET, APIC_DELMODE_FIXED); | |
46d4e165 | 1000 | } |
2d910aaf | 1001 | |
79f2da03 MD |
1002 | /* |
1003 | * Wait for acknowledgement by all cpus. smp_inval_intr() will | |
1004 | * temporarily enable interrupts to avoid deadlocking the lapic, | |
1005 | * and will also handle running cpu_invltlb() and remote invlpg | |
1006 | * command son our cpu if some other cpu requests it of us. | |
1007 | * | |
1008 | * WARNING! I originally tried to implement this as a hard loop | |
1009 | * checking only smp_invltlb_mask (and issuing a local | |
1010 | * cpu_invltlb() if requested), with interrupts enabled | |
1011 | * and without calling smp_inval_intr(). This DID NOT WORK. | |
1012 | * It resulted in weird races where smurf bits would get | |
1013 | * cleared without any action being taken. | |
1014 | */ | |
1015 | smp_inval_intr(); | |
1016 | CPUMASK_ASSZERO(mask); | |
1017 | while (CPUMASK_CMPMASKNEQ(smp_invltlb_mask, mask)) { | |
1018 | smp_inval_intr(); | |
7d4d6fdb | 1019 | cpu_pause(); |
bba35d66 | 1020 | #ifdef LOOPRECOVER |
1a5c7e0f | 1021 | if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { |
67534613 MD |
1022 | /* |
1023 | * cpuid - cpu doing the waiting | |
1024 | * invltlb_mask - IPI in progress | |
1025 | */ | |
2d20c15f MD |
1026 | kprintf("smp_invltlb %2d: WARNING blocked %d sec: " |
1027 | "inv=%08jx " | |
67534613 MD |
1028 | "smurf=%08jx " |
1029 | #ifdef LOOPMASK_IN | |
1030 | "in=%08jx " | |
1031 | #endif | |
1032 | "idle=%08jx/%08jx\n", | |
1a5c7e0f | 1033 | md->mi.gd_cpuid, |
2d20c15f | 1034 | repeats + 1, |
1a5c7e0f | 1035 | smp_invltlb_mask.ary[0], |
67534613 MD |
1036 | smp_smurf_mask.ary[0], |
1037 | #ifdef LOOPMASK_IN | |
1038 | smp_in_mask.ary[0], | |
1039 | #endif | |
1a5c7e0f MD |
1040 | smp_idleinvl_mask.ary[0], |
1041 | smp_idleinvl_reqs.ary[0]); | |
79f2da03 | 1042 | mdcpu->gd_xinvaltlb = 0; |
bba35d66 MD |
1043 | ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, |
1044 | smp_invltlb_mask); | |
79f2da03 | 1045 | smp_invlpg(&smp_active_mask); |
2d20c15f MD |
1046 | |
1047 | /* | |
1048 | * Reload tsc_base for retry, give up after | |
1049 | * 10 seconds (60 seconds if in VM). | |
1050 | */ | |
1a5c7e0f | 1051 | tsc_base = rdtsc(); |
2d20c15f | 1052 | if (++repeats > invlpg_timeout) { |
1a5c7e0f MD |
1053 | kprintf("smp_invltlb: giving up\n"); |
1054 | CPUMASK_ASSZERO(smp_invltlb_mask); | |
1055 | } | |
e32d3244 | 1056 | } |
46d4e165 | 1057 | #endif |
7d4d6fdb | 1058 | } |
79f2da03 | 1059 | write_rflags(rflags); |
7d4d6fdb | 1060 | crit_exit_gd(&md->mi); |
46d4e165 JG |
1061 | } |
1062 | ||
7d4d6fdb | 1063 | /* |
1a5c7e0f MD |
1064 | * Called from a critical section with interrupts hard-disabled. |
1065 | * This function issues an XINVLTLB IPI and then executes any pending | |
1066 | * command on the current cpu before returning. | |
7d4d6fdb MD |
1067 | */ |
1068 | void | |
79f2da03 | 1069 | smp_invlpg(cpumask_t *cmdmask) |
7d4d6fdb MD |
1070 | { |
1071 | struct mdglobaldata *md = mdcpu; | |
7d4d6fdb | 1072 | cpumask_t mask; |
7d4d6fdb | 1073 | |
79f2da03 MD |
1074 | if (report_invlpg_src > 0) { |
1075 | if (--report_invlpg_src <= 0) | |
1076 | print_backtrace(8); | |
1077 | } | |
1078 | ||
1079 | /* | |
1080 | * Disallow normal interrupts, set all active cpus in the pmap, | |
1081 | * plus our own for completion processing (it might or might not | |
1082 | * be part of the set). | |
1083 | */ | |
79f2da03 MD |
1084 | mask = smp_active_mask; |
1085 | CPUMASK_ANDMASK(mask, *cmdmask); | |
1086 | CPUMASK_ORMASK(mask, md->mi.gd_cpumask); | |
1087 | ||
1088 | /* | |
1089 | * Avoid double-queuing IPIs, which can deadlock us. We must disable | |
1090 | * real interrupts when setting the smurf flags or we might race a | |
1091 | * XINVLTLB before we manage to send the ipi's for the bits we set. | |
1092 | * | |
1093 | * NOTE: We might be including our own cpu in the smurf mask. | |
1094 | */ | |
e47e3dba | 1095 | smp_smurf_fetchset(&mask); |
79f2da03 MD |
1096 | |
1097 | /* | |
1098 | * Issue the IPI. Note that the XINVLTLB IPI runs regardless of | |
1099 | * the critical section count on the target cpus. | |
1100 | * | |
1101 | * We do not include our own cpu when issuing the IPI. | |
1102 | */ | |
bba35d66 | 1103 | if (all_but_self_ipi_enable && |
67534613 MD |
1104 | (all_but_self_ipi_enable >= 2 || |
1105 | CPUMASK_CMPMASKEQ(smp_startup_mask, mask))) { | |
79f2da03 MD |
1106 | all_but_self_ipi(XINVLTLB_OFFSET); |
1107 | } else { | |
1108 | CPUMASK_NANDMASK(mask, md->mi.gd_cpumask); | |
1109 | selected_apic_ipi(mask, XINVLTLB_OFFSET, APIC_DELMODE_FIXED); | |
7d4d6fdb | 1110 | } |
79f2da03 MD |
1111 | |
1112 | /* | |
1113 | * This will synchronously wait for our command to complete, | |
1114 | * as well as process commands from other cpus. It also handles | |
1115 | * reentrancy. | |
1a5c7e0f MD |
1116 | * |
1117 | * (interrupts are disabled and we are in a critical section here) | |
79f2da03 | 1118 | */ |
79f2da03 | 1119 | smp_inval_intr(); |
79f2da03 MD |
1120 | } |
1121 | ||
91dc43dd MD |
1122 | /* |
1123 | * Issue rip/rsp sniffs | |
1124 | */ | |
e32d3244 MD |
1125 | void |
1126 | smp_sniff(void) | |
1127 | { | |
1128 | globaldata_t gd = mycpu; | |
1129 | int dummy; | |
67534613 | 1130 | register_t rflags; |
e32d3244 | 1131 | |
bba35d66 MD |
1132 | /* |
1133 | * Ignore all_but_self_ipi_enable here and just use it. | |
1134 | */ | |
67534613 MD |
1135 | rflags = read_rflags(); |
1136 | cpu_disable_intr(); | |
e32d3244 MD |
1137 | all_but_self_ipi(XSNIFF_OFFSET); |
1138 | gd->gd_sample_pc = smp_sniff; | |
1139 | gd->gd_sample_sp = &dummy; | |
67534613 MD |
1140 | write_rflags(rflags); |
1141 | } | |
1142 | ||
1143 | void | |
1144 | cpu_sniff(int dcpu) | |
1145 | { | |
1146 | globaldata_t rgd = globaldata_find(dcpu); | |
1147 | register_t rflags; | |
1148 | int dummy; | |
1149 | ||
1150 | /* | |
1151 | * Ignore all_but_self_ipi_enable here and just use it. | |
1152 | */ | |
1153 | rflags = read_rflags(); | |
1154 | cpu_disable_intr(); | |
1155 | single_apic_ipi(dcpu, XSNIFF_OFFSET, APIC_DELMODE_FIXED); | |
1156 | rgd->gd_sample_pc = cpu_sniff; | |
1157 | rgd->gd_sample_sp = &dummy; | |
1158 | write_rflags(rflags); | |
e32d3244 MD |
1159 | } |
1160 | ||
79f2da03 | 1161 | /* |
1a5c7e0f MD |
1162 | * Called from Xinvltlb assembly with interrupts hard-disabled and in a |
1163 | * critical section. gd_intr_nesting_level may or may not be bumped | |
1164 | * depending on entry. | |
79f2da03 MD |
1165 | * |
1166 | * THIS CODE IS INTENDED TO EXPLICITLY IGNORE THE CRITICAL SECTION COUNT. | |
1167 | * THAT IS, THE INTERRUPT IS INTENDED TO FUNCTION EVEN WHEN MAINLINE CODE | |
1168 | * IS IN A CRITICAL SECTION. | |
1169 | */ | |
1170 | void | |
1171 | smp_inval_intr(void) | |
1172 | { | |
1173 | struct mdglobaldata *md = mdcpu; | |
1174 | cpumask_t cpumask; | |
bba35d66 | 1175 | #ifdef LOOPRECOVER |
5b49787b | 1176 | tsc_uclock_t tsc_base = rdtsc(); |
1a5c7e0f MD |
1177 | #endif |
1178 | ||
1179 | #if 0 | |
1180 | /* | |
1181 | * The idle code is in a critical section, but that doesn't stop | |
1182 | * Xinvltlb from executing, so deal with the race which can occur | |
1183 | * in that situation. Otherwise r-m-w operations by pmap_inval_intr() | |
1184 | * may have problems. | |
1185 | */ | |
1186 | if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, md->mi.gd_cpuid)) { | |
1187 | ATOMIC_CPUMASK_NANDBIT(smp_invltlb_mask, md->mi.gd_cpuid); | |
1188 | cpu_invltlb(); | |
1189 | cpu_mfence(); | |
1190 | } | |
1191 | #endif | |
79f2da03 MD |
1192 | |
1193 | /* | |
1194 | * This is a real mess. I'd like to just leave interrupts disabled | |
1195 | * but it can cause the lapic to deadlock if too many interrupts queue | |
1196 | * to it, due to the idiotic design of the lapic. So instead we have | |
1197 | * to enter a critical section so normal interrupts are made pending | |
1198 | * and track whether this one was reentered. | |
1199 | */ | |
1200 | if (md->gd_xinvaltlb) { /* reentrant on cpu */ | |
1201 | md->gd_xinvaltlb = 2; | |
1202 | return; | |
1203 | } | |
1204 | md->gd_xinvaltlb = 1; | |
1205 | ||
1206 | /* | |
1207 | * Check only those cpus with active Xinvl* commands pending. | |
1208 | * | |
1209 | * We are going to enable interrupts so make sure we are in a | |
1210 | * critical section. This is necessary to avoid deadlocking | |
ccd67bf6 MD |
1211 | * the lapic and to ensure that we execute our commands prior to |
1212 | * any nominal interrupt or preemption. | |
e47e3dba MD |
1213 | * |
1214 | * WARNING! It is very important that we only clear out but in | |
1215 | * smp_smurf_mask once for each interrupt we take. In | |
1216 | * this case, we clear it on initial entry and only loop | |
1217 | * on the reentrancy detect (caused by another interrupt). | |
79f2da03 MD |
1218 | */ |
1219 | cpumask = smp_invmask; | |
79f2da03 MD |
1220 | #ifdef LOOPMASK_IN |
1221 | ATOMIC_CPUMASK_ORBIT(smp_in_mask, md->mi.gd_cpuid); | |
1222 | #endif | |
67534613 MD |
1223 | loop: |
1224 | cpu_enable_intr(); | |
79f2da03 MD |
1225 | ATOMIC_CPUMASK_NANDBIT(smp_smurf_mask, md->mi.gd_cpuid); |
1226 | ||
1227 | /* | |
1228 | * Specific page request(s), and we can't return until all bits | |
1229 | * are zero. | |
79f2da03 MD |
1230 | */ |
1231 | for (;;) { | |
1a5c7e0f MD |
1232 | int toolong; |
1233 | ||
1234 | /* | |
1235 | * Also execute any pending full invalidation request in | |
1236 | * this loop. | |
1237 | */ | |
1238 | if (CPUMASK_TESTBIT(smp_invltlb_mask, md->mi.gd_cpuid)) { | |
1239 | ATOMIC_CPUMASK_NANDBIT(smp_invltlb_mask, | |
1240 | md->mi.gd_cpuid); | |
1241 | cpu_invltlb(); | |
1242 | cpu_mfence(); | |
1243 | } | |
1244 | ||
bba35d66 | 1245 | #ifdef LOOPRECOVER |
1a5c7e0f | 1246 | if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { |
67534613 MD |
1247 | /* |
1248 | * cpuid - cpu doing the waiting | |
1249 | * invmask - IPI in progress | |
1250 | * invltlb_mask - which ones are TLB invalidations? | |
1251 | */ | |
ab4aa0bb MD |
1252 | kprintf("smp_inval_intr %2d, WARNING blocked >1 sec " |
1253 | "inv=%08jx tlbm=%08jx " | |
67534613 MD |
1254 | "smurf=%08jx " |
1255 | #ifdef LOOPMASK_IN | |
1256 | "in=%08jx " | |
1257 | #endif | |
1a5c7e0f MD |
1258 | "idle=%08jx/%08jx\n", |
1259 | md->mi.gd_cpuid, | |
1260 | smp_invmask.ary[0], | |
1261 | smp_invltlb_mask.ary[0], | |
67534613 MD |
1262 | smp_smurf_mask.ary[0], |
1263 | #ifdef LOOPMASK_IN | |
1264 | smp_in_mask.ary[0], | |
1265 | #endif | |
1a5c7e0f MD |
1266 | smp_idleinvl_mask.ary[0], |
1267 | smp_idleinvl_reqs.ary[0]); | |
1268 | tsc_base = rdtsc(); | |
1269 | toolong = 1; | |
1270 | } else { | |
1271 | toolong = 0; | |
1272 | } | |
1273 | #else | |
1274 | toolong = 0; | |
1275 | #endif | |
1276 | ||
79f2da03 MD |
1277 | /* |
1278 | * We can only add bits to the cpumask to test during the | |
1279 | * loop because the smp_invmask bit is cleared once the | |
1280 | * originator completes the command (the targets may still | |
1281 | * be cycling their own completions in this loop, afterwords). | |
1282 | * | |
1283 | * lfence required prior to all tests as this Xinvltlb | |
1284 | * interrupt could race the originator (already be in progress | |
1285 | * wnen the originator decides to issue, due to an issue by | |
1286 | * another cpu). | |
1287 | */ | |
1288 | cpu_lfence(); | |
1289 | CPUMASK_ORMASK(cpumask, smp_invmask); | |
1a5c7e0f | 1290 | /*cpumask = smp_active_mask;*/ /* XXX */ |
95270b7e | 1291 | cpu_lfence(); |
1a5c7e0f MD |
1292 | |
1293 | if (pmap_inval_intr(&cpumask, toolong) == 0) { | |
79f2da03 MD |
1294 | /* |
1295 | * Clear our smurf mask to allow new IPIs, but deal | |
1296 | * with potential races. | |
1297 | */ | |
1298 | break; | |
79f2da03 MD |
1299 | } |
1300 | ||
1301 | /* | |
1302 | * Test if someone sent us another invalidation IPI, break | |
1303 | * out so we can take it to avoid deadlocking the lapic | |
1304 | * interrupt queue (? stupid intel, amd). | |
1305 | */ | |
1306 | if (md->gd_xinvaltlb == 2) | |
1307 | break; | |
1308 | /* | |
1309 | if (CPUMASK_TESTBIT(smp_smurf_mask, md->mi.gd_cpuid)) | |
1310 | break; | |
1311 | */ | |
1312 | } | |
1313 | ||
1314 | /* | |
1315 | * Full invalidation request | |
1316 | */ | |
1317 | if (CPUMASK_TESTBIT(smp_invltlb_mask, md->mi.gd_cpuid)) { | |
1318 | ATOMIC_CPUMASK_NANDBIT(smp_invltlb_mask, | |
1319 | md->mi.gd_cpuid); | |
1320 | cpu_invltlb(); | |
1321 | cpu_mfence(); | |
1322 | } | |
1a5c7e0f | 1323 | |
79f2da03 MD |
1324 | /* |
1325 | * Check to see if another Xinvltlb interrupt occurred and loop up | |
1326 | * if it did. | |
1327 | */ | |
1328 | cpu_disable_intr(); | |
1329 | if (md->gd_xinvaltlb == 2) { | |
1330 | md->gd_xinvaltlb = 1; | |
1331 | goto loop; | |
1332 | } | |
67534613 MD |
1333 | #ifdef LOOPMASK_IN |
1334 | ATOMIC_CPUMASK_NANDBIT(smp_in_mask, md->mi.gd_cpuid); | |
1335 | #endif | |
79f2da03 | 1336 | md->gd_xinvaltlb = 0; |
7d4d6fdb MD |
1337 | } |
1338 | ||
3c8aa76f FT |
1339 | void |
1340 | cpu_wbinvd_on_all_cpus_callback(void *arg) | |
1341 | { | |
79f2da03 | 1342 | wbinvd(); |
3c8aa76f FT |
1343 | } |
1344 | ||
46d4e165 JG |
1345 | /* |
1346 | * When called the executing CPU will send an IPI to all other CPUs | |
67534613 | 1347 | * requesting that they halt execution. |
46d4e165 JG |
1348 | * |
1349 | * Usually (but not necessarily) called with 'other_cpus' as its arg. | |
1350 | * | |
1351 | * - Signals all CPUs in map to stop. | |
1352 | * - Waits for each to stop. | |
1353 | * | |
1354 | * Returns: | |
1355 | * -1: error | |
1356 | * 0: NA | |
1357 | * 1: ok | |
1358 | * | |
1359 | * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs | |
1360 | * from executing at same time. | |
1361 | */ | |
1362 | int | |
da23a592 | 1363 | stop_cpus(cpumask_t map) |
46d4e165 | 1364 | { |
c07315c4 MD |
1365 | cpumask_t mask; |
1366 | ||
1367 | CPUMASK_ANDMASK(map, smp_active_mask); | |
46d4e165 JG |
1368 | |
1369 | /* send the Xcpustop IPI to all CPUs in map */ | |
1370 | selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED); | |
c07315c4 MD |
1371 | |
1372 | do { | |
1373 | mask = stopped_cpus; | |
1374 | CPUMASK_ANDMASK(mask, map); | |
1375 | /* spin */ | |
1376 | } while (CPUMASK_CMPMASKNEQ(mask, map)); | |
46d4e165 JG |
1377 | |
1378 | return 1; | |
1379 | } | |
1380 | ||
1381 | ||
1382 | /* | |
1383 | * Called by a CPU to restart stopped CPUs. | |
1384 | * | |
1385 | * Usually (but not necessarily) called with 'stopped_cpus' as its arg. | |
1386 | * | |
1387 | * - Signals all CPUs in map to restart. | |
1388 | * - Waits for each to restart. | |
1389 | * | |
1390 | * Returns: | |
1391 | * -1: error | |
1392 | * 0: NA | |
1393 | * 1: ok | |
1394 | */ | |
1395 | int | |
da23a592 | 1396 | restart_cpus(cpumask_t map) |
46d4e165 | 1397 | { |
c07315c4 | 1398 | cpumask_t mask; |
46d4e165 | 1399 | |
c07315c4 MD |
1400 | /* signal other cpus to restart */ |
1401 | mask = map; | |
1402 | CPUMASK_ANDMASK(mask, smp_active_mask); | |
1403 | cpu_ccfence(); | |
1404 | started_cpus = mask; | |
1405 | cpu_ccfence(); | |
1406 | ||
1407 | /* wait for each to clear its bit */ | |
1408 | while (CPUMASK_CMPMASKNEQ(stopped_cpus, map)) | |
1409 | cpu_pause(); | |
46d4e165 JG |
1410 | |
1411 | return 1; | |
1412 | } | |
1413 | ||
1414 | /* | |
1415 | * This is called once the mpboot code has gotten us properly relocated | |
1416 | * and the MMU turned on, etc. ap_init() is actually the idle thread, | |
1417 | * and when it returns the scheduler will call the real cpu_idle() main | |
1418 | * loop for the idlethread. Interrupts are disabled on entry and should | |
1419 | * remain disabled at return. | |
1420 | */ | |
1421 | void | |
1422 | ap_init(void) | |
1423 | { | |
fbac0dc4 | 1424 | int cpu_id; |
46d4e165 JG |
1425 | |
1426 | /* | |
1427 | * Adjust smp_startup_mask to signal the BSP that we have started | |
1428 | * up successfully. Note that we do not yet hold the BGL. The BSP | |
1429 | * is waiting for our signal. | |
1430 | * | |
1431 | * We can't set our bit in smp_active_mask yet because we are holding | |
1432 | * interrupts physically disabled and remote cpus could deadlock | |
1433 | * trying to send us an IPI. | |
1434 | */ | |
c07315c4 | 1435 | ATOMIC_CPUMASK_ORBIT(smp_startup_mask, mycpu->gd_cpuid); |
46d4e165 JG |
1436 | cpu_mfence(); |
1437 | ||
1438 | /* | |
c6b1591c SZ |
1439 | * Interlock for LAPIC initialization. Wait until mp_finish_lapic is |
1440 | * non-zero, then get the MP lock. | |
46d4e165 JG |
1441 | * |
1442 | * Note: We are in a critical section. | |
1443 | * | |
46d4e165 JG |
1444 | * Note: we are the idle thread, we can only spin. |
1445 | * | |
1446 | * Note: The load fence is memory volatile and prevents the compiler | |
c6b1591c | 1447 | * from improperly caching mp_finish_lapic, and the cpu from improperly |
46d4e165 JG |
1448 | * caching it. |
1449 | */ | |
06c66eb2 MD |
1450 | while (mp_finish_lapic == 0) { |
1451 | cpu_pause(); | |
b5d16701 | 1452 | cpu_lfence(); |
06c66eb2 | 1453 | } |
1997b4c2 | 1454 | #if 0 |
06c66eb2 MD |
1455 | while (try_mplock() == 0) { |
1456 | cpu_pause(); | |
1457 | cpu_lfence(); | |
1458 | } | |
1997b4c2 | 1459 | #endif |
46d4e165 JG |
1460 | |
1461 | if (cpu_feature & CPUID_TSC) { | |
b5d16701 MD |
1462 | /* |
1463 | * The BSP is constantly updating tsc0_offset, figure out | |
1464 | * the relative difference to synchronize ktrdump. | |
1465 | */ | |
1466 | tsc_offsets[mycpu->gd_cpuid] = rdtsc() - tsc0_offset; | |
46d4e165 JG |
1467 | } |
1468 | ||
1469 | /* BSP may have changed PTD while we're waiting for the lock */ | |
1470 | cpu_invltlb(); | |
1471 | ||
46d4e165 | 1472 | /* Build our map of 'other' CPUs. */ |
c07315c4 | 1473 | mycpu->gd_other_cpus = smp_startup_mask; |
1997b4c2 | 1474 | ATOMIC_CPUMASK_NANDBIT(mycpu->gd_other_cpus, mycpu->gd_cpuid); |
46d4e165 | 1475 | |
46d4e165 | 1476 | /* A quick check from sanity claus */ |
8afc0c3d | 1477 | cpu_id = APICID_TO_CPUID(LAPIC_READID); |
fbac0dc4 SZ |
1478 | if (mycpu->gd_cpuid != cpu_id) { |
1479 | kprintf("SMP: assigned cpuid = %d\n", mycpu->gd_cpuid); | |
1480 | kprintf("SMP: actual cpuid = %d lapicid %d\n", | |
8afc0c3d | 1481 | cpu_id, LAPIC_READID); |
32d3bd25 | 1482 | #if 0 /* JGXXX */ |
46d4e165 JG |
1483 | kprintf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); |
1484 | #endif | |
1485 | panic("cpuid mismatch! boom!!"); | |
1486 | } | |
1487 | ||
1488 | /* Initialize AP's local APIC for irq's */ | |
5ddeabb9 | 1489 | lapic_init(FALSE); |
46d4e165 | 1490 | |
c6b1591c | 1491 | /* LAPIC initialization is done */ |
1997b4c2 | 1492 | ATOMIC_CPUMASK_ORBIT(smp_lapic_mask, mycpu->gd_cpuid); |
c6b1591c SZ |
1493 | cpu_mfence(); |
1494 | ||
1997b4c2 | 1495 | #if 0 |
c6b1591c SZ |
1496 | /* Let BSP move onto the next initialization stage */ |
1497 | rel_mplock(); | |
1997b4c2 | 1498 | #endif |
c6b1591c SZ |
1499 | |
1500 | /* | |
1501 | * Interlock for finalization. Wait until mp_finish is non-zero, | |
1502 | * then get the MP lock. | |
1503 | * | |
1504 | * Note: We are in a critical section. | |
1505 | * | |
1506 | * Note: we are the idle thread, we can only spin. | |
1507 | * | |
1508 | * Note: The load fence is memory volatile and prevents the compiler | |
1509 | * from improperly caching mp_finish, and the cpu from improperly | |
1510 | * caching it. | |
1511 | */ | |
06c66eb2 MD |
1512 | while (mp_finish == 0) { |
1513 | cpu_pause(); | |
c6b1591c | 1514 | cpu_lfence(); |
06c66eb2 | 1515 | } |
c6b1591c SZ |
1516 | |
1517 | /* BSP may have changed PTD while we're waiting for the lock */ | |
1518 | cpu_invltlb(); | |
1519 | ||
46d4e165 JG |
1520 | /* Set memory range attributes for this CPU to match the BSP */ |
1521 | mem_range_AP_init(); | |
1522 | ||
1523 | /* | |
1524 | * Once we go active we must process any IPIQ messages that may | |
1525 | * have been queued, because no actual IPI will occur until we | |
1526 | * set our bit in the smp_active_mask. If we don't the IPI | |
1527 | * message interlock could be left set which would also prevent | |
1528 | * further IPIs. | |
1529 | * | |
1530 | * The idle loop doesn't expect the BGL to be held and while | |
1531 | * lwkt_switch() normally cleans things up this is a special case | |
1532 | * because we returning almost directly into the idle loop. | |
1533 | * | |
1534 | * The idle thread is never placed on the runq, make sure | |
1535 | * nothing we've done put it there. | |
1536 | */ | |
46d4e165 JG |
1537 | |
1538 | /* | |
1997b4c2 MD |
1539 | * Hold a critical section and allow real interrupts to occur. Zero |
1540 | * any spurious interrupts which have accumulated, then set our | |
1541 | * smp_active_mask indicating that we are fully operational. | |
46d4e165 | 1542 | */ |
1997b4c2 | 1543 | crit_enter(); |
46d4e165 | 1544 | __asm __volatile("sti; pause; pause"::); |
9611ff20 | 1545 | bzero(mdcpu->gd_ipending, sizeof(mdcpu->gd_ipending)); |
1997b4c2 | 1546 | ATOMIC_CPUMASK_ORBIT(smp_active_mask, mycpu->gd_cpuid); |
46d4e165 | 1547 | |
1997b4c2 MD |
1548 | /* |
1549 | * Wait until all cpus have set their smp_active_mask and have fully | |
1550 | * operational interrupts before proceeding. | |
1551 | * | |
1552 | * We need a final cpu_invltlb() because we would not have received | |
1553 | * any until we set our bit in smp_active_mask. | |
1554 | */ | |
1555 | while (mp_finish == 1) { | |
1556 | cpu_pause(); | |
1557 | cpu_lfence(); | |
1558 | } | |
1559 | cpu_invltlb(); | |
1560 | ||
1561 | /* | |
1562 | * Initialize per-cpu clocks and do other per-cpu initialization. | |
1563 | * At this point code is expected to be able to use the full kernel | |
1564 | * API. | |
1565 | */ | |
46d4e165 | 1566 | initclocks_pcpu(); /* clock interrupts (via IPIs) */ |
46d4e165 JG |
1567 | |
1568 | /* | |
1997b4c2 MD |
1569 | * Since we may have cleaned up the interrupt triggers, manually |
1570 | * process any pending IPIs before exiting our critical section. | |
1571 | * Once the critical section has exited, normal interrupt processing | |
1572 | * may occur. | |
46d4e165 | 1573 | */ |
e47e3dba | 1574 | atomic_swap_int(&mycpu->gd_npoll, 0); |
1997b4c2 | 1575 | lwkt_process_ipiq(); |
e47e3dba | 1576 | crit_exit(); |
06c66eb2 | 1577 | |
06c66eb2 | 1578 | /* |
1997b4c2 MD |
1579 | * Final final, allow the waiting BSP to resume the boot process, |
1580 | * return 'into' the idle thread bootstrap. | |
06c66eb2 | 1581 | */ |
1997b4c2 MD |
1582 | ATOMIC_CPUMASK_ORBIT(smp_finalize_mask, mycpu->gd_cpuid); |
1583 | KKASSERT((curthread->td_flags & TDF_RUNQ) == 0); | |
46d4e165 JG |
1584 | } |
1585 | ||
1586 | /* | |
1587 | * Get SMP fully working before we start initializing devices. | |
1588 | */ | |
1589 | static | |
1590 | void | |
1591 | ap_finish(void) | |
1592 | { | |
46d4e165 JG |
1593 | if (bootverbose) |
1594 | kprintf("Finish MP startup\n"); | |
46d4e165 | 1595 | rel_mplock(); |
06c66eb2 | 1596 | |
1997b4c2 MD |
1597 | /* |
1598 | * Wait for the active mask to complete, after which all cpus will | |
1599 | * be accepting interrupts. | |
1600 | */ | |
1601 | mp_finish = 1; | |
c07315c4 | 1602 | while (CPUMASK_CMPMASKNEQ(smp_active_mask, smp_startup_mask)) { |
06c66eb2 | 1603 | cpu_pause(); |
46d4e165 | 1604 | cpu_lfence(); |
06c66eb2 | 1605 | } |
1997b4c2 MD |
1606 | |
1607 | /* | |
1608 | * Wait for the finalization mask to complete, after which all cpus | |
1609 | * have completely finished initializing and are entering or are in | |
1610 | * their idle thread. | |
1611 | * | |
1612 | * BSP should have received all required invltlbs but do another | |
1613 | * one just in case. | |
1614 | */ | |
1615 | cpu_invltlb(); | |
1616 | mp_finish = 2; | |
1617 | while (CPUMASK_CMPMASKNEQ(smp_finalize_mask, smp_startup_mask)) { | |
1618 | cpu_pause(); | |
1619 | cpu_lfence(); | |
1620 | } | |
1621 | ||
06c66eb2 | 1622 | while (try_mplock() == 0) { |
c07315c4 | 1623 | cpu_pause(); |
06c66eb2 | 1624 | cpu_lfence(); |
c07315c4 | 1625 | } |
06c66eb2 | 1626 | |
da23a592 MD |
1627 | if (bootverbose) { |
1628 | kprintf("Active CPU Mask: %016jx\n", | |
c07315c4 | 1629 | (uintmax_t)CPUMASK_LOWMASK(smp_active_mask)); |
da23a592 | 1630 | } |
46d4e165 JG |
1631 | } |
1632 | ||
f3f3eadb | 1633 | SYSINIT(finishsmp, SI_BOOT2_FINISH_SMP, SI_ORDER_FIRST, ap_finish, NULL); |
46d4e165 | 1634 | |
67534613 MD |
1635 | /* |
1636 | * Interrupts must be hard-disabled by caller | |
1637 | */ | |
46d4e165 JG |
1638 | void |
1639 | cpu_send_ipiq(int dcpu) | |
1640 | { | |
c07315c4 | 1641 | if (CPUMASK_TESTBIT(smp_active_mask, dcpu)) |
46d4e165 JG |
1642 | single_apic_ipi(dcpu, XIPIQ_OFFSET, APIC_DELMODE_FIXED); |
1643 | } | |
1644 | ||
1645 | #if 0 /* single_apic_ipi_passive() not working yet */ | |
1646 | /* | |
1647 | * Returns 0 on failure, 1 on success | |
1648 | */ | |
1649 | int | |
1650 | cpu_send_ipiq_passive(int dcpu) | |
1651 | { | |
1652 | int r = 0; | |
c07315c4 | 1653 | if (CPUMASK_TESTBIT(smp_active_mask, dcpu)) { |
46d4e165 JG |
1654 | r = single_apic_ipi_passive(dcpu, XIPIQ_OFFSET, |
1655 | APIC_DELMODE_FIXED); | |
1656 | } | |
1657 | return(r); | |
1658 | } | |
1659 | #endif | |
3566408b SZ |
1660 | |
1661 | static void | |
3a69c113 | 1662 | mp_bsp_simple_setup(void) |
3566408b | 1663 | { |
8005c0c8 | 1664 | struct mdglobaldata *gd; |
1997b4c2 MD |
1665 | size_t ipiq_size; |
1666 | ||
3566408b | 1667 | /* build our map of 'other' CPUs */ |
c07315c4 MD |
1668 | mycpu->gd_other_cpus = smp_startup_mask; |
1669 | CPUMASK_NANDBIT(mycpu->gd_other_cpus, mycpu->gd_cpuid); | |
1997b4c2 | 1670 | |
8005c0c8 SZ |
1671 | gd = (struct mdglobaldata *)mycpu; |
1672 | gd->gd_acpi_id = CPUID_TO_ACPIID(mycpu->gd_cpuid); | |
1673 | ||
1997b4c2 | 1674 | ipiq_size = sizeof(struct lwkt_ipiq) * ncpus; |
1eeaf6b2 | 1675 | mycpu->gd_ipiq = (void *)kmem_alloc(kernel_map, ipiq_size, |
3091de50 | 1676 | VM_SUBSYS_IPIQ); |
1997b4c2 | 1677 | bzero(mycpu->gd_ipiq, ipiq_size); |
3566408b | 1678 | |
cc3685b0 SZ |
1679 | /* initialize arc4random. */ |
1680 | arc4_init_pcpu(0); | |
1681 | ||
3566408b SZ |
1682 | pmap_set_opt(); |
1683 | ||
1684 | if (cpu_feature & CPUID_TSC) | |
1685 | tsc0_offset = rdtsc(); | |
1686 | } | |
f77c018a MC |
1687 | |
1688 | ||
1689 | /* | |
1690 | * CPU TOPOLOGY DETECTION FUNCTIONS | |
1691 | */ | |
1692 | ||
1693 | /* Detect intel topology using CPUID | |
1694 | * Ref: http://www.intel.com/Assets/PDF/appnote/241618.pdf, pg 41 | |
1695 | */ | |
1696 | static void | |
1697 | detect_intel_topology(int count_htt_cores) | |
1698 | { | |
1699 | int shift = 0; | |
1700 | int ecx_index = 0; | |
1701 | int core_plus_logical_bits = 0; | |
1702 | int cores_per_package; | |
1703 | int logical_per_package; | |
1704 | int logical_per_core; | |
1705 | unsigned int p[4]; | |
1706 | ||
1707 | if (cpu_high >= 0xb) { | |
1708 | goto FUNC_B; | |
1709 | ||
1710 | } else if (cpu_high >= 0x4) { | |
1711 | goto FUNC_4; | |
1712 | ||
1713 | } else { | |
1714 | core_bits = 0; | |
1715 | for (shift = 0; (1 << shift) < count_htt_cores; ++shift) | |
1716 | ; | |
1717 | logical_CPU_bits = 1 << shift; | |
1718 | return; | |
1719 | } | |
1720 | ||
1721 | FUNC_B: | |
1722 | cpuid_count(0xb, FUNC_B_THREAD_LEVEL, p); | |
1723 | ||
1724 | /* if 0xb not supported - fallback to 0x4 */ | |
1725 | if (p[1] == 0 || (FUNC_B_TYPE(p[2]) != FUNC_B_THREAD_TYPE)) { | |
1726 | goto FUNC_4; | |
1727 | } | |
1728 | ||
1729 | logical_CPU_bits = FUNC_B_BITS_SHIFT_NEXT_LEVEL(p[0]); | |
1730 | ||
1731 | ecx_index = FUNC_B_THREAD_LEVEL + 1; | |
1732 | do { | |
1733 | cpuid_count(0xb, ecx_index, p); | |
1734 | ||
1735 | /* Check for the Core type in the implemented sub leaves. */ | |
1736 | if (FUNC_B_TYPE(p[2]) == FUNC_B_CORE_TYPE) { | |
1737 | core_plus_logical_bits = FUNC_B_BITS_SHIFT_NEXT_LEVEL(p[0]); | |
1738 | break; | |
1739 | } | |
1740 | ||
1741 | ecx_index++; | |
1742 | ||
1743 | } while (FUNC_B_TYPE(p[2]) != FUNC_B_INVALID_TYPE); | |
1744 | ||
1745 | core_bits = core_plus_logical_bits - logical_CPU_bits; | |
1746 | ||
1747 | return; | |
1748 | ||
1749 | FUNC_4: | |
1750 | cpuid_count(0x4, 0, p); | |
1751 | cores_per_package = FUNC_4_MAX_CORE_NO(p[0]) + 1; | |
1752 | ||
1753 | logical_per_package = count_htt_cores; | |
1754 | logical_per_core = logical_per_package / cores_per_package; | |
1755 | ||
1756 | for (shift = 0; (1 << shift) < logical_per_core; ++shift) | |
1757 | ; | |
1758 | logical_CPU_bits = shift; | |
1759 | ||
1760 | for (shift = 0; (1 << shift) < cores_per_package; ++shift) | |
1761 | ; | |
1762 | core_bits = shift; | |
1763 | ||
1764 | return; | |
1765 | } | |
1766 | ||
1767 | /* Detect AMD topology using CPUID | |
1768 | * Ref: http://support.amd.com/us/Embedded_TechDocs/25481.pdf, last page | |
1769 | */ | |
1770 | static void | |
1771 | detect_amd_topology(int count_htt_cores) | |
1772 | { | |
1773 | int shift = 0; | |
c91894e0 | 1774 | if ((cpu_feature & CPUID_HTT) && (amd_feature2 & AMDID2_CMP)) { |
f77c018a | 1775 | if (cpu_procinfo2 & AMDID_COREID_SIZE) { |
c91894e0 MD |
1776 | core_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> |
1777 | AMDID_COREID_SIZE_SHIFT; | |
f77c018a MC |
1778 | } else { |
1779 | core_bits = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; | |
1780 | for (shift = 0; (1 << shift) < core_bits; ++shift) | |
1781 | ; | |
1782 | core_bits = shift; | |
1783 | } | |
c70d4562 MD |
1784 | logical_CPU_bits = count_htt_cores >> core_bits; |
1785 | for (shift = 0; (1 << shift) < logical_CPU_bits; ++shift) | |
1786 | ; | |
1787 | logical_CPU_bits = shift; | |
1788 | ||
1789 | kprintf("core_bits %d logical_CPU_bits %d\n", | |
1790 | core_bits - logical_CPU_bits, logical_CPU_bits); | |
f77c018a | 1791 | |
c70d4562 MD |
1792 | if (amd_feature2 & AMDID2_TOPOEXT) { |
1793 | u_int p[4]; /* eax,ebx,ecx,edx */ | |
1794 | int nodes; | |
1795 | ||
1796 | cpuid_count(0x8000001e, 0, p); | |
1797 | ||
1798 | switch(((p[1] >> 8) & 3) + 1) { | |
1799 | case 1: | |
1800 | logical_CPU_bits = 0; | |
1801 | break; | |
1802 | case 2: | |
1803 | logical_CPU_bits = 1; | |
1804 | break; | |
1805 | case 3: | |
1806 | case 4: | |
1807 | logical_CPU_bits = 2; | |
1808 | break; | |
1809 | } | |
1810 | ||
1811 | /* | |
1812 | * Nodes are kind of a stand-in for packages*sockets, | |
1813 | * but can be thought of in terms of Numa domains. | |
1814 | */ | |
1815 | nodes = ((p[2] >> 8) & 7) + 1; | |
1816 | switch(nodes) { | |
1817 | case 8: | |
1818 | case 7: | |
1819 | case 6: | |
1820 | case 5: | |
1821 | --core_bits; | |
1822 | /* fallthrough */ | |
1823 | case 4: | |
1824 | case 3: | |
1825 | --core_bits; | |
1826 | /* fallthrough */ | |
1827 | case 2: | |
1828 | --core_bits; | |
1829 | /* fallthrough */ | |
1830 | case 1: | |
1831 | break; | |
1832 | } | |
1833 | core_bits -= logical_CPU_bits; | |
1834 | kprintf("%d-way htt, %d Nodes, %d cores/node\n", | |
1835 | (int)(((p[1] >> 8) & 3) + 1), | |
1836 | nodes, | |
1837 | 1 << core_bits); | |
1838 | ||
1839 | } | |
1840 | #if 0 | |
eb77e726 MD |
1841 | if (amd_feature2 & AMDID2_TOPOEXT) { |
1842 | u_int p[4]; | |
1843 | int i; | |
1844 | int type; | |
1845 | int level; | |
1846 | int share_count; | |
c70d4562 MD |
1847 | |
1848 | logical_CPU_bits = 0; | |
1849 | core_bits = 0; | |
1850 | ||
eb77e726 MD |
1851 | for (i = 0; i < 256; ++i) { |
1852 | cpuid_count(0x8000001d, i, p); | |
1853 | type = p[0] & 0x1f; | |
1854 | level = (p[0] >> 5) & 0x7; | |
1855 | share_count = 1 + ((p[0] >> 14) & 0xfff); | |
1856 | ||
1857 | if (type == 0) | |
1858 | break; | |
c70d4562 MD |
1859 | kprintf("Topology probe i=%2d type=%d " |
1860 | "level=%d share_count=%d\n", | |
1861 | i, type, level, share_count); | |
1862 | shift = 0; | |
1863 | while ((1 << shift) < share_count) | |
1864 | ++shift; | |
1865 | ||
1866 | switch(type) { | |
1867 | case 1: | |
1868 | /* | |
1869 | * CPUID_TYPE_SMT | |
1870 | * | |
1871 | * Logical CPU (SMT) | |
1872 | */ | |
1873 | logical_CPU_bits = shift; | |
1874 | break; | |
1875 | case 2: | |
1876 | /* | |
1877 | * CPUID_TYPE_CORE | |
1878 | * | |
1879 | * Physical subdivision of a package | |
1880 | */ | |
1881 | core_bits = logical_CPU_bits + | |
1882 | shift; | |
1883 | break; | |
1884 | case 3: | |
1885 | /* | |
1886 | * CPUID_TYPE_CACHE | |
1887 | * | |
1888 | * CPU L1/L2/L3 cache | |
1889 | */ | |
1890 | break; | |
1891 | case 4: | |
1892 | /* | |
1893 | * CPUID_TYPE_PKG | |
1894 | * | |
1895 | * Package aka chip, equivalent to | |
1896 | * socket | |
1897 | */ | |
eb77e726 MD |
1898 | break; |
1899 | } | |
1900 | } | |
1901 | } | |
c70d4562 | 1902 | #endif |
f77c018a MC |
1903 | } else { |
1904 | for (shift = 0; (1 << shift) < count_htt_cores; ++shift) | |
1905 | ; | |
1906 | core_bits = shift; | |
1907 | logical_CPU_bits = 0; | |
1908 | } | |
1909 | } | |
1910 | ||
0e9325d3 MC |
1911 | static void |
1912 | amd_get_compute_unit_id(void *arg) | |
1913 | { | |
1914 | u_int regs[4]; | |
1915 | ||
1916 | do_cpuid(0x8000001e, regs); | |
1917 | cpu_node_t * mynode = get_cpu_node_by_cpuid(mycpuid); | |
c91894e0 | 1918 | |
0e9325d3 MC |
1919 | /* |
1920 | * AMD - CPUID Specification September 2010 | |
1921 | * page 34 - //ComputeUnitID = ebx[0:7]// | |
1922 | */ | |
1923 | mynode->compute_unit_id = regs[1] & 0xff; | |
1924 | } | |
1925 | ||
1926 | int | |
1927 | fix_amd_topology(void) | |
1928 | { | |
c07315c4 MD |
1929 | cpumask_t mask; |
1930 | ||
0e9325d3 MC |
1931 | if (cpu_vendor_id != CPU_VENDOR_AMD) |
1932 | return -1; | |
1933 | if ((amd_feature2 & AMDID2_TOPOEXT) == 0) | |
1934 | return -1; | |
1935 | ||
c07315c4 MD |
1936 | CPUMASK_ASSALLONES(mask); |
1937 | lwkt_cpusync_simple(mask, amd_get_compute_unit_id, NULL); | |
0e9325d3 MC |
1938 | |
1939 | kprintf("Compute unit iDS:\n"); | |
1940 | int i; | |
c07315c4 MD |
1941 | for (i = 0; i < ncpus; i++) { |
1942 | kprintf("%d-%d; \n", | |
1943 | i, get_cpu_node_by_cpuid(i)->compute_unit_id); | |
1944 | } | |
0e9325d3 MC |
1945 | return 0; |
1946 | } | |
1947 | ||
c91894e0 MD |
1948 | /* |
1949 | * Calculate | |
f77c018a MC |
1950 | * - logical_CPU_bits |
1951 | * - core_bits | |
1952 | * With the values above (for AMD or INTEL) we are able to generally | |
1953 | * detect the CPU topology (number of cores for each level): | |
1954 | * Ref: http://wiki.osdev.org/Detecting_CPU_Topology_(80x86) | |
1955 | * Ref: http://www.multicoreinfo.com/research/papers/whitepapers/Intel-detect-topology.pdf | |
1956 | */ | |
1957 | void | |
1958 | detect_cpu_topology(void) | |
1959 | { | |
1960 | static int topology_detected = 0; | |
1961 | int count = 0; | |
1962 | ||
0c77d800 | 1963 | if (topology_detected) |
f77c018a | 1964 | goto OUT; |
f77c018a MC |
1965 | if ((cpu_feature & CPUID_HTT) == 0) { |
1966 | core_bits = 0; | |
1967 | logical_CPU_bits = 0; | |
1968 | goto OUT; | |
0c77d800 SW |
1969 | } |
1970 | count = (cpu_procinfo & CPUID_HTT_CORES) >> CPUID_HTT_CORE_SHIFT; | |
f77c018a | 1971 | |
0c77d800 | 1972 | if (cpu_vendor_id == CPU_VENDOR_INTEL) |
f77c018a | 1973 | detect_intel_topology(count); |
0c77d800 | 1974 | else if (cpu_vendor_id == CPU_VENDOR_AMD) |
f77c018a | 1975 | detect_amd_topology(count); |
0c77d800 | 1976 | topology_detected = 1; |
f77c018a MC |
1977 | |
1978 | OUT: | |
c91894e0 MD |
1979 | if (bootverbose) { |
1980 | kprintf("Bits within APICID: logical_CPU_bits: %d; " | |
1981 | "core_bits: %d\n", | |
1982 | logical_CPU_bits, core_bits); | |
1983 | } | |
f77c018a MC |
1984 | } |
1985 | ||
c91894e0 MD |
1986 | /* |
1987 | * Interface functions to calculate chip_ID, | |
f77c018a MC |
1988 | * core_number and logical_number |
1989 | * Ref: http://wiki.osdev.org/Detecting_CPU_Topology_(80x86) | |
1990 | */ | |
1991 | int | |
1992 | get_chip_ID(int cpuid) | |
1993 | { | |
1994 | return get_apicid_from_cpuid(cpuid) >> | |
1995 | (logical_CPU_bits + core_bits); | |
1996 | } | |
1997 | ||
c7f9edd8 MD |
1998 | int |
1999 | get_chip_ID_from_APICID(int apicid) | |
2000 | { | |
2001 | return apicid >> (logical_CPU_bits + core_bits); | |
2002 | } | |
2003 | ||
f77c018a MC |
2004 | int |
2005 | get_core_number_within_chip(int cpuid) | |
2006 | { | |
c91894e0 MD |
2007 | return ((get_apicid_from_cpuid(cpuid) >> logical_CPU_bits) & |
2008 | ((1 << core_bits) - 1)); | |
f77c018a MC |
2009 | } |
2010 | ||
2011 | int | |
2012 | get_logical_CPU_number_within_core(int cpuid) | |
2013 | { | |
c91894e0 MD |
2014 | return (get_apicid_from_cpuid(cpuid) & |
2015 | ((1 << logical_CPU_bits) - 1)); | |
f77c018a | 2016 | } |