| Commit | Line | Data |
|---|---|---|
| 46d4e165 JG |
1 | /* |
| 2 | * Copyright (c) 1996, by Steve Passe | |
| 3 | * All rights reserved. | |
| 4 | * | |
| 5 | * Redistribution and use in source and binary forms, with or without | |
| 6 | * modification, are permitted provided that the following conditions | |
| 7 | * are met: | |
| 8 | * 1. Redistributions of source code must retain the above copyright | |
| 9 | * notice, this list of conditions and the following disclaimer. | |
| 10 | * 2. The name of the developer may NOT be used to endorse or promote products | |
| 11 | * derived from this software without specific prior written permission. | |
| 12 | * | |
| 13 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
| 14 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 16 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
| 17 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 18 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 19 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 20 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 21 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 22 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 23 | * SUCH DAMAGE. | |
| 24 | * | |
| 25 | * $FreeBSD: src/sys/i386/i386/mp_machdep.c,v 1.115.2.15 2003/03/14 21:22:35 jhb Exp $ | |
| 46d4e165 JG |
26 | */ |
| 27 | ||
| 28 | #include "opt_cpu.h" | |
| 29 | ||
| 30 | #include <sys/param.h> | |
| 31 | #include <sys/systm.h> | |
| 32 | #include <sys/kernel.h> | |
| 33 | #include <sys/sysctl.h> | |
| 34 | #include <sys/malloc.h> | |
| 35 | #include <sys/memrange.h> | |
| 36 | #include <sys/cons.h> /* cngetc() */ | |
| 37 | #include <sys/machintr.h> | |
| 38 | ||
| 684a93c4 MD |
39 | #include <sys/mplock2.h> |
| 40 | ||
| 46d4e165 JG |
41 | #include <vm/vm.h> |
| 42 | #include <vm/vm_param.h> | |
| 43 | #include <vm/pmap.h> | |
| 44 | #include <vm/vm_kern.h> | |
| 45 | #include <vm/vm_extern.h> | |
| 46 | #include <sys/lock.h> | |
| 47 | #include <vm/vm_map.h> | |
| 48 | #include <sys/user.h> | |
| 49 | #ifdef GPROF | |
| 50 | #include <sys/gmon.h> | |
| 51 | #endif | |
| 52 | ||
| 53 | #include <machine/smp.h> | |
| 54 | #include <machine_base/apic/apicreg.h> | |
| 55 | #include <machine/atomic.h> | |
| 56 | #include <machine/cpufunc.h> | |
| 2b6cd37e | 57 | #include <machine_base/apic/lapic.h> |
| 61452645 | 58 | #include <machine_base/apic/ioapic.h> |
| 46d4e165 JG |
59 | #include <machine/psl.h> |
| 60 | #include <machine/segments.h> | |
| 61 | #include <machine/tss.h> | |
| 62 | #include <machine/specialreg.h> | |
| 63 | #include <machine/globaldata.h> | |
| 4117f2fd | 64 | #include <machine/pmap_inval.h> |
| 46d4e165 JG |
65 | |
| 66 | #include <machine/md_var.h> /* setidt() */ | |
| 57a9c56b | 67 | #include <machine_base/icu/icu.h> /* IPIs */ |
| 3566408b | 68 | #include <machine_base/icu/icu_var.h> |
| e0918665 | 69 | #include <machine_base/apic/ioapic_abi.h> |
| 57a9c56b | 70 | #include <machine/intr_machdep.h> /* IPIs */ |
| 46d4e165 | 71 | |
| 46d4e165 JG |
72 | #define WARMBOOT_TARGET 0 |
| 73 | #define WARMBOOT_OFF (KERNBASE + 0x0467) | |
| 74 | #define WARMBOOT_SEG (KERNBASE + 0x0469) | |
| 75 | ||
| 46d4e165 JG |
76 | #define CMOS_REG (0x70) |
| 77 | #define CMOS_DATA (0x71) | |
| 78 | #define BIOS_RESET (0x0f) | |
| 79 | #define BIOS_WARM (0x0a) | |
| 80 | ||
| 46d4e165 JG |
81 | /* |
| 82 | * this code MUST be enabled here and in mpboot.s. | |
| 83 | * it follows the very early stages of AP boot by placing values in CMOS ram. | |
| 84 | * it NORMALLY will never be needed and thus the primitive method for enabling. | |
| 85 | * | |
| 86 | */ | |
| 87 | #if defined(CHECK_POINTS) | |
| 88 | #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) | |
| 89 | #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) | |
| 90 | ||
| 91 | #define CHECK_INIT(D); \ | |
| 92 | CHECK_WRITE(0x34, (D)); \ | |
| 93 | CHECK_WRITE(0x35, (D)); \ | |
| 94 | CHECK_WRITE(0x36, (D)); \ | |
| 95 | CHECK_WRITE(0x37, (D)); \ | |
| 96 | CHECK_WRITE(0x38, (D)); \ | |
| 97 | CHECK_WRITE(0x39, (D)); | |
| 98 | ||
| 99 | #define CHECK_PRINT(S); \ | |
| 100 | kprintf("%s: %d, %d, %d, %d, %d, %d\n", \ | |
| 101 | (S), \ | |
| 102 | CHECK_READ(0x34), \ | |
| 103 | CHECK_READ(0x35), \ | |
| 104 | CHECK_READ(0x36), \ | |
| 105 | CHECK_READ(0x37), \ | |
| 106 | CHECK_READ(0x38), \ | |
| 107 | CHECK_READ(0x39)); | |
| 108 | ||
| 109 | #else /* CHECK_POINTS */ | |
| 110 | ||
| 111 | #define CHECK_INIT(D) | |
| 112 | #define CHECK_PRINT(S) | |
| 113 | ||
| 114 | #endif /* CHECK_POINTS */ | |
| 115 | ||
| 116 | /* | |
| 117 | * Values to send to the POST hardware. | |
| 118 | */ | |
| 119 | #define MP_BOOTADDRESS_POST 0x10 | |
| 120 | #define MP_PROBE_POST 0x11 | |
| 121 | #define MPTABLE_PASS1_POST 0x12 | |
| 122 | ||
| 123 | #define MP_START_POST 0x13 | |
| 124 | #define MP_ENABLE_POST 0x14 | |
| 125 | #define MPTABLE_PASS2_POST 0x15 | |
| 126 | ||
| 127 | #define START_ALL_APS_POST 0x16 | |
| 128 | #define INSTALL_AP_TRAMP_POST 0x17 | |
| 129 | #define START_AP_POST 0x18 | |
| 130 | ||
| 131 | #define MP_ANNOUNCE_POST 0x19 | |
| 132 | ||
| 46d4e165 JG |
133 | /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ |
| 134 | int current_postcode; | |
| 135 | ||
| 136 | /** XXX FIXME: what system files declare these??? */ | |
| 137 | extern struct region_descriptor r_gdt, r_idt; | |
| 138 | ||
| 637df2f6 SZ |
139 | extern int nkpt; |
| 140 | extern int naps; | |
| 46d4e165 | 141 | |
| 46d4e165 JG |
142 | int64_t tsc0_offset; |
| 143 | extern int64_t tsc_offsets[]; | |
| 144 | ||
| 46d4e165 JG |
145 | /* AP uses this during bootstrap. Do not staticize. */ |
| 146 | char *bootSTK; | |
| 147 | static int bootAP; | |
| 148 | ||
| 46d4e165 JG |
149 | struct pcb stoppcbs[MAXCPU]; |
| 150 | ||
| 151 | extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); | |
| 152 | ||
| 46d4e165 JG |
153 | /* |
| 154 | * Local data and functions. | |
| 155 | */ | |
| 156 | ||
| 46d4e165 | 157 | static u_int boot_address; |
| 46d4e165 | 158 | static int mp_finish; |
| c6b1591c | 159 | static int mp_finish_lapic; |
| 46d4e165 | 160 | |
| 46d4e165 | 161 | static int start_all_aps(u_int boot_addr); |
| bfc09ba0 | 162 | #if 0 |
| 46d4e165 | 163 | static void install_ap_tramp(u_int boot_addr); |
| bfc09ba0 | 164 | #endif |
| bb467734 MD |
165 | static int start_ap(struct mdglobaldata *gd, u_int boot_addr, int smibest); |
| 166 | static int smitest(void); | |
| 3a69c113 | 167 | static void mp_bsp_simple_setup(void); |
| 46d4e165 JG |
168 | |
| 169 | static cpumask_t smp_startup_mask = 1; /* which cpus have been started */ | |
| c6b1591c | 170 | static cpumask_t smp_lapic_mask = 1; /* which cpus have lapic been inited */ |
| 46d4e165 JG |
171 | cpumask_t smp_active_mask = 1; /* which cpus are ready for IPIs etc? */ |
| 172 | SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RD, &smp_active_mask, 0, ""); | |
| 173 | static u_int bootMP_size; | |
| 174 | ||
| 175 | /* | |
| 176 | * Calculate usable address in base memory for AP trampoline code. | |
| 177 | */ | |
| 178 | u_int | |
| 179 | mp_bootaddress(u_int basemem) | |
| 180 | { | |
| 181 | POSTCODE(MP_BOOTADDRESS_POST); | |
| 182 | ||
| c855ebba JG |
183 | bootMP_size = mptramp_end - mptramp_start; |
| 184 | boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ | |
| 185 | if (((basemem * 1024) - boot_address) < bootMP_size) | |
| 186 | boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ | |
| 46d4e165 JG |
187 | /* 3 levels of page table pages */ |
| 188 | mptramp_pagetables = boot_address - (PAGE_SIZE * 3); | |
| 189 | ||
| 190 | return mptramp_pagetables; | |
| 191 | } | |
| 192 | ||
| 46d4e165 | 193 | /* |
| 46d4e165 JG |
194 | * Print various information about the SMP system hardware and setup. |
| 195 | */ | |
| 196 | void | |
| 197 | mp_announce(void) | |
| 198 | { | |
| 199 | int x; | |
| 200 | ||
| 201 | POSTCODE(MP_ANNOUNCE_POST); | |
| 202 | ||
| 203 | kprintf("DragonFly/MP: Multiprocessor motherboard\n"); | |
| fbac0dc4 | 204 | kprintf(" cpu0 (BSP): apic id: %2d\n", CPUID_TO_APICID(0)); |
| 637df2f6 | 205 | for (x = 1; x <= naps; ++x) |
| fbac0dc4 | 206 | kprintf(" cpu%d (AP): apic id: %2d\n", x, CPUID_TO_APICID(x)); |
| 46d4e165 | 207 | |
| f45bfca0 | 208 | if (!ioapic_enable) |
| 7a603b36 | 209 | kprintf(" Warning: APIC I/O disabled\n"); |
| 46d4e165 JG |
210 | } |
| 211 | ||
| 212 | /* | |
| 213 | * AP cpu's call this to sync up protected mode. | |
| 214 | * | |
| ec073ddc | 215 | * WARNING! %gs is not set up on entry. This routine sets up %gs. |
| 46d4e165 JG |
216 | */ |
| 217 | void | |
| 218 | init_secondary(void) | |
| 219 | { | |
| 220 | int gsel_tss; | |
| 221 | int x, myid = bootAP; | |
| 222 | u_int64_t msr, cr0; | |
| 223 | struct mdglobaldata *md; | |
| 224 | struct privatespace *ps; | |
| 225 | ||
| 226 | ps = &CPU_prvspace[myid]; | |
| 227 | ||
| 228 | gdt_segs[GPROC0_SEL].ssd_base = | |
| 229 | (long) &ps->mdglobaldata.gd_common_tss; | |
| 230 | ps->mdglobaldata.mi.gd_prvspace = ps; | |
| 231 | ||
| 232 | /* We fill the 32-bit segment descriptors */ | |
| 233 | for (x = 0; x < NGDT; x++) { | |
| 234 | if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) | |
| 235 | ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x]); | |
| 236 | } | |
| 237 | /* And now a 64-bit one */ | |
| 238 | ssdtosyssd(&gdt_segs[GPROC0_SEL], | |
| 239 | (struct system_segment_descriptor *)&gdt[myid * NGDT + GPROC0_SEL]); | |
| 240 | ||
| 241 | r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; | |
| 242 | r_gdt.rd_base = (long) &gdt[myid * NGDT]; | |
| 243 | lgdt(&r_gdt); /* does magic intra-segment return */ | |
| 244 | ||
| ec073ddc JG |
245 | /* lgdt() destroys the GSBASE value, so we load GSBASE after lgdt() */ |
| 246 | wrmsr(MSR_FSBASE, 0); /* User value */ | |
| 247 | wrmsr(MSR_GSBASE, (u_int64_t)ps); | |
| 248 | wrmsr(MSR_KGSBASE, 0); /* XXX User value while we're in the kernel */ | |
| 249 | ||
| 46d4e165 JG |
250 | lidt(&r_idt); |
| 251 | ||
| 252 | #if 0 | |
| 253 | lldt(_default_ldt); | |
| 254 | mdcpu->gd_currentldt = _default_ldt; | |
| 255 | #endif | |
| 256 | ||
| 257 | gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); | |
| 258 | gdt[myid * NGDT + GPROC0_SEL].sd_type = SDT_SYSTSS; | |
| 259 | ||
| 260 | md = mdcpu; /* loaded through %gs:0 (mdglobaldata.mi.gd_prvspace)*/ | |
| 261 | ||
| 262 | md->gd_common_tss.tss_rsp0 = 0; /* not used until after switch */ | |
| 263 | #if 0 /* JG XXX */ | |
| 264 | md->gd_common_tss.tss_ioopt = (sizeof md->gd_common_tss) << 16; | |
| 265 | #endif | |
| 266 | md->gd_tss_gdt = &gdt[myid * NGDT + GPROC0_SEL]; | |
| 267 | md->gd_common_tssd = *md->gd_tss_gdt; | |
| 093565f2 MD |
268 | |
| 269 | /* double fault stack */ | |
| 270 | md->gd_common_tss.tss_ist1 = | |
| 271 | (long)&md->mi.gd_prvspace->idlestack[ | |
| 272 | sizeof(md->mi.gd_prvspace->idlestack)]; | |
| 273 | ||
| 46d4e165 JG |
274 | ltr(gsel_tss); |
| 275 | ||
| 46d4e165 JG |
276 | /* |
| 277 | * Set to a known state: | |
| 278 | * Set by mpboot.s: CR0_PG, CR0_PE | |
| 279 | * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM | |
| 280 | */ | |
| 281 | cr0 = rcr0(); | |
| 282 | cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); | |
| 283 | load_cr0(cr0); | |
| 284 | ||
| 285 | /* Set up the fast syscall stuff */ | |
| 286 | msr = rdmsr(MSR_EFER) | EFER_SCE; | |
| 287 | wrmsr(MSR_EFER, msr); | |
| 288 | wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); | |
| 289 | wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); | |
| 290 | msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | | |
| 291 | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); | |
| 292 | wrmsr(MSR_STAR, msr); | |
| 3338cc67 | 293 | wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL); |
| 46d4e165 JG |
294 | |
| 295 | pmap_set_opt(); /* PSE/4MB pages, etc */ | |
| 296 | #if JGXXX | |
| 297 | /* Initialize the PAT MSR. */ | |
| 298 | pmap_init_pat(); | |
| 299 | #endif | |
| 300 | ||
| 301 | /* set up CPU registers and state */ | |
| 302 | cpu_setregs(); | |
| 303 | ||
| 304 | /* set up SSE/NX registers */ | |
| 305 | initializecpu(); | |
| 306 | ||
| 307 | /* set up FPU state on the AP */ | |
| 308 | npxinit(__INITIAL_NPXCW__); | |
| ec073ddc JG |
309 | |
| 310 | /* disable the APIC, just to be SURE */ | |
| 311 | lapic->svr &= ~APIC_SVR_ENABLE; | |
| 46d4e165 JG |
312 | } |
| 313 | ||
| 314 | /******************************************************************* | |
| 315 | * local functions and data | |
| 316 | */ | |
| 317 | ||
| 318 | /* | |
| 3a69c113 | 319 | * Start the SMP system |
| 46d4e165 JG |
320 | */ |
| 321 | static void | |
| 3a69c113 | 322 | mp_start_aps(void *dummy __unused) |
| 46d4e165 | 323 | { |
| 2e0ed166 SZ |
324 | if (lapic_enable) { |
| 325 | /* start each Application Processor */ | |
| 3a69c113 | 326 | start_all_aps(boot_address); |
| 2e0ed166 | 327 | } else { |
| 3a69c113 | 328 | mp_bsp_simple_setup(); |
| 944562df | 329 | } |
| 46d4e165 | 330 | } |
| 3a69c113 | 331 | SYSINIT(startaps, SI_BOOT2_START_APS, SI_ORDER_FIRST, mp_start_aps, NULL) |
| 46d4e165 | 332 | |
| 46d4e165 | 333 | /* |
| 46d4e165 JG |
334 | * start each AP in our list |
| 335 | */ | |
| 336 | static int | |
| 337 | start_all_aps(u_int boot_addr) | |
| 338 | { | |
| 339 | vm_offset_t va = boot_address + KERNBASE; | |
| 340 | u_int64_t *pt4, *pt3, *pt2; | |
| 341 | int x, i, pg; | |
| 342 | int shift; | |
| bb467734 MD |
343 | int smicount; |
| 344 | int smibest; | |
| 345 | int smilast; | |
| 46d4e165 JG |
346 | u_char mpbiosreason; |
| 347 | u_long mpbioswarmvec; | |
| 348 | struct mdglobaldata *gd; | |
| 349 | struct privatespace *ps; | |
| 46d4e165 JG |
350 | |
| 351 | POSTCODE(START_ALL_APS_POST); | |
| 352 | ||
| 46d4e165 JG |
353 | /* install the AP 1st level boot code */ |
| 354 | pmap_kenter(va, boot_address); | |
| bfc09ba0 | 355 | cpu_invlpg((void *)va); /* JG XXX */ |
| 46d4e165 JG |
356 | bcopy(mptramp_start, (void *)va, bootMP_size); |
| 357 | ||
| 358 | /* Locate the page tables, they'll be below the trampoline */ | |
| 359 | pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); | |
| 360 | pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); | |
| 361 | pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); | |
| 362 | ||
| 363 | /* Create the initial 1GB replicated page tables */ | |
| 364 | for (i = 0; i < 512; i++) { | |
| 365 | /* Each slot of the level 4 pages points to the same level 3 page */ | |
| 366 | pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); | |
| 367 | pt4[i] |= PG_V | PG_RW | PG_U; | |
| 368 | ||
| 369 | /* Each slot of the level 3 pages points to the same level 2 page */ | |
| 370 | pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); | |
| 371 | pt3[i] |= PG_V | PG_RW | PG_U; | |
| 372 | ||
| 373 | /* The level 2 page slots are mapped with 2MB pages for 1GB. */ | |
| 374 | pt2[i] = i * (2 * 1024 * 1024); | |
| 375 | pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; | |
| 376 | } | |
| 377 | ||
| 378 | /* save the current value of the warm-start vector */ | |
| 379 | mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); | |
| 380 | outb(CMOS_REG, BIOS_RESET); | |
| 381 | mpbiosreason = inb(CMOS_DATA); | |
| 382 | ||
| 383 | /* setup a vector to our boot code */ | |
| 384 | *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; | |
| 385 | *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); | |
| 386 | outb(CMOS_REG, BIOS_RESET); | |
| 387 | outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ | |
| 388 | ||
| bb467734 MD |
389 | /* |
| 390 | * If we have a TSC we can figure out the SMI interrupt rate. | |
| 391 | * The SMI does not necessarily use a constant rate. Spend | |
| 392 | * up to 250ms trying to figure it out. | |
| 393 | */ | |
| 394 | smibest = 0; | |
| 395 | if (cpu_feature & CPUID_TSC) { | |
| 396 | set_apic_timer(275000); | |
| 397 | smilast = read_apic_timer(); | |
| 398 | for (x = 0; x < 20 && read_apic_timer(); ++x) { | |
| 399 | smicount = smitest(); | |
| 400 | if (smibest == 0 || smilast - smicount < smibest) | |
| 401 | smibest = smilast - smicount; | |
| 402 | smilast = smicount; | |
| 403 | } | |
| 404 | if (smibest > 250000) | |
| 405 | smibest = 0; | |
| 406 | if (smibest) { | |
| 407 | smibest = smibest * (int64_t)1000000 / | |
| 408 | get_apic_timer_frequency(); | |
| 409 | } | |
| 410 | } | |
| 411 | if (smibest) | |
| 412 | kprintf("SMI Frequency (worst case): %d Hz (%d us)\n", | |
| 413 | 1000000 / smibest, smibest); | |
| 414 | ||
| 46d4e165 | 415 | /* start each AP */ |
| 637df2f6 | 416 | for (x = 1; x <= naps; ++x) { |
| 46d4e165 JG |
417 | |
| 418 | /* This is a bit verbose, it will go away soon. */ | |
| 419 | ||
| 420 | /* first page of AP's private space */ | |
| b2b3ffcd | 421 | pg = x * x86_64_btop(sizeof(struct privatespace)); |
| 46d4e165 JG |
422 | |
| 423 | /* allocate new private data page(s) */ | |
| 424 | gd = (struct mdglobaldata *)kmem_alloc(&kernel_map, | |
| 425 | MDGLOBALDATA_BASEALLOC_SIZE); | |
| 46d4e165 JG |
426 | |
| 427 | gd = &CPU_prvspace[x].mdglobaldata; /* official location */ | |
| 428 | bzero(gd, sizeof(*gd)); | |
| 429 | gd->mi.gd_prvspace = ps = &CPU_prvspace[x]; | |
| 430 | ||
| 431 | /* prime data page for it to use */ | |
| 432 | mi_gdinit(&gd->mi, x); | |
| 433 | cpu_gdinit(gd, x); | |
| 637df2f6 SZ |
434 | gd->mi.gd_ipiq = (void *)kmem_alloc(&kernel_map, sizeof(lwkt_ipiq) * (naps + 1)); |
| 435 | bzero(gd->mi.gd_ipiq, sizeof(lwkt_ipiq) * (naps + 1)); | |
| 46d4e165 JG |
436 | |
| 437 | /* setup a vector to our boot code */ | |
| 438 | *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; | |
| 439 | *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); | |
| 440 | outb(CMOS_REG, BIOS_RESET); | |
| 441 | outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ | |
| 442 | ||
| 443 | /* | |
| 444 | * Setup the AP boot stack | |
| 445 | */ | |
| 446 | bootSTK = &ps->idlestack[UPAGES*PAGE_SIZE/2]; | |
| 447 | bootAP = x; | |
| 448 | ||
| 449 | /* attempt to start the Application Processor */ | |
| 450 | CHECK_INIT(99); /* setup checkpoints */ | |
| bb467734 | 451 | if (!start_ap(gd, boot_addr, smibest)) { |
| ea96e50f | 452 | kprintf("\nAP #%d (PHY# %d) failed!\n", |
| fbac0dc4 | 453 | x, CPUID_TO_APICID(x)); |
| 46d4e165 JG |
454 | CHECK_PRINT("trace"); /* show checkpoints */ |
| 455 | /* better panic as the AP may be running loose */ | |
| 456 | kprintf("panic y/n? [y] "); | |
| 457 | if (cngetc() != 'n') | |
| 458 | panic("bye-bye"); | |
| 459 | } | |
| 460 | CHECK_PRINT("trace"); /* show checkpoints */ | |
| 46d4e165 JG |
461 | } |
| 462 | ||
| 463 | /* set ncpus to 1 + highest logical cpu. Not all may have come up */ | |
| 464 | ncpus = x; | |
| 465 | ||
| 466 | /* ncpus2 -- ncpus rounded down to the nearest power of 2 */ | |
| 467 | for (shift = 0; (1 << shift) <= ncpus; ++shift) | |
| 468 | ; | |
| 469 | --shift; | |
| 470 | ncpus2_shift = shift; | |
| 471 | ncpus2 = 1 << shift; | |
| 472 | ncpus2_mask = ncpus2 - 1; | |
| 473 | ||
| 474 | /* ncpus_fit -- ncpus rounded up to the nearest power of 2 */ | |
| 475 | if ((1 << shift) < ncpus) | |
| 476 | ++shift; | |
| 477 | ncpus_fit = 1 << shift; | |
| 478 | ncpus_fit_mask = ncpus_fit - 1; | |
| 479 | ||
| 480 | /* build our map of 'other' CPUs */ | |
| da23a592 | 481 | mycpu->gd_other_cpus = smp_startup_mask & ~CPUMASK(mycpu->gd_cpuid); |
| 46d4e165 JG |
482 | mycpu->gd_ipiq = (void *)kmem_alloc(&kernel_map, sizeof(lwkt_ipiq) * ncpus); |
| 483 | bzero(mycpu->gd_ipiq, sizeof(lwkt_ipiq) * ncpus); | |
| 484 | ||
| 46d4e165 JG |
485 | /* restore the warmstart vector */ |
| 486 | *(u_long *) WARMBOOT_OFF = mpbioswarmvec; | |
| 487 | outb(CMOS_REG, BIOS_RESET); | |
| 488 | outb(CMOS_DATA, mpbiosreason); | |
| 489 | ||
| 490 | /* | |
| 491 | * NOTE! The idlestack for the BSP was setup by locore. Finish | |
| 492 | * up, clean out the P==V mapping we did earlier. | |
| 493 | */ | |
| 46d4e165 JG |
494 | pmap_set_opt(); |
| 495 | ||
| c6b1591c SZ |
496 | /* |
| 497 | * Wait all APs to finish initializing LAPIC | |
| 498 | */ | |
| 499 | mp_finish_lapic = 1; | |
| 500 | if (bootverbose) | |
| 501 | kprintf("SMP: Waiting APs LAPIC initialization\n"); | |
| 502 | if (cpu_feature & CPUID_TSC) | |
| 503 | tsc0_offset = rdtsc(); | |
| 504 | tsc_offsets[0] = 0; | |
| 505 | rel_mplock(); | |
| 506 | while (smp_lapic_mask != smp_startup_mask) { | |
| 507 | cpu_lfence(); | |
| 508 | if (cpu_feature & CPUID_TSC) | |
| 509 | tsc0_offset = rdtsc(); | |
| 510 | } | |
| 511 | while (try_mplock() == 0) | |
| 512 | ; | |
| 513 | ||
| 46d4e165 JG |
514 | /* number of APs actually started */ |
| 515 | return ncpus - 1; | |
| 516 | } | |
| 517 | ||
| 518 | ||
| 519 | /* | |
| 520 | * load the 1st level AP boot code into base memory. | |
| 521 | */ | |
| 522 | ||
| 523 | /* targets for relocation */ | |
| 524 | extern void bigJump(void); | |
| 525 | extern void bootCodeSeg(void); | |
| 526 | extern void bootDataSeg(void); | |
| 527 | extern void MPentry(void); | |
| 528 | extern u_int MP_GDT; | |
| 529 | extern u_int mp_gdtbase; | |
| 530 | ||
| bfc09ba0 MD |
531 | #if 0 |
| 532 | ||
| 46d4e165 JG |
533 | static void |
| 534 | install_ap_tramp(u_int boot_addr) | |
| 535 | { | |
| 536 | int x; | |
| 537 | int size = *(int *) ((u_long) & bootMP_size); | |
| 538 | u_char *src = (u_char *) ((u_long) bootMP); | |
| 539 | u_char *dst = (u_char *) boot_addr + KERNBASE; | |
| 540 | u_int boot_base = (u_int) bootMP; | |
| 541 | u_int8_t *dst8; | |
| 542 | u_int16_t *dst16; | |
| 543 | u_int32_t *dst32; | |
| 544 | ||
| 545 | POSTCODE(INSTALL_AP_TRAMP_POST); | |
| 546 | ||
| 547 | for (x = 0; x < size; ++x) | |
| 548 | *dst++ = *src++; | |
| 549 | ||
| 550 | /* | |
| 551 | * modify addresses in code we just moved to basemem. unfortunately we | |
| 552 | * need fairly detailed info about mpboot.s for this to work. changes | |
| 553 | * to mpboot.s might require changes here. | |
| 554 | */ | |
| 555 | ||
| 556 | /* boot code is located in KERNEL space */ | |
| 557 | dst = (u_char *) boot_addr + KERNBASE; | |
| 558 | ||
| 559 | /* modify the lgdt arg */ | |
| 560 | dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); | |
| 561 | *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); | |
| 562 | ||
| 563 | /* modify the ljmp target for MPentry() */ | |
| 564 | dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); | |
| 565 | *dst32 = ((u_int) MPentry - KERNBASE); | |
| 566 | ||
| 567 | /* modify the target for boot code segment */ | |
| 568 | dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); | |
| 569 | dst8 = (u_int8_t *) (dst16 + 1); | |
| 570 | *dst16 = (u_int) boot_addr & 0xffff; | |
| 571 | *dst8 = ((u_int) boot_addr >> 16) & 0xff; | |
| 572 | ||
| 573 | /* modify the target for boot data segment */ | |
| 574 | dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); | |
| 575 | dst8 = (u_int8_t *) (dst16 + 1); | |
| 576 | *dst16 = (u_int) boot_addr & 0xffff; | |
| 577 | *dst8 = ((u_int) boot_addr >> 16) & 0xff; | |
| 578 | } | |
| 579 | ||
| bfc09ba0 | 580 | #endif |
| 46d4e165 JG |
581 | |
| 582 | /* | |
| bb467734 | 583 | * This function starts the AP (application processor) identified |
| 46d4e165 JG |
584 | * by the APIC ID 'physicalCpu'. It does quite a "song and dance" |
| 585 | * to accomplish this. This is necessary because of the nuances | |
| 586 | * of the different hardware we might encounter. It ain't pretty, | |
| 587 | * but it seems to work. | |
| 588 | * | |
| 589 | * NOTE: eventually an AP gets to ap_init(), which is called just | |
| 590 | * before the AP goes into the LWKT scheduler's idle loop. | |
| 591 | */ | |
| 592 | static int | |
| bb467734 | 593 | start_ap(struct mdglobaldata *gd, u_int boot_addr, int smibest) |
| 46d4e165 JG |
594 | { |
| 595 | int physical_cpu; | |
| 596 | int vector; | |
| 597 | u_long icr_lo, icr_hi; | |
| 598 | ||
| 599 | POSTCODE(START_AP_POST); | |
| 600 | ||
| 601 | /* get the PHYSICAL APIC ID# */ | |
| fbac0dc4 | 602 | physical_cpu = CPUID_TO_APICID(gd->mi.gd_cpuid); |
| 46d4e165 JG |
603 | |
| 604 | /* calculate the vector */ | |
| 605 | vector = (boot_addr >> 12) & 0xff; | |
| 606 | ||
| bb467734 MD |
607 | /* We don't want anything interfering */ |
| 608 | cpu_disable_intr(); | |
| 609 | ||
| 46d4e165 JG |
610 | /* Make sure the target cpu sees everything */ |
| 611 | wbinvd(); | |
| 612 | ||
| 613 | /* | |
| bb467734 MD |
614 | * Try to detect when a SMI has occurred, wait up to 200ms. |
| 615 | * | |
| 616 | * If a SMI occurs during an AP reset but before we issue | |
| 617 | * the STARTUP command, the AP may brick. To work around | |
| 618 | * this problem we hold off doing the AP startup until | |
| 619 | * after we have detected the SMI. Hopefully another SMI | |
| 620 | * will not occur before we finish the AP startup. | |
| 621 | * | |
| 622 | * Retries don't seem to help. SMIs have a window of opportunity | |
| 623 | * and if USB->legacy keyboard emulation is enabled in the BIOS | |
| 624 | * the interrupt rate can be quite high. | |
| 625 | * | |
| 626 | * NOTE: Don't worry about the L1 cache load, it might bloat | |
| 627 | * ldelta a little but ndelta will be so huge when the SMI | |
| 628 | * occurs the detection logic will still work fine. | |
| 629 | */ | |
| 630 | if (smibest) { | |
| 631 | set_apic_timer(200000); | |
| 632 | smitest(); | |
| 633 | } | |
| 634 | ||
| 635 | /* | |
| 46d4e165 JG |
636 | * first we do an INIT/RESET IPI this INIT IPI might be run, reseting |
| 637 | * and running the target CPU. OR this INIT IPI might be latched (P5 | |
| 638 | * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be | |
| 639 | * ignored. | |
| bb467734 MD |
640 | * |
| 641 | * see apic/apicreg.h for icr bit definitions. | |
| 642 | * | |
| 643 | * TIME CRITICAL CODE, DO NOT DO ANY KPRINTFS IN THE HOT PATH. | |
| 46d4e165 JG |
644 | */ |
| 645 | ||
| bb467734 MD |
646 | /* |
| 647 | * Setup the address for the target AP. We can setup | |
| 648 | * icr_hi once and then just trigger operations with | |
| 649 | * icr_lo. | |
| 650 | */ | |
| 46d4e165 JG |
651 | icr_hi = lapic->icr_hi & ~APIC_ID_MASK; |
| 652 | icr_hi |= (physical_cpu << 24); | |
| 46d4e165 | 653 | icr_lo = lapic->icr_lo & 0xfff00000; |
| bb467734 | 654 | lapic->icr_hi = icr_hi; |
| 46d4e165 | 655 | |
| bb467734 MD |
656 | /* |
| 657 | * Do an INIT IPI: assert RESET | |
| 658 | * | |
| 659 | * Use edge triggered mode to assert INIT | |
| 660 | */ | |
| 661 | lapic->icr_lo = icr_lo | 0x00004500; | |
| 46d4e165 JG |
662 | while (lapic->icr_lo & APIC_DELSTAT_MASK) |
| 663 | /* spin */ ; | |
| 664 | ||
| bb467734 MD |
665 | /* |
| 666 | * The spec calls for a 10ms delay but we may have to use a | |
| 667 | * MUCH lower delay to avoid bricking an AP due to a fast SMI | |
| 668 | * interrupt. We have other loops here too and dividing by 2 | |
| 669 | * doesn't seem to be enough even after subtracting 350us, | |
| 670 | * so we divide by 4. | |
| 671 | * | |
| 672 | * Our minimum delay is 150uS, maximum is 10ms. If no SMI | |
| 673 | * interrupt was detected we use the full 10ms. | |
| 674 | */ | |
| 675 | if (smibest == 0) | |
| 676 | u_sleep(10000); | |
| 677 | else if (smibest < 150 * 4 + 350) | |
| 678 | u_sleep(150); | |
| 679 | else if ((smibest - 350) / 4 < 10000) | |
| 680 | u_sleep((smibest - 350) / 4); | |
| 681 | else | |
| 682 | u_sleep(10000); | |
| 46d4e165 | 683 | |
| bb467734 MD |
684 | /* |
| 685 | * Do an INIT IPI: deassert RESET | |
| 686 | * | |
| 687 | * Use level triggered mode to deassert. It is unclear | |
| 688 | * why we need to do this. | |
| 689 | */ | |
| 690 | lapic->icr_lo = icr_lo | 0x00008500; | |
| 46d4e165 JG |
691 | while (lapic->icr_lo & APIC_DELSTAT_MASK) |
| 692 | /* spin */ ; | |
| bb467734 | 693 | u_sleep(150); /* wait 150us */ |
| 46d4e165 JG |
694 | |
| 695 | /* | |
| bb467734 | 696 | * Next we do a STARTUP IPI: the previous INIT IPI might still be |
| 46d4e165 JG |
697 | * latched, (P5 bug) this 1st STARTUP would then terminate |
| 698 | * immediately, and the previously started INIT IPI would continue. OR | |
| 699 | * the previous INIT IPI has already run. and this STARTUP IPI will | |
| 700 | * run. OR the previous INIT IPI was ignored. and this STARTUP IPI | |
| 701 | * will run. | |
| 702 | */ | |
| 46d4e165 JG |
703 | lapic->icr_lo = icr_lo | 0x00000600 | vector; |
| 704 | while (lapic->icr_lo & APIC_DELSTAT_MASK) | |
| 705 | /* spin */ ; | |
| 706 | u_sleep(200); /* wait ~200uS */ | |
| 707 | ||
| 708 | /* | |
| bb467734 | 709 | * Finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF |
| 46d4e165 JG |
710 | * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR |
| 711 | * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is | |
| 712 | * recognized after hardware RESET or INIT IPI. | |
| 713 | */ | |
| 46d4e165 JG |
714 | lapic->icr_lo = icr_lo | 0x00000600 | vector; |
| 715 | while (lapic->icr_lo & APIC_DELSTAT_MASK) | |
| 716 | /* spin */ ; | |
| bb467734 MD |
717 | |
| 718 | /* Resume normal operation */ | |
| 719 | cpu_enable_intr(); | |
| 46d4e165 JG |
720 | |
| 721 | /* wait for it to start, see ap_init() */ | |
| 722 | set_apic_timer(5000000);/* == 5 seconds */ | |
| 723 | while (read_apic_timer()) { | |
| da23a592 | 724 | if (smp_startup_mask & CPUMASK(gd->mi.gd_cpuid)) |
| 46d4e165 JG |
725 | return 1; /* return SUCCESS */ |
| 726 | } | |
| bb467734 | 727 | |
| 46d4e165 JG |
728 | return 0; /* return FAILURE */ |
| 729 | } | |
| 730 | ||
| bb467734 MD |
731 | static |
| 732 | int | |
| 733 | smitest(void) | |
| 734 | { | |
| 735 | int64_t ltsc; | |
| 736 | int64_t ntsc; | |
| 737 | int64_t ldelta; | |
| 738 | int64_t ndelta; | |
| 739 | int count; | |
| 740 | ||
| 741 | ldelta = 0; | |
| 742 | ndelta = 0; | |
| 743 | while (read_apic_timer()) { | |
| 744 | ltsc = rdtsc(); | |
| 745 | for (count = 0; count < 100; ++count) | |
| 746 | ntsc = rdtsc(); /* force loop to occur */ | |
| 747 | if (ldelta) { | |
| 748 | ndelta = ntsc - ltsc; | |
| 749 | if (ldelta > ndelta) | |
| 750 | ldelta = ndelta; | |
| 751 | if (ndelta > ldelta * 2) | |
| 752 | break; | |
| 753 | } else { | |
| 754 | ldelta = ntsc - ltsc; | |
| 755 | } | |
| 756 | } | |
| 757 | return(read_apic_timer()); | |
| 758 | } | |
| 46d4e165 JG |
759 | |
| 760 | /* | |
| 7d4d6fdb MD |
761 | * Synchronously flush the TLB on all other CPU's. The current cpu's |
| 762 | * TLB is not flushed. If the caller wishes to flush the current cpu's | |
| 763 | * TLB the caller must call cpu_invltlb() in addition to smp_invltlb(). | |
| 46d4e165 | 764 | * |
| 7d4d6fdb MD |
765 | * NOTE: If for some reason we were unable to start all cpus we cannot |
| 766 | * safely use broadcast IPIs. | |
| 46d4e165 | 767 | */ |
| 7d4d6fdb MD |
768 | |
| 769 | static cpumask_t smp_invltlb_req; | |
| 770 | ||
| b4b1a37a MD |
771 | #define SMP_INVLTLB_DEBUG |
| 772 | ||
| 46d4e165 JG |
773 | void |
| 774 | smp_invltlb(void) | |
| 775 | { | |
| 7d4d6fdb | 776 | struct mdglobaldata *md = mdcpu; |
| 2d910aaf | 777 | #ifdef SMP_INVLTLB_DEBUG |
| 7d4d6fdb | 778 | long count = 0; |
| 2d910aaf | 779 | long xcount = 0; |
| 7d4d6fdb | 780 | #endif |
| 4117f2fd | 781 | |
| 7d4d6fdb MD |
782 | crit_enter_gd(&md->mi); |
| 783 | md->gd_invltlb_ret = 0; | |
| 784 | ++md->mi.gd_cnt.v_smpinvltlb; | |
| da23a592 | 785 | atomic_set_cpumask(&smp_invltlb_req, md->mi.gd_cpumask); |
| 2d910aaf MD |
786 | #ifdef SMP_INVLTLB_DEBUG |
| 787 | again: | |
| 788 | #endif | |
| 46d4e165 JG |
789 | if (smp_startup_mask == smp_active_mask) { |
| 790 | all_but_self_ipi(XINVLTLB_OFFSET); | |
| 791 | } else { | |
| 7d4d6fdb MD |
792 | selected_apic_ipi(smp_active_mask & ~md->mi.gd_cpumask, |
| 793 | XINVLTLB_OFFSET, APIC_DELMODE_FIXED); | |
| 46d4e165 | 794 | } |
| 2d910aaf MD |
795 | |
| 796 | #ifdef SMP_INVLTLB_DEBUG | |
| 797 | if (xcount) | |
| 798 | kprintf("smp_invltlb: ipi sent\n"); | |
| 799 | #endif | |
| 7d4d6fdb MD |
800 | while ((md->gd_invltlb_ret & smp_active_mask & ~md->mi.gd_cpumask) != |
| 801 | (smp_active_mask & ~md->mi.gd_cpumask)) { | |
| 802 | cpu_mfence(); | |
| 803 | cpu_pause(); | |
| 2d910aaf | 804 | #ifdef SMP_INVLTLB_DEBUG |
| 7d4d6fdb MD |
805 | /* DEBUGGING */ |
| 806 | if (++count == 400000000) { | |
| 2d910aaf MD |
807 | print_backtrace(-1); |
| 808 | kprintf("smp_invltlb: endless loop %08lx %08lx, " | |
| 809 | "rflags %016jx retry", | |
| 7d4d6fdb | 810 | (long)md->gd_invltlb_ret, |
| 2d910aaf MD |
811 | (long)smp_invltlb_req, |
| 812 | (intmax_t)read_rflags()); | |
| 813 | __asm __volatile ("sti"); | |
| 814 | ++xcount; | |
| 815 | if (xcount > 2) | |
| 816 | lwkt_process_ipiq(); | |
| 817 | if (xcount > 3) { | |
| da23a592 MD |
818 | int bcpu = BSFCPUMASK(~md->gd_invltlb_ret & |
| 819 | ~md->mi.gd_cpumask & | |
| 820 | smp_active_mask); | |
| 2d910aaf MD |
821 | globaldata_t xgd; |
| 822 | ||
| 823 | kprintf("bcpu %d\n", bcpu); | |
| 824 | xgd = globaldata_find(bcpu); | |
| 825 | kprintf("thread %p %s\n", xgd->gd_curthread, xgd->gd_curthread->td_comm); | |
| 826 | } | |
| 827 | if (xcount > 5) | |
| 828 | Debugger("giving up"); | |
| 829 | count = 0; | |
| 830 | goto again; | |
| 7d4d6fdb | 831 | } |
| 46d4e165 | 832 | #endif |
| 7d4d6fdb | 833 | } |
| da23a592 | 834 | atomic_clear_cpumask(&smp_invltlb_req, md->mi.gd_cpumask); |
| 7d4d6fdb | 835 | crit_exit_gd(&md->mi); |
| 46d4e165 JG |
836 | } |
| 837 | ||
| 7d4d6fdb MD |
838 | /* |
| 839 | * Called from Xinvltlb assembly with interrupts disabled. We didn't | |
| 840 | * bother to bump the critical section count or nested interrupt count | |
| 841 | * so only do very low level operations here. | |
| 842 | */ | |
| 843 | void | |
| 844 | smp_invltlb_intr(void) | |
| 845 | { | |
| 846 | struct mdglobaldata *md = mdcpu; | |
| 847 | struct mdglobaldata *omd; | |
| 848 | cpumask_t mask; | |
| 849 | int cpu; | |
| 850 | ||
| 7d4d6fdb | 851 | cpu_mfence(); |
| 2d910aaf | 852 | mask = smp_invltlb_req; |
| 7d4d6fdb MD |
853 | cpu_invltlb(); |
| 854 | while (mask) { | |
| da23a592 MD |
855 | cpu = BSFCPUMASK(mask); |
| 856 | mask &= ~CPUMASK(cpu); | |
| 7d4d6fdb | 857 | omd = (struct mdglobaldata *)globaldata_find(cpu); |
| da23a592 | 858 | atomic_set_cpumask(&omd->gd_invltlb_ret, md->mi.gd_cpumask); |
| 7d4d6fdb MD |
859 | } |
| 860 | } | |
| 861 | ||
| 46d4e165 JG |
862 | /* |
| 863 | * When called the executing CPU will send an IPI to all other CPUs | |
| 864 | * requesting that they halt execution. | |
| 865 | * | |
| 866 | * Usually (but not necessarily) called with 'other_cpus' as its arg. | |
| 867 | * | |
| 868 | * - Signals all CPUs in map to stop. | |
| 869 | * - Waits for each to stop. | |
| 870 | * | |
| 871 | * Returns: | |
| 872 | * -1: error | |
| 873 | * 0: NA | |
| 874 | * 1: ok | |
| 875 | * | |
| 876 | * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs | |
| 877 | * from executing at same time. | |
| 878 | */ | |
| 879 | int | |
| da23a592 | 880 | stop_cpus(cpumask_t map) |
| 46d4e165 JG |
881 | { |
| 882 | map &= smp_active_mask; | |
| 883 | ||
| 884 | /* send the Xcpustop IPI to all CPUs in map */ | |
| 885 | selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED); | |
| 886 | ||
| 887 | while ((stopped_cpus & map) != map) | |
| 888 | /* spin */ ; | |
| 889 | ||
| 890 | return 1; | |
| 891 | } | |
| 892 | ||
| 893 | ||
| 894 | /* | |
| 895 | * Called by a CPU to restart stopped CPUs. | |
| 896 | * | |
| 897 | * Usually (but not necessarily) called with 'stopped_cpus' as its arg. | |
| 898 | * | |
| 899 | * - Signals all CPUs in map to restart. | |
| 900 | * - Waits for each to restart. | |
| 901 | * | |
| 902 | * Returns: | |
| 903 | * -1: error | |
| 904 | * 0: NA | |
| 905 | * 1: ok | |
| 906 | */ | |
| 907 | int | |
| da23a592 | 908 | restart_cpus(cpumask_t map) |
| 46d4e165 JG |
909 | { |
| 910 | /* signal other cpus to restart */ | |
| 911 | started_cpus = map & smp_active_mask; | |
| 912 | ||
| 913 | while ((stopped_cpus & map) != 0) /* wait for each to clear its bit */ | |
| 914 | /* spin */ ; | |
| 915 | ||
| 916 | return 1; | |
| 917 | } | |
| 918 | ||
| 919 | /* | |
| 920 | * This is called once the mpboot code has gotten us properly relocated | |
| 921 | * and the MMU turned on, etc. ap_init() is actually the idle thread, | |
| 922 | * and when it returns the scheduler will call the real cpu_idle() main | |
| 923 | * loop for the idlethread. Interrupts are disabled on entry and should | |
| 924 | * remain disabled at return. | |
| 925 | */ | |
| 926 | void | |
| 927 | ap_init(void) | |
| 928 | { | |
| fbac0dc4 | 929 | int cpu_id; |
| 46d4e165 JG |
930 | |
| 931 | /* | |
| 932 | * Adjust smp_startup_mask to signal the BSP that we have started | |
| 933 | * up successfully. Note that we do not yet hold the BGL. The BSP | |
| 934 | * is waiting for our signal. | |
| 935 | * | |
| 936 | * We can't set our bit in smp_active_mask yet because we are holding | |
| 937 | * interrupts physically disabled and remote cpus could deadlock | |
| 938 | * trying to send us an IPI. | |
| 939 | */ | |
| da23a592 | 940 | smp_startup_mask |= CPUMASK(mycpu->gd_cpuid); |
| 46d4e165 JG |
941 | cpu_mfence(); |
| 942 | ||
| 943 | /* | |
| c6b1591c SZ |
944 | * Interlock for LAPIC initialization. Wait until mp_finish_lapic is |
| 945 | * non-zero, then get the MP lock. | |
| 46d4e165 JG |
946 | * |
| 947 | * Note: We are in a critical section. | |
| 948 | * | |
| 46d4e165 JG |
949 | * Note: we are the idle thread, we can only spin. |
| 950 | * | |
| 951 | * Note: The load fence is memory volatile and prevents the compiler | |
| c6b1591c | 952 | * from improperly caching mp_finish_lapic, and the cpu from improperly |
| 46d4e165 JG |
953 | * caching it. |
| 954 | */ | |
| c6b1591c | 955 | while (mp_finish_lapic == 0) |
| b5d16701 MD |
956 | cpu_lfence(); |
| 957 | while (try_mplock() == 0) | |
| 958 | ; | |
| 46d4e165 JG |
959 | |
| 960 | if (cpu_feature & CPUID_TSC) { | |
| b5d16701 MD |
961 | /* |
| 962 | * The BSP is constantly updating tsc0_offset, figure out | |
| 963 | * the relative difference to synchronize ktrdump. | |
| 964 | */ | |
| 965 | tsc_offsets[mycpu->gd_cpuid] = rdtsc() - tsc0_offset; | |
| 46d4e165 JG |
966 | } |
| 967 | ||
| 968 | /* BSP may have changed PTD while we're waiting for the lock */ | |
| 969 | cpu_invltlb(); | |
| 970 | ||
| 46d4e165 | 971 | /* Build our map of 'other' CPUs. */ |
| da23a592 | 972 | mycpu->gd_other_cpus = smp_startup_mask & ~CPUMASK(mycpu->gd_cpuid); |
| 46d4e165 | 973 | |
| 46d4e165 | 974 | /* A quick check from sanity claus */ |
| fbac0dc4 SZ |
975 | cpu_id = APICID_TO_CPUID((lapic->id & 0xff000000) >> 24); |
| 976 | if (mycpu->gd_cpuid != cpu_id) { | |
| 977 | kprintf("SMP: assigned cpuid = %d\n", mycpu->gd_cpuid); | |
| 978 | kprintf("SMP: actual cpuid = %d lapicid %d\n", | |
| 979 | cpu_id, (lapic->id & 0xff000000) >> 24); | |
| 46d4e165 JG |
980 | #if JGXXX |
| 981 | kprintf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); | |
| 982 | #endif | |
| 983 | panic("cpuid mismatch! boom!!"); | |
| 984 | } | |
| 985 | ||
| 986 | /* Initialize AP's local APIC for irq's */ | |
| 5ddeabb9 | 987 | lapic_init(FALSE); |
| 46d4e165 | 988 | |
| c6b1591c SZ |
989 | /* LAPIC initialization is done */ |
| 990 | smp_lapic_mask |= CPUMASK(mycpu->gd_cpuid); | |
| 991 | cpu_mfence(); | |
| 992 | ||
| 993 | /* Let BSP move onto the next initialization stage */ | |
| 994 | rel_mplock(); | |
| 995 | ||
| 996 | /* | |
| 997 | * Interlock for finalization. Wait until mp_finish is non-zero, | |
| 998 | * then get the MP lock. | |
| 999 | * | |
| 1000 | * Note: We are in a critical section. | |
| 1001 | * | |
| 1002 | * Note: we are the idle thread, we can only spin. | |
| 1003 | * | |
| 1004 | * Note: The load fence is memory volatile and prevents the compiler | |
| 1005 | * from improperly caching mp_finish, and the cpu from improperly | |
| 1006 | * caching it. | |
| 1007 | */ | |
| 1008 | while (mp_finish == 0) | |
| 1009 | cpu_lfence(); | |
| 1010 | while (try_mplock() == 0) | |
| 1011 | ; | |
| 1012 | ||
| 1013 | /* BSP may have changed PTD while we're waiting for the lock */ | |
| 1014 | cpu_invltlb(); | |
| 1015 | ||
| 46d4e165 JG |
1016 | /* Set memory range attributes for this CPU to match the BSP */ |
| 1017 | mem_range_AP_init(); | |
| 1018 | ||
| 1019 | /* | |
| 1020 | * Once we go active we must process any IPIQ messages that may | |
| 1021 | * have been queued, because no actual IPI will occur until we | |
| 1022 | * set our bit in the smp_active_mask. If we don't the IPI | |
| 1023 | * message interlock could be left set which would also prevent | |
| 1024 | * further IPIs. | |
| 1025 | * | |
| 1026 | * The idle loop doesn't expect the BGL to be held and while | |
| 1027 | * lwkt_switch() normally cleans things up this is a special case | |
| 1028 | * because we returning almost directly into the idle loop. | |
| 1029 | * | |
| 1030 | * The idle thread is never placed on the runq, make sure | |
| 1031 | * nothing we've done put it there. | |
| 1032 | */ | |
| b5d16701 | 1033 | KKASSERT(get_mplock_count(curthread) == 1); |
| da23a592 | 1034 | smp_active_mask |= CPUMASK(mycpu->gd_cpuid); |
| 46d4e165 JG |
1035 | |
| 1036 | /* | |
| 1037 | * Enable interrupts here. idle_restore will also do it, but | |
| 1038 | * doing it here lets us clean up any strays that got posted to | |
| 1039 | * the CPU during the AP boot while we are still in a critical | |
| 1040 | * section. | |
| 1041 | */ | |
| 1042 | __asm __volatile("sti; pause; pause"::); | |
| 9611ff20 | 1043 | bzero(mdcpu->gd_ipending, sizeof(mdcpu->gd_ipending)); |
| 46d4e165 JG |
1044 | |
| 1045 | initclocks_pcpu(); /* clock interrupts (via IPIs) */ | |
| 1046 | lwkt_process_ipiq(); | |
| 1047 | ||
| 1048 | /* | |
| 1049 | * Releasing the mp lock lets the BSP finish up the SMP init | |
| 1050 | */ | |
| 1051 | rel_mplock(); | |
| 1052 | KKASSERT((curthread->td_flags & TDF_RUNQ) == 0); | |
| 1053 | } | |
| 1054 | ||
| 1055 | /* | |
| 1056 | * Get SMP fully working before we start initializing devices. | |
| 1057 | */ | |
| 1058 | static | |
| 1059 | void | |
| 1060 | ap_finish(void) | |
| 1061 | { | |
| 1062 | mp_finish = 1; | |
| 1063 | if (bootverbose) | |
| 1064 | kprintf("Finish MP startup\n"); | |
| 46d4e165 | 1065 | rel_mplock(); |
| c6b1591c | 1066 | while (smp_active_mask != smp_startup_mask) |
| 46d4e165 | 1067 | cpu_lfence(); |
| 46d4e165 JG |
1068 | while (try_mplock() == 0) |
| 1069 | ; | |
| da23a592 MD |
1070 | if (bootverbose) { |
| 1071 | kprintf("Active CPU Mask: %016jx\n", | |
| 1072 | (uintmax_t)smp_active_mask); | |
| 1073 | } | |
| 46d4e165 JG |
1074 | } |
| 1075 | ||
| 1076 | SYSINIT(finishsmp, SI_BOOT2_FINISH_SMP, SI_ORDER_FIRST, ap_finish, NULL) | |
| 1077 | ||
| 1078 | void | |
| 1079 | cpu_send_ipiq(int dcpu) | |
| 1080 | { | |
| da23a592 | 1081 | if (CPUMASK(dcpu) & smp_active_mask) |
| 46d4e165 JG |
1082 | single_apic_ipi(dcpu, XIPIQ_OFFSET, APIC_DELMODE_FIXED); |
| 1083 | } | |
| 1084 | ||
| 1085 | #if 0 /* single_apic_ipi_passive() not working yet */ | |
| 1086 | /* | |
| 1087 | * Returns 0 on failure, 1 on success | |
| 1088 | */ | |
| 1089 | int | |
| 1090 | cpu_send_ipiq_passive(int dcpu) | |
| 1091 | { | |
| 1092 | int r = 0; | |
| da23a592 | 1093 | if (CPUMASK(dcpu) & smp_active_mask) { |
| 46d4e165 JG |
1094 | r = single_apic_ipi_passive(dcpu, XIPIQ_OFFSET, |
| 1095 | APIC_DELMODE_FIXED); | |
| 1096 | } | |
| 1097 | return(r); | |
| 1098 | } | |
| 1099 | #endif | |
| 3566408b SZ |
1100 | |
| 1101 | static void | |
| 3a69c113 | 1102 | mp_bsp_simple_setup(void) |
| 3566408b SZ |
1103 | { |
| 1104 | /* build our map of 'other' CPUs */ | |
| 1105 | mycpu->gd_other_cpus = smp_startup_mask & ~CPUMASK(mycpu->gd_cpuid); | |
| 1106 | mycpu->gd_ipiq = (void *)kmem_alloc(&kernel_map, sizeof(lwkt_ipiq) * ncpus); | |
| 1107 | bzero(mycpu->gd_ipiq, sizeof(lwkt_ipiq) * ncpus); | |
| 1108 | ||
| 1109 | pmap_set_opt(); | |
| 1110 | ||
| 1111 | if (cpu_feature & CPUID_TSC) | |
| 1112 | tsc0_offset = rdtsc(); | |
| 1113 | } |