2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008-2017 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
47 #include "opt_maxmem.h"
48 #include "opt_msgbuf.h"
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/sysmsg.h>
54 #include <sys/signalvar.h>
55 #include <sys/kernel.h>
56 #include <sys/linker.h>
57 #include <sys/malloc.h>
61 #include <sys/reboot.h>
63 #include <sys/msgbuf.h>
64 #include <sys/sysent.h>
65 #include <sys/sysctl.h>
66 #include <sys/vmmeter.h>
68 #include <sys/usched.h>
71 #include <sys/ctype.h>
72 #include <sys/serialize.h>
73 #include <sys/systimer.h>
76 #include <vm/vm_param.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
85 #include <sys/thread2.h>
86 #include <sys/mplock2.h>
95 #include <machine/cpu.h>
96 #include <machine/clock.h>
97 #include <machine/specialreg.h>
99 #include <machine/bootinfo.h>
101 #include <machine/md_var.h>
102 #include <machine/metadata.h>
103 #include <machine/pc/bios.h>
104 #include <machine/pcb_ext.h>
105 #include <machine/globaldata.h> /* CPU_prvspace */
106 #include <machine/smp.h>
107 #include <machine/cputypes.h>
108 #include <machine/intr_machdep.h>
109 #include <machine/framebuffer.h>
112 #include <bus/isa/isa_device.h>
114 #include <machine_base/isa/isa_intr.h>
115 #include <bus/isa/rtc.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
120 #include <sys/machintr.h>
121 #include <machine_base/icu/icu_abi.h>
122 #include <machine_base/icu/elcr_var.h>
123 #include <machine_base/apic/lapic.h>
124 #include <machine_base/apic/ioapic.h>
125 #include <machine_base/apic/ioapic_abi.h>
126 #include <machine/mptable.h>
128 #define PHYSMAP_ENTRIES 10
129 #define MAXBUFSTRUCTSIZE ((size_t)512 * 1024 * 1024)
131 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
133 extern void printcpuinfo(void); /* XXX header file */
134 extern void identify_cpu(void);
135 extern void panicifcpuunsupported(void);
137 static void cpu_startup(void *);
138 static void pic_finish(void *);
139 static void cpu_finish(void *);
141 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
143 static void init_locks(void);
145 extern void pcpu_timer_always(struct intrframe *);
147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL);
149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL);
152 extern vm_offset_t ksym_start, ksym_end;
155 struct privatespace CPU_prvspace_bsp __aligned(4096);
156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp };
158 vm_paddr_t efi_systbl_phys;
159 int _udatasel, _ucodesel, _ucode32sel;
161 int64_t tsc_offsets[MAXCPU];
162 cpumask_t smp_idleinvl_mask;
163 cpumask_t smp_idleinvl_reqs;
165 /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
166 __read_mostly static int cpu_mwait_halt_global;
167 __read_mostly static int clock_debug1;
168 __read_mostly static int flame_poll_debug;
170 SYSCTL_INT(_debug, OID_AUTO, flame_poll_debug,
171 CTLFLAG_RW, &flame_poll_debug, 0, "");
172 TUNABLE_INT("debug.flame_poll_debug", &flame_poll_debug);
174 #if defined(SWTCH_OPTIM_STATS)
175 extern int swtch_optim_stats;
176 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
177 CTLFLAG_RD, &swtch_optim_stats, 0, "");
178 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
179 CTLFLAG_RD, &tlb_flush_count, 0, "");
181 SYSCTL_INT(_debug, OID_AUTO, clock_debug1,
182 CTLFLAG_RW, &clock_debug1, 0, "");
183 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt,
184 CTLFLAG_RD, &cpu_mwait_halt_global, 0, "");
185 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin,
186 CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state");
188 #define CPU_MWAIT_HAS_CX \
189 ((cpu_feature2 & CPUID2_MON) && \
190 (cpu_mwait_feature & CPUID_MWAIT_EXT))
192 #define CPU_MWAIT_CX_NAMELEN 16
194 #define CPU_MWAIT_C1 1
195 #define CPU_MWAIT_C2 2
196 #define CPU_MWAIT_C3 3
197 #define CPU_MWAIT_CX_MAX 8
199 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
200 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
202 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features");
203 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings");
205 struct cpu_mwait_cx {
208 struct sysctl_ctx_list sysctl_ctx;
209 struct sysctl_oid *sysctl_tree;
211 static struct cpu_mwait_cx cpu_mwait_cx_info[CPU_MWAIT_CX_MAX];
212 static char cpu_mwait_cx_supported[256];
214 static int cpu_mwait_c1_hints_cnt;
215 static int cpu_mwait_hints_cnt;
216 static int *cpu_mwait_hints;
218 static int cpu_mwait_deep_hints_cnt;
219 static int *cpu_mwait_deep_hints;
221 #define CPU_IDLE_REPEAT_DEFAULT 750
223 static u_int cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT;
224 static u_long cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT;
225 static u_int cpu_mwait_repeat_shift = 1;
227 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
228 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
230 static int cpu_mwait_c3_preamble =
231 CPU_MWAIT_C3_PREAMBLE_BM_ARB |
232 CPU_MWAIT_C3_PREAMBLE_BM_STS;
234 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD,
235 cpu_mwait_cx_supported, 0, "MWAIT supported C states");
236 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD,
237 &cpu_mwait_c3_preamble, 0, "C3+ preamble mask");
239 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,
241 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS);
242 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS);
243 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS);
245 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW,
246 NULL, 0, cpu_mwait_cx_idle_sysctl, "A", "");
247 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW,
248 NULL, 0, cpu_mwait_cx_spin_sysctl, "A", "");
249 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW,
250 &cpu_mwait_repeat_shift, 0, "");
254 u_long ebda_addr = 0;
256 int imcr_present = 0;
258 int naps = 0; /* # of Applications processors */
263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
265 u_long pmem = ctob(physmem);
268 error = sysctl_handle_long(oidp, &pmem, 0, req);
273 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD,
274 0, 0, sysctl_hw_physmem, "LU",
275 "Total system memory in bytes (number of pages * page size)");
278 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
280 u_long usermem = ctob(physmem - vmstats.v_wire_count);
283 error = sysctl_handle_long(oidp, &usermem, 0, req);
288 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD,
289 0, 0, sysctl_hw_usermem, "LU", "");
292 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
297 availpages = x86_64_btop(avail_end - avail_start);
298 error = sysctl_handle_long(oidp, &availpages, 0, req);
303 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD,
304 0, 0, sysctl_hw_availpages, "LU", "");
310 * The number of PHYSMAP entries must be one less than the number of
311 * PHYSSEG entries because the PHYSMAP entry that spans the largest
312 * physical address that is accessible by ISA DMA is split into two
315 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1];
316 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1];
318 /* must be 1 less so 0 0 can signal end of chunks */
319 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
320 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
322 static vm_offset_t buffer_sva, buffer_eva;
323 vm_offset_t clean_sva, clean_eva;
324 static vm_offset_t pager_sva, pager_eva;
325 static struct trapframe proc0_tf;
327 static void cpu_implement_smap(void);
330 cpu_startup(void *dummy)
334 vm_offset_t firstaddr;
337 * Good {morning,afternoon,evening,night}.
339 kprintf("%s", version);
342 panicifcpuunsupported();
343 if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
344 cpu_implement_smap();
346 kprintf("real memory = %ju (%ju MB)\n",
348 (intmax_t)Realmem / 1024 / 1024);
350 * Display any holes after the first chunk of extended memory.
355 kprintf("Physical memory chunk(s):\n");
356 for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) {
359 size1 = phys_avail[indx].phys_end -
360 phys_avail[indx].phys_beg;
362 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
363 (intmax_t)phys_avail[indx].phys_beg,
364 (intmax_t)phys_avail[indx].phys_end - 1,
366 (intmax_t)(size1 / PAGE_SIZE));
371 * Allocate space for system data structures.
372 * The first available kernel virtual address is in "v".
373 * As pages of kernel virtual memory are allocated, "v" is incremented.
374 * As pages of memory are allocated and cleared,
375 * "firstaddr" is incremented.
376 * An index into the kernel page table corresponding to the
377 * virtual memory address maintained in "v" is kept in "mapaddr".
381 * Make two passes. The first pass calculates how much memory is
382 * needed and allocates it. The second pass assigns virtual
383 * addresses to the various data structures.
387 v = (caddr_t)firstaddr;
389 #define valloc(name, type, num) \
390 (name) = (type *)v; v = (caddr_t)((name)+(num))
391 #define valloclim(name, type, num, lim) \
392 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
395 * Calculate nbuf such that maxbufspace uses approximately 1/20
396 * of physical memory by default, with a minimum of 50 buffers.
398 * The calculation is made after discounting 128MB.
400 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB).
401 * nbuf = (kbytes / factor) would cover all of memory.
404 long factor = NBUFCALCSIZE / 1024; /* KB/nbuf */
405 long kbytes = physmem * (PAGE_SIZE / 1024); /* physmem */
408 if (kbytes > 128 * 1024)
409 nbuf += (kbytes - 128 * 1024) / (factor * 20);
410 if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE)
411 nbuf = maxbcache / NBUFCALCSIZE;
412 if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) {
413 kprintf("Warning: nbuf capped at %ld due to the "
414 "reasonability limit\n", nbuf);
415 nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf);
420 * Do not allow the buffer_map to be more then 1/2 the size of the
423 if (nbuf > (virtual_end - virtual_start +
424 virtual2_end - virtual2_start) / (MAXBSIZE * 2)) {
425 nbuf = (virtual_end - virtual_start +
426 virtual2_end - virtual2_start) / (MAXBSIZE * 2);
427 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf);
431 * Do not allow the buffer_map to use more than 50% of available
432 * physical-equivalent memory. Since the VM pages which back
433 * individual buffers are typically wired, having too many bufs
434 * can prevent the system from paging properly.
436 if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) {
437 nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2);
438 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf);
442 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of
443 * the valloc space which is just the virtual_end - virtual_start
444 * section. This is typically ~2GB regardless of the amount of
445 * memory, so we use 500MB as a metric.
447 * This is because we use valloc() to allocate the buf header array.
449 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls.
451 if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) {
452 nbuf = (virtual_end - virtual_start) /
453 (sizeof(struct buf) * 4);
454 kprintf("Warning: nbufs capped at %ld due to "
455 "valloc considerations\n",
459 nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8);
461 if (nswbuf_mem < NSWBUF_MIN)
462 nswbuf_mem = NSWBUF_MIN;
464 nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16);
466 if (nswbuf_kva < NSWBUF_MIN)
467 nswbuf_kva = NSWBUF_MIN;
470 valloc(swbuf_mem, struct buf, nswbuf_mem);
471 valloc(swbuf_kva, struct buf, nswbuf_kva);
472 valloc(buf, struct buf, nbuf);
475 * End of first pass, size has been calculated so allocate memory
477 if (firstaddr == 0) {
478 size = (vm_size_t)(v - firstaddr);
479 firstaddr = kmem_alloc(kernel_map, round_page(size),
482 panic("startup: no room for tables");
487 * End of second pass, addresses have been assigned
489 * nbuf is an int, make sure we don't overflow the field.
491 * On 64-bit systems we always reserve maximal allocations for
492 * buffer cache buffers and there are no fragmentation issues,
493 * so the KVA segment does not have to be excessively oversized.
495 if ((vm_size_t)(v - firstaddr) != size)
496 panic("startup: table size inconsistency");
498 kmem_suballoc(kernel_map, clean_map, &clean_sva, &clean_eva,
499 ((vm_offset_t)(nbuf + 16) * MAXBSIZE) +
500 ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size);
501 kmem_suballoc(clean_map, buffer_map, &buffer_sva, &buffer_eva,
502 ((vm_offset_t)(nbuf + 16) * MAXBSIZE));
503 buffer_map->system_map = 1;
504 kmem_suballoc(clean_map, pager_map, &pager_sva, &pager_eva,
505 ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) +
507 pager_map->system_map = 1;
508 kprintf("avail memory = %ju (%ju MB)\n",
509 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages),
510 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) /
514 struct cpu_idle_stat {
522 u_long mwait_cx[CPU_MWAIT_CX_MAX];
525 #define CPU_IDLE_STAT_HALT -1
526 #define CPU_IDLE_STAT_SPIN -2
528 static struct cpu_idle_stat cpu_idle_stats[MAXCPU];
531 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)
533 int idx = arg2, cpu, error;
536 if (idx == CPU_IDLE_STAT_HALT) {
537 for (cpu = 0; cpu < ncpus; ++cpu)
538 val += cpu_idle_stats[cpu].halt;
539 } else if (idx == CPU_IDLE_STAT_SPIN) {
540 for (cpu = 0; cpu < ncpus; ++cpu)
541 val += cpu_idle_stats[cpu].spin;
543 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
544 ("invalid index %d", idx));
545 for (cpu = 0; cpu < ncpus; ++cpu)
546 val += cpu_idle_stats[cpu].mwait_cx[idx];
549 error = sysctl_handle_quad(oidp, &val, 0, req);
550 if (error || req->newptr == NULL)
553 if (idx == CPU_IDLE_STAT_HALT) {
554 for (cpu = 0; cpu < ncpus; ++cpu)
555 cpu_idle_stats[cpu].halt = 0;
556 cpu_idle_stats[0].halt = val;
557 } else if (idx == CPU_IDLE_STAT_SPIN) {
558 for (cpu = 0; cpu < ncpus; ++cpu)
559 cpu_idle_stats[cpu].spin = 0;
560 cpu_idle_stats[0].spin = val;
562 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
563 ("invalid index %d", idx));
564 for (cpu = 0; cpu < ncpus; ++cpu)
565 cpu_idle_stats[cpu].mwait_cx[idx] = 0;
566 cpu_idle_stats[0].mwait_cx[idx] = val;
572 cpu_mwait_attach(void)
577 if (!CPU_MWAIT_HAS_CX)
580 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
581 (CPUID_TO_FAMILY(cpu_id) > 0xf ||
582 (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
583 CPUID_TO_MODEL(cpu_id) >= 0xf))) {
587 * Pentium dual-core, Core 2 and beyond do not need any
588 * additional activities to enter deep C-state, i.e. C3(+).
590 cpu_mwait_cx_no_bmarb();
592 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts);
594 cpu_mwait_cx_no_bmsts();
597 sbuf_new(&sb, cpu_mwait_cx_supported,
598 sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN);
600 for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) {
601 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i];
604 ksnprintf(cx->name, sizeof(cx->name), "C%d", i);
606 sysctl_ctx_init(&cx->sysctl_ctx);
607 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx,
608 SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO,
609 cx->name, CTLFLAG_RW, NULL, "Cx control/info");
610 if (cx->sysctl_tree == NULL)
613 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i);
614 SYSCTL_ADD_INT(&cx->sysctl_ctx,
615 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
616 "subcnt", CTLFLAG_RD, &cx->subcnt, 0,
618 SYSCTL_ADD_PROC(&cx->sysctl_ctx,
619 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
620 "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0,
621 i, sysctl_cpu_idle_cnt, "Q", "# of times entered");
623 for (sub = 0; sub < cx->subcnt; ++sub)
624 sbuf_printf(&sb, "C%d/%d ", i, sub);
632 cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt;
633 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i)
634 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt;
635 cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt,
639 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) {
642 subcnt = cpu_mwait_cx_info[i].subcnt;
643 for (j = 0; j < subcnt; ++j) {
644 KASSERT(hint_idx < cpu_mwait_hints_cnt,
645 ("invalid mwait hint index %d", hint_idx));
646 cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
650 KASSERT(hint_idx == cpu_mwait_hints_cnt,
651 ("mwait hint count %d != index %d",
652 cpu_mwait_hints_cnt, hint_idx));
655 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt);
656 for (i = 0; i < cpu_mwait_hints_cnt; ++i) {
657 int hint = cpu_mwait_hints[i];
659 kprintf(" C%d/%d hint 0x%04x\n",
660 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
668 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i)
669 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt;
670 cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt,
674 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) {
677 subcnt = cpu_mwait_cx_info[i].subcnt;
678 for (j = 0; j < subcnt; ++j) {
679 KASSERT(hint_idx < cpu_mwait_deep_hints_cnt,
680 ("invalid mwait deep hint index %d", hint_idx));
681 cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
685 KASSERT(hint_idx == cpu_mwait_deep_hints_cnt,
686 ("mwait deep hint count %d != index %d",
687 cpu_mwait_deep_hints_cnt, hint_idx));
690 kprintf("MWAIT deep hints:\n");
691 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) {
692 int hint = cpu_mwait_deep_hints[i];
694 kprintf(" C%d/%d hint 0x%04x\n",
695 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
699 cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt;
701 for (i = 0; i < ncpus; ++i) {
704 ksnprintf(name, sizeof(name), "idle%d", i);
705 SYSCTL_ADD_PROC(NULL,
706 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO,
707 name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i],
708 0, cpu_mwait_cx_pcpu_idle_sysctl, "A", "");
713 cpu_finish(void *dummy __unused)
720 pic_finish(void *dummy __unused)
722 /* Log ELCR information */
725 /* Log MPTABLE information */
726 mptable_pci_int_dump();
729 MachIntrABI.finalize();
733 * Send an interrupt to process.
735 * Stack is set up to allow sigcode stored
736 * at top to call routine, followed by kcall
737 * to sigreturn routine below. After sigreturn
738 * resets the signal mask, the stack, and the
739 * frame pointer, it returns to the user
743 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
745 struct lwp *lp = curthread->td_lwp;
746 struct proc *p = lp->lwp_proc;
747 struct trapframe *regs;
748 struct sigacts *psp = p->p_sigacts;
749 struct sigframe sf, *sfp;
753 regs = lp->lwp_md.md_regs;
754 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0;
756 /* Save user context */
757 bzero(&sf, sizeof(struct sigframe));
758 sf.sf_uc.uc_sigmask = *mask;
759 sf.sf_uc.uc_stack = lp->lwp_sigstk;
760 sf.sf_uc.uc_mcontext.mc_onstack = oonstack;
761 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0);
762 /* gcc errors out on optimized bcopy */
763 _bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe));
765 /* Make the size of the saved context visible to userland */
766 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext);
768 /* Allocate and validate space for the signal handler context. */
769 if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack &&
770 SIGISMEMBER(psp->ps_sigonstack, sig)) {
771 sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size -
772 sizeof(struct sigframe);
773 lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
775 /* We take red zone into account */
776 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
780 * XXX AVX needs 64-byte alignment but sigframe has other fields and
781 * the embedded ucontext is not at the front, so aligning this won't
782 * help us. Fortunately we bcopy in/out of the sigframe, so the
785 * The problem though is if userland winds up trying to use the
788 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF);
790 /* Translate the signal is appropriate */
791 if (p->p_sysent->sv_sigtbl) {
792 if (sig <= p->p_sysent->sv_sigsize)
793 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
797 * Build the argument list for the signal handler.
799 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
801 regs->tf_rdi = sig; /* argument 1 */
802 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */
804 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
806 * Signal handler installed with SA_SIGINFO.
808 * action(signo, siginfo, ucontext)
810 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */
811 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
812 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
814 /* fill siginfo structure */
815 sf.sf_si.si_signo = sig;
816 sf.sf_si.si_pid = psp->ps_frominfo[sig].pid;
817 sf.sf_si.si_uid = psp->ps_frominfo[sig].uid;
818 sf.sf_si.si_code = code;
819 sf.sf_si.si_addr = (void *)regs->tf_addr;
822 * Old FreeBSD-style arguments.
824 * handler (signo, code, [uc], addr)
826 regs->tf_rsi = (register_t)code; /* argument 2 */
827 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
828 sf.sf_ahu.sf_handler = catcher;
832 * If we're a vm86 process, we want to save the segment registers.
833 * We also change eflags to be our emulated eflags, not the actual
837 if (regs->tf_eflags & PSL_VM) {
838 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
839 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
841 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
842 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
843 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
844 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
846 if (vm86->vm86_has_vme == 0)
847 sf.sf_uc.uc_mcontext.mc_eflags =
848 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
849 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
852 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
853 * syscalls made by the signal handler. This just avoids
854 * wasting time for our lazy fixup of such faults. PSL_NT
855 * does nothing in vm86 mode, but vm86 programs can set it
856 * almost legitimately in probes for old cpu types.
858 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
863 * Save the FPU state and reinit the FP unit
865 npxpush(&sf.sf_uc.uc_mcontext);
868 * Copy the sigframe out to the user's stack.
870 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
872 * Something is wrong with the stack pointer.
873 * ...Kill the process.
878 regs->tf_rsp = (register_t)sfp;
879 regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
880 regs->tf_rip -= SZSIGCODE_EXTRA_BYTES;
883 * x86 abi specifies that the direction flag must be cleared
886 regs->tf_rflags &= ~(PSL_T | PSL_D);
889 * 64 bit mode has a code and stack selector but
890 * no data or extra selector. %fs and %gs are not
893 regs->tf_cs = _ucodesel;
894 regs->tf_ss = _udatasel;
899 * Sanitize the trapframe for a virtual kernel passing control to a custom
900 * VM context. Remove any items that would otherwise create a privilage
903 * XXX at the moment we allow userland to set the resume flag. Is this a
907 cpu_sanitize_frame(struct trapframe *frame)
909 frame->tf_cs = _ucodesel;
910 frame->tf_ss = _udatasel;
911 /* XXX VM (8086) mode not supported? */
912 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP);
913 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I;
919 * Sanitize the tls so loading the descriptor does not blow up
920 * on us. For x86_64 we don't have to do anything.
923 cpu_sanitize_tls(struct savetls *tls)
929 * sigreturn(ucontext_t *sigcntxp)
931 * System call to cleanup state after a signal
932 * has been taken. Reset signal mask and
933 * stack state from context left by sendsig (above).
934 * Return to previous pc and psl as specified by
935 * context left by sendsig. Check carefully to
936 * make sure that the user has not modified the
937 * state to gain improper privileges.
941 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
942 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
945 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap)
947 struct lwp *lp = curthread->td_lwp;
948 struct trapframe *regs;
956 * We have to copy the information into kernel space so userland
957 * can't modify it while we are sniffing it.
959 regs = lp->lwp_md.md_regs;
960 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
964 rflags = ucp->uc_mcontext.mc_rflags;
966 /* VM (8086) mode not supported */
967 rflags &= ~PSL_VM_UNSUPP;
970 if (eflags & PSL_VM) {
971 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
972 struct vm86_kernel *vm86;
975 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
976 * set up the vm86 area, and we can't enter vm86 mode.
978 if (lp->lwp_thread->td_pcb->pcb_ext == 0)
980 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
981 if (vm86->vm86_inited == 0)
984 /* go back to user mode if both flags are set */
985 if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
986 trapsignal(lp, SIGBUS, 0);
988 if (vm86->vm86_has_vme) {
989 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
990 (eflags & VME_USERCHANGE) | PSL_VM;
992 vm86->vm86_eflags = eflags; /* save VIF, VIP */
993 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
994 (eflags & VM_USERCHANGE) | PSL_VM;
996 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe));
997 tf->tf_eflags = eflags;
998 tf->tf_vm86_ds = tf->tf_ds;
999 tf->tf_vm86_es = tf->tf_es;
1000 tf->tf_vm86_fs = tf->tf_fs;
1001 tf->tf_vm86_gs = tf->tf_gs;
1002 tf->tf_ds = _udatasel;
1003 tf->tf_es = _udatasel;
1004 tf->tf_fs = _udatasel;
1005 tf->tf_gs = _udatasel;
1010 * Don't allow users to change privileged or reserved flags.
1013 * XXX do allow users to change the privileged flag PSL_RF.
1014 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
1015 * should sometimes set it there too. tf_eflags is kept in
1016 * the signal context during signal handling and there is no
1017 * other place to remember it, so the PSL_RF bit may be
1018 * corrupted by the signal handler without us knowing.
1019 * Corruption of the PSL_RF bit at worst causes one more or
1020 * one less debugger trap, so allowing it is fairly harmless.
1022 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
1023 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags);
1028 * Don't allow users to load a valid privileged %cs. Let the
1029 * hardware check for invalid selectors, excess privilege in
1030 * other selectors, invalid %eip's and invalid %esp's.
1032 cs = ucp->uc_mcontext.mc_cs;
1033 if (!CS_SECURE(cs)) {
1034 kprintf("sigreturn: cs = 0x%x\n", cs);
1035 trapsignal(lp, SIGBUS, T_PROTFLT);
1038 /* gcc errors out on optimized bcopy */
1039 _bcopy(&ucp->uc_mcontext.mc_rdi, regs,
1040 sizeof(struct trapframe));
1044 * Restore the FPU state from the frame
1047 npxpop(&ucp->uc_mcontext);
1049 if (ucp->uc_mcontext.mc_onstack & 1)
1050 lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
1052 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK;
1054 lp->lwp_sigmask = ucp->uc_sigmask;
1055 SIG_CANTMASK(lp->lwp_sigmask);
1058 return(EJUSTRETURN);
1062 * Machine dependent boot() routine
1064 * I haven't seen anything to put here yet
1065 * Possibly some stuff might be grafted back here from boot()
1073 * Shutdown the CPU as much as possible
1079 __asm__ __volatile("hlt");
1083 * cpu_idle() represents the idle LWKT. You cannot return from this function
1084 * (unless you want to blow things up!). Instead we look for runnable threads
1085 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1087 * The main loop is entered with a critical section held, we must release
1088 * the critical section before doing anything else. lwkt_switch() will
1089 * check for pending interrupts due to entering and exiting its own
1092 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1093 * However, there are cases where the idlethread will be entered with
1094 * the possibility that no IPI will occur and in such cases
1095 * lwkt_switch() sets TDF_IDLE_NOHLT.
1097 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1098 * must occur before it starts using ACPI halt.
1100 * NOTE: Value overridden in hammer_time().
1102 static int cpu_idle_hlt = 2;
1103 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1104 &cpu_idle_hlt, 0, "Idle loop HLT enable");
1105 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW,
1106 &cpu_idle_repeat, 0, "Idle entries before acpi hlt");
1108 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1109 0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts");
1110 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1111 0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins");
1114 cpu_idle_default_hook(void)
1117 * We must guarentee that hlt is exactly the instruction
1118 * following the sti.
1120 __asm __volatile("sti; hlt");
1123 /* Other subsystems (e.g., ACPI) can hook this later. */
1124 void (*cpu_idle_hook)(void) = cpu_idle_default_hook;
1127 cpu_mwait_cx_hint(struct cpu_idle_stat *stat)
1136 idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >>
1137 cpu_mwait_repeat_shift;
1138 if (idx >= cpu_mwait_c1_hints_cnt) {
1139 /* Step up faster, once we walked through all C1 states */
1140 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1);
1142 if (hint == CPU_MWAIT_HINT_AUTODEEP) {
1143 if (idx >= cpu_mwait_deep_hints_cnt)
1144 idx = cpu_mwait_deep_hints_cnt - 1;
1145 hint = cpu_mwait_deep_hints[idx];
1147 if (idx >= cpu_mwait_hints_cnt)
1148 idx = cpu_mwait_hints_cnt - 1;
1149 hint = cpu_mwait_hints[idx];
1152 cx_idx = MWAIT_EAX_TO_CX(hint);
1153 if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX)
1154 stat->mwait_cx[cx_idx]++;
1161 globaldata_t gd = mycpu;
1162 struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid];
1163 struct thread *td __debugvar = gd->gd_curthread;
1166 stat->repeat = stat->repeat_last = cpu_idle_repeat_max;
1169 KKASSERT(td->td_critcount == 0);
1173 * See if there are any LWKTs ready to go.
1178 * When halting inside a cli we must check for reqflags
1179 * races, particularly [re]schedule requests. Running
1180 * splz() does the job.
1183 * 0 Never halt, just spin
1185 * 1 Always use MONITOR/MWAIT if avail, HLT
1188 * Better default for modern (Haswell+) Intel
1191 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1192 * use the ACPI halt (default). This is a hybrid
1193 * approach. See machdep.cpu_idle_repeat.
1195 * Better default for modern AMD cpus and older
1198 * 3 Always use the ACPI halt. This typically
1199 * eats the least amount of power but the cpu
1200 * will be slow waking up. Slows down e.g.
1201 * compiles and other pipe/event oriented stuff.
1203 * Usually the best default for AMD cpus.
1209 * NOTE: Interrupts are enabled and we are not in a critical
1212 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1213 * don't bother capping gd_idle_repeat, it is ok if
1214 * it overflows (we do make it unsigned, however).
1216 * Implement optimized invltlb operations when halted
1217 * in idle. By setting the bit in smp_idleinvl_mask
1218 * we inform other cpus that they can set _reqs to
1219 * request an invltlb. Current the code to do that
1220 * sets the bits in _reqs anyway, but then check _mask
1221 * to determine if they can assume the invltlb will execute.
1223 * A critical section is required to ensure that interrupts
1224 * do not fully run until after we've had a chance to execute
1227 if (gd->gd_idle_repeat == 0) {
1228 stat->repeat = (stat->repeat + stat->repeat_last) >> 1;
1229 if (stat->repeat > cpu_idle_repeat_max)
1230 stat->repeat = cpu_idle_repeat_max;
1231 stat->repeat_last = 0;
1232 stat->repeat_delta = 0;
1234 ++stat->repeat_last;
1237 * General idle thread halt code
1239 * IBRS NOTES - IBRS is a SPECTRE mitigation. When going
1240 * idle, disable IBRS to reduce hyperthread
1243 ++gd->gd_idle_repeat;
1245 switch(cpu_idle_hlt) {
1254 __asm __volatile("sti");
1261 * Use MONITOR/MWAIT (or HLT) for a few cycles,
1262 * then start using the ACPI halt code if we
1263 * continue to be idle.
1265 if (gd->gd_idle_repeat >= cpu_idle_repeat)
1270 * Always use MONITOR/MWAIT (will use HLT if
1271 * MONITOR/MWAIT not available).
1273 if (cpu_mi_feature & CPU_MI_MONITOR) {
1275 reqflags = gd->gd_reqflags;
1276 if (reqflags & RQF_IDLECHECK_WK_MASK)
1279 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid);
1283 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1284 SPEC_CTRL_DUMMY_ENABLE) {
1285 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1287 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1288 cpu_mwait_cx_hint(stat), 0);
1289 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1290 SPEC_CTRL_DUMMY_ENABLE) {
1291 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1294 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid);
1295 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1308 __asm __volatile("cli");
1311 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1312 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1314 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1315 SPEC_CTRL_DUMMY_ENABLE) {
1316 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1318 cpu_idle_default_hook();
1319 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1320 SPEC_CTRL_DUMMY_ENABLE) {
1321 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1323 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1325 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1331 __asm __volatile("sti");
1341 __asm __volatile("cli");
1344 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1345 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1347 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1348 SPEC_CTRL_DUMMY_ENABLE) {
1349 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1352 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1353 SPEC_CTRL_DUMMY_ENABLE) {
1354 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1356 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1358 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1364 __asm __volatile("sti");
1373 * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt
1374 * the cpu in C1. ACPI might use other halt methods for deeper states
1375 * and not reach here.
1377 * For now we always use HLT as we are not sure what ACPI may have actually
1378 * done. MONITOR/MWAIT might not be appropriate.
1380 * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT
1381 * does. On Intel, MONITOR/MWAIT does appear to throttle the cpu.
1390 /* DISABLED FOR NOW */
1391 struct cpu_idle_stat *stat;
1395 if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) &&
1396 (cpu_mi_feature & CPU_MI_MONITOR) &&
1397 cpu_vendor_id != CPU_VENDOR_AMD) {
1401 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we
1404 stat = &cpu_idle_stats[gd->gd_cpuid];
1405 reqflags = gd->gd_reqflags;
1406 if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1407 __asm __volatile("sti");
1408 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1409 cpu_mwait_cx_hint(stat), 0);
1411 __asm __volatile("sti; pause");
1419 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0)
1420 __asm __volatile("sti; hlt");
1422 __asm __volatile("sti; pause");
1428 * Called in a loop indirectly via Xcpustop
1431 cpu_smp_stopped(void)
1433 globaldata_t gd = mycpu;
1434 volatile __uint64_t *ptr;
1437 ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid);
1439 if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) {
1440 if (cpu_mi_feature & CPU_MI_MONITOR) {
1441 if (cpu_mwait_hints) {
1442 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1445 cpu_mwait_hints_cnt - 1], 0);
1447 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1451 cpu_halt(); /* depend on lapic timer */
1457 * This routine is called if a spinlock has been held through the
1458 * exponential backoff period and is seriously contested. On a real cpu
1462 cpu_spinlock_contested(void)
1468 * Clear registers on exec
1471 exec_setregs(u_long entry, u_long stack, u_long ps_strings)
1473 struct thread *td = curthread;
1474 struct lwp *lp = td->td_lwp;
1475 struct pcb *pcb = td->td_pcb;
1476 struct trapframe *regs = lp->lwp_md.md_regs;
1481 bzero((char *)regs, sizeof(struct trapframe));
1482 regs->tf_rip = entry;
1483 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */
1484 regs->tf_rdi = stack; /* argv */
1485 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
1486 regs->tf_ss = _udatasel;
1487 regs->tf_cs = _ucodesel;
1488 regs->tf_rbx = ps_strings;
1491 * Reset the hardware debug registers if they were in use.
1492 * They won't have any meaning for the newly exec'd process.
1494 if (pcb->pcb_flags & PCB_DBREGS) {
1500 pcb->pcb_dr7 = 0; /* JG set bit 10? */
1501 if (pcb == td->td_pcb) {
1503 * Clear the debug registers on the running
1504 * CPU, otherwise they will end up affecting
1505 * the next process we switch to.
1509 pcb->pcb_flags &= ~PCB_DBREGS;
1513 * Initialize the math emulator (if any) for the current process.
1514 * Actually, just clear the bit that says that the emulator has
1515 * been initialized. Initialization is delayed until the process
1516 * traps to the emulator (if it is done at all) mainly because
1517 * emulators don't provide an entry point for initialization.
1519 pcb->pcb_flags &= ~FP_SOFTFP;
1522 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1523 * gd_npxthread. Otherwise a preemptive interrupt thread
1524 * may panic in npxdna().
1527 load_cr0(rcr0() | CR0_MP);
1530 * NOTE: The MSR values must be correct so we can return to
1531 * userland. gd_user_fs/gs must be correct so the switch
1532 * code knows what the current MSR values are.
1534 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */
1535 pcb->pcb_gsbase = 0;
1536 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */
1537 mdcpu->gd_user_gs = 0;
1538 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */
1539 wrmsr(MSR_KGSBASE, 0);
1541 /* Initialize the npx (if any) for the current process. */
1545 pcb->pcb_ds = _udatasel;
1546 pcb->pcb_es = _udatasel;
1547 pcb->pcb_fs = _udatasel;
1548 pcb->pcb_gs = _udatasel;
1557 cr0 |= CR0_NE; /* Done by npxinit() */
1558 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */
1559 cr0 |= CR0_WP | CR0_AM;
1565 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1568 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1570 if (!error && req->newptr)
1575 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1576 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1578 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1579 CTLFLAG_RW, &disable_rtc_set, 0, "");
1582 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1583 CTLFLAG_RD, &bootinfo, bootinfo, "");
1586 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1587 CTLFLAG_RW, &wall_cmos_clock, 0, "");
1590 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1592 struct efi_map_header *efihdr;
1596 kmdp = preload_search_by_type("elf kernel");
1598 kmdp = preload_search_by_type("elf64 kernel");
1599 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1600 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1603 efisize = *((uint32_t *)efihdr - 1);
1604 return (SYSCTL_OUT(req, efihdr, efisize));
1606 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1607 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1610 * Initialize x86 and configure to run kernel
1614 * Initialize segments & interrupt table
1618 struct user_segment_descriptor gdt_cpu0[MAXGDT_COUNT];
1619 struct gate_descriptor idt_arr[MAXCPU][NIDT];
1621 union descriptor ldt[NLDT]; /* local descriptor table */
1624 /* table descriptors - used to load tables by cpu */
1625 struct region_descriptor r_gdt;
1626 struct region_descriptor r_idt_arr[MAXCPU];
1628 /* JG proc0paddr is a virtual address */
1631 char proc0paddr_buff[LWKT_THREAD_STACK];
1634 /* software prototypes -- in more palatable form */
1635 struct soft_segment_descriptor gdt_segs[] = {
1636 /* GNULL_SEL 0 Null Descriptor */
1637 { 0x0, /* segment base address */
1639 0, /* segment type */
1640 0, /* segment descriptor priority level */
1641 0, /* segment descriptor present */
1643 0, /* default 32 vs 16 bit size */
1644 0 /* limit granularity (byte/page units)*/ },
1645 /* GCODE_SEL 1 Code Descriptor for kernel */
1646 { 0x0, /* segment base address */
1647 0xfffff, /* length - all address space */
1648 SDT_MEMERA, /* segment type */
1649 SEL_KPL, /* segment descriptor priority level */
1650 1, /* segment descriptor present */
1652 0, /* default 32 vs 16 bit size */
1653 1 /* limit granularity (byte/page units)*/ },
1654 /* GDATA_SEL 2 Data Descriptor for kernel */
1655 { 0x0, /* segment base address */
1656 0xfffff, /* length - all address space */
1657 SDT_MEMRWA, /* segment type */
1658 SEL_KPL, /* segment descriptor priority level */
1659 1, /* segment descriptor present */
1661 0, /* default 32 vs 16 bit size */
1662 1 /* limit granularity (byte/page units)*/ },
1663 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1664 { 0x0, /* segment base address */
1665 0xfffff, /* length - all address space */
1666 SDT_MEMERA, /* segment type */
1667 SEL_UPL, /* segment descriptor priority level */
1668 1, /* segment descriptor present */
1670 1, /* default 32 vs 16 bit size */
1671 1 /* limit granularity (byte/page units)*/ },
1672 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1673 { 0x0, /* segment base address */
1674 0xfffff, /* length - all address space */
1675 SDT_MEMRWA, /* segment type */
1676 SEL_UPL, /* segment descriptor priority level */
1677 1, /* segment descriptor present */
1679 1, /* default 32 vs 16 bit size */
1680 1 /* limit granularity (byte/page units)*/ },
1681 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1682 { 0x0, /* segment base address */
1683 0xfffff, /* length - all address space */
1684 SDT_MEMERA, /* segment type */
1685 SEL_UPL, /* segment descriptor priority level */
1686 1, /* segment descriptor present */
1688 0, /* default 32 vs 16 bit size */
1689 1 /* limit granularity (byte/page units)*/ },
1690 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1692 0x0, /* segment base address */
1693 sizeof(struct x86_64tss)-1,/* length - all address space */
1694 SDT_SYSTSS, /* segment type */
1695 SEL_KPL, /* segment descriptor priority level */
1696 1, /* segment descriptor present */
1698 0, /* unused - default 32 vs 16 bit size */
1699 0 /* limit granularity (byte/page units)*/ },
1700 /* Actually, the TSS is a system descriptor which is double size */
1701 { 0x0, /* segment base address */
1703 0, /* segment type */
1704 0, /* segment descriptor priority level */
1705 0, /* segment descriptor present */
1707 0, /* default 32 vs 16 bit size */
1708 0 /* limit granularity (byte/page units)*/ },
1709 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1710 { 0x0, /* segment base address */
1711 0xfffff, /* length - all address space */
1712 SDT_MEMRWA, /* segment type */
1713 SEL_UPL, /* segment descriptor priority level */
1714 1, /* segment descriptor present */
1716 1, /* default 32 vs 16 bit size */
1717 1 /* limit granularity (byte/page units)*/ },
1721 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist)
1725 for (cpu = 0; cpu < MAXCPU; ++cpu) {
1726 struct gate_descriptor *ip = &idt_arr[cpu][idx];
1728 ip->gd_looffset = (uintptr_t)func;
1729 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1735 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1740 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu)
1742 struct gate_descriptor *ip;
1744 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu));
1746 ip = &idt_arr[cpu][idx];
1747 ip->gd_looffset = (uintptr_t)func;
1748 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1754 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1757 #define IDTVEC(name) __CONCAT(X,name)
1760 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1761 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1762 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1763 IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align),
1764 IDTVEC(xmm), IDTVEC(dblfault),
1765 IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
1768 IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03),
1769 IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07),
1770 IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b),
1771 IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f),
1772 IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13),
1773 IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17),
1774 IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b),
1775 IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f),
1776 IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23),
1777 IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27),
1778 IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b),
1779 IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f),
1780 IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33),
1781 IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37),
1782 IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b),
1783 IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f),
1784 IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43),
1785 IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47),
1786 IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b),
1787 IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f),
1788 IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53),
1789 IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57),
1790 IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b),
1791 IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f),
1792 IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63),
1793 IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67),
1794 IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b),
1795 IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f),
1796 IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73),
1797 IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77),
1798 IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b),
1799 IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f),
1800 IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83),
1801 IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87),
1802 IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b),
1803 IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f),
1804 IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93),
1805 IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97),
1806 IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b),
1807 IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f),
1808 IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3),
1809 IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7),
1810 IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab),
1811 IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf),
1812 IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3),
1813 IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7),
1814 IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb),
1815 IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf),
1816 IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3),
1817 IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7),
1818 IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb),
1819 IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf),
1820 IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3),
1821 IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7),
1822 IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb),
1823 IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf),
1824 IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3),
1825 IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7),
1826 IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb),
1827 IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef),
1828 IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3),
1829 IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7),
1830 IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb),
1831 IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff);
1833 inthand_t *rsvdary[NIDT] = {
1834 &IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03),
1835 &IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07),
1836 &IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b),
1837 &IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f),
1838 &IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13),
1839 &IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17),
1840 &IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b),
1841 &IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f),
1842 &IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23),
1843 &IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27),
1844 &IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b),
1845 &IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f),
1846 &IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33),
1847 &IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37),
1848 &IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b),
1849 &IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f),
1850 &IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43),
1851 &IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47),
1852 &IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b),
1853 &IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f),
1854 &IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53),
1855 &IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57),
1856 &IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b),
1857 &IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f),
1858 &IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63),
1859 &IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67),
1860 &IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b),
1861 &IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f),
1862 &IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73),
1863 &IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77),
1864 &IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b),
1865 &IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f),
1866 &IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83),
1867 &IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87),
1868 &IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b),
1869 &IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f),
1870 &IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93),
1871 &IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97),
1872 &IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b),
1873 &IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f),
1874 &IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3),
1875 &IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7),
1876 &IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab),
1877 &IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf),
1878 &IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3),
1879 &IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7),
1880 &IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb),
1881 &IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf),
1882 &IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3),
1883 &IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7),
1884 &IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb),
1885 &IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf),
1886 &IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3),
1887 &IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7),
1888 &IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb),
1889 &IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf),
1890 &IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3),
1891 &IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7),
1892 &IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb),
1893 &IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef),
1894 &IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3),
1895 &IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7),
1896 &IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb),
1897 &IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff)
1901 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
1903 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
1904 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1905 ssd->ssd_type = sd->sd_type;
1906 ssd->ssd_dpl = sd->sd_dpl;
1907 ssd->ssd_p = sd->sd_p;
1908 ssd->ssd_def32 = sd->sd_def32;
1909 ssd->ssd_gran = sd->sd_gran;
1913 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
1916 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1917 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
1918 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1919 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1920 sd->sd_type = ssd->ssd_type;
1921 sd->sd_dpl = ssd->ssd_dpl;
1922 sd->sd_p = ssd->ssd_p;
1923 sd->sd_long = ssd->ssd_long;
1924 sd->sd_def32 = ssd->ssd_def32;
1925 sd->sd_gran = ssd->ssd_gran;
1929 ssdtosyssd(struct soft_segment_descriptor *ssd,
1930 struct system_segment_descriptor *sd)
1933 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1934 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
1935 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1936 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1937 sd->sd_type = ssd->ssd_type;
1938 sd->sd_dpl = ssd->ssd_dpl;
1939 sd->sd_p = ssd->ssd_p;
1940 sd->sd_gran = ssd->ssd_gran;
1944 * Populate the (physmap) array with base/bound pairs describing the
1945 * available physical memory in the system, then test this memory and
1946 * build the phys_avail array describing the actually-available memory.
1948 * If we cannot accurately determine the physical memory map, then use
1949 * value from the 0xE801 call, and failing that, the RTC.
1951 * Total memory size may be set by the kernel environment variable
1952 * hw.physmem or the compile-time define MAXMEM.
1954 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1955 * of PAGE_SIZE. This also greatly reduces the memory test time
1956 * which would otherwise be excessive on machines with > 8G of ram.
1958 * XXX first should be vm_paddr_t.
1961 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1962 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1963 #define PHYSMAP_SIZE VM_PHYSSEG_MAX
1965 vm_paddr_t physmap[PHYSMAP_SIZE];
1966 struct bios_smap *smapbase, *smap, *smapend;
1967 struct efi_map_header *efihdrbase;
1970 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1971 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1974 add_smap_entries(int *physmap_idx)
1978 smapsize = *((u_int32_t *)smapbase - 1);
1979 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1981 for (smap = smapbase; smap < smapend; smap++) {
1982 if (boothowto & RB_VERBOSE)
1983 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1984 smap->type, smap->base, smap->length);
1986 if (smap->type != SMAP_TYPE_MEMORY)
1989 if (smap->length == 0)
1992 for (i = 0; i <= *physmap_idx; i += 2) {
1993 if (smap->base < physmap[i + 1]) {
1994 if (boothowto & RB_VERBOSE) {
1995 kprintf("Overlapping or non-monotonic "
1996 "memory region, ignoring "
2002 if (i <= *physmap_idx)
2005 Realmem += smap->length;
2008 * NOTE: This little bit of code initially expands
2009 * physmap[1] as well as later entries.
2011 if (smap->base == physmap[*physmap_idx + 1]) {
2012 physmap[*physmap_idx + 1] += smap->length;
2017 if (*physmap_idx == PHYSMAP_SIZE) {
2018 kprintf("Too many segments in the physical "
2019 "address map, giving up\n");
2022 physmap[*physmap_idx] = smap->base;
2023 physmap[*physmap_idx + 1] = smap->base + smap->length;
2028 add_efi_map_entries(int *physmap_idx)
2030 struct efi_md *map, *p;
2035 static const char *types[] = {
2041 "RuntimeServicesCode",
2042 "RuntimeServicesData",
2043 "ConventionalMemory",
2045 "ACPIReclaimMemory",
2048 "MemoryMappedIOPortSpace",
2053 * Memory map data provided by UEFI via the GetMemoryMap
2054 * Boot Services API.
2056 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
2057 map = (struct efi_md *)((uint8_t *)efihdrbase + efisz);
2059 if (efihdrbase->descriptor_size == 0)
2061 ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size;
2063 if (boothowto & RB_VERBOSE)
2064 kprintf("%23s %12s %12s %8s %4s\n",
2065 "Type", "Physical", "Virtual", "#Pages", "Attr");
2067 for (i = 0, p = map; i < ndesc; i++,
2068 p = efi_next_descriptor(p, efihdrbase->descriptor_size)) {
2069 if (boothowto & RB_VERBOSE) {
2070 if (p->md_type <= EFI_MD_TYPE_PALCODE)
2071 type = types[p->md_type];
2074 kprintf("%23s %012lx %12p %08lx ", type, p->md_phys,
2075 p->md_virt, p->md_pages);
2076 if (p->md_attr & EFI_MD_ATTR_UC)
2078 if (p->md_attr & EFI_MD_ATTR_WC)
2080 if (p->md_attr & EFI_MD_ATTR_WT)
2082 if (p->md_attr & EFI_MD_ATTR_WB)
2084 if (p->md_attr & EFI_MD_ATTR_UCE)
2086 if (p->md_attr & EFI_MD_ATTR_WP)
2088 if (p->md_attr & EFI_MD_ATTR_RP)
2090 if (p->md_attr & EFI_MD_ATTR_XP)
2092 if (p->md_attr & EFI_MD_ATTR_RT)
2097 switch (p->md_type) {
2098 case EFI_MD_TYPE_CODE:
2099 case EFI_MD_TYPE_DATA:
2100 case EFI_MD_TYPE_BS_CODE:
2101 case EFI_MD_TYPE_BS_DATA:
2102 case EFI_MD_TYPE_FREE:
2104 * We're allowed to use any entry with these types.
2111 Realmem += p->md_pages * PAGE_SIZE;
2114 * NOTE: This little bit of code initially expands
2115 * physmap[1] as well as later entries.
2117 if (p->md_phys == physmap[*physmap_idx + 1]) {
2118 physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE;
2123 if (*physmap_idx == PHYSMAP_SIZE) {
2124 kprintf("Too many segments in the physical "
2125 "address map, giving up\n");
2128 physmap[*physmap_idx] = p->md_phys;
2129 physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE;
2133 struct fb_info efi_fb_info;
2134 static int have_efi_framebuffer = 0;
2137 efi_fb_init_vaddr(int direct_map)
2140 vm_offset_t addr, v;
2142 v = efi_fb_info.vaddr;
2143 sz = efi_fb_info.stride * efi_fb_info.height;
2146 addr = PHYS_TO_DMAP(efi_fb_info.paddr);
2147 if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress)
2148 efi_fb_info.vaddr = addr;
2151 (vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr,
2153 PAT_WRITE_COMBINING);
2158 efifb_color_depth(struct efi_fb *efifb)
2163 mask = efifb->fb_mask_red | efifb->fb_mask_green |
2164 efifb->fb_mask_blue | efifb->fb_mask_reserved;
2167 for (depth = 1; mask != 1; depth++)
2173 probe_efi_fb(int early)
2175 struct efi_fb *efifb;
2179 if (have_efi_framebuffer) {
2181 (efi_fb_info.vaddr == 0 ||
2182 efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr)))
2183 efi_fb_init_vaddr(0);
2187 kmdp = preload_search_by_type("elf kernel");
2189 kmdp = preload_search_by_type("elf64 kernel");
2190 efifb = (struct efi_fb *)preload_search_info(kmdp,
2191 MODINFO_METADATA | MODINFOMD_EFI_FB);
2195 depth = efifb_color_depth(efifb);
2197 * Our bootloader should already notice, when we won't be able to
2198 * use the UEFI framebuffer.
2200 if (depth != 24 && depth != 32)
2203 have_efi_framebuffer = 1;
2205 efi_fb_info.is_vga_boot_display = 1;
2206 efi_fb_info.width = efifb->fb_width;
2207 efi_fb_info.height = efifb->fb_height;
2208 efi_fb_info.depth = depth;
2209 efi_fb_info.stride = efifb->fb_stride * (depth / 8);
2210 efi_fb_info.paddr = efifb->fb_addr;
2212 efi_fb_info.vaddr = 0;
2214 efi_fb_init_vaddr(0);
2216 efi_fb_info.fbops.fb_set_par = NULL;
2217 efi_fb_info.fbops.fb_blank = NULL;
2218 efi_fb_info.fbops.fb_debug_enter = NULL;
2219 efi_fb_info.device = NULL;
2225 efifb_startup(void *arg)
2230 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL);
2233 getmemsize(caddr_t kmdp, u_int64_t first)
2235 int off, physmap_idx, pa_indx, da_indx;
2238 vm_paddr_t msgbuf_size;
2239 u_long physmem_tunable;
2241 quad_t dcons_addr, dcons_size;
2243 bzero(physmap, sizeof(physmap));
2247 * get memory map from INT 15:E820, kindly supplied by the loader.
2249 * subr_module.c says:
2250 * "Consumer may safely assume that size value precedes data."
2251 * ie: an int32_t immediately precedes smap.
2253 efihdrbase = (struct efi_map_header *)preload_search_info(kmdp,
2254 MODINFO_METADATA | MODINFOMD_EFI_MAP);
2255 smapbase = (struct bios_smap *)preload_search_info(kmdp,
2256 MODINFO_METADATA | MODINFOMD_SMAP);
2257 if (smapbase == NULL && efihdrbase == NULL)
2258 panic("No BIOS smap or EFI map info from loader!");
2260 if (efihdrbase == NULL)
2261 add_smap_entries(&physmap_idx);
2263 add_efi_map_entries(&physmap_idx);
2265 base_memory = physmap[1] / 1024;
2266 /* make hole for AP bootstrap code */
2267 physmap[1] = mp_bootaddress(base_memory);
2269 /* Save EBDA address, if any */
2270 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e));
2274 * Maxmem isn't the "maximum memory", it's one larger than the
2275 * highest page of the physical address space. It should be
2276 * called something like "Maxphyspage". We may adjust this
2277 * based on ``hw.physmem'' and the results of the memory test.
2279 Maxmem = atop(physmap[physmap_idx + 1]);
2282 Maxmem = MAXMEM / 4;
2285 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
2286 Maxmem = atop(physmem_tunable);
2289 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
2292 if (Maxmem > atop(physmap[physmap_idx + 1]))
2293 Maxmem = atop(physmap[physmap_idx + 1]);
2296 * Blowing out the DMAP will blow up the system.
2298 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) {
2299 kprintf("Limiting Maxmem due to DMAP size\n");
2300 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS);
2303 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2304 (boothowto & RB_VERBOSE)) {
2305 kprintf("Physical memory use set to %ldK\n", Maxmem * 4);
2309 * Call pmap initialization to make new kernel address space
2313 pmap_bootstrap(&first);
2314 physmap[0] = PAGE_SIZE;
2317 * Align the physmap to PHYSMAP_ALIGN and cut out anything
2320 for (i = j = 0; i <= physmap_idx; i += 2) {
2321 if (physmap[i+1] > ptoa(Maxmem))
2322 physmap[i+1] = ptoa(Maxmem);
2323 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) &
2324 ~PHYSMAP_ALIGN_MASK;
2325 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK;
2327 physmap[j] = physmap[i];
2328 physmap[j+1] = physmap[i+1];
2330 if (physmap[i] < physmap[i+1])
2333 physmap_idx = j - 2;
2336 * Align anything else used in the validation loop.
2338 * Also make sure that our 2MB kernel text+data+bss mappings
2339 * do not overlap potentially allocatable space.
2341 first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2344 * Size up each available chunk of physical memory.
2348 phys_avail[pa_indx].phys_beg = physmap[0];
2349 phys_avail[pa_indx].phys_end = physmap[0];
2350 dump_avail[da_indx].phys_beg = 0;
2351 dump_avail[da_indx].phys_end = physmap[0];
2355 * Get dcons buffer address
2357 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 ||
2358 kgetenv_quad("dcons.size", &dcons_size) == 0)
2362 * Validate the physical memory. The physical memory segments
2363 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2366 * We no longer perform an exhaustive memory test. Instead we
2367 * simply test the first and last word in each physmap[]
2370 for (i = 0; i <= physmap_idx; i += 2) {
2374 end = physmap[i + 1];
2376 for (pa = physmap[i]; pa < end; pa += incr) {
2378 volatile uint64_t *ptr = (uint64_t *)CADDR1;
2384 * Calculate incr. Just test the first and
2385 * last page in each physmap[] segment.
2387 if (pa == end - PAGE_SIZE)
2390 incr = end - pa - PAGE_SIZE;
2393 * Make sure we don't skip blacked out areas.
2395 if (pa < 0x200000 && 0x200000 < end) {
2396 incr = 0x200000 - pa;
2398 if (dcons_addr > 0 &&
2401 incr = dcons_addr - pa;
2405 * Block out kernel memory as not available.
2407 if (pa >= 0x200000 && pa < first) {
2409 if (pa + incr > end)
2415 * Block out the dcons buffer if it exists.
2417 if (dcons_addr > 0 &&
2418 pa >= trunc_page(dcons_addr) &&
2419 pa < dcons_addr + dcons_size) {
2420 incr = dcons_addr + dcons_size - pa;
2421 incr = (incr + PAGE_MASK) &
2422 ~(vm_paddr_t)PAGE_MASK;
2423 if (pa + incr > end)
2431 * Map the page non-cacheable for the memory
2435 kernel_pmap->pmap_bits[PG_V_IDX] |
2436 kernel_pmap->pmap_bits[PG_RW_IDX] |
2437 kernel_pmap->pmap_bits[PG_N_IDX];
2438 cpu_invlpg(__DEVOLATILE(void *, ptr));
2442 * Save original value for restoration later.
2447 * Test for alternating 1's and 0's
2449 *ptr = 0xaaaaaaaaaaaaaaaaLLU;
2451 if (*ptr != 0xaaaaaaaaaaaaaaaaLLU)
2454 * Test for alternating 0's and 1's
2456 *ptr = 0x5555555555555555LLU;
2458 if (*ptr != 0x5555555555555555LLU)
2463 *ptr = 0xffffffffffffffffLLU;
2465 if (*ptr != 0xffffffffffffffffLLU)
2476 * Restore original value.
2481 * Adjust array of valid/good pages.
2483 if (page_bad == TRUE) {
2489 * Collapse page address into phys_avail[]. Do a
2490 * continuation of the current phys_avail[] index
2493 if (phys_avail[pa_indx].phys_end == pa) {
2497 phys_avail[pa_indx].phys_end += incr;
2498 } else if (phys_avail[pa_indx].phys_beg ==
2499 phys_avail[pa_indx].phys_end) {
2501 * Current phys_avail is completely empty,
2504 phys_avail[pa_indx].phys_beg = pa;
2505 phys_avail[pa_indx].phys_end = pa + incr;
2508 * Allocate next phys_avail index.
2511 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2513 "Too many holes in the physical address space, giving up\n");
2518 phys_avail[pa_indx].phys_beg = pa;
2519 phys_avail[pa_indx].phys_end = pa + incr;
2521 physmem += incr / PAGE_SIZE;
2524 * pa available for dumping
2527 if (dump_avail[da_indx].phys_end == pa) {
2528 dump_avail[da_indx].phys_end += incr;
2531 if (da_indx == DUMP_AVAIL_ARRAY_END) {
2535 dump_avail[da_indx].phys_beg = pa;
2536 dump_avail[da_indx].phys_end = pa + incr;
2548 * The last chunk must contain at least one page plus the message
2549 * buffer to avoid complicating other code (message buffer address
2550 * calculation, etc.).
2552 msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2554 while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >=
2555 phys_avail[pa_indx].phys_end) {
2556 physmem -= atop(phys_avail[pa_indx].phys_end -
2557 phys_avail[pa_indx].phys_beg);
2558 phys_avail[pa_indx].phys_beg = 0;
2559 phys_avail[pa_indx].phys_end = 0;
2563 Maxmem = atop(phys_avail[pa_indx].phys_end);
2565 /* Trim off space for the message buffer. */
2566 phys_avail[pa_indx].phys_end -= msgbuf_size;
2568 avail_end = phys_avail[pa_indx].phys_end;
2570 /* Map the message buffer. */
2571 for (off = 0; off < msgbuf_size; off += PAGE_SIZE) {
2572 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2576 * Try to get EFI framebuffer working as early as possible.
2578 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing
2579 * the pmap probe code to create a DMAP that does not cover its
2580 * physical address space, efi_fb_init_vaddr(1) might not return
2581 * an initialized framebuffer base pointer. In this situation the
2582 * later efi_fb_init_vaddr(0) call will deal with it.
2584 if (have_efi_framebuffer)
2585 efi_fb_init_vaddr(1);
2588 struct machintr_abi MachIntrABI;
2599 * 7 Device Not Available (x87)
2601 * 9 Coprocessor Segment overrun (unsupported, reserved)
2603 * 11 Segment not present
2605 * 13 General Protection
2608 * 16 x87 FP Exception pending
2609 * 17 Alignment Check
2611 * 19 SIMD floating point
2613 * 32-255 INTn/external sources
2616 hammer_time(u_int64_t modulep, u_int64_t physfree)
2619 int gsel_tss, x, cpu;
2621 int metadata_missing, off;
2623 struct mdglobaldata *gd;
2624 struct privatespace *ps;
2628 * Prevent lowering of the ipl if we call tsleep() early.
2630 gd = &CPU_prvspace[0]->mdglobaldata;
2631 ps = (struct privatespace *)gd;
2632 bzero(gd, sizeof(*gd));
2633 bzero(&ps->common_tss, sizeof(ps->common_tss));
2636 * Note: on both UP and SMP curthread must be set non-NULL
2637 * early in the boot sequence because the system assumes
2638 * that 'curthread' is never NULL.
2641 gd->mi.gd_curthread = &thread0;
2642 thread0.td_gd = &gd->mi;
2644 atdevbase = ISA_HOLE_START + PTOV_OFFSET;
2647 metadata_missing = 0;
2648 if (bootinfo.bi_modulep) {
2649 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2650 preload_bootstrap_relocate(KERNBASE);
2652 metadata_missing = 1;
2654 if (bootinfo.bi_envp)
2655 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2658 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET);
2659 preload_bootstrap_relocate(PTOV_OFFSET);
2660 kmdp = preload_search_by_type("elf kernel");
2662 kmdp = preload_search_by_type("elf64 kernel");
2663 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
2664 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET;
2666 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
2667 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
2669 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
2671 if (boothowto & RB_VERBOSE)
2675 * Default MachIntrABI to ICU
2677 MachIntrABI = MachIntrABI_ICU;
2680 * start with one cpu. Note: with one cpu, ncpus_fit_mask remain 0.
2684 /* Init basic tunables, hz etc */
2688 * make gdt memory segments
2690 gdt_segs[GPROC0_SEL].ssd_base =
2691 (uintptr_t) &CPU_prvspace[0]->common_tss;
2693 gd->mi.gd_prvspace = CPU_prvspace[0];
2695 for (x = 0; x < NGDT; x++) {
2696 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
2697 ssdtosd(&gdt_segs[x], &gdt_cpu0[x]);
2699 ssdtosyssd(&gdt_segs[GPROC0_SEL],
2700 (struct system_segment_descriptor *)&gdt_cpu0[GPROC0_SEL]);
2703 * WARNING! Due to an Intel quirk, VMX exits set the gdt[] table
2704 * limit to 0xFFFF. To avoid having to do a heavy-weight
2705 * reload, we just make ours maximally sized.
2707 r_gdt.rd_limit = MAXGDT_LIMIT - 1;
2708 r_gdt.rd_base = (long)gdt_cpu0;
2711 wrmsr(MSR_FSBASE, 0); /* User value */
2712 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi);
2713 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
2715 mi_gdinit(&gd->mi, 0);
2717 proc0paddr = proc0paddr_buff;
2718 mi_proc0init(&gd->mi, proc0paddr);
2719 safepri = TDPRI_MAX;
2721 /* spinlocks and the BGL */
2725 for (x = 0; x < NIDT; x++)
2726 setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0);
2727 setidt_global(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
2728 setidt_global(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 2);
2729 setidt_global(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1);
2730 setidt_global(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
2731 setidt_global(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
2732 setidt_global(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
2733 setidt_global(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
2734 setidt_global(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
2735 setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
2736 setidt_global(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
2737 setidt_global(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
2738 setidt_global(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
2739 setidt_global(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
2740 setidt_global(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
2741 setidt_global(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
2742 setidt_global(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
2743 setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
2744 setidt_global(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
2745 setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
2747 for (cpu = 0; cpu < MAXCPU; ++cpu) {
2748 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1;
2749 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0];
2752 lidt(&r_idt_arr[0]);
2755 * Initialize the console before we print anything out.
2760 if (metadata_missing)
2761 kprintf("WARNING: loader(8) metadata is missing!\n");
2771 * Initialize IRQ mapping
2774 * SHOULD be after elcr_probe()
2776 MachIntrABI_ICU.initmap();
2777 MachIntrABI_IOAPIC.initmap();
2781 if (boothowto & RB_KDB)
2782 Debugger("Boot flags requested debugger");
2785 identify_cpu(); /* Final stage of CPU initialization */
2786 initializecpu(0); /* Initialize CPU registers */
2789 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better
2790 * because the cpu does significant power management in MWAIT
2791 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2793 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does
2794 * significant power management only when using ACPI halt mode.
2795 * (However, on Ryzen, mode 4 (HLT) also does power management).
2797 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI
2798 * is needed to reduce power consumption, but wakeup times are often
2801 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
2802 CPUID_TO_MODEL(cpu_id) >= 0x3C) { /* Haswell or later */
2805 if (cpu_vendor_id == CPU_VENDOR_AMD) {
2806 if (CPUID_TO_FAMILY(cpu_id) >= 0x17) {
2807 /* Ryzen or later */
2809 } else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) {
2810 /* Bobcat or later */
2815 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */
2816 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable);
2817 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable);
2818 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt);
2821 * By default always enable the ioapic. Certain virtual machines
2822 * may not work with the I/O apic enabled and can be specified in
2823 * the case statement below. On the other hand, if the ioapic is
2824 * disabled for virtual machines which DO work with the I/O apic,
2825 * the virtual machine can implode if we disable the I/O apic.
2827 * For now enable the ioapic for all guests.
2829 * NOTE: This must be done after identify_cpu(), which sets
2832 if (ioapic_enable < 0) {
2835 case VMM_GUEST_NONE: /* should be enabled on real HW */
2836 case VMM_GUEST_KVM: /* must be enabled or VM implodes */
2839 default: /* enable by default for other VMs */
2846 * TSS entry point for interrupts, traps, and exceptions
2847 * (sans NMI). This will always go to near the top of the pcpu
2848 * trampoline area. Hardware-pushed data will be copied into
2849 * the trap-frame on entry, and (if necessary) returned to the
2850 * trampoline on exit.
2852 * We store some pcb data for the trampoline code above the
2853 * stack the cpu hw pushes into, and arrange things so the
2854 * address of tr_pcb_rsp is the same as the desired top of
2857 ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp;
2858 ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0;
2859 ps->trampoline.tr_pcb_gs_kernel = (register_t)gd;
2860 ps->trampoline.tr_pcb_cr3 = KPML4phys; /* adj to user cr3 live */
2861 ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd;
2862 ps->dbltramp.tr_pcb_cr3 = KPML4phys;
2863 ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd;
2864 ps->dbgtramp.tr_pcb_cr3 = KPML4phys;
2866 /* double fault stack */
2867 ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp;
2868 /* #DB debugger needs its own stack */
2869 ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp;
2871 /* Set the IO permission bitmap (empty due to tss seg limit) */
2872 ps->common_tss.tss_iobase = sizeof(struct x86_64tss);
2874 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2875 gd->gd_gdt = &gdt_cpu0[0];
2876 gd->gd_tss_gdt = &gd->gd_gdt[GPROC0_SEL];
2877 gd->gd_common_tssd = *gd->gd_tss_gdt;
2880 /* Set up the fast syscall stuff */
2881 msr = rdmsr(MSR_EFER) | EFER_SCE;
2882 wrmsr(MSR_EFER, msr);
2883 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
2884 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
2885 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
2886 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
2887 wrmsr(MSR_STAR, msr);
2888 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC);
2890 getmemsize(kmdp, physfree);
2891 init_param2(physmem);
2893 /* now running on new page tables, configured,and u/iom is accessible */
2895 /* Map the message buffer. */
2897 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2898 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2901 msgbufinit(msgbufp, MSGBUF_SIZE);
2904 /* transfer to user mode */
2906 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2907 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2908 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
2914 /* setup proc 0's pcb */
2915 thread0.td_pcb->pcb_flags = 0;
2916 thread0.td_pcb->pcb_cr3 = KPML4phys;
2917 thread0.td_pcb->pcb_cr3_iso = 0;
2918 thread0.td_pcb->pcb_ext = NULL;
2919 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */
2921 /* Location of kernel stack for locore */
2922 return ((u_int64_t)thread0.td_pcb);
2926 * Initialize machine-dependant portions of the global data structure.
2927 * Note that the global data area and cpu0's idlestack in the private
2928 * data space were allocated in locore.
2930 * Note: the idlethread's cpl is 0
2932 * WARNING! Called from early boot, 'mycpu' may not work yet.
2935 cpu_gdinit(struct mdglobaldata *gd, int cpu)
2938 gd->mi.gd_curthread = &gd->mi.gd_idlethread;
2940 lwkt_init_thread(&gd->mi.gd_idlethread,
2941 gd->mi.gd_prvspace->idlestack,
2942 sizeof(gd->mi.gd_prvspace->idlestack),
2944 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu);
2945 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch;
2946 gd->mi.gd_idlethread.td_sp -= sizeof(void *);
2947 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore;
2951 * We only have to check for DMAP bounds, the globaldata space is
2952 * actually part of the kernel_map so we don't have to waste time
2953 * checking CPU_prvspace[*].
2956 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr)
2959 if (saddr >= (vm_offset_t)&CPU_prvspace[0] &&
2960 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) {
2964 if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS)
2970 globaldata_find(int cpu)
2972 KKASSERT(cpu >= 0 && cpu < ncpus);
2973 return(&CPU_prvspace[cpu]->mdglobaldata.mi);
2977 * This path should be safe from the SYSRET issue because only stopped threads
2978 * can have their %rip adjusted this way (and all heavy weight thread switches
2979 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2980 * convoluted so add a safety by forcing %rip to be cannonical.
2983 ptrace_set_pc(struct lwp *lp, unsigned long addr)
2985 if (addr & 0x0000800000000000LLU)
2986 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU;
2988 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU;
2993 ptrace_single_step(struct lwp *lp)
2995 lp->lwp_md.md_regs->tf_rflags |= PSL_T;
3000 fill_regs(struct lwp *lp, struct reg *regs)
3002 struct trapframe *tp;
3004 if ((tp = lp->lwp_md.md_regs) == NULL)
3006 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs));
3011 set_regs(struct lwp *lp, struct reg *regs)
3013 struct trapframe *tp;
3015 tp = lp->lwp_md.md_regs;
3016 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) ||
3017 !CS_SECURE(regs->r_cs))
3019 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs));
3025 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87)
3027 struct env87 *penv_87 = &sv_87->sv_env;
3028 struct envxmm *penv_xmm = &sv_xmm->sv_env;
3031 /* FPU control/status */
3032 penv_87->en_cw = penv_xmm->en_cw;
3033 penv_87->en_sw = penv_xmm->en_sw;
3034 penv_87->en_tw = penv_xmm->en_tw;
3035 penv_87->en_fip = penv_xmm->en_fip;
3036 penv_87->en_fcs = penv_xmm->en_fcs;
3037 penv_87->en_opcode = penv_xmm->en_opcode;
3038 penv_87->en_foo = penv_xmm->en_foo;
3039 penv_87->en_fos = penv_xmm->en_fos;
3042 for (i = 0; i < 8; ++i)
3043 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
3047 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm)
3049 struct env87 *penv_87 = &sv_87->sv_env;
3050 struct envxmm *penv_xmm = &sv_xmm->sv_env;
3053 /* FPU control/status */
3054 penv_xmm->en_cw = penv_87->en_cw;
3055 penv_xmm->en_sw = penv_87->en_sw;
3056 penv_xmm->en_tw = penv_87->en_tw;
3057 penv_xmm->en_fip = penv_87->en_fip;
3058 penv_xmm->en_fcs = penv_87->en_fcs;
3059 penv_xmm->en_opcode = penv_87->en_opcode;
3060 penv_xmm->en_foo = penv_87->en_foo;
3061 penv_xmm->en_fos = penv_87->en_fos;
3064 for (i = 0; i < 8; ++i)
3065 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
3069 fill_fpregs(struct lwp *lp, struct fpreg *fpregs)
3071 if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL)
3074 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm,
3075 (struct save87 *)fpregs);
3078 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
3083 set_fpregs(struct lwp *lp, struct fpreg *fpregs)
3086 set_fpregs_xmm((struct save87 *)fpregs,
3087 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm);
3090 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs);
3095 fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
3100 dbregs->dr[0] = rdr0();
3101 dbregs->dr[1] = rdr1();
3102 dbregs->dr[2] = rdr2();
3103 dbregs->dr[3] = rdr3();
3104 dbregs->dr[4] = rdr4();
3105 dbregs->dr[5] = rdr5();
3106 dbregs->dr[6] = rdr6();
3107 dbregs->dr[7] = rdr7();
3110 if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL)
3112 dbregs->dr[0] = pcb->pcb_dr0;
3113 dbregs->dr[1] = pcb->pcb_dr1;
3114 dbregs->dr[2] = pcb->pcb_dr2;
3115 dbregs->dr[3] = pcb->pcb_dr3;
3118 dbregs->dr[6] = pcb->pcb_dr6;
3119 dbregs->dr[7] = pcb->pcb_dr7;
3124 set_dbregs(struct lwp *lp, struct dbreg *dbregs)
3127 load_dr0(dbregs->dr[0]);
3128 load_dr1(dbregs->dr[1]);
3129 load_dr2(dbregs->dr[2]);
3130 load_dr3(dbregs->dr[3]);
3131 load_dr4(dbregs->dr[4]);
3132 load_dr5(dbregs->dr[5]);
3133 load_dr6(dbregs->dr[6]);
3134 load_dr7(dbregs->dr[7]);
3137 struct ucred *ucred;
3139 uint64_t mask1, mask2;
3142 * Don't let an illegal value for dr7 get set. Specifically,
3143 * check for undefined settings. Setting these bit patterns
3144 * result in undefined behaviour and can lead to an unexpected
3147 /* JG this loop looks unreadable */
3148 /* Check 4 2-bit fields for invalid patterns.
3149 * These fields are R/Wi, for i = 0..3
3151 /* Is 10 in LENi allowed when running in compatibility mode? */
3152 /* Pattern 10 in R/Wi might be used to indicate
3153 * breakpoint on I/O. Further analysis should be
3154 * carried to decide if it is safe and useful to
3155 * provide access to that capability
3157 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4;
3158 i++, mask1 <<= 4, mask2 <<= 4)
3159 if ((dbregs->dr[7] & mask1) == mask2)
3162 pcb = lp->lwp_thread->td_pcb;
3163 ucred = lp->lwp_proc->p_ucred;
3166 * Don't let a process set a breakpoint that is not within the
3167 * process's address space. If a process could do this, it
3168 * could halt the system by setting a breakpoint in the kernel
3169 * (if ddb was enabled). Thus, we need to check to make sure
3170 * that no breakpoints are being enabled for addresses outside
3171 * process's address space, unless, perhaps, we were called by
3174 * XXX - what about when the watched area of the user's
3175 * address space is written into from within the kernel
3176 * ... wouldn't that still cause a breakpoint to be generated
3177 * from within kernel mode?
3180 if (caps_priv_check(ucred, SYSCAP_RESTRICTEDROOT) != 0) {
3181 if (dbregs->dr[7] & 0x3) {
3182 /* dr0 is enabled */
3183 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
3187 if (dbregs->dr[7] & (0x3<<2)) {
3188 /* dr1 is enabled */
3189 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
3193 if (dbregs->dr[7] & (0x3<<4)) {
3194 /* dr2 is enabled */
3195 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
3199 if (dbregs->dr[7] & (0x3<<6)) {
3200 /* dr3 is enabled */
3201 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
3206 pcb->pcb_dr0 = dbregs->dr[0];
3207 pcb->pcb_dr1 = dbregs->dr[1];
3208 pcb->pcb_dr2 = dbregs->dr[2];
3209 pcb->pcb_dr3 = dbregs->dr[3];
3210 pcb->pcb_dr6 = dbregs->dr[6];
3211 pcb->pcb_dr7 = dbregs->dr[7];
3213 pcb->pcb_flags |= PCB_DBREGS;
3220 * Return > 0 if a hardware breakpoint has been hit, and the
3221 * breakpoint was in user space. Return 0, otherwise.
3224 user_dbreg_trap(void)
3226 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
3227 u_int64_t bp; /* breakpoint bits extracted from dr6 */
3228 int nbp; /* number of breakpoints that triggered */
3229 caddr_t addr[4]; /* breakpoint addresses */
3233 if ((dr7 & 0xff) == 0) {
3235 * all GE and LE bits in the dr7 register are zero,
3236 * thus the trap couldn't have been caused by the
3237 * hardware debug registers
3248 * None of the breakpoint bits are set meaning this
3249 * trap was not caused by any of the debug registers
3255 * at least one of the breakpoints were hit, check to see
3256 * which ones and if any of them are user space addresses
3260 addr[nbp++] = (caddr_t)rdr0();
3263 addr[nbp++] = (caddr_t)rdr1();
3266 addr[nbp++] = (caddr_t)rdr2();
3269 addr[nbp++] = (caddr_t)rdr3();
3272 for (i = 0; i < nbp; i++) {
3273 if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) {
3275 * addr[i] is in user space
3282 * None of the breakpoints are in user space.
3290 Debugger(const char *msg)
3292 kprintf("Debugger(\"%s\") called.\n", msg);
3299 * Provide inb() and outb() as functions. They are normally only
3300 * available as macros calling inlined functions, thus cannot be
3301 * called inside DDB.
3303 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
3309 /* silence compiler warnings */
3311 void outb(u_int, u_char);
3318 * We use %%dx and not %1 here because i/o is done at %dx and not at
3319 * %edx, while gcc generates inferior code (movw instead of movl)
3320 * if we tell it to load (u_short) port.
3322 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
3327 outb(u_int port, u_char data)
3331 * Use an unnecessary assignment to help gcc's register allocator.
3332 * This make a large difference for gcc-1.40 and a tiny difference
3333 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
3334 * best results. gcc-2.6.0 can't handle this.
3337 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
3345 * initialize all the SMP locks
3348 /* critical region when masking or unmasking interupts */
3349 struct spinlock_deprecated imen_spinlock;
3351 /* locks com (tty) data/hardware accesses: a FASTINTR() */
3352 struct spinlock_deprecated com_spinlock;
3354 /* lock regions around the clock hardware */
3355 struct spinlock_deprecated clock_spinlock;
3361 * Get the initial mplock with a count of 1 for the BSP.
3362 * This uses a LOGICAL cpu ID, ie BSP == 0.
3364 cpu_get_initial_mplock();
3366 spin_init_deprecated(&imen_spinlock);
3367 spin_init_deprecated(&com_spinlock);
3368 spin_init_deprecated(&clock_spinlock);
3370 /* our token pool needs to work early */
3371 lwkt_token_pool_init();
3375 cpu_mwait_hint_valid(uint32_t hint)
3379 cx_idx = MWAIT_EAX_TO_CX(hint);
3380 if (cx_idx >= CPU_MWAIT_CX_MAX)
3383 sub = MWAIT_EAX_TO_CX_SUB(hint);
3384 if (sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3391 cpu_mwait_cx_no_bmsts(void)
3393 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS);
3397 cpu_mwait_cx_no_bmarb(void)
3399 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB);
3403 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto)
3405 int old_cx_idx, sub = 0;
3408 old_cx_idx = MWAIT_EAX_TO_CX(hint);
3409 sub = MWAIT_EAX_TO_CX_SUB(hint);
3410 } else if (hint == CPU_MWAIT_HINT_AUTO) {
3411 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX;
3412 } else if (hint == CPU_MWAIT_HINT_AUTODEEP) {
3413 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX;
3415 old_cx_idx = CPU_MWAIT_CX_MAX;
3418 if (!CPU_MWAIT_HAS_CX)
3419 strlcpy(name, "NONE", namelen);
3420 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO)
3421 strlcpy(name, "AUTO", namelen);
3422 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP)
3423 strlcpy(name, "AUTODEEP", namelen);
3424 else if (old_cx_idx >= CPU_MWAIT_CX_MAX ||
3425 sub >= cpu_mwait_cx_info[old_cx_idx].subcnt)
3426 strlcpy(name, "INVALID", namelen);
3428 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub);
3434 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto)
3436 int cx_idx, sub, hint;
3439 if (allow_auto && strcmp(name, "AUTO") == 0) {
3440 hint = CPU_MWAIT_HINT_AUTO;
3441 cx_idx = CPU_MWAIT_C2;
3444 if (allow_auto && strcmp(name, "AUTODEEP") == 0) {
3445 hint = CPU_MWAIT_HINT_AUTODEEP;
3446 cx_idx = CPU_MWAIT_C3;
3450 if (strlen(name) < 4 || toupper(name[0]) != 'C')
3455 cx_idx = strtol(start, &ptr, 10);
3456 if (ptr == start || *ptr != '/')
3458 if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX)
3464 sub = strtol(start, &ptr, 10);
3467 if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3470 hint = MWAIT_EAX_HINT(cx_idx, sub);
3477 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx)
3479 if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble)
3481 if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) {
3484 error = cputimer_intr_powersave_addreq();
3487 } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) {
3488 cputimer_intr_powersave_remreq();
3494 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0,
3495 boolean_t allow_auto)
3497 int error, cx_idx, old_cx_idx, hint;
3498 char name[CPU_MWAIT_CX_NAMELEN];
3501 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name),
3504 error = sysctl_handle_string(oidp, name, sizeof(name), req);
3505 if (error != 0 || req->newptr == NULL)
3508 if (!CPU_MWAIT_HAS_CX)
3511 cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto);
3515 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3524 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name)
3526 int error, cx_idx, old_cx_idx, hint;
3527 char name[CPU_MWAIT_CX_NAMELEN];
3529 KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension"));
3532 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3534 strlcpy(name, cx_name, sizeof(name));
3535 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3539 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3548 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)
3550 int hint = cpu_mwait_halt_global;
3551 int error, cx_idx, cpu;
3552 char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN];
3554 cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3556 error = sysctl_handle_string(oidp, name, sizeof(name), req);
3557 if (error != 0 || req->newptr == NULL)
3560 if (!CPU_MWAIT_HAS_CX)
3563 /* Save name for later per-cpu CX configuration */
3564 strlcpy(cx_name, name, sizeof(cx_name));
3566 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3570 /* Change per-cpu CX configuration */
3571 for (cpu = 0; cpu < ncpus; ++cpu) {
3572 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name);
3577 cpu_mwait_halt_global = hint;
3582 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
3584 struct cpu_idle_stat *stat = arg1;
3587 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3593 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)
3597 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3598 &cpu_mwait_spin, FALSE);
3603 * This manual debugging code is called unconditionally from Xtimer
3604 * (the per-cpu timer interrupt) whether the current thread is in a
3605 * critical section or not) and can be useful in tracking down lockups.
3607 * NOTE: MANUAL DEBUG CODE
3610 static int saveticks[SMP_MAXCPU];
3611 static int savecounts[SMP_MAXCPU];
3613 static tsc_uclock_t last_tsc[SMP_MAXCPU];
3616 pcpu_timer_always(struct intrframe *frame)
3627 if (flame_poll_debug == 0)
3630 tsc = rdtsc() - last_tsc[gd->gd_cpuid];
3631 if (tsc_frequency == 0 || tsc < tsc_frequency)
3633 last_tsc[gd->gd_cpuid] = rdtsc();
3635 td = gd->gd_curthread;
3638 bot = (char *)td->td_kstack + PAGE_SIZE; /* skip guard */
3639 top = (char *)td->td_kstack + td->td_kstack_size;
3643 rip = (char *)(intptr_t)frame->if_rip;
3644 kprintf("POLL%02d %016lx", gd->gd_cpuid, (intptr_t)rip);
3645 rbp = (char *)(intptr_t)frame->if_rbp;
3647 for (n = 1; n < 8; ++n) {
3648 if (rbp < bot || rbp > top - 8 || ((intptr_t)rbp & 7))
3650 kprintf("<-%016lx", (intptr_t)*(char **)(rbp + 8));
3651 if (*(char **)rbp <= rbp)
3653 rbp = *(char **)rbp;
3659 SET_DECLARE(smap_open, char);
3660 SET_DECLARE(smap_close, char);
3663 cpu_implement_smap(void)
3667 for (scan = SET_BEGIN(smap_open); /* nop -> stac */
3668 scan < SET_LIMIT(smap_open); ++scan) {
3673 for (scan = SET_BEGIN(smap_close); /* nop -> clac */
3674 scan < SET_LIMIT(smap_close); ++scan) {
3682 * From a hard interrupt
3685 cpu_interrupt_running(struct thread *td)
3687 struct mdglobaldata *gd = mdcpu;
3689 if (clock_debug1 > 0) {
3691 kprintf("%d %016lx %016lx %016lx\n",
3692 ((td->td_flags & TDF_INTTHREAD) != 0),
3695 gd->gd_ipending[2]);
3696 if (td->td_flags & TDF_CLKTHREAD) {
3697 kprintf("CLKTD %s PREEMPT %s\n",
3700 td->td_preempted->td_comm : ""));
3702 kprintf("NORTD %s\n", td->td_comm);
3705 if ((td->td_flags & TDF_INTTHREAD) ||
3706 gd->gd_ipending[0] ||
3707 gd->gd_ipending[1] ||
3708 gd->gd_ipending[2]) {