netinet{,6}: Assert in{,6}_inithead() are only used for system routing tables.
[dragonfly.git] / sys / platform / pc64 / x86_64 / machdep.c
CommitLineData
c8fe38ae
MD
1/*-
2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008 The DragonFly Project.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
c8fe38ae
MD
41 */
42
c8fe38ae
MD
43//#include "use_npx.h"
44#include "use_isa.h"
c8fe38ae
MD
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_directio.h"
49#include "opt_inet.h"
c8fe38ae
MD
50#include "opt_msgbuf.h"
51#include "opt_swap.h"
52
53#include <sys/param.h>
54#include <sys/systm.h>
55#include <sys/sysproto.h>
56#include <sys/signalvar.h>
57#include <sys/kernel.h>
58#include <sys/linker.h>
59#include <sys/malloc.h>
60#include <sys/proc.h>
895c1f85 61#include <sys/priv.h>
c8fe38ae
MD
62#include <sys/buf.h>
63#include <sys/reboot.h>
64#include <sys/mbuf.h>
65#include <sys/msgbuf.h>
66#include <sys/sysent.h>
67#include <sys/sysctl.h>
68#include <sys/vmmeter.h>
69#include <sys/bus.h>
c8fe38ae
MD
70#include <sys/usched.h>
71#include <sys/reg.h>
fcab1000 72#include <sys/sbuf.h>
29b33800 73#include <sys/ctype.h>
271a3286
SZ
74#include <sys/serialize.h>
75#include <sys/systimer.h>
c8fe38ae
MD
76
77#include <vm/vm.h>
78#include <vm/vm_param.h>
79#include <sys/lock.h>
80#include <vm/vm_kern.h>
81#include <vm/vm_object.h>
82#include <vm/vm_page.h>
83#include <vm/vm_map.h>
84#include <vm/vm_pager.h>
85#include <vm/vm_extern.h>
86
87#include <sys/thread2.h>
684a93c4 88#include <sys/mplock2.h>
320c681e 89#include <sys/mutex2.h>
c8fe38ae
MD
90
91#include <sys/user.h>
92#include <sys/exec.h>
93#include <sys/cons.h>
94
95#include <ddb/ddb.h>
96
97#include <machine/cpu.h>
98#include <machine/clock.h>
99#include <machine/specialreg.h>
32d3bd25 100#if 0 /* JG */
c8fe38ae
MD
101#include <machine/bootinfo.h>
102#endif
c8fe38ae
MD
103#include <machine/md_var.h>
104#include <machine/metadata.h>
105#include <machine/pc/bios.h>
106#include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
107#include <machine/globaldata.h> /* CPU_prvspace */
108#include <machine/smp.h>
109#ifdef PERFMON
110#include <machine/perfmon.h>
111#endif
112#include <machine/cputypes.h>
57a9c56b 113#include <machine/intr_machdep.h>
c8fe38ae
MD
114
115#ifdef OLD_BUS_ARCH
46d4e165 116#include <bus/isa/isa_device.h>
c8fe38ae 117#endif
57a9c56b 118#include <machine_base/isa/isa_intr.h>
c8fe38ae
MD
119#include <bus/isa/rtc.h>
120#include <sys/random.h>
121#include <sys/ptrace.h>
122#include <machine/sigframe.h>
123
faaf4131 124#include <sys/machintr.h>
9284cddf 125#include <machine_base/icu/icu_abi.h>
7265a4fe 126#include <machine_base/icu/elcr_var.h>
2e0ed166 127#include <machine_base/apic/lapic.h>
ed4d621d 128#include <machine_base/apic/ioapic.h>
a3dd9120 129#include <machine_base/apic/ioapic_abi.h>
8cc9a8d1 130#include <machine/mptable.h>
faaf4131 131
c8fe38ae
MD
132#define PHYSMAP_ENTRIES 10
133
c8fe38ae
MD
134extern u_int64_t hammer_time(u_int64_t, u_int64_t);
135
136extern void printcpuinfo(void); /* XXX header file */
137extern void identify_cpu(void);
32d3bd25 138#if 0 /* JG */
c8fe38ae
MD
139extern void finishidentcpu(void);
140#endif
141extern void panicifcpuunsupported(void);
c8fe38ae
MD
142
143static void cpu_startup(void *);
1ebcbb29
SZ
144static void pic_finish(void *);
145static void cpu_finish(void *);
146
c8fe38ae
MD
147static void set_fpregs_xmm(struct save87 *, struct savexmm *);
148static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
c8fe38ae
MD
149#ifdef DIRECTIO
150extern void ffs_rawread_setup(void);
151#endif /* DIRECTIO */
152static void init_locks(void);
153
f3f3eadb
SW
154SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
155SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL);
156SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL);
c8fe38ae
MD
157
158#ifdef DDB
159extern vm_offset_t ksym_start, ksym_end;
160#endif
161
4864d541
MD
162struct privatespace CPU_prvspace_bsp __aligned(4096);
163struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp };
48ffc236 164
c8fe38ae
MD
165int _udatasel, _ucodesel, _ucode32sel;
166u_long atdevbase;
c8fe38ae 167int64_t tsc_offsets[MAXCPU];
f23178a0 168
ffa2dd72 169static int cpu_mwait_halt_global; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
c8fe38ae
MD
170
171#if defined(SWTCH_OPTIM_STATS)
172extern int swtch_optim_stats;
173SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
174 CTLFLAG_RD, &swtch_optim_stats, 0, "");
175SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
176 CTLFLAG_RD, &tlb_flush_count, 0, "");
177#endif
a46b4a23 178SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt,
ffa2dd72 179 CTLFLAG_RD, &cpu_mwait_halt_global, 0, "");
6c2af113
SZ
180SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin, CTLFLAG_RD, &cpu_mwait_spin, 0,
181 "monitor/mwait target state");
c8fe38ae 182
ffa2dd72
SZ
183#define CPU_MWAIT_HAS_CX \
184 ((cpu_feature2 & CPUID2_MON) && \
185 (cpu_mwait_feature & CPUID_MWAIT_EXT))
186
187#define CPU_MWAIT_CX_NAMELEN 16
188
9b96ccc8 189#define CPU_MWAIT_C1 1
1786faf9 190#define CPU_MWAIT_C2 2
271a3286 191#define CPU_MWAIT_C3 3
1f9bc256
SZ
192#define CPU_MWAIT_CX_MAX 8
193
f23178a0
SZ
194#define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
195#define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
196
5658c3a2
SZ
197SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features");
198SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings");
1f9bc256
SZ
199
200struct cpu_mwait_cx {
201 int subcnt;
202 char name[4];
203 struct sysctl_ctx_list sysctl_ctx;
204 struct sysctl_oid *sysctl_tree;
205};
206static struct cpu_mwait_cx cpu_mwait_cx_info[CPU_MWAIT_CX_MAX];
fcab1000
SZ
207static char cpu_mwait_cx_supported[256];
208
6fb37fea 209static int cpu_mwait_c1_hints_cnt;
f23178a0
SZ
210static int cpu_mwait_hints_cnt;
211static int *cpu_mwait_hints;
212
213static int cpu_mwait_deep_hints_cnt;
214static int *cpu_mwait_deep_hints;
215
90658001
SZ
216#define CPU_IDLE_REPEAT_DEFAULT 750
217
218static u_int cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT;
219static u_long cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT;
6fb37fea 220static u_int cpu_mwait_repeat_shift = 1;
2cb4dff2 221
22d2370f
SZ
222#define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
223#define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
224
225static int cpu_mwait_c3_preamble =
226 CPU_MWAIT_C3_PREAMBLE_BM_ARB |
227 CPU_MWAIT_C3_PREAMBLE_BM_STS;
228
fcab1000
SZ
229SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD,
230 cpu_mwait_cx_supported, 0, "MWAIT supported C states");
f54f376c
SZ
231SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD,
232 &cpu_mwait_c3_preamble, 0, "C3+ preamble mask");
1f9bc256 233
1786faf9
SZ
234static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,
235 int *, boolean_t);
29b33800 236static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS);
ffa2dd72 237static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS);
29b33800
SZ
238static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS);
239
240SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW,
241 NULL, 0, cpu_mwait_cx_idle_sysctl, "A", "");
242SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW,
243 NULL, 0, cpu_mwait_cx_spin_sysctl, "A", "");
6fb37fea
SZ
244SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW,
245 &cpu_mwait_repeat_shift, 0, "");
29b33800 246
39d69dae 247long physmem = 0;
c8fe38ae 248
927c4c1f
MN
249u_long ebda_addr = 0;
250
85bcaa51
SZ
251int imcr_present = 0;
252
637df2f6
SZ
253int naps = 0; /* # of Applications processors */
254
8936cd9b 255u_int base_memory;
320c681e 256struct mtx dt_lock; /* lock for GDT and LDT */
8936cd9b 257
c8fe38ae
MD
258static int
259sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
260{
39d69dae
AH
261 u_long pmem = ctob(physmem);
262
263 int error = sysctl_handle_long(oidp, &pmem, 0, req);
c8fe38ae
MD
264 return (error);
265}
266
39d69dae 267SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD,
9b9532a0 268 0, 0, sysctl_hw_physmem, "LU", "Total system memory in bytes (number of pages * page size)");
c8fe38ae
MD
269
270static int
271sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
272{
273 int error = sysctl_handle_int(oidp, 0,
274 ctob(physmem - vmstats.v_wire_count), req);
275 return (error);
276}
277
278SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
279 0, 0, sysctl_hw_usermem, "IU", "");
280
281static int
282sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
283{
c8fe38ae 284 int error = sysctl_handle_int(oidp, 0,
b2b3ffcd 285 x86_64_btop(avail_end - avail_start), req);
c8fe38ae 286 return (error);
c8fe38ae
MD
287}
288
289SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD,
290 0, 0, sysctl_hw_availpages, "I", "");
291
1bda0d3d
MD
292vm_paddr_t Maxmem;
293vm_paddr_t Realmem;
c8fe38ae
MD
294
295/*
296 * The number of PHYSMAP entries must be one less than the number of
297 * PHYSSEG entries because the PHYSMAP entry that spans the largest
298 * physical address that is accessible by ISA DMA is split into two
299 * PHYSSEG entries.
300 */
301#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
302
303vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
304vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
305
306/* must be 2 less so 0 0 can signal end of chunks */
c157ff7a
SW
307#define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 2)
308#define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 2)
c8fe38ae
MD
309
310static vm_offset_t buffer_sva, buffer_eva;
311vm_offset_t clean_sva, clean_eva;
312static vm_offset_t pager_sva, pager_eva;
313static struct trapframe proc0_tf;
314
315static void
316cpu_startup(void *dummy)
317{
318 caddr_t v;
319 vm_size_t size = 0;
320 vm_offset_t firstaddr;
321
c8fe38ae
MD
322 /*
323 * Good {morning,afternoon,evening,night}.
324 */
325 kprintf("%s", version);
326 startrtclock();
327 printcpuinfo();
328 panicifcpuunsupported();
329#ifdef PERFMON
330 perfmon_init();
331#endif
15dc6550 332 kprintf("real memory = %ju (%ju MB)\n",
1bda0d3d
MD
333 (intmax_t)Realmem,
334 (intmax_t)Realmem / 1024 / 1024);
c8fe38ae
MD
335 /*
336 * Display any holes after the first chunk of extended memory.
337 */
338 if (bootverbose) {
339 int indx;
340
341 kprintf("Physical memory chunk(s):\n");
342 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
343 vm_paddr_t size1 = phys_avail[indx + 1] - phys_avail[indx];
344
bfc09ba0
MD
345 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
346 (intmax_t)phys_avail[indx],
347 (intmax_t)phys_avail[indx + 1] - 1,
348 (intmax_t)size1,
349 (intmax_t)(size1 / PAGE_SIZE));
c8fe38ae
MD
350 }
351 }
352
353 /*
354 * Allocate space for system data structures.
355 * The first available kernel virtual address is in "v".
356 * As pages of kernel virtual memory are allocated, "v" is incremented.
357 * As pages of memory are allocated and cleared,
358 * "firstaddr" is incremented.
359 * An index into the kernel page table corresponding to the
360 * virtual memory address maintained in "v" is kept in "mapaddr".
361 */
362
363 /*
364 * Make two passes. The first pass calculates how much memory is
365 * needed and allocates it. The second pass assigns virtual
366 * addresses to the various data structures.
367 */
368 firstaddr = 0;
369again:
370 v = (caddr_t)firstaddr;
371
372#define valloc(name, type, num) \
373 (name) = (type *)v; v = (caddr_t)((name)+(num))
374#define valloclim(name, type, num, lim) \
375 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
376
377 /*
378 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
379 * For the first 64MB of ram nominally allocate sufficient buffers to
380 * cover 1/4 of our ram. Beyond the first 64MB allocate additional
381 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing
382 * the buffer cache we limit the eventual kva reservation to
383 * maxbcache bytes.
384 *
385 * factor represents the 1/4 x ram conversion.
386 */
387 if (nbuf == 0) {
74d62460
MD
388 long factor = 4 * BKVASIZE / 1024;
389 long kbytes = physmem * (PAGE_SIZE / 1024);
c8fe38ae
MD
390
391 nbuf = 50;
392 if (kbytes > 4096)
393 nbuf += min((kbytes - 4096) / factor, 65536 / factor);
394 if (kbytes > 65536)
395 nbuf += (kbytes - 65536) * 2 / (factor * 5);
396 if (maxbcache && nbuf > maxbcache / BKVASIZE)
397 nbuf = maxbcache / BKVASIZE;
398 }
399
400 /*
401 * Do not allow the buffer_map to be more then 1/2 the size of the
402 * kernel_map.
403 */
74d62460
MD
404 if (nbuf > (virtual_end - virtual_start +
405 virtual2_end - virtual2_start) / (BKVASIZE * 2)) {
406 nbuf = (virtual_end - virtual_start +
407 virtual2_end - virtual2_start) / (BKVASIZE * 2);
408 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf);
c8fe38ae
MD
409 }
410
74d62460
MD
411 /*
412 * Do not allow the buffer_map to use more than 50% of available
413 * physical-equivalent memory. Since the VM pages which back
414 * individual buffers are typically wired, having too many bufs
415 * can prevent the system from paging properly.
416 */
417 if (nbuf > physmem * PAGE_SIZE / (BKVASIZE * 2)) {
418 nbuf = physmem * PAGE_SIZE / (BKVASIZE * 2);
419 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf);
420 }
421
422 /*
423 * Do not allow the sizeof(struct buf) * nbuf to exceed half of
424 * the valloc space which is just the virtual_end - virtual_start
425 * section. We use valloc() to allocate the buf header array.
426 */
427 if (nbuf > (virtual_end - virtual_start) / sizeof(struct buf) / 2) {
428 nbuf = (virtual_end - virtual_start) /
429 sizeof(struct buf) / 2;
430 kprintf("Warning: nbufs capped at %ld due to valloc "
431 "considerations", nbuf);
432 }
433
434 nswbuf = lmax(lmin(nbuf / 4, 256), 16);
c8fe38ae
MD
435#ifdef NSWBUF_MIN
436 if (nswbuf < NSWBUF_MIN)
437 nswbuf = NSWBUF_MIN;
438#endif
439#ifdef DIRECTIO
440 ffs_rawread_setup();
441#endif
442
443 valloc(swbuf, struct buf, nswbuf);
444 valloc(buf, struct buf, nbuf);
445
446 /*
447 * End of first pass, size has been calculated so allocate memory
448 */
449 if (firstaddr == 0) {
450 size = (vm_size_t)(v - firstaddr);
451 firstaddr = kmem_alloc(&kernel_map, round_page(size));
452 if (firstaddr == 0)
453 panic("startup: no room for tables");
454 goto again;
455 }
456
457 /*
458 * End of second pass, addresses have been assigned
74d62460
MD
459 *
460 * nbuf is an int, make sure we don't overflow the field.
923b8527 461 *
80e89abc
MD
462 * On 64-bit systems we always reserve maximal allocations for
463 * buffer cache buffers and there are no fragmentation issues,
464 * so the KVA segment does not have to be excessively oversized.
c8fe38ae
MD
465 */
466 if ((vm_size_t)(v - firstaddr) != size)
467 panic("startup: table size inconsistency");
468
469 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva,
80e89abc 470 ((vm_offset_t)(nbuf + 16) * BKVASIZE) +
74d62460 471 (nswbuf * MAXPHYS) + pager_map_size);
c8fe38ae 472 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva,
80e89abc 473 ((vm_offset_t)(nbuf + 16) * BKVASIZE));
c8fe38ae
MD
474 buffer_map.system_map = 1;
475 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva,
74d62460 476 ((vm_offset_t)nswbuf * MAXPHYS) + pager_map_size);
c8fe38ae 477 pager_map.system_map = 1;
361c5f22 478 kprintf("avail memory = %ju (%ju MB)\n",
79d182b0
MD
479 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages),
480 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) /
481 1024 / 1024);
1ebcbb29
SZ
482}
483
992cc13d 484struct cpu_idle_stat {
ffa2dd72
SZ
485 int hint;
486 int reserved;
992cc13d
SZ
487 u_long halt;
488 u_long spin;
7164ca98
SZ
489 u_long repeat;
490 u_long repeat_last;
6fb37fea 491 u_long repeat_delta;
992cc13d
SZ
492 u_long mwait_cx[CPU_MWAIT_CX_MAX];
493} __cachealign;
494
495#define CPU_IDLE_STAT_HALT -1
496#define CPU_IDLE_STAT_SPIN -2
497
498static struct cpu_idle_stat cpu_idle_stats[MAXCPU];
499
500static int
501sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)
502{
503 int idx = arg2, cpu, error;
504 u_long val = 0;
505
506 if (idx == CPU_IDLE_STAT_HALT) {
507 for (cpu = 0; cpu < ncpus; ++cpu)
508 val += cpu_idle_stats[cpu].halt;
509 } else if (idx == CPU_IDLE_STAT_SPIN) {
510 for (cpu = 0; cpu < ncpus; ++cpu)
511 val += cpu_idle_stats[cpu].spin;
512 } else {
513 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
514 ("invalid index %d", idx));
515 for (cpu = 0; cpu < ncpus; ++cpu)
516 val += cpu_idle_stats[cpu].mwait_cx[idx];
517 }
518
519 error = sysctl_handle_quad(oidp, &val, 0, req);
520 if (error || req->newptr == NULL)
521 return error;
522
523 if (idx == CPU_IDLE_STAT_HALT) {
524 for (cpu = 0; cpu < ncpus; ++cpu)
525 cpu_idle_stats[cpu].halt = 0;
526 cpu_idle_stats[0].halt = val;
527 } else if (idx == CPU_IDLE_STAT_SPIN) {
528 for (cpu = 0; cpu < ncpus; ++cpu)
529 cpu_idle_stats[cpu].spin = 0;
530 cpu_idle_stats[0].spin = val;
531 } else {
532 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
533 ("invalid index %d", idx));
534 for (cpu = 0; cpu < ncpus; ++cpu)
535 cpu_idle_stats[cpu].mwait_cx[idx] = 0;
536 cpu_idle_stats[0].mwait_cx[idx] = val;
537 }
538 return 0;
539}
540
1ebcbb29 541static void
90658001 542cpu_mwait_attach(void)
1ebcbb29 543{
90658001
SZ
544 struct sbuf sb;
545 int hint_idx, i;
546
ffa2dd72 547 if (!CPU_MWAIT_HAS_CX)
90658001
SZ
548 return;
549
550 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
551 (CPUID_TO_FAMILY(cpu_id) > 0xf ||
552 (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
553 CPUID_TO_MODEL(cpu_id) >= 0xf))) {
1e97a4ba
SZ
554 int bm_sts = 1;
555
e1e344f5
SZ
556 /*
557 * Pentium dual-core, Core 2 and beyond do not need any
558 * additional activities to enter deep C-state, i.e. C3(+).
559 */
560 cpu_mwait_cx_no_bmarb();
1e97a4ba
SZ
561
562 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts);
e1e344f5
SZ
563 if (!bm_sts)
564 cpu_mwait_cx_no_bmsts();
90658001 565 }
1f9bc256 566
90658001
SZ
567 sbuf_new(&sb, cpu_mwait_cx_supported,
568 sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN);
569
570 for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) {
571 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i];
572 int sub;
573
574 ksnprintf(cx->name, sizeof(cx->name), "C%d", i);
575
576 sysctl_ctx_init(&cx->sysctl_ctx);
577 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx,
578 SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO,
579 cx->name, CTLFLAG_RW, NULL, "Cx control/info");
580 if (cx->sysctl_tree == NULL)
581 continue;
582
583 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i);
584 SYSCTL_ADD_INT(&cx->sysctl_ctx,
585 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
586 "subcnt", CTLFLAG_RD, &cx->subcnt, 0,
587 "sub-state count");
588 SYSCTL_ADD_PROC(&cx->sysctl_ctx,
589 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
590 "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0,
591 i, sysctl_cpu_idle_cnt, "Q", "# of times entered");
592
593 for (sub = 0; sub < cx->subcnt; ++sub)
594 sbuf_printf(&sb, "C%d/%d ", i, sub);
595 }
596 sbuf_trim(&sb);
597 sbuf_finish(&sb);
1f9bc256 598
90658001
SZ
599 /*
600 * Non-deep C-states
601 */
6fb37fea 602 cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt;
90658001
SZ
603 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i)
604 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt;
605 cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt,
606 M_DEVBUF, M_WAITOK);
607
608 hint_idx = 0;
609 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) {
610 int j, subcnt;
611
612 subcnt = cpu_mwait_cx_info[i].subcnt;
613 for (j = 0; j < subcnt; ++j) {
614 KASSERT(hint_idx < cpu_mwait_hints_cnt,
615 ("invalid mwait hint index %d", hint_idx));
616 cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
617 ++hint_idx;
22d2370f 618 }
90658001
SZ
619 }
620 KASSERT(hint_idx == cpu_mwait_hints_cnt,
621 ("mwait hint count %d != index %d",
622 cpu_mwait_hints_cnt, hint_idx));
22d2370f 623
90658001 624 if (bootverbose) {
6fb37fea 625 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt);
90658001
SZ
626 for (i = 0; i < cpu_mwait_hints_cnt; ++i) {
627 int hint = cpu_mwait_hints[i];
628
629 kprintf(" C%d/%d hint 0x%04x\n",
630 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
631 hint);
f23178a0 632 }
90658001 633 }
f23178a0 634
90658001
SZ
635 /*
636 * Deep C-states
637 */
638 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i)
639 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt;
640 cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt,
641 M_DEVBUF, M_WAITOK);
642
643 hint_idx = 0;
644 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) {
645 int j, subcnt;
646
647 subcnt = cpu_mwait_cx_info[i].subcnt;
648 for (j = 0; j < subcnt; ++j) {
649 KASSERT(hint_idx < cpu_mwait_deep_hints_cnt,
650 ("invalid mwait deep hint index %d", hint_idx));
651 cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
652 ++hint_idx;
f23178a0 653 }
90658001
SZ
654 }
655 KASSERT(hint_idx == cpu_mwait_deep_hints_cnt,
656 ("mwait deep hint count %d != index %d",
657 cpu_mwait_deep_hints_cnt, hint_idx));
658
659 if (bootverbose) {
660 kprintf("MWAIT deep hints:\n");
661 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) {
662 int hint = cpu_mwait_deep_hints[i];
663
664 kprintf(" C%d/%d hint 0x%04x\n",
665 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
666 hint);
f23178a0 667 }
1f9bc256 668 }
6fb37fea 669 cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt;
ffa2dd72
SZ
670
671 for (i = 0; i < ncpus; ++i) {
672 char name[16];
673
674 ksnprintf(name, sizeof(name), "idle%d", i);
675 SYSCTL_ADD_PROC(NULL,
676 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO,
677 name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i],
678 0, cpu_mwait_cx_pcpu_idle_sysctl, "A", "");
679 }
90658001
SZ
680}
681
682static void
683cpu_finish(void *dummy __unused)
684{
685 cpu_setregs();
686 cpu_mwait_attach();
1ebcbb29
SZ
687}
688
689static void
690pic_finish(void *dummy __unused)
691{
692 /* Log ELCR information */
693 elcr_dump();
8dc88f05 694
8cc9a8d1
SZ
695 /* Log MPTABLE information */
696 mptable_pci_int_dump();
697
8dc88f05
SZ
698 /* Finalize PCI */
699 MachIntrABI.finalize();
c8fe38ae
MD
700}
701
702/*
703 * Send an interrupt to process.
704 *
705 * Stack is set up to allow sigcode stored
706 * at top to call routine, followed by kcall
707 * to sigreturn routine below. After sigreturn
708 * resets the signal mask, the stack, and the
709 * frame pointer, it returns to the user
710 * specified pc, psl.
711 */
712void
713sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
714{
c8fe38ae
MD
715 struct lwp *lp = curthread->td_lwp;
716 struct proc *p = lp->lwp_proc;
717 struct trapframe *regs;
718 struct sigacts *psp = p->p_sigacts;
719 struct sigframe sf, *sfp;
720 int oonstack;
a6a09809 721 char *sp;
c8fe38ae
MD
722
723 regs = lp->lwp_md.md_regs;
724 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0;
725
a6a09809 726 /* Save user context */
c8fe38ae
MD
727 bzero(&sf, sizeof(struct sigframe));
728 sf.sf_uc.uc_sigmask = *mask;
729 sf.sf_uc.uc_stack = lp->lwp_sigstk;
730 sf.sf_uc.uc_mcontext.mc_onstack = oonstack;
5b9f6cc4
MD
731 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0);
732 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe));
c8fe38ae 733
a6a09809 734 /* Make the size of the saved context visible to userland */
c8fe38ae
MD
735 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext);
736
c8fe38ae 737 /* Allocate and validate space for the signal handler context. */
4643740a 738 if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack &&
c8fe38ae 739 SIGISMEMBER(psp->ps_sigonstack, sig)) {
a6a09809
MD
740 sp = (char *)(lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size -
741 sizeof(struct sigframe));
c8fe38ae
MD
742 lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
743 } else {
89954408
JG
744 /* We take red zone into account */
745 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
c8fe38ae
MD
746 }
747
e6e019a8
MD
748 /*
749 * XXX AVX needs 64-byte alignment but sigframe has other fields and
750 * the embedded ucontext is not at the front, so aligning this won't
751 * help us. Fortunately we bcopy in/out of the sigframe, so the
752 * kernel is ok.
753 *
754 * The problem though is if userland winds up trying to use the
755 * context directly.
756 */
4117f2fd 757 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF);
a6a09809 758
c8fe38ae
MD
759 /* Translate the signal is appropriate */
760 if (p->p_sysent->sv_sigtbl) {
761 if (sig <= p->p_sysent->sv_sigsize)
762 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
763 }
764
a6a09809
MD
765 /*
766 * Build the argument list for the signal handler.
767 *
768 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
769 */
770 regs->tf_rdi = sig; /* argument 1 */
771 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */
772
c8fe38ae 773 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
a6a09809
MD
774 /*
775 * Signal handler installed with SA_SIGINFO.
776 *
777 * action(signo, siginfo, ucontext)
778 */
779 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */
630d9ab4 780 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
c8fe38ae
MD
781 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
782
783 /* fill siginfo structure */
784 sf.sf_si.si_signo = sig;
785 sf.sf_si.si_code = code;
630d9ab4 786 sf.sf_si.si_addr = (void *)regs->tf_addr;
a6a09809
MD
787 } else {
788 /*
789 * Old FreeBSD-style arguments.
790 *
791 * handler (signo, code, [uc], addr)
792 */
793 regs->tf_rsi = (register_t)code; /* argument 2 */
630d9ab4 794 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
c8fe38ae
MD
795 sf.sf_ahu.sf_handler = catcher;
796 }
797
798 /*
799 * If we're a vm86 process, we want to save the segment registers.
800 * We also change eflags to be our emulated eflags, not the actual
801 * eflags.
802 */
32d3bd25 803#if 0 /* JG */
c8fe38ae
MD
804 if (regs->tf_eflags & PSL_VM) {
805 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
806 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
807
808 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
809 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
810 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
811 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
812
813 if (vm86->vm86_has_vme == 0)
814 sf.sf_uc.uc_mcontext.mc_eflags =
815 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
816 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
817
818 /*
819 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
820 * syscalls made by the signal handler. This just avoids
821 * wasting time for our lazy fixup of such faults. PSL_NT
822 * does nothing in vm86 mode, but vm86 programs can set it
823 * almost legitimately in probes for old cpu types.
824 */
825 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
826 }
827#endif
828
829 /*
830 * Save the FPU state and reinit the FP unit
831 */
c8fe38ae 832 npxpush(&sf.sf_uc.uc_mcontext);
c8fe38ae
MD
833
834 /*
835 * Copy the sigframe out to the user's stack.
836 */
837 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
838 /*
839 * Something is wrong with the stack pointer.
840 * ...Kill the process.
841 */
842 sigexit(lp, SIGILL);
843 }
844
5b9f6cc4
MD
845 regs->tf_rsp = (register_t)sfp;
846 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
c8fe38ae
MD
847
848 /*
849 * i386 abi specifies that the direction flag must be cleared
850 * on function entry
851 */
5b9f6cc4 852 regs->tf_rflags &= ~(PSL_T|PSL_D);
c8fe38ae 853
c8fe38ae 854 /*
a6a09809
MD
855 * 64 bit mode has a code and stack selector but
856 * no data or extra selector. %fs and %gs are not
857 * stored in-context.
c8fe38ae 858 */
a6a09809 859 regs->tf_cs = _ucodesel;
c8fe38ae 860 regs->tf_ss = _udatasel;
f2081646 861 clear_quickret();
c8fe38ae
MD
862}
863
864/*
865 * Sanitize the trapframe for a virtual kernel passing control to a custom
866 * VM context. Remove any items that would otherwise create a privilage
867 * issue.
868 *
869 * XXX at the moment we allow userland to set the resume flag. Is this a
870 * bad idea?
871 */
872int
873cpu_sanitize_frame(struct trapframe *frame)
874{
c8fe38ae 875 frame->tf_cs = _ucodesel;
c8fe38ae 876 frame->tf_ss = _udatasel;
5b9f6cc4
MD
877 /* XXX VM (8086) mode not supported? */
878 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP);
879 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I;
880
c8fe38ae
MD
881 return(0);
882}
883
884/*
885 * Sanitize the tls so loading the descriptor does not blow up
b2b3ffcd 886 * on us. For x86_64 we don't have to do anything.
c8fe38ae
MD
887 */
888int
889cpu_sanitize_tls(struct savetls *tls)
890{
891 return(0);
892}
893
894/*
895 * sigreturn(ucontext_t *sigcntxp)
896 *
897 * System call to cleanup state after a signal
898 * has been taken. Reset signal mask and
899 * stack state from context left by sendsig (above).
900 * Return to previous pc and psl as specified by
901 * context left by sendsig. Check carefully to
902 * make sure that the user has not modified the
903 * state to gain improper privileges.
3919ced0
MD
904 *
905 * MPSAFE
c8fe38ae
MD
906 */
907#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
908#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
909
910int
911sys_sigreturn(struct sigreturn_args *uap)
912{
913 struct lwp *lp = curthread->td_lwp;
c8fe38ae
MD
914 struct trapframe *regs;
915 ucontext_t uc;
916 ucontext_t *ucp;
5b9f6cc4 917 register_t rflags;
c8fe38ae 918 int cs;
c8fe38ae
MD
919 int error;
920
921 /*
922 * We have to copy the information into kernel space so userland
923 * can't modify it while we are sniffing it.
924 */
925 regs = lp->lwp_md.md_regs;
926 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
927 if (error)
928 return (error);
929 ucp = &uc;
5b9f6cc4
MD
930 rflags = ucp->uc_mcontext.mc_rflags;
931
932 /* VM (8086) mode not supported */
933 rflags &= ~PSL_VM_UNSUPP;
c8fe38ae 934
32d3bd25 935#if 0 /* JG */
c8fe38ae
MD
936 if (eflags & PSL_VM) {
937 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
938 struct vm86_kernel *vm86;
939
940 /*
941 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
942 * set up the vm86 area, and we can't enter vm86 mode.
943 */
944 if (lp->lwp_thread->td_pcb->pcb_ext == 0)
945 return (EINVAL);
946 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
947 if (vm86->vm86_inited == 0)
948 return (EINVAL);
949
950 /* go back to user mode if both flags are set */
951 if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
952 trapsignal(lp, SIGBUS, 0);
953
954 if (vm86->vm86_has_vme) {
c8fe38ae
MD
955 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
956 (eflags & VME_USERCHANGE) | PSL_VM;
c8fe38ae 957 } else {
c8fe38ae
MD
958 vm86->vm86_eflags = eflags; /* save VIF, VIP */
959 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
960 (eflags & VM_USERCHANGE) | PSL_VM;
c8fe38ae 961 }
c8fe38ae
MD
962 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe));
963 tf->tf_eflags = eflags;
c8fe38ae
MD
964 tf->tf_vm86_ds = tf->tf_ds;
965 tf->tf_vm86_es = tf->tf_es;
966 tf->tf_vm86_fs = tf->tf_fs;
967 tf->tf_vm86_gs = tf->tf_gs;
968 tf->tf_ds = _udatasel;
969 tf->tf_es = _udatasel;
c8fe38ae
MD
970 tf->tf_fs = _udatasel;
971 tf->tf_gs = _udatasel;
5b9f6cc4 972 } else
c8fe38ae 973#endif
5b9f6cc4 974 {
c8fe38ae
MD
975 /*
976 * Don't allow users to change privileged or reserved flags.
977 */
978 /*
979 * XXX do allow users to change the privileged flag PSL_RF.
980 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
981 * should sometimes set it there too. tf_eflags is kept in
982 * the signal context during signal handling and there is no
983 * other place to remember it, so the PSL_RF bit may be
984 * corrupted by the signal handler without us knowing.
985 * Corruption of the PSL_RF bit at worst causes one more or
986 * one less debugger trap, so allowing it is fairly harmless.
987 */
5b9f6cc4
MD
988 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
989 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags);
c8fe38ae
MD
990 return(EINVAL);
991 }
c8fe38ae
MD
992
993 /*
994 * Don't allow users to load a valid privileged %cs. Let the
995 * hardware check for invalid selectors, excess privilege in
996 * other selectors, invalid %eip's and invalid %esp's.
997 */
998 cs = ucp->uc_mcontext.mc_cs;
999 if (!CS_SECURE(cs)) {
1000 kprintf("sigreturn: cs = 0x%x\n", cs);
1001 trapsignal(lp, SIGBUS, T_PROTFLT);
1002 return(EINVAL);
1003 }
5b9f6cc4 1004 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(struct trapframe));
c8fe38ae 1005 }
c8fe38ae
MD
1006
1007 /*
1008 * Restore the FPU state from the frame
1009 */
3919ced0 1010 crit_enter();
c8fe38ae 1011 npxpop(&ucp->uc_mcontext);
c8fe38ae 1012
c8fe38ae
MD
1013 if (ucp->uc_mcontext.mc_onstack & 1)
1014 lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
1015 else
1016 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK;
1017
1018 lp->lwp_sigmask = ucp->uc_sigmask;
1019 SIG_CANTMASK(lp->lwp_sigmask);
f2081646 1020 clear_quickret();
3919ced0 1021 crit_exit();
c8fe38ae
MD
1022 return(EJUSTRETURN);
1023}
1024
c8fe38ae
MD
1025/*
1026 * Machine dependent boot() routine
1027 *
1028 * I haven't seen anything to put here yet
1029 * Possibly some stuff might be grafted back here from boot()
1030 */
1031void
1032cpu_boot(int howto)
1033{
1034}
1035
1036/*
1037 * Shutdown the CPU as much as possible
1038 */
1039void
1040cpu_halt(void)
1041{
1042 for (;;)
1043 __asm__ __volatile("hlt");
1044}
1045
1046/*
1047 * cpu_idle() represents the idle LWKT. You cannot return from this function
1048 * (unless you want to blow things up!). Instead we look for runnable threads
1049 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1050 *
1051 * The main loop is entered with a critical section held, we must release
1052 * the critical section before doing anything else. lwkt_switch() will
1053 * check for pending interrupts due to entering and exiting its own
1054 * critical section.
1055 *
7d4d6fdb
MD
1056 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1057 * However, there are cases where the idlethread will be entered with
1058 * the possibility that no IPI will occur and in such cases
1059 * lwkt_switch() sets TDF_IDLE_NOHLT.
1060 *
be71787b
MD
1061 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1062 * must occur before it starts using ACPI halt.
c7bf93fb
MD
1063 *
1064 * NOTE: Value overridden in hammer_time().
c8fe38ae 1065 */
46e562ce 1066static int cpu_idle_hlt = 2;
c8fe38ae
MD
1067SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1068 &cpu_idle_hlt, 0, "Idle loop HLT enable");
be71787b
MD
1069SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW,
1070 &cpu_idle_repeat, 0, "Idle entries before acpi hlt");
c8fe38ae 1071
992cc13d
SZ
1072SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1073 0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts");
1074SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1075 0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins");
1076
c8fe38ae
MD
1077static void
1078cpu_idle_default_hook(void)
1079{
1080 /*
1081 * We must guarentee that hlt is exactly the instruction
1082 * following the sti.
1083 */
1084 __asm __volatile("sti; hlt");
1085}
1086
1087/* Other subsystems (e.g., ACPI) can hook this later. */
1088void (*cpu_idle_hook)(void) = cpu_idle_default_hook;
1089
f23178a0 1090static __inline int
7164ca98 1091cpu_mwait_cx_hint(struct cpu_idle_stat *stat)
f23178a0 1092{
992cc13d 1093 int hint, cx_idx;
1786faf9
SZ
1094 u_int idx;
1095
ffa2dd72
SZ
1096 hint = stat->hint;
1097 if (hint >= 0)
992cc13d 1098 goto done;
1786faf9 1099
6fb37fea
SZ
1100 idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >>
1101 cpu_mwait_repeat_shift;
1102 if (idx >= cpu_mwait_c1_hints_cnt) {
1103 /* Step up faster, once we walked through all C1 states */
1104 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1);
1105 }
ffa2dd72 1106 if (hint == CPU_MWAIT_HINT_AUTODEEP) {
1786faf9
SZ
1107 if (idx >= cpu_mwait_deep_hints_cnt)
1108 idx = cpu_mwait_deep_hints_cnt - 1;
992cc13d 1109 hint = cpu_mwait_deep_hints[idx];
1786faf9
SZ
1110 } else {
1111 if (idx >= cpu_mwait_hints_cnt)
1112 idx = cpu_mwait_hints_cnt - 1;
992cc13d 1113 hint = cpu_mwait_hints[idx];
1786faf9 1114 }
992cc13d
SZ
1115done:
1116 cx_idx = MWAIT_EAX_TO_CX(hint);
1117 if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX)
7164ca98 1118 stat->mwait_cx[cx_idx]++;
992cc13d 1119 return hint;
f23178a0
SZ
1120}
1121
c8fe38ae
MD
1122void
1123cpu_idle(void)
1124{
0f0466c0 1125 globaldata_t gd = mycpu;
7164ca98 1126 struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid];
86232a57 1127 struct thread *td __debugvar = gd->gd_curthread;
0f0466c0 1128 int reqflags;
be71787b 1129 int quick;
c8fe38ae 1130
2cb4dff2
SZ
1131 stat->repeat = stat->repeat_last = cpu_idle_repeat_max;
1132
c8fe38ae 1133 crit_exit();
f9235b6d 1134 KKASSERT(td->td_critcount == 0);
06c66eb2 1135
c8fe38ae
MD
1136 for (;;) {
1137 /*
1138 * See if there are any LWKTs ready to go.
1139 */
1140 lwkt_switch();
1141
1142 /*
be71787b
MD
1143 * When halting inside a cli we must check for reqflags
1144 * races, particularly [re]schedule requests. Running
1145 * splz() does the job.
1146 *
1147 * cpu_idle_hlt:
1148 * 0 Never halt, just spin
1149 *
1150 * 1 Always use HLT (or MONITOR/MWAIT if avail).
c7bf93fb
MD
1151 *
1152 * Better default for modern (Haswell+) Intel
1153 * cpus.
be71787b
MD
1154 *
1155 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1156 * use the ACPI halt (default). This is a hybrid
1157 * approach. See machdep.cpu_idle_repeat.
1158 *
c7bf93fb
MD
1159 * Better default for modern AMD cpus and older
1160 * Intel cpus.
1161 *
be71787b
MD
1162 * 3 Always use the ACPI halt. This typically
1163 * eats the least amount of power but the cpu
1164 * will be slow waking up. Slows down e.g.
1165 * compiles and other pipe/event oriented stuff.
1166 *
1997b4c2
MD
1167 * 4 Always use HLT.
1168 *
be71787b
MD
1169 * NOTE: Interrupts are enabled and we are not in a critical
1170 * section.
1171 *
1172 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1173 * don't bother capping gd_idle_repeat, it is ok if
1174 * it overflows.
c8fe38ae 1175 */
7164ca98
SZ
1176 if (gd->gd_idle_repeat == 0) {
1177 stat->repeat = (stat->repeat + stat->repeat_last) >> 1;
2cb4dff2
SZ
1178 if (stat->repeat > cpu_idle_repeat_max)
1179 stat->repeat = cpu_idle_repeat_max;
7164ca98 1180 stat->repeat_last = 0;
6fb37fea 1181 stat->repeat_delta = 0;
7164ca98
SZ
1182 }
1183 ++stat->repeat_last;
2cb4dff2 1184
be71787b 1185 ++gd->gd_idle_repeat;
0f0466c0 1186 reqflags = gd->gd_reqflags;
be71787b
MD
1187 quick = (cpu_idle_hlt == 1) ||
1188 (cpu_idle_hlt < 3 &&
1189 gd->gd_idle_repeat < cpu_idle_repeat);
1190
1191 if (quick && (cpu_mi_feature & CPU_MI_MONITOR) &&
0f0466c0 1192 (reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
701c977e 1193 splz(); /* XXX */
a46b4a23 1194 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
7164ca98
SZ
1195 cpu_mwait_cx_hint(stat), 0);
1196 stat->halt++;
0f0466c0 1197 } else if (cpu_idle_hlt) {
c8fe38ae
MD
1198 __asm __volatile("cli");
1199 splz();
0f0466c0 1200 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
be71787b 1201 if (quick)
7d4d6fdb
MD
1202 cpu_idle_default_hook();
1203 else
1204 cpu_idle_hook();
1205 }
7d4d6fdb 1206 __asm __volatile("sti");
7164ca98 1207 stat->halt++;
c8fe38ae 1208 } else {
c8fe38ae 1209 splz();
c5724852 1210 __asm __volatile("sti");
7164ca98 1211 stat->spin++;
c8fe38ae
MD
1212 }
1213 }
1214}
1215
c8fe38ae
MD
1216/*
1217 * This routine is called if a spinlock has been held through the
1218 * exponential backoff period and is seriously contested. On a real cpu
1219 * we let it spin.
1220 */
1221void
1222cpu_spinlock_contested(void)
1223{
1224 cpu_pause();
1225}
1226
1227/*
1228 * Clear registers on exec
1229 */
1230void
1231exec_setregs(u_long entry, u_long stack, u_long ps_strings)
1232{
1233 struct thread *td = curthread;
1234 struct lwp *lp = td->td_lwp;
1235 struct pcb *pcb = td->td_pcb;
1236 struct trapframe *regs = lp->lwp_md.md_regs;
1237
c8fe38ae
MD
1238 /* was i386_user_cleanup() in NetBSD */
1239 user_ldt_free(pcb);
1240
f2081646 1241 clear_quickret();
c8fe38ae
MD
1242 bzero((char *)regs, sizeof(struct trapframe));
1243 regs->tf_rip = entry;
1244 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */
1245 regs->tf_rdi = stack; /* argv */
1246 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
1247 regs->tf_ss = _udatasel;
1248 regs->tf_cs = _ucodesel;
1249 regs->tf_rbx = ps_strings;
1250
1251 /*
1252 * Reset the hardware debug registers if they were in use.
1253 * They won't have any meaning for the newly exec'd process.
1254 */
1255 if (pcb->pcb_flags & PCB_DBREGS) {
1256 pcb->pcb_dr0 = 0;
1257 pcb->pcb_dr1 = 0;
1258 pcb->pcb_dr2 = 0;
1259 pcb->pcb_dr3 = 0;
1260 pcb->pcb_dr6 = 0;
0855a2af 1261 pcb->pcb_dr7 = 0; /* JG set bit 10? */
c8fe38ae
MD
1262 if (pcb == td->td_pcb) {
1263 /*
1264 * Clear the debug registers on the running
1265 * CPU, otherwise they will end up affecting
1266 * the next process we switch to.
1267 */
1268 reset_dbregs();
1269 }
1270 pcb->pcb_flags &= ~PCB_DBREGS;
1271 }
1272
1273 /*
1274 * Initialize the math emulator (if any) for the current process.
1275 * Actually, just clear the bit that says that the emulator has
1276 * been initialized. Initialization is delayed until the process
1277 * traps to the emulator (if it is done at all) mainly because
1278 * emulators don't provide an entry point for initialization.
1279 */
c8fe38ae 1280 pcb->pcb_flags &= ~FP_SOFTFP;
c8fe38ae
MD
1281
1282 /*
5b9f6cc4
MD
1283 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1284 * gd_npxthread. Otherwise a preemptive interrupt thread
1285 * may panic in npxdna().
c8fe38ae
MD
1286 */
1287 crit_enter();
1288 load_cr0(rcr0() | CR0_MP);
1289
5b9f6cc4
MD
1290 /*
1291 * NOTE: The MSR values must be correct so we can return to
1292 * userland. gd_user_fs/gs must be correct so the switch
1293 * code knows what the current MSR values are.
1294 */
1295 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */
c8fe38ae 1296 pcb->pcb_gsbase = 0;
5b9f6cc4
MD
1297 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */
1298 mdcpu->gd_user_gs = 0;
1299 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */
1300 wrmsr(MSR_KGSBASE, 0);
c8fe38ae 1301
c8fe38ae 1302 /* Initialize the npx (if any) for the current process. */
186c803f 1303 npxinit();
c8fe38ae
MD
1304 crit_exit();
1305
1306 pcb->pcb_ds = _udatasel;
1307 pcb->pcb_es = _udatasel;
1308 pcb->pcb_fs = _udatasel;
1309 pcb->pcb_gs = _udatasel;
1310}
1311
1312void
1313cpu_setregs(void)
1314{
1315 register_t cr0;
1316
1317 cr0 = rcr0();
1318 cr0 |= CR0_NE; /* Done by npxinit() */
1319 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */
1320 cr0 |= CR0_WP | CR0_AM;
1321 load_cr0(cr0);
1322 load_gs(_udatasel);
1323}
1324
1325static int
1326sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1327{
1328 int error;
1329 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1330 req);
1331 if (!error && req->newptr)
1332 resettodr();
1333 return (error);
1334}
1335
1336SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1337 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1338
c8fe38ae
MD
1339SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1340 CTLFLAG_RW, &disable_rtc_set, 0, "");
c8fe38ae 1341
32d3bd25 1342#if 0 /* JG */
c8fe38ae
MD
1343SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1344 CTLFLAG_RD, &bootinfo, bootinfo, "");
1345#endif
1346
1347SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1348 CTLFLAG_RW, &wall_cmos_clock, 0, "");
1349
1350extern u_long bootdev; /* not a cdev_t - encoding is different */
1351SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
1352 CTLFLAG_RD, &bootdev, 0, "Boot device (not in cdev_t format)");
1353
1354/*
1355 * Initialize 386 and configure to run kernel
1356 */
1357
1358/*
1359 * Initialize segments & interrupt table
1360 */
1361
1362int _default_ldt;
1363struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
8a06c6ee 1364struct gate_descriptor idt_arr[MAXCPU][NIDT];
32d3bd25 1365#if 0 /* JG */
c8fe38ae
MD
1366union descriptor ldt[NLDT]; /* local descriptor table */
1367#endif
1368
1369/* table descriptors - used to load tables by cpu */
8a06c6ee
SZ
1370struct region_descriptor r_gdt;
1371struct region_descriptor r_idt_arr[MAXCPU];
c8fe38ae 1372
c8fe38ae
MD
1373/* JG proc0paddr is a virtual address */
1374void *proc0paddr;
1375/* JG alignment? */
1376char proc0paddr_buff[LWKT_THREAD_STACK];
1377
1378
1379/* software prototypes -- in more palatable form */
1380struct soft_segment_descriptor gdt_segs[] = {
1381/* GNULL_SEL 0 Null Descriptor */
1382{ 0x0, /* segment base address */
1383 0x0, /* length */
1384 0, /* segment type */
1385 0, /* segment descriptor priority level */
1386 0, /* segment descriptor present */
1387 0, /* long */
1388 0, /* default 32 vs 16 bit size */
1389 0 /* limit granularity (byte/page units)*/ },
1390/* GCODE_SEL 1 Code Descriptor for kernel */
1391{ 0x0, /* segment base address */
1392 0xfffff, /* length - all address space */
1393 SDT_MEMERA, /* segment type */
1394 SEL_KPL, /* segment descriptor priority level */
1395 1, /* segment descriptor present */
1396 1, /* long */
1397 0, /* default 32 vs 16 bit size */
1398 1 /* limit granularity (byte/page units)*/ },
1399/* GDATA_SEL 2 Data Descriptor for kernel */
1400{ 0x0, /* segment base address */
1401 0xfffff, /* length - all address space */
1402 SDT_MEMRWA, /* segment type */
1403 SEL_KPL, /* segment descriptor priority level */
1404 1, /* segment descriptor present */
1405 1, /* long */
1406 0, /* default 32 vs 16 bit size */
1407 1 /* limit granularity (byte/page units)*/ },
1408/* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1409{ 0x0, /* segment base address */
1410 0xfffff, /* length - all address space */
1411 SDT_MEMERA, /* segment type */
1412 SEL_UPL, /* segment descriptor priority level */
1413 1, /* segment descriptor present */
1414 0, /* long */
1415 1, /* default 32 vs 16 bit size */
1416 1 /* limit granularity (byte/page units)*/ },
1417/* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1418{ 0x0, /* segment base address */
1419 0xfffff, /* length - all address space */
1420 SDT_MEMRWA, /* segment type */
1421 SEL_UPL, /* segment descriptor priority level */
1422 1, /* segment descriptor present */
1423 0, /* long */
1424 1, /* default 32 vs 16 bit size */
1425 1 /* limit granularity (byte/page units)*/ },
1426/* GUCODE_SEL 5 64 bit Code Descriptor for user */
1427{ 0x0, /* segment base address */
1428 0xfffff, /* length - all address space */
1429 SDT_MEMERA, /* segment type */
1430 SEL_UPL, /* segment descriptor priority level */
1431 1, /* segment descriptor present */
1432 1, /* long */
1433 0, /* default 32 vs 16 bit size */
1434 1 /* limit granularity (byte/page units)*/ },
1435/* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1436{
1437 0x0, /* segment base address */
b2b3ffcd 1438 sizeof(struct x86_64tss)-1,/* length - all address space */
c8fe38ae
MD
1439 SDT_SYSTSS, /* segment type */
1440 SEL_KPL, /* segment descriptor priority level */
1441 1, /* segment descriptor present */
1442 0, /* long */
1443 0, /* unused - default 32 vs 16 bit size */
1444 0 /* limit granularity (byte/page units)*/ },
1445/* Actually, the TSS is a system descriptor which is double size */
1446{ 0x0, /* segment base address */
1447 0x0, /* length */
1448 0, /* segment type */
1449 0, /* segment descriptor priority level */
1450 0, /* segment descriptor present */
1451 0, /* long */
1452 0, /* default 32 vs 16 bit size */
1453 0 /* limit granularity (byte/page units)*/ },
1454/* GUGS32_SEL 8 32 bit GS Descriptor for user */
1455{ 0x0, /* segment base address */
1456 0xfffff, /* length - all address space */
1457 SDT_MEMRWA, /* segment type */
1458 SEL_UPL, /* segment descriptor priority level */
1459 1, /* segment descriptor present */
1460 0, /* long */
1461 1, /* default 32 vs 16 bit size */
1462 1 /* limit granularity (byte/page units)*/ },
1463};
1464
1465void
8a06c6ee 1466setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist)
c8fe38ae 1467{
8a06c6ee
SZ
1468 int cpu;
1469
1470 for (cpu = 0; cpu < MAXCPU; ++cpu) {
1471 struct gate_descriptor *ip = &idt_arr[cpu][idx];
1472
1473 ip->gd_looffset = (uintptr_t)func;
1474 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1475 ip->gd_ist = ist;
1476 ip->gd_xx = 0;
1477 ip->gd_type = typ;
1478 ip->gd_dpl = dpl;
1479 ip->gd_p = 1;
1480 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1481 }
c8fe38ae
MD
1482}
1483
8a782434
SZ
1484void
1485setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu)
1486{
1487 struct gate_descriptor *ip;
1488
ed20d0e3 1489 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu));
8a782434
SZ
1490
1491 ip = &idt_arr[cpu][idx];
1492 ip->gd_looffset = (uintptr_t)func;
1493 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1494 ip->gd_ist = ist;
1495 ip->gd_xx = 0;
1496 ip->gd_type = typ;
1497 ip->gd_dpl = dpl;
1498 ip->gd_p = 1;
1499 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1500}
1501
c8fe38ae
MD
1502#define IDTVEC(name) __CONCAT(X,name)
1503
1504extern inthand_t
1505 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1506 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1507 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1508 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
1509 IDTVEC(xmm), IDTVEC(dblfault),
1510 IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
1511
c8fe38ae
MD
1512void
1513sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
1514{
1515 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
1516 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1517 ssd->ssd_type = sd->sd_type;
1518 ssd->ssd_dpl = sd->sd_dpl;
1519 ssd->ssd_p = sd->sd_p;
1520 ssd->ssd_def32 = sd->sd_def32;
1521 ssd->ssd_gran = sd->sd_gran;
1522}
1523
1524void
1525ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
1526{
1527
1528 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1529 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
1530 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1531 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1532 sd->sd_type = ssd->ssd_type;
1533 sd->sd_dpl = ssd->ssd_dpl;
1534 sd->sd_p = ssd->ssd_p;
1535 sd->sd_long = ssd->ssd_long;
1536 sd->sd_def32 = ssd->ssd_def32;
1537 sd->sd_gran = ssd->ssd_gran;
1538}
1539
1540void
1541ssdtosyssd(struct soft_segment_descriptor *ssd,
1542 struct system_segment_descriptor *sd)
1543{
1544
1545 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1546 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
1547 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1548 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1549 sd->sd_type = ssd->ssd_type;
1550 sd->sd_dpl = ssd->ssd_dpl;
1551 sd->sd_p = ssd->ssd_p;
1552 sd->sd_gran = ssd->ssd_gran;
1553}
1554
c8fe38ae
MD
1555/*
1556 * Populate the (physmap) array with base/bound pairs describing the
1557 * available physical memory in the system, then test this memory and
1558 * build the phys_avail array describing the actually-available memory.
1559 *
1560 * If we cannot accurately determine the physical memory map, then use
1561 * value from the 0xE801 call, and failing that, the RTC.
1562 *
1563 * Total memory size may be set by the kernel environment variable
1564 * hw.physmem or the compile-time define MAXMEM.
1565 *
b4d9abe2
MD
1566 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1567 * of PAGE_SIZE. This also greatly reduces the memory test time
1568 * which would otherwise be excessive on machines with > 8G of ram.
1569 *
c8fe38ae
MD
1570 * XXX first should be vm_paddr_t.
1571 */
b4d9abe2
MD
1572
1573#define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1574#define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
902419bf
MD
1575 vm_paddr_t physmap[PHYSMAP_SIZE];
1576 struct bios_smap *smapbase, *smap, *smapend;
1577 u_int32_t smapsize;
b4d9abe2 1578
c8fe38ae
MD
1579static void
1580getmemsize(caddr_t kmdp, u_int64_t first)
1581{
b4d9abe2
MD
1582 int off, physmap_idx, pa_indx, da_indx;
1583 int i, j;
b4d9abe2
MD
1584 vm_paddr_t pa;
1585 vm_paddr_t msgbuf_size;
c8fe38ae
MD
1586 u_long physmem_tunable;
1587 pt_entry_t *pte;
c8fe38ae
MD
1588 quad_t dcons_addr, dcons_size;
1589
1590 bzero(physmap, sizeof(physmap));
c8fe38ae
MD
1591 physmap_idx = 0;
1592
1593 /*
1594 * get memory map from INT 15:E820, kindly supplied by the loader.
1595 *
1596 * subr_module.c says:
1597 * "Consumer may safely assume that size value precedes data."
1598 * ie: an int32_t immediately precedes smap.
1599 */
1600 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1601 MODINFO_METADATA | MODINFOMD_SMAP);
1602 if (smapbase == NULL)
1603 panic("No BIOS smap info from loader!");
1604
1605 smapsize = *((u_int32_t *)smapbase - 1);
1606 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1607
1608 for (smap = smapbase; smap < smapend; smap++) {
1609 if (boothowto & RB_VERBOSE)
1610 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1611 smap->type, smap->base, smap->length);
1612
1613 if (smap->type != SMAP_TYPE_MEMORY)
1614 continue;
1615
1616 if (smap->length == 0)
1617 continue;
1618
1619 for (i = 0; i <= physmap_idx; i += 2) {
1620 if (smap->base < physmap[i + 1]) {
1bda0d3d
MD
1621 if (boothowto & RB_VERBOSE) {
1622 kprintf("Overlapping or non-monotonic "
1623 "memory region, ignoring "
1624 "second region\n");
1625 }
2eddd927 1626 break;
c8fe38ae
MD
1627 }
1628 }
2eddd927
MD
1629 if (i <= physmap_idx)
1630 continue;
1631
1bda0d3d 1632 Realmem += smap->length;
c8fe38ae
MD
1633
1634 if (smap->base == physmap[physmap_idx + 1]) {
1635 physmap[physmap_idx + 1] += smap->length;
1636 continue;
1637 }
1638
1639 physmap_idx += 2;
1640 if (physmap_idx == PHYSMAP_SIZE) {
1bda0d3d
MD
1641 kprintf("Too many segments in the physical "
1642 "address map, giving up\n");
c8fe38ae
MD
1643 break;
1644 }
1645 physmap[physmap_idx] = smap->base;
1646 physmap[physmap_idx + 1] = smap->base + smap->length;
1647 }
1648
8936cd9b 1649 base_memory = physmap[1] / 1024;
c8fe38ae 1650 /* make hole for AP bootstrap code */
8936cd9b 1651 physmap[1] = mp_bootaddress(base_memory);
2331304b 1652
927c4c1f
MN
1653 /* Save EBDA address, if any */
1654 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e));
1655 ebda_addr <<= 4;
c8fe38ae
MD
1656
1657 /*
1658 * Maxmem isn't the "maximum memory", it's one larger than the
1659 * highest page of the physical address space. It should be
1660 * called something like "Maxphyspage". We may adjust this
1661 * based on ``hw.physmem'' and the results of the memory test.
1662 */
1663 Maxmem = atop(physmap[physmap_idx + 1]);
1664
1665#ifdef MAXMEM
1666 Maxmem = MAXMEM / 4;
1667#endif
1668
1669 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1670 Maxmem = atop(physmem_tunable);
1671
1672 /*
1673 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1674 * in the system.
1675 */
1676 if (Maxmem > atop(physmap[physmap_idx + 1]))
1677 Maxmem = atop(physmap[physmap_idx + 1]);
1678
8e5ea5f7 1679 /*
b4d9abe2 1680 * Blowing out the DMAP will blow up the system.
8e5ea5f7
MD
1681 */
1682 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) {
1683 kprintf("Limiting Maxmem due to DMAP size\n");
1684 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS);
1685 }
1686
c8fe38ae 1687 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
b4d9abe2 1688 (boothowto & RB_VERBOSE)) {
c8fe38ae 1689 kprintf("Physical memory use set to %ldK\n", Maxmem * 4);
b4d9abe2 1690 }
c8fe38ae 1691
b4d9abe2
MD
1692 /*
1693 * Call pmap initialization to make new kernel address space
1694 *
1695 * Mask off page 0.
1696 */
48ffc236 1697 pmap_bootstrap(&first);
b4d9abe2
MD
1698 physmap[0] = PAGE_SIZE;
1699
1700 /*
1701 * Align the physmap to PHYSMAP_ALIGN and cut out anything
1702 * exceeding Maxmem.
1703 */
1704 for (i = j = 0; i <= physmap_idx; i += 2) {
06bb314f
SW
1705 if (physmap[i+1] > ptoa(Maxmem))
1706 physmap[i+1] = ptoa(Maxmem);
b4d9abe2
MD
1707 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) &
1708 ~PHYSMAP_ALIGN_MASK;
1709 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK;
1710
1711 physmap[j] = physmap[i];
1712 physmap[j+1] = physmap[i+1];
1713
1714 if (physmap[i] < physmap[i+1])
1715 j += 2;
1716 }
1717 physmap_idx = j - 2;
1718
1719 /*
1720 * Align anything else used in the validation loop.
1721 */
1722 first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
c8fe38ae
MD
1723
1724 /*
1725 * Size up each available chunk of physical memory.
1726 */
c8fe38ae
MD
1727 pa_indx = 0;
1728 da_indx = 1;
1729 phys_avail[pa_indx++] = physmap[0];
1730 phys_avail[pa_indx] = physmap[0];
1731 dump_avail[da_indx] = physmap[0];
1732 pte = CMAP1;
1733
1734 /*
1735 * Get dcons buffer address
1736 */
1737 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 ||
1738 kgetenv_quad("dcons.size", &dcons_size) == 0)
1739 dcons_addr = 0;
1740
1741 /*
b4d9abe2
MD
1742 * Validate the physical memory. The physical memory segments
1743 * have already been aligned to PHYSMAP_ALIGN which is a multiple
1744 * of PAGE_SIZE.
c8fe38ae
MD
1745 */
1746 for (i = 0; i <= physmap_idx; i += 2) {
1747 vm_paddr_t end;
1748
b4d9abe2
MD
1749 end = physmap[i + 1];
1750
1751 for (pa = physmap[i]; pa < end; pa += PHYSMAP_ALIGN) {
c8fe38ae
MD
1752 int tmp, page_bad, full;
1753 int *ptr = (int *)CADDR1;
1754
1755 full = FALSE;
1756 /*
1757 * block out kernel memory as not available.
1758 */
0aefa526 1759 if (pa >= 0x200000 && pa < first)
c8fe38ae
MD
1760 goto do_dump_avail;
1761
1762 /*
1763 * block out dcons buffer
1764 */
1765 if (dcons_addr > 0
1766 && pa >= trunc_page(dcons_addr)
b4d9abe2 1767 && pa < dcons_addr + dcons_size) {
c8fe38ae 1768 goto do_dump_avail;
b4d9abe2 1769 }
c8fe38ae
MD
1770
1771 page_bad = FALSE;
1772
1773 /*
1774 * map page into kernel: valid, read/write,non-cacheable
1775 */
a86ce0cd
MD
1776 *pte = pa |
1777 kernel_pmap.pmap_bits[PG_V_IDX] |
1778 kernel_pmap.pmap_bits[PG_RW_IDX] |
1779 kernel_pmap.pmap_bits[PG_N_IDX];
c8fe38ae
MD
1780 cpu_invltlb();
1781
06bb314f 1782 tmp = *ptr;
c8fe38ae
MD
1783 /*
1784 * Test for alternating 1's and 0's
1785 */
1786 *(volatile int *)ptr = 0xaaaaaaaa;
b4d9abe2 1787 cpu_mfence();
c8fe38ae
MD
1788 if (*(volatile int *)ptr != 0xaaaaaaaa)
1789 page_bad = TRUE;
1790 /*
1791 * Test for alternating 0's and 1's
1792 */
1793 *(volatile int *)ptr = 0x55555555;
b4d9abe2 1794 cpu_mfence();
c8fe38ae
MD
1795 if (*(volatile int *)ptr != 0x55555555)
1796 page_bad = TRUE;
1797 /*
1798 * Test for all 1's
1799 */
1800 *(volatile int *)ptr = 0xffffffff;
b4d9abe2 1801 cpu_mfence();
c8fe38ae
MD
1802 if (*(volatile int *)ptr != 0xffffffff)
1803 page_bad = TRUE;
1804 /*
1805 * Test for all 0's
1806 */
1807 *(volatile int *)ptr = 0x0;
b4d9abe2 1808 cpu_mfence();
c8fe38ae
MD
1809 if (*(volatile int *)ptr != 0x0)
1810 page_bad = TRUE;
1811 /*
1812 * Restore original value.
1813 */
06bb314f 1814 *ptr = tmp;
c8fe38ae
MD
1815
1816 /*
1817 * Adjust array of valid/good pages.
1818 */
1819 if (page_bad == TRUE)
1820 continue;
1821 /*
1822 * If this good page is a continuation of the
1823 * previous set of good pages, then just increase
1824 * the end pointer. Otherwise start a new chunk.
1825 * Note that "end" points one higher than end,
1826 * making the range >= start and < end.
1827 * If we're also doing a speculative memory
1828 * test and we at or past the end, bump up Maxmem
1829 * so that we keep going. The first bad page
1830 * will terminate the loop.
1831 */
1832 if (phys_avail[pa_indx] == pa) {
b4d9abe2 1833 phys_avail[pa_indx] += PHYSMAP_ALIGN;
c8fe38ae
MD
1834 } else {
1835 pa_indx++;
1836 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1837 kprintf(
1838 "Too many holes in the physical address space, giving up\n");
1839 pa_indx--;
1840 full = TRUE;
1841 goto do_dump_avail;
1842 }
b4d9abe2
MD
1843 phys_avail[pa_indx++] = pa;
1844 phys_avail[pa_indx] = pa + PHYSMAP_ALIGN;
c8fe38ae 1845 }
7a3eee88 1846 physmem += PHYSMAP_ALIGN / PAGE_SIZE;
c8fe38ae
MD
1847do_dump_avail:
1848 if (dump_avail[da_indx] == pa) {
b4d9abe2 1849 dump_avail[da_indx] += PHYSMAP_ALIGN;
c8fe38ae
MD
1850 } else {
1851 da_indx++;
1852 if (da_indx == DUMP_AVAIL_ARRAY_END) {
1853 da_indx--;
1854 goto do_next;
1855 }
b4d9abe2
MD
1856 dump_avail[da_indx++] = pa;
1857 dump_avail[da_indx] = pa + PHYSMAP_ALIGN;
c8fe38ae
MD
1858 }
1859do_next:
1860 if (full)
1861 break;
1862 }
1863 }
1864 *pte = 0;
1865 cpu_invltlb();
1866
1867 /*
c8fe38ae
MD
1868 * The last chunk must contain at least one page plus the message
1869 * buffer to avoid complicating other code (message buffer address
1870 * calculation, etc.).
1871 */
b4d9abe2
MD
1872 msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
1873
1874 while (phys_avail[pa_indx - 1] + PHYSMAP_ALIGN +
1875 msgbuf_size >= phys_avail[pa_indx]) {
c8fe38ae
MD
1876 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1877 phys_avail[pa_indx--] = 0;
1878 phys_avail[pa_indx--] = 0;
1879 }
1880
1881 Maxmem = atop(phys_avail[pa_indx]);
1882
1883 /* Trim off space for the message buffer. */
b4d9abe2 1884 phys_avail[pa_indx] -= msgbuf_size;
c8fe38ae 1885
1185babf
JG
1886 avail_end = phys_avail[pa_indx];
1887
c8fe38ae 1888 /* Map the message buffer. */
b4d9abe2
MD
1889 for (off = 0; off < msgbuf_size; off += PAGE_SIZE) {
1890 pmap_kenter((vm_offset_t)msgbufp + off,
1891 phys_avail[pa_indx] + off);
1892 }
c8fe38ae
MD
1893}
1894
faaf4131
MN
1895struct machintr_abi MachIntrABI;
1896
c8fe38ae
MD
1897/*
1898 * IDT VECTORS:
1899 * 0 Divide by zero
1900 * 1 Debug
1901 * 2 NMI
1902 * 3 BreakPoint
1903 * 4 OverFlow
1904 * 5 Bound-Range
1905 * 6 Invalid OpCode
1906 * 7 Device Not Available (x87)
1907 * 8 Double-Fault
1908 * 9 Coprocessor Segment overrun (unsupported, reserved)
1909 * 10 Invalid-TSS
1910 * 11 Segment not present
1911 * 12 Stack
1912 * 13 General Protection
1913 * 14 Page Fault
1914 * 15 Reserved
1915 * 16 x87 FP Exception pending
1916 * 17 Alignment Check
1917 * 18 Machine Check
1918 * 19 SIMD floating point
1919 * 20-31 reserved
1920 * 32-255 INTn/external sources
1921 */
1922u_int64_t
1923hammer_time(u_int64_t modulep, u_int64_t physfree)
1924{
1925 caddr_t kmdp;
8a06c6ee 1926 int gsel_tss, x, cpu;
32d3bd25 1927#if 0 /* JG */
5b9f6cc4
MD
1928 int metadata_missing, off;
1929#endif
c8fe38ae
MD
1930 struct mdglobaldata *gd;
1931 u_int64_t msr;
c8fe38ae 1932
c8fe38ae
MD
1933 /*
1934 * Prevent lowering of the ipl if we call tsleep() early.
1935 */
4864d541 1936 gd = &CPU_prvspace[0]->mdglobaldata;
c8fe38ae
MD
1937 bzero(gd, sizeof(*gd));
1938
1939 /*
1940 * Note: on both UP and SMP curthread must be set non-NULL
1941 * early in the boot sequence because the system assumes
1942 * that 'curthread' is never NULL.
1943 */
1944
1945 gd->mi.gd_curthread = &thread0;
1946 thread0.td_gd = &gd->mi;
1947
1948 atdevbase = ISA_HOLE_START + PTOV_OFFSET;
1949
32d3bd25 1950#if 0 /* JG */
c8fe38ae
MD
1951 metadata_missing = 0;
1952 if (bootinfo.bi_modulep) {
1953 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
1954 preload_bootstrap_relocate(KERNBASE);
1955 } else {
1956 metadata_missing = 1;
1957 }
1958 if (bootinfo.bi_envp)
1959 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
1960#endif
1961
1962 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET);
1963 preload_bootstrap_relocate(PTOV_OFFSET);
1964 kmdp = preload_search_by_type("elf kernel");
1965 if (kmdp == NULL)
1966 kmdp = preload_search_by_type("elf64 kernel");
1967 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1968 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET;
1969#ifdef DDB
1970 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1971 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1972#endif
1973
27af435a
SZ
1974 if (boothowto & RB_VERBOSE)
1975 bootverbose++;
1976
faaf4131 1977 /*
10db3cc6 1978 * Default MachIntrABI to ICU
faaf4131
MN
1979 */
1980 MachIntrABI = MachIntrABI_ICU;
9a4bd8f3 1981
c8fe38ae
MD
1982 /*
1983 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask,
1984 * and ncpus_fit_mask remain 0.
1985 */
1986 ncpus = 1;
1987 ncpus2 = 1;
1988 ncpus_fit = 1;
1989 /* Init basic tunables, hz etc */
1990 init_param1();
1991
1992 /*
1993 * make gdt memory segments
1994 */
1995 gdt_segs[GPROC0_SEL].ssd_base =
4864d541 1996 (uintptr_t) &CPU_prvspace[0]->mdglobaldata.gd_common_tss;
c8fe38ae 1997
4864d541 1998 gd->mi.gd_prvspace = CPU_prvspace[0];
c8fe38ae
MD
1999
2000 for (x = 0; x < NGDT; x++) {
2001 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
2002 ssdtosd(&gdt_segs[x], &gdt[x]);
2003 }
2004 ssdtosyssd(&gdt_segs[GPROC0_SEL],
2005 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
48ffc236 2006
c8fe38ae
MD
2007 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
2008 r_gdt.rd_base = (long) gdt;
2009 lgdt(&r_gdt);
2010
2011 wrmsr(MSR_FSBASE, 0); /* User value */
2012 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi);
2013 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
2014
2015 mi_gdinit(&gd->mi, 0);
2016 cpu_gdinit(gd, 0);
2017 proc0paddr = proc0paddr_buff;
2018 mi_proc0init(&gd->mi, proc0paddr);
2019 safepri = TDPRI_MAX;
2020
2021 /* spinlocks and the BGL */
2022 init_locks();
2023
2024 /* exceptions */
2025 for (x = 0; x < NIDT; x++)
8a06c6ee
SZ
2026 setidt_global(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
2027 setidt_global(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
2028 setidt_global(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
2029 setidt_global(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1);
2030 setidt_global(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
2031 setidt_global(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
2032 setidt_global(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
2033 setidt_global(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
2034 setidt_global(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
2035 setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
2036 setidt_global(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
2037 setidt_global(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
2038 setidt_global(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
2039 setidt_global(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
2040 setidt_global(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
2041 setidt_global(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
2042 setidt_global(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
2043 setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
2044 setidt_global(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
2045 setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
2046
2047 for (cpu = 0; cpu < MAXCPU; ++cpu) {
2048 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1;
2049 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0];
2050 }
2051
2052 lidt(&r_idt_arr[0]);
c8fe38ae
MD
2053
2054 /*
2055 * Initialize the console before we print anything out.
2056 */
2057 cninit();
2058
32d3bd25 2059#if 0 /* JG */
c8fe38ae
MD
2060 if (metadata_missing)
2061 kprintf("WARNING: loader(8) metadata is missing!\n");
2062#endif
2063
2064#if NISA >0
e24dd6e0 2065 elcr_probe();
c8fe38ae
MD
2066 isa_defaultirq();
2067#endif
2068 rand_initialize();
2069
a3dd9120
SZ
2070 /*
2071 * Initialize IRQ mapping
2072 *
2073 * NOTE:
2074 * SHOULD be after elcr_probe()
2075 */
2076 MachIntrABI_ICU.initmap();
a3dd9120 2077 MachIntrABI_IOAPIC.initmap();
a3dd9120 2078
c8fe38ae
MD
2079#ifdef DDB
2080 kdb_init();
2081 if (boothowto & RB_KDB)
2082 Debugger("Boot flags requested debugger");
2083#endif
2084
32d3bd25 2085#if 0 /* JG */
c8fe38ae 2086 finishidentcpu(); /* Final stage of CPU initialization */
2883d2d8
MD
2087 setidt(6, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
2088 setidt(13, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
c8fe38ae
MD
2089#endif
2090 identify_cpu(); /* Final stage of CPU initialization */
20a6d9db 2091 initializecpu(0); /* Initialize CPU registers */
c8fe38ae 2092
c7bf93fb
MD
2093 /*
2094 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better
2095 * becaue the cpu does significant power management in HLT
2096 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2097 *
2098 * On modern amd cpus or on any older amd or intel cpu,
2099 * cpu_idle_hlt=2 is better because ACPI is needed to reduce power
2100 * consumption.
2101 */
2102 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
2103 CPUID_TO_MODEL(cpu_id) >= 0x3C) { /* Haswell or later */
2104 cpu_idle_hlt = 1;
2105 }
2106
e6dee928
SZ
2107 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */
2108 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable);
2109 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable);
b8d5441d 2110 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt);
e6dee928
SZ
2111
2112 /*
08771751 2113 * Some of the virtual machines do not work w/ I/O APIC
e6dee928
SZ
2114 * enabled. If the user does not explicitly enable or
2115 * disable the I/O APIC (ioapic_enable < 0), then we
2116 * disable I/O APIC on all virtual machines.
2117 *
2118 * NOTE:
2119 * This must be done after identify_cpu(), which sets
2120 * 'cpu_feature2'
2121 */
2122 if (ioapic_enable < 0) {
2123 if (cpu_feature2 & CPUID2_VMM)
2124 ioapic_enable = 0;
2125 else
2126 ioapic_enable = 1;
2127 }
2128
c8fe38ae 2129 /* make an initial tss so cpu can get interrupt stack on syscall! */
5b9f6cc4
MD
2130 gd->gd_common_tss.tss_rsp0 =
2131 (register_t)(thread0.td_kstack +
2132 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb));
c8fe38ae 2133 /* Ensure the stack is aligned to 16 bytes */
2883d2d8 2134 gd->gd_common_tss.tss_rsp0 &= ~(register_t)0xF;
c8fe38ae 2135
093565f2
MD
2136 /* double fault stack */
2137 gd->gd_common_tss.tss_ist1 =
2138 (long)&gd->mi.gd_prvspace->idlestack[
2139 sizeof(gd->mi.gd_prvspace->idlestack)];
c8fe38ae
MD
2140
2141 /* Set the IO permission bitmap (empty due to tss seg limit) */
b2b3ffcd 2142 gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss);
c8fe38ae
MD
2143
2144 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2145 gd->gd_tss_gdt = &gdt[GPROC0_SEL];
2146 gd->gd_common_tssd = *gd->gd_tss_gdt;
2147 ltr(gsel_tss);
2148
2149 /* Set up the fast syscall stuff */
2150 msr = rdmsr(MSR_EFER) | EFER_SCE;
2151 wrmsr(MSR_EFER, msr);
2152 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
2153 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
2154 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
2155 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
2156 wrmsr(MSR_STAR, msr);
3338cc67 2157 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL);
c8fe38ae
MD
2158
2159 getmemsize(kmdp, physfree);
2160 init_param2(physmem);
2161
2162 /* now running on new page tables, configured,and u/iom is accessible */
2163
2164 /* Map the message buffer. */
32d3bd25 2165#if 0 /* JG */
c8fe38ae
MD
2166 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2167 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2168#endif
2169
2170 msgbufinit(msgbufp, MSGBUF_SIZE);
2171
2172
2173 /* transfer to user mode */
2174
2175 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2176 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2177 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
2178
2179 load_ds(_udatasel);
2180 load_es(_udatasel);
2181 load_fs(_udatasel);
2182
2183 /* setup proc 0's pcb */
2184 thread0.td_pcb->pcb_flags = 0;
c8fe38ae 2185 thread0.td_pcb->pcb_cr3 = KPML4phys;
d8061892 2186 thread0.td_pcb->pcb_ext = NULL;
d1368d1a 2187 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */
c8fe38ae
MD
2188
2189 /* Location of kernel stack for locore */
2190 return ((u_int64_t)thread0.td_pcb);
2191}
2192
2193/*
2194 * Initialize machine-dependant portions of the global data structure.
2195 * Note that the global data area and cpu0's idlestack in the private
2196 * data space were allocated in locore.
2197 *
2198 * Note: the idlethread's cpl is 0
2199 *
2200 * WARNING! Called from early boot, 'mycpu' may not work yet.
2201 */
2202void
2203cpu_gdinit(struct mdglobaldata *gd, int cpu)
2204{
2205 if (cpu)
2206 gd->mi.gd_curthread = &gd->mi.gd_idlethread;
2207
2208 lwkt_init_thread(&gd->mi.gd_idlethread,
2209 gd->mi.gd_prvspace->idlestack,
2210 sizeof(gd->mi.gd_prvspace->idlestack),
fdce8919 2211 0, &gd->mi);
c8fe38ae
MD
2212 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu);
2213 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch;
2214 gd->mi.gd_idlethread.td_sp -= sizeof(void *);
2215 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore;
2216}
2217
4864d541
MD
2218/*
2219 * We only have to check for DMAP bounds, the globaldata space is
2220 * actually part of the kernel_map so we don't have to waste time
2221 * checking CPU_prvspace[*].
2222 */
c8fe38ae
MD
2223int
2224is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr)
2225{
4864d541 2226#if 0
c8fe38ae
MD
2227 if (saddr >= (vm_offset_t)&CPU_prvspace[0] &&
2228 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) {
2229 return (TRUE);
2230 }
4864d541 2231#endif
616516c8
MD
2232 if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS)
2233 return (TRUE);
c8fe38ae
MD
2234 return (FALSE);
2235}
2236
2237struct globaldata *
2238globaldata_find(int cpu)
2239{
2240 KKASSERT(cpu >= 0 && cpu < ncpus);
4864d541 2241 return(&CPU_prvspace[cpu]->mdglobaldata.mi);
c8fe38ae
MD
2242}
2243
db2ac896
MD
2244/*
2245 * This path should be safe from the SYSRET issue because only stopped threads
2246 * can have their %rip adjusted this way (and all heavy weight thread switches
2247 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2248 * convoluted so add a safety by forcing %rip to be cannonical.
2249 */
c8fe38ae
MD
2250int
2251ptrace_set_pc(struct lwp *lp, unsigned long addr)
2252{
db2ac896
MD
2253 if (addr & 0x0000800000000000LLU)
2254 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU;
2255 else
2256 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU;
c8fe38ae
MD
2257 return (0);
2258}
2259
2260int
2261ptrace_single_step(struct lwp *lp)
2262{
5b9f6cc4 2263 lp->lwp_md.md_regs->tf_rflags |= PSL_T;
c8fe38ae
MD
2264 return (0);
2265}
2266
2267int
2268fill_regs(struct lwp *lp, struct reg *regs)
2269{
c8fe38ae
MD
2270 struct trapframe *tp;
2271
d64d3805
MD
2272 if ((tp = lp->lwp_md.md_regs) == NULL)
2273 return EINVAL;
5b9f6cc4 2274 bcopy(&tp->tf_rdi, &regs->r_rdi, sizeof(*regs));
c8fe38ae
MD
2275 return (0);
2276}
2277
2278int
2279set_regs(struct lwp *lp, struct reg *regs)
2280{
c8fe38ae
MD
2281 struct trapframe *tp;
2282
2283 tp = lp->lwp_md.md_regs;
5b9f6cc4 2284 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) ||
c8fe38ae
MD
2285 !CS_SECURE(regs->r_cs))
2286 return (EINVAL);
5b9f6cc4 2287 bcopy(&regs->r_rdi, &tp->tf_rdi, sizeof(*regs));
f2081646 2288 clear_quickret();
c8fe38ae
MD
2289 return (0);
2290}
2291
c8fe38ae
MD
2292static void
2293fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87)
2294{
2295 struct env87 *penv_87 = &sv_87->sv_env;
2296 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2297 int i;
2298
2299 /* FPU control/status */
2300 penv_87->en_cw = penv_xmm->en_cw;
2301 penv_87->en_sw = penv_xmm->en_sw;
2302 penv_87->en_tw = penv_xmm->en_tw;
2303 penv_87->en_fip = penv_xmm->en_fip;
2304 penv_87->en_fcs = penv_xmm->en_fcs;
2305 penv_87->en_opcode = penv_xmm->en_opcode;
2306 penv_87->en_foo = penv_xmm->en_foo;
2307 penv_87->en_fos = penv_xmm->en_fos;
2308
2309 /* FPU registers */
2310 for (i = 0; i < 8; ++i)
2311 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
c8fe38ae
MD
2312}
2313
2314static void
2315set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm)
2316{
2317 struct env87 *penv_87 = &sv_87->sv_env;
2318 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2319 int i;
2320
2321 /* FPU control/status */
2322 penv_xmm->en_cw = penv_87->en_cw;
2323 penv_xmm->en_sw = penv_87->en_sw;
2324 penv_xmm->en_tw = penv_87->en_tw;
2325 penv_xmm->en_fip = penv_87->en_fip;
2326 penv_xmm->en_fcs = penv_87->en_fcs;
2327 penv_xmm->en_opcode = penv_87->en_opcode;
2328 penv_xmm->en_foo = penv_87->en_foo;
2329 penv_xmm->en_fos = penv_87->en_fos;
2330
2331 /* FPU registers */
2332 for (i = 0; i < 8; ++i)
2333 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
c8fe38ae 2334}
c8fe38ae
MD
2335
2336int
2337fill_fpregs(struct lwp *lp, struct fpreg *fpregs)
2338{
d64d3805
MD
2339 if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL)
2340 return EINVAL;
c8fe38ae
MD
2341 if (cpu_fxsr) {
2342 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm,
2343 (struct save87 *)fpregs);
2344 return (0);
2345 }
c8fe38ae
MD
2346 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
2347 return (0);
2348}
2349
2350int
2351set_fpregs(struct lwp *lp, struct fpreg *fpregs)
2352{
c8fe38ae
MD
2353 if (cpu_fxsr) {
2354 set_fpregs_xmm((struct save87 *)fpregs,
2355 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm);
2356 return (0);
2357 }
c8fe38ae
MD
2358 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs);
2359 return (0);
2360}
2361
2362int
2363fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
2364{
d64d3805
MD
2365 struct pcb *pcb;
2366
c8fe38ae 2367 if (lp == NULL) {
0855a2af
JG
2368 dbregs->dr[0] = rdr0();
2369 dbregs->dr[1] = rdr1();
2370 dbregs->dr[2] = rdr2();
2371 dbregs->dr[3] = rdr3();
2372 dbregs->dr[4] = rdr4();
2373 dbregs->dr[5] = rdr5();
2374 dbregs->dr[6] = rdr6();
2375 dbregs->dr[7] = rdr7();
d64d3805 2376 return (0);
c8fe38ae 2377 }
d64d3805
MD
2378 if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL)
2379 return EINVAL;
2380 dbregs->dr[0] = pcb->pcb_dr0;
2381 dbregs->dr[1] = pcb->pcb_dr1;
2382 dbregs->dr[2] = pcb->pcb_dr2;
2383 dbregs->dr[3] = pcb->pcb_dr3;
2384 dbregs->dr[4] = 0;
2385 dbregs->dr[5] = 0;
2386 dbregs->dr[6] = pcb->pcb_dr6;
2387 dbregs->dr[7] = pcb->pcb_dr7;
c8fe38ae
MD
2388 return (0);
2389}
2390
2391int
2392set_dbregs(struct lwp *lp, struct dbreg *dbregs)
2393{
2394 if (lp == NULL) {
0855a2af
JG
2395 load_dr0(dbregs->dr[0]);
2396 load_dr1(dbregs->dr[1]);
2397 load_dr2(dbregs->dr[2]);
2398 load_dr3(dbregs->dr[3]);
2399 load_dr4(dbregs->dr[4]);
2400 load_dr5(dbregs->dr[5]);
2401 load_dr6(dbregs->dr[6]);
2402 load_dr7(dbregs->dr[7]);
c8fe38ae
MD
2403 } else {
2404 struct pcb *pcb;
2405 struct ucred *ucred;
2406 int i;
0855a2af 2407 uint64_t mask1, mask2;
c8fe38ae
MD
2408
2409 /*
2410 * Don't let an illegal value for dr7 get set. Specifically,
2411 * check for undefined settings. Setting these bit patterns
2412 * result in undefined behaviour and can lead to an unexpected
2413 * TRCTRAP.
2414 */
0855a2af
JG
2415 /* JG this loop looks unreadable */
2416 /* Check 4 2-bit fields for invalid patterns.
2417 * These fields are R/Wi, for i = 0..3
2418 */
2419 /* Is 10 in LENi allowed when running in compatibility mode? */
2420 /* Pattern 10 in R/Wi might be used to indicate
2421 * breakpoint on I/O. Further analysis should be
2422 * carried to decide if it is safe and useful to
2423 * provide access to that capability
2424 */
2425 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4;
2426 i++, mask1 <<= 4, mask2 <<= 4)
2427 if ((dbregs->dr[7] & mask1) == mask2)
c8fe38ae 2428 return (EINVAL);
c8fe38ae
MD
2429
2430 pcb = lp->lwp_thread->td_pcb;
2431 ucred = lp->lwp_proc->p_ucred;
2432
2433 /*
2434 * Don't let a process set a breakpoint that is not within the
2435 * process's address space. If a process could do this, it
2436 * could halt the system by setting a breakpoint in the kernel
2437 * (if ddb was enabled). Thus, we need to check to make sure
2438 * that no breakpoints are being enabled for addresses outside
2439 * process's address space, unless, perhaps, we were called by
2440 * uid 0.
2441 *
2442 * XXX - what about when the watched area of the user's
2443 * address space is written into from within the kernel
2444 * ... wouldn't that still cause a breakpoint to be generated
2445 * from within kernel mode?
2446 */
2447
895c1f85 2448 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) {
0855a2af 2449 if (dbregs->dr[7] & 0x3) {
c8fe38ae 2450 /* dr0 is enabled */
0855a2af 2451 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
c8fe38ae
MD
2452 return (EINVAL);
2453 }
2454
0855a2af 2455 if (dbregs->dr[7] & (0x3<<2)) {
c8fe38ae 2456 /* dr1 is enabled */
0855a2af 2457 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
c8fe38ae
MD
2458 return (EINVAL);
2459 }
2460
0855a2af 2461 if (dbregs->dr[7] & (0x3<<4)) {
c8fe38ae 2462 /* dr2 is enabled */
0855a2af 2463 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
c8fe38ae
MD
2464 return (EINVAL);
2465 }
2466
0855a2af 2467 if (dbregs->dr[7] & (0x3<<6)) {
c8fe38ae 2468 /* dr3 is enabled */
0855a2af 2469 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
c8fe38ae
MD
2470 return (EINVAL);
2471 }
c8fe38ae
MD
2472 }
2473
0855a2af
JG
2474 pcb->pcb_dr0 = dbregs->dr[0];
2475 pcb->pcb_dr1 = dbregs->dr[1];
2476 pcb->pcb_dr2 = dbregs->dr[2];
2477 pcb->pcb_dr3 = dbregs->dr[3];
2478 pcb->pcb_dr6 = dbregs->dr[6];
2479 pcb->pcb_dr7 = dbregs->dr[7];
c8fe38ae
MD
2480
2481 pcb->pcb_flags |= PCB_DBREGS;
2482 }
2483
2484 return (0);
2485}
2486
2487/*
2488 * Return > 0 if a hardware breakpoint has been hit, and the
2489 * breakpoint was in user space. Return 0, otherwise.
2490 */
2491int
2492user_dbreg_trap(void)
2493{
0855a2af
JG
2494 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
2495 u_int64_t bp; /* breakpoint bits extracted from dr6 */
c8fe38ae
MD
2496 int nbp; /* number of breakpoints that triggered */
2497 caddr_t addr[4]; /* breakpoint addresses */
2498 int i;
2499
2500 dr7 = rdr7();
0855a2af 2501 if ((dr7 & 0xff) == 0) {
c8fe38ae
MD
2502 /*
2503 * all GE and LE bits in the dr7 register are zero,
2504 * thus the trap couldn't have been caused by the
2505 * hardware debug registers
2506 */
2507 return 0;
2508 }
2509
2510 nbp = 0;
2511 dr6 = rdr6();
0855a2af 2512 bp = dr6 & 0xf;
c8fe38ae 2513
0855a2af 2514 if (bp == 0) {
c8fe38ae
MD
2515 /*
2516 * None of the breakpoint bits are set meaning this
2517 * trap was not caused by any of the debug registers
2518 */
2519 return 0;
2520 }
2521
2522 /*
2523 * at least one of the breakpoints were hit, check to see
2524 * which ones and if any of them are user space addresses
2525 */
2526
2527 if (bp & 0x01) {
2528 addr[nbp++] = (caddr_t)rdr0();
2529 }
2530 if (bp & 0x02) {
2531 addr[nbp++] = (caddr_t)rdr1();
2532 }
2533 if (bp & 0x04) {
2534 addr[nbp++] = (caddr_t)rdr2();
2535 }
2536 if (bp & 0x08) {
2537 addr[nbp++] = (caddr_t)rdr3();
2538 }
2539
2540 for (i=0; i<nbp; i++) {
2541 if (addr[i] <
2542 (caddr_t)VM_MAX_USER_ADDRESS) {
2543 /*
2544 * addr[i] is in user space
2545 */
2546 return nbp;
2547 }
2548 }
2549
2550 /*
2551 * None of the breakpoints are in user space.
2552 */
2553 return 0;
2554}
2555
2556
2557#ifndef DDB
2558void
2559Debugger(const char *msg)
2560{
2561 kprintf("Debugger(\"%s\") called.\n", msg);
2562}
2563#endif /* no DDB */
2564
2565#ifdef DDB
2566
2567/*
2568 * Provide inb() and outb() as functions. They are normally only
2569 * available as macros calling inlined functions, thus cannot be
2570 * called inside DDB.
2571 *
2572 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2573 */
2574
2575#undef inb
2576#undef outb
2577
2578/* silence compiler warnings */
2579u_char inb(u_int);
2580void outb(u_int, u_char);
2581
2582u_char
2583inb(u_int port)
2584{
2585 u_char data;
2586 /*
2587 * We use %%dx and not %1 here because i/o is done at %dx and not at
2588 * %edx, while gcc generates inferior code (movw instead of movl)
2589 * if we tell it to load (u_short) port.
2590 */
2591 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
2592 return (data);
2593}
2594
2595void
2596outb(u_int port, u_char data)
2597{
2598 u_char al;
2599 /*
2600 * Use an unnecessary assignment to help gcc's register allocator.
2601 * This make a large difference for gcc-1.40 and a tiny difference
2602 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2603 * best results. gcc-2.6.0 can't handle this.
2604 */
2605 al = data;
2606 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
2607}
2608
2609#endif /* DDB */
2610
2611
2612
c8fe38ae
MD
2613/*
2614 * initialize all the SMP locks
2615 */
2616
2617/* critical region when masking or unmasking interupts */
2618struct spinlock_deprecated imen_spinlock;
2619
c8fe38ae
MD
2620/* critical region for old style disable_intr/enable_intr */
2621struct spinlock_deprecated mpintr_spinlock;
2622
2623/* critical region around INTR() routines */
2624struct spinlock_deprecated intr_spinlock;
2625
2626/* lock region used by kernel profiling */
2627struct spinlock_deprecated mcount_spinlock;
2628
2629/* locks com (tty) data/hardware accesses: a FASTINTR() */
2630struct spinlock_deprecated com_spinlock;
2631
c8fe38ae
MD
2632/* lock regions around the clock hardware */
2633struct spinlock_deprecated clock_spinlock;
2634
c8fe38ae
MD
2635static void
2636init_locks(void)
2637{
2638 /*
b5d16701 2639 * Get the initial mplock with a count of 1 for the BSP.
c8fe38ae
MD
2640 * This uses a LOGICAL cpu ID, ie BSP == 0.
2641 */
c8fe38ae 2642 cpu_get_initial_mplock();
c8fe38ae
MD
2643 /* DEPRECATED */
2644 spin_lock_init(&mcount_spinlock);
c8fe38ae
MD
2645 spin_lock_init(&intr_spinlock);
2646 spin_lock_init(&mpintr_spinlock);
2647 spin_lock_init(&imen_spinlock);
c8fe38ae
MD
2648 spin_lock_init(&com_spinlock);
2649 spin_lock_init(&clock_spinlock);
c8fe38ae
MD
2650
2651 /* our token pool needs to work early */
2652 lwkt_token_pool_init();
2653}
2654
5764e125
SZ
2655boolean_t
2656cpu_mwait_hint_valid(uint32_t hint)
2657{
5764e125
SZ
2658 int cx_idx, sub;
2659
2660 cx_idx = MWAIT_EAX_TO_CX(hint);
5764e125
SZ
2661 if (cx_idx >= CPU_MWAIT_CX_MAX)
2662 return FALSE;
2663
29b33800
SZ
2664 sub = MWAIT_EAX_TO_CX_SUB(hint);
2665 if (sub >= cpu_mwait_cx_info[cx_idx].subcnt)
5764e125
SZ
2666 return FALSE;
2667
2668 return TRUE;
2669}
29b33800 2670
22d2370f
SZ
2671void
2672cpu_mwait_cx_no_bmsts(void)
2673{
2674 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS);
2675}
2676
e1e344f5
SZ
2677void
2678cpu_mwait_cx_no_bmarb(void)
2679{
2680 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB);
2681}
2682
29b33800 2683static int
ffa2dd72 2684cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto)
29b33800 2685{
ffa2dd72 2686 int old_cx_idx, sub = 0;
29b33800 2687
1786faf9
SZ
2688 if (hint >= 0) {
2689 old_cx_idx = MWAIT_EAX_TO_CX(hint);
2690 sub = MWAIT_EAX_TO_CX_SUB(hint);
2691 } else if (hint == CPU_MWAIT_HINT_AUTO) {
2692 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX;
2693 } else if (hint == CPU_MWAIT_HINT_AUTODEEP) {
2694 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX;
2695 } else {
2696 old_cx_idx = CPU_MWAIT_CX_MAX;
2697 }
29b33800 2698
ffa2dd72
SZ
2699 if (!CPU_MWAIT_HAS_CX)
2700 strlcpy(name, "NONE", namelen);
1786faf9 2701 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO)
ffa2dd72 2702 strlcpy(name, "AUTO", namelen);
1786faf9 2703 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP)
ffa2dd72 2704 strlcpy(name, "AUTODEEP", namelen);
1786faf9
SZ
2705 else if (old_cx_idx >= CPU_MWAIT_CX_MAX ||
2706 sub >= cpu_mwait_cx_info[old_cx_idx].subcnt)
ffa2dd72 2707 strlcpy(name, "INVALID", namelen);
29b33800 2708 else
ffa2dd72 2709 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub);
29b33800 2710
ffa2dd72
SZ
2711 return old_cx_idx;
2712}
29b33800 2713
ffa2dd72
SZ
2714static int
2715cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto)
2716{
2717 int cx_idx, sub, hint;
2718 char *ptr, *start;
271a3286 2719
1786faf9
SZ
2720 if (allow_auto && strcmp(name, "AUTO") == 0) {
2721 hint = CPU_MWAIT_HINT_AUTO;
2722 cx_idx = CPU_MWAIT_C2;
2723 goto done;
2724 }
2725 if (allow_auto && strcmp(name, "AUTODEEP") == 0) {
2726 hint = CPU_MWAIT_HINT_AUTODEEP;
2727 cx_idx = CPU_MWAIT_C3;
1786faf9
SZ
2728 goto done;
2729 }
2730
29b33800 2731 if (strlen(name) < 4 || toupper(name[0]) != 'C')
ffa2dd72 2732 return -1;
29b33800
SZ
2733 start = &name[1];
2734 ptr = NULL;
2735
2736 cx_idx = strtol(start, &ptr, 10);
2737 if (ptr == start || *ptr != '/')
ffa2dd72 2738 return -1;
29b33800 2739 if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX)
ffa2dd72 2740 return -1;
29b33800
SZ
2741
2742 start = ptr + 1;
2743 ptr = NULL;
2744
2745 sub = strtol(start, &ptr, 10);
2746 if (*ptr != '\0')
ffa2dd72 2747 return -1;
29b33800 2748 if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt)
ffa2dd72 2749 return -1;
29b33800 2750
1786faf9
SZ
2751 hint = MWAIT_EAX_HINT(cx_idx, sub);
2752done:
ffa2dd72
SZ
2753 *hint0 = hint;
2754 return cx_idx;
2755}
2756
2757static int
2758cpu_mwait_cx_transit(int old_cx_idx, int cx_idx)
2759{
22d2370f
SZ
2760 if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble)
2761 return EOPNOTSUPP;
1786faf9 2762 if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) {
ffa2dd72
SZ
2763 int error;
2764
271a3286
SZ
2765 error = cputimer_intr_powersave_addreq();
2766 if (error)
2767 return error;
1786faf9 2768 } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) {
271a3286
SZ
2769 cputimer_intr_powersave_remreq();
2770 }
ffa2dd72
SZ
2771 return 0;
2772}
2773
2774static int
2775cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0,
2776 boolean_t allow_auto)
2777{
2778 int error, cx_idx, old_cx_idx, hint;
2779 char name[CPU_MWAIT_CX_NAMELEN];
2780
2781 hint = *hint0;
2782 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name),
2783 allow_auto);
2784
2785 error = sysctl_handle_string(oidp, name, sizeof(name), req);
2786 if (error != 0 || req->newptr == NULL)
2787 return error;
2788
2789 if (!CPU_MWAIT_HAS_CX)
2790 return EOPNOTSUPP;
2791
2792 cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto);
2793 if (cx_idx < 0)
2794 return EINVAL;
2795
2796 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
2797 if (error)
2798 return error;
271a3286 2799
1786faf9 2800 *hint0 = hint;
29b33800
SZ
2801 return 0;
2802}
2803
ffa2dd72
SZ
2804static int
2805cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name)
2806{
2807 int error, cx_idx, old_cx_idx, hint;
2808 char name[CPU_MWAIT_CX_NAMELEN];
2809
2810 KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension"));
2811
2812 hint = stat->hint;
2813 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
2814
2815 strlcpy(name, cx_name, sizeof(name));
2816 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
2817 if (cx_idx < 0)
2818 return EINVAL;
2819
2820 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
2821 if (error)
2822 return error;
2823
2824 stat->hint = hint;
2825 return 0;
2826}
2827
29b33800
SZ
2828static int
2829cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)
2830{
ffa2dd72
SZ
2831 int hint = cpu_mwait_halt_global;
2832 int error, cx_idx, cpu;
2833 char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN];
2834
2835 cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
2836
2837 error = sysctl_handle_string(oidp, name, sizeof(name), req);
2838 if (error != 0 || req->newptr == NULL)
2839 return error;
2840
2841 if (!CPU_MWAIT_HAS_CX)
2842 return EOPNOTSUPP;
2843
2844 /* Save name for later per-cpu CX configuration */
2845 strlcpy(cx_name, name, sizeof(cx_name));
2846
2847 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
2848 if (cx_idx < 0)
2849 return EINVAL;
2850
2851 /* Change per-cpu CX configuration */
2852 for (cpu = 0; cpu < ncpus; ++cpu) {
2853 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name);
2854 if (error)
2855 return error;
2856 }
2857
2858 cpu_mwait_halt_global = hint;
2859 return 0;
2860}
2861
2862static int
2863cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
2864{
2865 struct cpu_idle_stat *stat = arg1;
271a3286
SZ
2866 int error;
2867
271a3286 2868 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
ffa2dd72 2869 &stat->hint, TRUE);
271a3286 2870 return error;
29b33800
SZ
2871}
2872
2873static int
2874cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)
2875{
271a3286
SZ
2876 int error;
2877
271a3286 2878 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
1786faf9 2879 &cpu_mwait_spin, FALSE);
271a3286 2880 return error;
29b33800 2881}