gitweb.dragonflybsd.org Git - dragonfly.git/blame

Commit	Line	Data
8c10bfcf MD	1	/*
8c10bfcf MD	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
bbf175be	3	*
8c10bfcf MD	4	* This code is derived from software contributed to The DragonFly Project
8c10bfcf MD	5	* by Matthew Dillon <dillon@backplane.com>
bbf175be	6	*
8c10bfcf MD	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
bbf175be	10	*
8c10bfcf MD	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
bbf175be	20	*
8c10bfcf MD	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
bbf175be	33	*
984263bc MD	34	* Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
	35	* Copyright (c) 1982, 1986, 1991, 1993
	36	* The Regents of the University of California. All rights reserved.
	37	* (c) UNIX System Laboratories, Inc.
	38	* All or some portions of this file are derived from material licensed
	39	* to the University of California by American Telephone and Telegraph
	40	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	41	* the permission of UNIX System Laboratories, Inc.
	42	*
	43	* Redistribution and use in source and binary forms, with or without
	44	* modification, are permitted provided that the following conditions
	45	* are met:
	46	* 1. Redistributions of source code must retain the above copyright
	47	* notice, this list of conditions and the following disclaimer.
	48	* 2. Redistributions in binary form must reproduce the above copyright
	49	* notice, this list of conditions and the following disclaimer in the
	50	* documentation and/or other materials provided with the distribution.
dc71b7ab	51	* 3. Neither the name of the University nor the names of its contributors
984263bc MD	52	* may be used to endorse or promote products derived from this software
	53	* without specific prior written permission.
	54	*
	55	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	56	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	57	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	58	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	59	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	60	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	61	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	62	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	63	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	64	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	65	* SUCH DAMAGE.
	66	*
	67	* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
	68	* $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $
	69	*/
	70
	71	#include "opt_ntp.h"
07522099	72	#include "opt_pctrack.h"
984263bc MD	73
	74	#include <sys/param.h>
	75	#include <sys/systm.h>
984263bc MD	76	#include <sys/callout.h>
984263bc MD	77	#include <sys/kernel.h>
f5d21610	78	#include <sys/kinfo.h>
984263bc MD	79	#include <sys/proc.h>
984263bc MD	80	#include <sys/malloc.h>
d70eef28	81	#include <sys/resource.h>
984263bc MD	82	#include <sys/resourcevar.h>
984263bc MD	83	#include <sys/signalvar.h>
2b3f93ea	84	#include <sys/caps.h>
984263bc MD	85	#include <sys/timex.h>
984263bc MD	86	#include <sys/timepps.h>
0adbcbd6	87	#include <sys/upmap.h>
984263bc	88	#include <sys/lock.h>
77bc82e1 MD	89	#include <sys/sysctl.h>
77bc82e1 MD	90	#include <sys/kcollect.h>
8f7f5bd5	91	#include <sys/exislock.h>
fac0eb3c	92	#include <sys/exislock2.h>
77bc82e1 MD	93
77bc82e1 MD	94	#include <vm/vm.h>
984263bc MD	95	#include <vm/pmap.h>
984263bc MD	96	#include <vm/vm_map.h>
5ffd1608	97	#include <vm/vm_extern.h>
684a93c4	98
2689779e	99	#include <sys/thread2.h>
a55bb12d	100	#include <sys/spinlock2.h>
984263bc MD	101
	102	#include <machine/cpu.h>
	103	#include <machine/limits.h>
	104	#include <machine/smp.h>
d2412a2e MD	105	#include <machine/cpufunc.h>
	106	#include <machine/specialreg.h>
	107	#include <machine/clock.h>
984263bc	108
07522099 MD	109	#ifdef DEBUG_PCTRACK
	110	static void do_pctrack(struct intrframe *frame, int which);
	111	#endif
	112
402ed7e1	113	static void initclocks (void *dummy);
f3f3eadb	114	SYSINIT(clocks, SI_BOOT2_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
984263bc	115
6ad39cae MD	116	/*
6ad39cae MD	117	* Some of these don't belong here, but it's easiest to concentrate them.
9eea7f0c	118	* Note that cpu_time counts in microseconds, but most userland programs
6ad39cae MD	119	* just compare relative times against the total by delta.
6ad39cae MD	120	*/
9eea7f0c	121	struct kinfo_cputime cputime_percpu[MAXCPU];
07522099 MD	122	#ifdef DEBUG_PCTRACK
	123	struct kinfo_pcheader cputime_pcheader = { PCTRACK_SIZE, PCTRACK_ARYSIZE };
	124	struct kinfo_pctrack cputime_pctrack[MAXCPU][PCTRACK_SIZE];
	125	#endif
	126
63823918 MD	127	__read_mostly static int sniff_enable = 1;
	128	__read_mostly static int sniff_target = -1;
	129	__read_mostly static int clock_debug2 = 0;
67534613 MD	130	SYSCTL_INT(_kern, OID_AUTO, sniff_enable, CTLFLAG_RW, &sniff_enable, 0 , "");
67534613 MD	131	SYSCTL_INT(_kern, OID_AUTO, sniff_target, CTLFLAG_RW, &sniff_target, 0 , "");
63823918	132	SYSCTL_INT(_debug, OID_AUTO, clock_debug2, CTLFLAG_RW, &clock_debug2, 0 , "");
67534613	133
8f7f5bd5 MD	134	__read_mostly long pseudo_ticks = 1; /* existential timed locks */
8f7f5bd5 MD	135
9eea7f0c HP	136	static int
	137	sysctl_cputime(SYSCTL_HANDLER_ARGS)
	138	{
	139	int cpu, error = 0;
82f8b550	140	int root_error;
9eea7f0c	141	size_t size = sizeof(struct kinfo_cputime);
e32d3244	142	struct kinfo_cputime tmp;
9eea7f0c	143
82f8b550 MD	144	/*
	145	* NOTE: For security reasons, only root can sniff %rip
	146	*/
2b3f93ea	147	root_error = caps_priv_check_self(SYSCAP_RESTRICTEDROOT);
82f8b550	148
9eea7f0c	149	for (cpu = 0; cpu < ncpus; ++cpu) {
e32d3244	150	tmp = cputime_percpu[cpu];
82f8b550 MD	151	if (root_error == 0) {
	152	tmp.cp_sample_pc =
	153	(int64_t)globaldata_find(cpu)->gd_sample_pc;
	154	tmp.cp_sample_sp =
	155	(int64_t)globaldata_find(cpu)->gd_sample_sp;
	156	}
e32d3244	157	if ((error = SYSCTL_OUT(req, &tmp, size)) != 0)
9eea7f0c HP	158	break;
9eea7f0c HP	159	}
82f8b550	160
67534613 MD	161	if (root_error == 0) {
	162	if (sniff_enable) {
	163	int n = sniff_target;
	164	if (n < 0)
	165	smp_sniff();
	166	else if (n < ncpus)
	167	cpu_sniff(n);
	168	}
	169	}
984263bc	170
9eea7f0c HP	171	return (error);
	172	}
	173	SYSCTL_PROC(_kern, OID_AUTO, cputime, (CTLTYPE_OPAQUE\|CTLFLAG_RD), 0, 0,
	174	sysctl_cputime, "S,kinfo_cputime", "CPU time statistics");
984263bc	175
06636a8e AHJ	176	static int
	177	sysctl_cp_time(SYSCTL_HANDLER_ARGS)
	178	{
5c13d0f3	179	long cpu_states[CPUSTATES] = {0};
06636a8e AHJ	180	int cpu, error = 0;
	181	size_t size = sizeof(cpu_states);
	182
	183	for (cpu = 0; cpu < ncpus; ++cpu) {
d70eef28 SG	184	cpu_states[CP_USER] += cputime_percpu[cpu].cp_user;
	185	cpu_states[CP_NICE] += cputime_percpu[cpu].cp_nice;
	186	cpu_states[CP_SYS] += cputime_percpu[cpu].cp_sys;
	187	cpu_states[CP_INTR] += cputime_percpu[cpu].cp_intr;
	188	cpu_states[CP_IDLE] += cputime_percpu[cpu].cp_idle;
06636a8e AHJ	189	}
	190
	191	error = SYSCTL_OUT(req, cpu_states, size);
	192
	193	return (error);
	194	}
	195
	196	SYSCTL_PROC(_kern, OID_AUTO, cp_time, (CTLTYPE_LONG\|CTLFLAG_RD), 0, 0,
4276b194 SW	197	sysctl_cp_time, "LU", "CPU time statistics");
	198
	199	static int
	200	sysctl_cp_times(SYSCTL_HANDLER_ARGS)
	201	{
	202	long cpu_states[CPUSTATES] = {0};
	203	int cpu, error;
	204	size_t size = sizeof(cpu_states);
	205
	206	for (error = 0, cpu = 0; error == 0 && cpu < ncpus; ++cpu) {
	207	cpu_states[CP_USER] = cputime_percpu[cpu].cp_user;
	208	cpu_states[CP_NICE] = cputime_percpu[cpu].cp_nice;
	209	cpu_states[CP_SYS] = cputime_percpu[cpu].cp_sys;
	210	cpu_states[CP_INTR] = cputime_percpu[cpu].cp_intr;
	211	cpu_states[CP_IDLE] = cputime_percpu[cpu].cp_idle;
	212	error = SYSCTL_OUT(req, cpu_states, size);
	213	}
	214
	215	return (error);
	216	}
	217
	218	SYSCTL_PROC(_kern, OID_AUTO, cp_times, (CTLTYPE_LONG\|CTLFLAG_RD), 0, 0,
	219	sysctl_cp_times, "LU", "per-CPU time statistics");
06636a8e	220
88c4d2f6 MD	221	/*
	222	* boottime is used to calculate the 'real' uptime. Do not confuse this with
	223	* microuptime(). microtime() is not drift compensated. The real uptime
60b2809b MD	224	* with compensation is nanotime() - bootime. boottime is recalculated
	225	* whenever the real time is set based on the compensated elapsed time
	226	* in seconds (gd->gd_time_seconds).
88c4d2f6	227	*
88c4d2f6 MD	228	* The gd_time_seconds and gd_cpuclock_base fields remain fairly monotonic.
	229	* Slight adjustments to gd_cpuclock_base are made to phase-lock it to
	230	* the real time.
3dc002ae	231	*
1fceee21	232	* WARNING! time_second can backstep on time corrections. Also, unlike
2ed58723	233	* time_second, time_uptime is not a "real" time_t (seconds
1fceee21	234	* since the Epoch) but seconds since booting.
88c4d2f6	235	*/
2ff21866	236	__read_mostly struct timespec boottime; /* boot time (realtime) for ref only */
63823918 MD	237	__read_mostly struct timespec ticktime0;/* updated every tick */
	238	__read_mostly struct timespec ticktime2;/* updated every tick */
	239	__read_mostly int ticktime_update;
2ff21866 MD	240	__read_mostly time_t time_second; /* read-only 'passive' rt in seconds */
2ff21866 MD	241	__read_mostly time_t time_uptime; /* read-only 'passive' ut in seconds */
984263bc	242
5eb5a6bc MD	243	/*
5eb5a6bc MD	244	* basetime is used to calculate the compensated real time of day. The
bbf175be	245	* basetime can be modified on a per-tick basis by the adjtime(),
5eb5a6bc MD	246	* ntp_adjtime(), and sysctl-based time correction APIs.
	247	*
	248	* Note that frequency corrections can also be made by adjusting
	249	* gd_cpuclock_base.
	250	*
	251	* basetime is a tail-chasing FIFO, updated only by cpu #0. The FIFO is
	252	* used on both SMP and UP systems to avoid MP races between cpu's and
	253	* interrupt races on UP systems.
	254	*/
2ed58723 MD	255	struct hardtime {
	256	__uint32_t time_second;
	257	sysclock_t cpuclock_base;
	258	};
	259
5eb5a6bc MD	260	#define BASETIME_ARYSIZE 16
	261	#define BASETIME_ARYMASK (BASETIME_ARYSIZE - 1)
	262	static struct timespec basetime[BASETIME_ARYSIZE];
2ed58723	263	static struct hardtime hardtime[BASETIME_ARYSIZE];
5eb5a6bc MD	264	static volatile int basetime_index;
	265
	266	static int
	267	sysctl_get_basetime(SYSCTL_HANDLER_ARGS)
	268	{
	269	struct timespec *bt;
	270	int error;
35238fa5	271	int index;
5eb5a6bc	272
35238fa5 MD	273	/*
	274	* Because basetime data and index may be updated by another cpu,
	275	* a load fence is required to ensure that the data we read has
	276	* not been speculatively read relative to a possibly updated index.
	277	*/
	278	index = basetime_index;
	279	cpu_lfence();
	280	bt = &basetime[index];
08f95c49	281	error = SYSCTL_OUT(req, bt, sizeof(*bt));
5eb5a6bc MD	282	return (error);
	283	}
	284
984263bc	285	SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
08f95c49	286	&boottime, timespec, "System boottime");
5eb5a6bc	287	SYSCTL_PROC(_kern, OID_AUTO, basetime, CTLTYPE_STRUCT\|CTLFLAG_RD, 0, 0,
08f95c49	288	sysctl_get_basetime, "S,timespec", "System basetime");
984263bc	289
96d52ac8 SZ	290	static void hardclock(systimer_t info, int, struct intrframe *frame);
	291	static void statclock(systimer_t info, int, struct intrframe *frame);
	292	static void schedclock(systimer_t info, int, struct intrframe *frame);
5eb5a6bc	293	static void getnanotime_nbt(struct timespec nbt, struct timespec tsp);
88c4d2f6	294
2ff21866 MD	295	/*
	296	* Use __read_mostly for ticks and sched_ticks because these variables are
	297	* used all over the kernel and only updated once per tick.
	298	*/
6d3dff5f	299	__read_mostly sbintime_t sbticks; /* system master ticks at hz (64bit) */
2ff21866 MD	300	__read_mostly int ticks; /* system master ticks at hz */
	301	__read_mostly int sched_ticks; /* global schedule clock ticks */
	302	__read_mostly int clocks_running; /* tsleep/timeout clocks operational */
88c4d2f6 MD	303	int64_t nsec_adj; /* ntpd per-tick adjustment in nsec << 32 */
88c4d2f6 MD	304	int64_t nsec_acc; /* accumulator */
984263bc	305
4026c000 JS	306	/* NTPD time correction fields */
	307	int64_t ntp_tick_permanent; /* per-tick adjustment in nsec << 32 */
	308	int64_t ntp_tick_acc; /* accumulator for per-tick adjustment */
	309	int64_t ntp_delta; /* one-time correction in nsec */
	310	int64_t ntp_big_delta = 1000000000;
	311	int32_t ntp_tick_delta; /* current adjustment rate */
	312	int32_t ntp_default_tick_delta; /* adjustment rate for ntp_delta */
48590578 JS	313	time_t ntp_leap_second; /* time of next leap second */
48590578 JS	314	int ntp_leap_insert; /* whether to insert or remove a second */
a55bb12d	315	struct spinlock ntp_spin;
4026c000	316
984263bc	317	/*
88c4d2f6	318	* Finish initializing clock frequencies and start all clocks running.
984263bc	319	*/
88c4d2f6 MD	320	/* ARGSUSED*/
	321	static void
	322	initclocks(void *dummy)
984263bc	323	{
88c4d2f6	324	/psratio = profhz / stathz;/
a55bb12d	325	spin_init(&ntp_spin, "ntp");
88c4d2f6	326	initclocks_pcpu();
da3639ef	327	clocks_running = 1;
0adbcbd6	328	if (kpmap) {
5b49787b	329	kpmap->tsc_freq = tsc_frequency;
0adbcbd6 MD	330	kpmap->tick_freq = hz;
0adbcbd6 MD	331	}
984263bc MD	332	}
984263bc MD	333
88c4d2f6	334	/*
1997b4c2 MD	335	* Called on a per-cpu basis from the idle thread bootstrap on each cpu
	336	* during SMP initialization.
	337	*
	338	* This routine is called concurrently during low-level SMP initialization
	339	* and may not block in any way. Meaning, among other things, we can't
	340	* acquire any tokens.
88c4d2f6 MD	341	*/
	342	void
	343	initclocks_pcpu(void)
	344	{
	345	struct globaldata *gd = mycpu;
984263bc	346
88c4d2f6 MD	347	crit_enter();
	348	if (gd->gd_cpuid == 0) {
	349	gd->gd_time_seconds = 1;
044ee7c4	350	gd->gd_cpuclock_base = sys_cputimer->count();
2ed58723 MD	351	hardtime[0].time_second = gd->gd_time_seconds;
2ed58723 MD	352	hardtime[0].cpuclock_base = gd->gd_cpuclock_base;
88c4d2f6	353	} else {
88c4d2f6 MD	354	gd->gd_time_seconds = globaldata_find(0)->gd_time_seconds;
	355	gd->gd_cpuclock_base = globaldata_find(0)->gd_cpuclock_base;
	356	}
0d1dffdf	357
43adde98 SZ	358	systimer_intr_enable();
43adde98 SZ	359
1997b4c2 MD	360	crit_exit();
	361	}
	362
77bc82e1 MD	363	/*
	364	* Called on a 10-second interval after the system is operational.
	365	* Return the collection data for USERPCT and install the data for
	366	* SYSTPCT and IDLEPCT.
	367	*/
	368	static
	369	uint64_t
	370	collect_cputime_callback(int n)
	371	{
	372	static long cpu_base[CPUSTATES];
	373	long cpu_states[CPUSTATES];
	374	long total;
	375	long acc;
	376	long lsb;
	377
	378	bzero(cpu_states, sizeof(cpu_states));
	379	for (n = 0; n < ncpus; ++n) {
	380	cpu_states[CP_USER] += cputime_percpu[n].cp_user;
	381	cpu_states[CP_NICE] += cputime_percpu[n].cp_nice;
	382	cpu_states[CP_SYS] += cputime_percpu[n].cp_sys;
	383	cpu_states[CP_INTR] += cputime_percpu[n].cp_intr;
	384	cpu_states[CP_IDLE] += cputime_percpu[n].cp_idle;
	385	}
	386
	387	acc = 0;
	388	for (n = 0; n < CPUSTATES; ++n) {
	389	total = cpu_states[n] - cpu_base[n];
	390	cpu_base[n] = cpu_states[n];
	391	cpu_states[n] = total;
	392	acc += total;
	393	}
	394	if (acc == 0) /* prevent degenerate divide by 0 */
	395	acc = 1;
	396	lsb = acc / (10000 * 2);
	397	kcollect_setvalue(KCOLLECT_SYSTPCT,
	398	(cpu_states[CP_SYS] + lsb) * 10000 / acc);
	399	kcollect_setvalue(KCOLLECT_IDLEPCT,
	400	(cpu_states[CP_IDLE] + lsb) * 10000 / acc);
	401	kcollect_setvalue(KCOLLECT_INTRPCT,
	402	(cpu_states[CP_INTR] + lsb) * 10000 / acc);
	403	return((cpu_states[CP_USER] + cpu_states[CP_NICE] + lsb) * 10000 / acc);
	404	}
	405
1997b4c2 MD	406	/*
	407	* This routine is called on just the BSP, just after SMP initialization
	408	* completes to * finish initializing any clocks that might contend/block
	409	* (e.g. like on a token). We can't do this in initclocks_pcpu() because
	410	* that function is called from the idle thread bootstrap for each cpu and
	411	* not allowed to block at all.
	412	*/
	413	static
	414	void
	415	initclocks_other(void *dummy)
	416	{
	417	struct globaldata *ogd = mycpu;
	418	struct globaldata *gd;
	419	int n;
	420
	421	for (n = 0; n < ncpus; ++n) {
	422	lwkt_setcpu_self(globaldata_find(n));
	423	gd = mycpu;
	424
	425	/*
	426	* Use a non-queued periodic systimer to prevent multiple
	427	* ticks from building up if the sysclock jumps forward
	428	* (8254 gets reset). The sysclock will never jump backwards.
	429	* Our time sync is based on the actual sysclock, not the
	430	* ticks count.
c91894e0 MD	431	*
	432	* Install statclock before hardclock to prevent statclock
	433	* from misinterpreting gd_flags for tick assignment when
880fb308 MD	434	* they overlap. Also offset the statclock by half of
	435	* its interval to try to avoid being coincident with
	436	* callouts.
1997b4c2	437	*/
c6a766f4 MD	438	systimer_init_periodic_flags(&gd->gd_statclock, statclock,
c6a766f4 MD	439	NULL, stathz,
880fb308	440	SYSTF_MSSYNC \| SYSTF_FIRST \|
91dc43dd	441	SYSTF_OFFSET50 \| SYSTF_OFFSETCPU);
c6a766f4	442	systimer_init_periodic_flags(&gd->gd_hardclock, hardclock,
91dc43dd MD	443	NULL, hz,
91dc43dd MD	444	SYSTF_MSSYNC \| SYSTF_OFFSETCPU);
1997b4c2 MD	445	}
1997b4c2 MD	446	lwkt_setcpu_self(ogd);
77bc82e1 MD	447
	448	/*
	449	* Regular data collection
	450	*/
	451	kcollect_register(KCOLLECT_USERPCT, "user", collect_cputime_callback,
	452	KCOLLECT_SCALE(KCOLLECT_USERPCT_FORMAT, 0));
	453	kcollect_register(KCOLLECT_SYSTPCT, "syst", NULL,
	454	KCOLLECT_SCALE(KCOLLECT_SYSTPCT_FORMAT, 0));
	455	kcollect_register(KCOLLECT_IDLEPCT, "idle", NULL,
	456	KCOLLECT_SCALE(KCOLLECT_IDLEPCT_FORMAT, 0));
88c4d2f6	457	}
f3f3eadb	458	SYSINIT(clocks2, SI_BOOT2_POST_SMP, SI_ORDER_ANY, initclocks_other, NULL);
984263bc	459
39799749 IV	460	/*
	461	* This method is called on just the BSP, after all the usched implementations
	462	* are initialized. This avoids races between usched initialization functions
	463	* and usched_schedulerclock().
	464	*/
	465	static
	466	void
	467	initclocks_usched(void *dummy)
	468	{
	469	struct globaldata *ogd = mycpu;
	470	struct globaldata *gd;
	471	int n;
	472
	473	for (n = 0; n < ncpus; ++n) {
	474	lwkt_setcpu_self(globaldata_find(n));
	475	gd = mycpu;
	476
	477	/* XXX correct the frequency for scheduler / estcpu tests */
	478	systimer_init_periodic_flags(&gd->gd_schedclock, schedclock,
4341238d MD	479	NULL, ESTCPUFREQ,
4341238d MD	480	SYSTF_MSSYNC \| SYSTF_OFFSETCPU);
39799749 IV	481	}
	482	lwkt_setcpu_self(ogd);
	483	}
	484	SYSINIT(clocks3, SI_BOOT2_USCHED, SI_ORDER_ANY, initclocks_usched, NULL);
	485
984263bc	486	/*
88c4d2f6 MD	487	* This sets the current real time of day. Timespecs are in seconds and
	488	* nanoseconds. We do not mess with gd_time_seconds and gd_cpuclock_base,
	489	* instead we adjust basetime so basetime + gd_* results in the current
317c3bd2	490	* time of day. This way the gd_* fields are guaranteed to represent
88c4d2f6	491	* a monotonically increasing 'uptime' value.
5eb5a6bc MD	492	*
	493	* When set_timeofday() is called from userland, the system call forces it
	494	* onto cpu #0 since only cpu #0 can update basetime_index.
984263bc	495	*/
88c4d2f6 MD	496	void
	497	set_timeofday(struct timespec *ts)
	498	{
5eb5a6bc MD	499	struct timespec *nbt;
5eb5a6bc MD	500	int ni;
984263bc	501
88c4d2f6 MD	502	/*
	503	* XXX SMP / non-atomic basetime updates
	504	*/
	505	crit_enter();
5eb5a6bc	506	ni = (basetime_index + 1) & BASETIME_ARYMASK;
2ed58723	507	cpu_lfence();
5eb5a6bc MD	508	nbt = &basetime[ni];
	509	nanouptime(nbt);
	510	nbt->tv_sec = ts->tv_sec - nbt->tv_sec;
	511	nbt->tv_nsec = ts->tv_nsec - nbt->tv_nsec;
	512	if (nbt->tv_nsec < 0) {
	513	nbt->tv_nsec += 1000000000;
	514	--nbt->tv_sec;
88c4d2f6	515	}
a81931cc MD	516
	517	/*
	518	* Note that basetime diverges from boottime as the clock drift is
	519	* compensated for, so we cannot do away with boottime. When setting
	520	* the absolute time of day the drift is 0 (for an instant) and we
bbf175be	521	* can simply assign boottime to basetime.
a81931cc MD	522	*
a81931cc MD	523	* Note that nanouptime() is based on gd_time_seconds which is drift
317c3bd2	524	* compensated up to a point (it is guaranteed to remain monotonically
a81931cc MD	525	* increasing). gd_time_seconds is thus our best uptime guess and
	526	* suitable for use in the boottime calculation. It is already taken
	527	* into account in the basetime calculation above.
	528	*/
a55bb12d	529	spin_lock(&ntp_spin);
5eb5a6bc	530	boottime.tv_sec = nbt->tv_sec;
4026c000	531	ntp_delta = 0;
5eb5a6bc MD	532
5eb5a6bc MD	533	/*
35238fa5 MD	534	* We now have a new basetime, make sure all other cpus have it,
35238fa5 MD	535	* then update the index.
5eb5a6bc	536	*/
35238fa5	537	cpu_sfence();
5eb5a6bc	538	basetime_index = ni;
a55bb12d	539	spin_unlock(&ntp_spin);
5eb5a6bc	540
88c4d2f6 MD	541	crit_exit();
88c4d2f6 MD	542	}
bbf175be	543
984263bc	544	/*
4871f0f4	545	* Each cpu has its own hardclock, but we only increment ticks and softticks
88c4d2f6 MD	546	* on cpu #0.
	547	*
	548	* NOTE! systimer! the MP lock might not be held here. We can only safely
	549	* manipulate objects owned by the current cpu.
984263bc	550	*/
984263bc	551	static void
e76d2ad3	552	hardclock(systimer_t info, int in_ipi, struct intrframe *frame)
984263bc	553	{
88c4d2f6 MD	554	sysclock_t cputicks;
88c4d2f6 MD	555	struct proc *p;
88c4d2f6	556	struct globaldata *gd = mycpu;
984263bc	557
e76d2ad3 SZ	558	if ((gd->gd_reqflags & RQF_IPIQ) == 0 && lwkt_need_ipiq_process(gd)) {
	559	/* Defer to doreti on passive IPIQ processing */
	560	need_ipiq();
	561	}
	562
984263bc	563	/*
2ed58723 MD	564	* We update the compensation base to calculate fine-grained time
	565	* from the sys_cputimer on a per-cpu basis in order to avoid
	566	* having to mess around with locks. sys_cputimer is assumed to
	567	* be consistent across all cpus. CPU N copies the base state from
	568	* CPU 0 using the same FIFO trick that we use for basetime (so we
	569	* don't catch a CPU 0 update in the middle).
88c4d2f6 MD	570	*
88c4d2f6 MD	571	* Note that we never allow info->time (aka gd->gd_hardclock.time)
fad57d0e MD	572	* to reverse index gd_cpuclock_base, but that it is possible for
	573	* it to temporarily get behind in the seconds if something in the
	574	* system locks interrupts for a long period of time. Since periodic
	575	* timers count events, though everything should resynch again
	576	* immediately.
984263bc	577	*/
2ed58723 MD	578	if (gd->gd_cpuid == 0) {
	579	int ni;
	580
	581	cputicks = info->time - gd->gd_cpuclock_base;
	582	if (cputicks >= sys_cputimer->freq) {
	583	cputicks /= sys_cputimer->freq;
	584	if (cputicks != 0 && cputicks != 1)
	585	kprintf("Warning: hardclock missed > 1 sec\n");
	586	gd->gd_time_seconds += cputicks;
	587	gd->gd_cpuclock_base += sys_cputimer->freq * cputicks;
	588	/* uncorrected monotonic 1-sec gran */
	589	time_uptime += cputicks;
	590	}
	591	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	592	hardtime[ni].time_second = gd->gd_time_seconds;
	593	hardtime[ni].cpuclock_base = gd->gd_cpuclock_base;
	594	} else {
	595	int ni;
	596
	597	ni = basetime_index;
	598	cpu_lfence();
	599	gd->gd_time_seconds = hardtime[ni].time_second;
	600	gd->gd_cpuclock_base = hardtime[ni].cpuclock_base;
88c4d2f6	601	}
984263bc MD	602
984263bc MD	603	/*
92b561b7 MD	604	* The system-wide ticks counter and NTP related timedelta/tickdelta
	605	* adjustments only occur on cpu #0. NTP adjustments are accomplished
	606	* by updating basetime.
984263bc	607	*/
88c4d2f6	608	if (gd->gd_cpuid == 0) {
5eb5a6bc	609	struct timespec *nbt;
88c4d2f6 MD	610	struct timespec nts;
88c4d2f6 MD	611	int leap;
5eb5a6bc	612	int ni;
984263bc	613
63823918 MD	614	/*
	615	* Update system-wide ticks
	616	*/
88c4d2f6	617	++ticks;
6d3dff5f	618	++sbticks;
984263bc	619
63823918 MD	620	/*
	621	* Update system-wide ticktime for getnanotime() and getmicrotime()
	622	*/
	623	nanotime(&nts);
	624	atomic_add_int_nonlocked(&ticktime_update, 1);
	625	cpu_sfence();
	626	if (ticktime_update & 2)
	627	ticktime2 = nts;
	628	else
	629	ticktime0 = nts;
	630	cpu_sfence();
	631	atomic_add_int_nonlocked(&ticktime_update, 1);
	632
88c4d2f6	633	#if 0
bbf175be	634	if (tco->tc_poll_pps)
88c4d2f6 MD	635	tco->tc_poll_pps(tco);
88c4d2f6 MD	636	#endif
5eb5a6bc	637
88c4d2f6	638	/*
5eb5a6bc MD	639	* Calculate the new basetime index. We are in a critical section
	640	* on cpu #0 and can safely play with basetime_index. Start
	641	* with the current basetime and then make adjustments.
	642	*/
	643	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	644	nbt = &basetime[ni];
	645	*nbt = basetime[basetime_index];
	646
a55bb12d MD	647	/*
	648	* ntp adjustments only occur on cpu 0 and are protected by
	649	* ntp_spin. This spinlock virtually never conflicts.
	650	*/
	651	spin_lock(&ntp_spin);
	652
5eb5a6bc MD	653	/*
	654	* Apply adjtime corrections. (adjtime() API)
	655	*
	656	* adjtime() only runs on cpu #0 so our critical section is
	657	* sufficient to access these variables.
88c4d2f6	658	*/
4026c000	659	if (ntp_delta != 0) {
5eb5a6bc	660	nbt->tv_nsec += ntp_tick_delta;
4026c000 JS	661	ntp_delta -= ntp_tick_delta;
	662	if ((ntp_delta > 0 && ntp_delta < ntp_tick_delta) \|\|
	663	(ntp_delta < 0 && ntp_delta > ntp_tick_delta)) {
5eb5a6bc	664	ntp_tick_delta = ntp_delta;
4026c000 JS	665	}
	666	}
	667
5eb5a6bc MD	668	/*
	669	* Apply permanent frequency corrections. (sysctl API)
	670	*/
4026c000 JS	671	if (ntp_tick_permanent != 0) {
	672	ntp_tick_acc += ntp_tick_permanent;
	673	if (ntp_tick_acc >= (1LL << 32)) {
5eb5a6bc	674	nbt->tv_nsec += ntp_tick_acc >> 32;
331bc6f8	675	ntp_tick_acc -= (ntp_tick_acc >> 32) << 32;
4026c000	676	} else if (ntp_tick_acc <= -(1LL << 32)) {
331bc6f8	677	/* Negate ntp_tick_acc to avoid shifting the sign bit. */
5eb5a6bc	678	nbt->tv_nsec -= (-ntp_tick_acc) >> 32;
331bc6f8	679	ntp_tick_acc += ((-ntp_tick_acc) >> 32) << 32;
4026c000 JS	680	}
	681	}
	682
5eb5a6bc MD	683	if (nbt->tv_nsec >= 1000000000) {
	684	nbt->tv_sec++;
	685	nbt->tv_nsec -= 1000000000;
	686	} else if (nbt->tv_nsec < 0) {
	687	nbt->tv_sec--;
	688	nbt->tv_nsec += 1000000000;
88c4d2f6 MD	689	}
	690
	691	/*
5eb5a6bc	692	* Another per-tick compensation. (for ntp_adjtime() API)
88c4d2f6	693	*/
5eb5a6bc	694	if (nsec_adj != 0) {
88c4d2f6 MD	695	nsec_acc += nsec_adj;
88c4d2f6 MD	696	if (nsec_acc >= 0x100000000LL) {
5eb5a6bc	697	nbt->tv_nsec += nsec_acc >> 32;
88c4d2f6 MD	698	nsec_acc = (nsec_acc & 0xFFFFFFFFLL);
88c4d2f6 MD	699	} else if (nsec_acc <= -0x100000000LL) {
5eb5a6bc	700	nbt->tv_nsec -= -nsec_acc >> 32;
88c4d2f6 MD	701	nsec_acc = -(-nsec_acc & 0xFFFFFFFFLL);
88c4d2f6 MD	702	}
5eb5a6bc MD	703	if (nbt->tv_nsec >= 1000000000) {
	704	nbt->tv_nsec -= 1000000000;
	705	++nbt->tv_sec;
	706	} else if (nbt->tv_nsec < 0) {
	707	nbt->tv_nsec += 1000000000;
	708	--nbt->tv_sec;
	709	}
	710	}
a55bb12d	711	spin_unlock(&ntp_spin);
5eb5a6bc MD	712
	713	/************************************************************
	714	* LEAP SECOND CORRECTION *
	715	************************************************************
	716	*
	717	* Taking into account all the corrections made above, figure
	718	* out the new real time. If the seconds field has changed
	719	* then apply any pending leap-second corrections.
	720	*/
	721	getnanotime_nbt(nbt, &nts);
	722
32040d57 MD	723	if (time_second != nts.tv_sec) {
	724	/*
	725	* Apply leap second (sysctl API). Adjust nts for changes
	726	* so we do not have to call getnanotime_nbt again.
	727	*/
	728	if (ntp_leap_second) {
	729	if (ntp_leap_second == nts.tv_sec) {
	730	if (ntp_leap_insert) {
	731	nbt->tv_sec++;
	732	nts.tv_sec++;
	733	} else {
	734	nbt->tv_sec--;
	735	nts.tv_sec--;
	736	}
5eb5a6bc	737	ntp_leap_second--;
32040d57	738	}
88c4d2f6	739	}
88c4d2f6	740
32040d57 MD	741	/*
	742	* Apply leap second (ntp_adjtime() API), calculate a new
	743	* nsec_adj field. ntp_update_second() returns nsec_adj
	744	* as a per-second value but we need it as a per-tick value.
	745	*/
88c4d2f6	746	leap = ntp_update_second(time_second, &nsec_adj);
88c4d2f6	747	nsec_adj /= hz;
32040d57 MD	748	nbt->tv_sec += leap;
	749	nts.tv_sec += leap;
	750
	751	/*
	752	* Update the time_second 'approximate time' global.
	753	*/
	754	time_second = nts.tv_sec;
4871f0f4 MD	755
	756	/*
	757	* Clear the IPC hint for the currently running thread once
	758	* per second, allowing us to disconnect the hint from a
	759	* thread which may no longer care.
	760	*/
	761	curthread->td_wakefromcpu = -1;
88c4d2f6	762	}
5eb5a6bc MD	763
	764	/*
	765	* Finally, our new basetime is ready to go live!
	766	*/
35238fa5	767	cpu_sfence();
5eb5a6bc	768	basetime_index = ni;
0adbcbd6 MD	769
0adbcbd6 MD	770	/*
12081e87 MD	771	* Update kpmap on each tick. TS updates are integrated with
	772	* fences and upticks allowing userland to read the data
	773	* deterministically.
0adbcbd6 MD	774	*/
0adbcbd6 MD	775	if (kpmap) {
12081e87 MD	776	int w;
	777
	778	w = (kpmap->upticks + 1) & 1;
	779	getnanouptime(&kpmap->ts_uptime[w]);
	780	getnanotime(&kpmap->ts_realtime[w]);
	781	cpu_sfence();
	782	++kpmap->upticks;
	783	cpu_sfence();
0adbcbd6	784	}
8f7f5bd5 MD	785
	786	/*
	787	* Handle exislock pseudo_ticks. We make things as simple as
	788	* possible for the critical path arming code by adding a little
	789	* complication here.
	790	*
	791	* When we find that all cores have been armed, we increment
	792	* pseudo_ticks and disarm all the cores.
	793	*/
	794	{
	795	globaldata_t gd;
	796	int n;
	797
	798	for (n = 0; n < ncpus; ++n) {
	799	gd = globaldata_find(n);
	800	if (gd->gd_exisarmed == 0)
	801	break;
	802	}
	803
	804	if (n == ncpus) {
	805	for (n = 0; n < ncpus; ++n) {
	806	gd = globaldata_find(n);
	807	gd->gd_exisarmed = 0;
	808	}
	809	++pseudo_ticks;
	810	}
	811	}
88c4d2f6 MD	812	}
88c4d2f6 MD	813
f9235b6d MD	814	/*
	815	* lwkt thread scheduler fair queueing
	816	*/
85946b6c	817	lwkt_schedulerclock(curthread);
f9235b6d	818
8f7f5bd5 MD	819	/*
	820	* Cycle the existential lock system on odd ticks in order to re-arm
	821	* our cpu (in case the cpu is idle or nobody is using any exis locks).
	822	*/
	823	if (ticks & 1) {
	824	exis_hold_gd(gd);
	825	exis_drop_gd(gd);
	826	}
	827
92b561b7 MD	828	/*
	829	* softticks are handled for all cpus
	830	*/
	831	hardclock_softtick(gd);
	832
5ba14d44	833	/*
75979118	834	* Rollup accumulated vmstats, copy-back for critical path checks.
5ba14d44 MD	835	*/
5ba14d44 MD	836	vmstats_rollup_cpu(gd);
bf3f67a7	837	vfscache_rollup_cpu(gd);
75979118	838	mycpu->gd_vmstats = vmstats;
5ba14d44	839
88c4d2f6	840	/*
8582ec21 MD	841	* ITimer handling is per-tick, per-cpu.
	842	*
	843	* We must acquire the per-process token in order for ksignal()
898e34b3 MD	844	* to be non-blocking. For the moment this requires an AST fault,
	845	* the ksignal() cannot be safely issued from this hard interrupt.
	846	*
	847	* XXX Even the trytoken here isn't right, and itimer operation in
	848	* a multi threaded environment is going to be weird at the
	849	* very least.
88c4d2f6	850	*/
8582ec21	851	if ((p = curproc) != NULL && lwkt_trytoken(&p->p_token)) {
3dbbd6dd	852	crit_enter_hard();
0adbcbd6 MD	853	if (p->p_upmap)
	854	++p->p_upmap->runticks;
	855
88c4d2f6	856	if (frame && CLKF_USERMODE(frame) &&
93328593	857	timevalisset(&p->p_timer[ITIMER_VIRTUAL].it_value) &&
898e34b3	858	itimerdecr(&p->p_timer[ITIMER_VIRTUAL], ustick) == 0) {
4643740a	859	p->p_flags \|= P_SIGVTALRM;
898e34b3 MD	860	need_user_resched();
898e34b3 MD	861	}
93328593	862	if (timevalisset(&p->p_timer[ITIMER_PROF].it_value) &&
898e34b3	863	itimerdecr(&p->p_timer[ITIMER_PROF], ustick) == 0) {
4643740a	864	p->p_flags \|= P_SIGPROF;
898e34b3 MD	865	need_user_resched();
898e34b3 MD	866	}
3dbbd6dd	867	crit_exit_hard();
8582ec21	868	lwkt_reltoken(&p->p_token);
984263bc	869	}
604e1e09	870	setdelayed();
88c4d2f6	871	}
984263bc	872
88c4d2f6 MD	873	/*
	874	* The statistics clock typically runs at a 125Hz rate, and is intended
	875	* to be frequency offset from the hardclock (typ 100Hz). It is per-cpu.
	876	*
	877	* NOTE! systimer! the MP lock might not be held here. We can only safely
	878	* manipulate objects owned by the current cpu.
	879	*
	880	* The stats clock is responsible for grabbing a profiling sample.
	881	* Most of the statistics are only used by user-level statistics programs.
	882	* The main exceptions are p->p_uticks, p->p_sticks, p->p_iticks, and
	883	* p->p_estcpu.
	884	*
	885	* Like the other clocks, the stat clock is called from what is effectively
	886	* a fast interrupt, so the context should be the thread/process that got
	887	* interrupted.
	888	*/
	889	static void
96d52ac8	890	statclock(systimer_t info, int in_ipi, struct intrframe *frame)
88c4d2f6	891	{
c91894e0	892	globaldata_t gd = mycpu;
88c4d2f6 MD	893	thread_t td;
	894	struct proc *p;
	895	int bump;
1997b4c2 MD	896	sysclock_t cv;
1997b4c2 MD	897	sysclock_t scv;
984263bc MD	898
984263bc MD	899	/*
1997b4c2 MD	900	* How big was our timeslice relative to the last time? Calculate
	901	* in microseconds.
	902	*
	903	* NOTE: Use of microuptime() is typically MPSAFE, but usually not
	904	* during early boot. Just use the systimer count to be nice
	905	* to e.g. qemu. The systimer has a better chance of being
	906	* MPSAFE at early boot.
984263bc	907	*/
1997b4c2	908	cv = sys_cputimer->count();
c91894e0	909	scv = gd->statint.gd_statcv;
1997b4c2 MD	910	if (scv == 0) {
	911	bump = 1;
	912	} else {
8fbc264d MD	913	bump = muldivu64(sys_cputimer->freq64_usec,
8fbc264d MD	914	(cv - scv), 1L << 32);
1997b4c2 MD	915	if (bump < 0)
	916	bump = 0;
	917	if (bump > 1000000)
	918	bump = 1000000;
	919	}
c91894e0	920	gd->statint.gd_statcv = cv;
1997b4c2 MD	921
1997b4c2 MD	922	#if 0
c91894e0	923	stv = &gd->gd_stattv;
88c4d2f6 MD	924	if (stv->tv_sec == 0) {
	925	bump = 1;
	926	} else {
	927	bump = tv.tv_usec - stv->tv_usec +
	928	(tv.tv_sec - stv->tv_sec) * 1000000;
	929	if (bump < 0)
	930	bump = 0;
	931	if (bump > 1000000)
	932	bump = 1000000;
	933	}
	934	*stv = tv;
1997b4c2	935	#endif
984263bc	936
88c4d2f6 MD	937	td = curthread;
88c4d2f6 MD	938	p = td->td_proc;
984263bc	939
63823918 MD	940	/*
	941	* If this is an interrupt thread used for the clock interrupt, adjust
	942	* td to the thread it is preempting. If a frame is available, it will
	943	* be related to the thread being preempted.
	944	*/
	945	if ((td->td_flags & TDF_CLKTHREAD) && td->td_preempted)
	946	td = td->td_preempted;
	947
88c4d2f6 MD	948	if (frame && CLKF_USERMODE(frame)) {
	949	/*
	950	* Came from userland, handle user time and deal with
	951	* possible process.
	952	*/
4643740a	953	if (p && (p->p_flags & P_PROFIL))
88c4d2f6 MD	954	addupc_intr(p, CLKF_PC(frame), 1);
88c4d2f6 MD	955	td->td_uticks += bump;
984263bc	956
88c4d2f6 MD	957	/*
	958	* Charge the time as appropriate
	959	*/
	960	if (p && p->p_nice > NZERO)
9eea7f0c	961	cpu_time.cp_nice += bump;
88c4d2f6	962	else
9eea7f0c	963	cpu_time.cp_user += bump;
88c4d2f6	964	} else {
c91894e0	965	int intr_nest = gd->gd_intr_nesting_level;
96d52ac8 SZ	966
	967	if (in_ipi) {
	968	/*
	969	* IPI processing code will bump gd_intr_nesting_level
	970	* up by one, which breaks following CLKF_INTR testing,
317c3bd2	971	* so we subtract it by one here.
96d52ac8 SZ	972	*/
	973	--intr_nest;
	974	}
6026c54d	975
88c4d2f6 MD	976	/*
	977	* Came from kernel mode, so we were:
	978	* - handling an interrupt,
	979	* - doing syscall or trap work on behalf of the current
	980	* user process, or
	981	* - spinning in the idle loop.
	982	* Whichever it is, charge the time as appropriate.
	983	* Note that we charge interrupts to the current process,
	984	* regardless of whether they are ``for'' that process,
	985	* so that we know how much of its real time was spent
	986	* in ``non-process'' (i.e., interrupt) work.
	987	*
bbf175be	988	* XXX assume system if frame is NULL. A NULL frame
e43a034f	989	* can occur if ipi processing is done from a crit_exit().
88c4d2f6	990	*/
63823918 MD	991	if ((frame && CLKF_INTR(intr_nest)) \|\|
63823918 MD	992	cpu_interrupt_running(td)) {
e2b92533 MD	993	/*
	994	* If we interrupted an interrupt thread, well,
	995	* count it as interrupt time.
	996	*/
c91894e0	997	td->td_iticks += bump;
07522099	998	#ifdef DEBUG_PCTRACK
6026c54d SZ	999	if (frame)
6026c54d SZ	1000	do_pctrack(frame, PCTRACK_INT);
07522099	1001	#endif
9eea7f0c	1002	cpu_time.cp_intr += bump;
c91894e0 MD	1003	} else if (gd->gd_flags & GDF_VIRTUSER) {
	1004	/*
	1005	* The vkernel doesn't do a good job providing trap
	1006	* frames that we can test. If the GDF_VIRTUSER
	1007	* flag is set we probably interrupted user mode.
	1008	*/
	1009	td->td_uticks += bump;
	1010
	1011	/*
	1012	* Charge the time as appropriate
	1013	*/
	1014	if (p && p->p_nice > NZERO)
	1015	cpu_time.cp_nice += bump;
	1016	else
	1017	cpu_time.cp_user += bump;
88c4d2f6	1018	} else {
63823918 MD	1019	if (clock_debug2 > 0) {
	1020	--clock_debug2;
	1021	kprintf("statclock preempt %s (%p %p)\n", td->td_comm, td, &gd->gd_idlethread);
	1022	}
c91894e0 MD	1023	td->td_sticks += bump;
c91894e0 MD	1024	if (td == &gd->gd_idlethread) {
e2b92533	1025	/*
c6a766f4 MD	1026	* We want to count token contention as
	1027	* system time. When token contention occurs
	1028	* the cpu may only be outside its critical
	1029	* section while switching through the idle
	1030	* thread. In this situation, various flags
	1031	* will be set in gd_reqflags.
63823918 MD	1032	*
	1033	* INTPEND is not necessarily useful because
	1034	* it will be set if the clock interrupt
	1035	* happens to be on an interrupt thread, the
	1036	* cpu_interrupt_running() call does a better
	1037	* job so we've already handled it.
e2b92533	1038	*/
63823918 MD	1039	if (gd->gd_reqflags &
63823918 MD	1040	(RQF_IDLECHECK_WK_MASK & ~RQF_INTPEND)) {
76f1911e	1041	cpu_time.cp_sys += bump;
63823918	1042	} else {
76f1911e	1043	cpu_time.cp_idle += bump;
63823918	1044	}
07522099	1045	} else {
e2b92533 MD	1046	/*
	1047	* System thread was running.
	1048	*/
07522099 MD	1049	#ifdef DEBUG_PCTRACK
	1050	if (frame)
	1051	do_pctrack(frame, PCTRACK_SYS);
	1052	#endif
9eea7f0c	1053	cpu_time.cp_sys += bump;
07522099	1054	}
88c4d2f6 MD	1055	}
	1056	}
	1057	}
	1058
07522099 MD	1059	#ifdef DEBUG_PCTRACK
	1060	/*
	1061	* Sample the PC when in the kernel or in an interrupt. User code can
	1062	* retrieve the information and generate a histogram or other output.
	1063	*/
	1064
	1065	static void
	1066	do_pctrack(struct intrframe *frame, int which)
	1067	{
	1068	struct kinfo_pctrack *pctrack;
	1069
	1070	pctrack = &cputime_pctrack[mycpu->gd_cpuid][which];
bbf175be	1071	pctrack->pc_array[pctrack->pc_index & PCTRACK_ARYMASK] =
07522099 MD	1072	(void *)CLKF_PC(frame);
	1073	++pctrack->pc_index;
	1074	}
	1075
	1076	static int
	1077	sysctl_pctrack(SYSCTL_HANDLER_ARGS)
	1078	{
	1079	struct kinfo_pcheader head;
	1080	int error;
	1081	int cpu;
	1082	int ntrack;
	1083
	1084	head.pc_ntrack = PCTRACK_SIZE;
	1085	head.pc_arysize = PCTRACK_ARYSIZE;
	1086
	1087	if ((error = SYSCTL_OUT(req, &head, sizeof(head))) != 0)
	1088	return (error);
	1089
	1090	for (cpu = 0; cpu < ncpus; ++cpu) {
	1091	for (ntrack = 0; ntrack < PCTRACK_SIZE; ++ntrack) {
	1092	error = SYSCTL_OUT(req, &cputime_pctrack[cpu][ntrack],
	1093	sizeof(struct kinfo_pctrack));
	1094	if (error)
	1095	break;
	1096	}
	1097	if (error)
	1098	break;
	1099	}
	1100	return (error);
	1101	}
	1102	SYSCTL_PROC(_kern, OID_AUTO, pctrack, (CTLTYPE_OPAQUE\|CTLFLAG_RD), 0, 0,
	1103	sysctl_pctrack, "S,kinfo_pcheader", "CPU PC tracking");
	1104
	1105	#endif
	1106
88c4d2f6	1107	/*
dcc99b62	1108	* The scheduler clock typically runs at a 50Hz rate. NOTE! systimer,
88c4d2f6 MD	1109	* the MP lock might not be held. We can safely manipulate parts of curproc
88c4d2f6 MD	1110	* but that's about it.
dcc99b62 MD	1111	*
dcc99b62 MD	1112	* Each cpu has its own scheduler clock.
88c4d2f6 MD	1113	*/
88c4d2f6 MD	1114	static void
96d52ac8	1115	schedclock(systimer_t info, int in_ipi __unused, struct intrframe *frame)
88c4d2f6	1116	{
553ea3c8	1117	struct lwp *lp;
88c4d2f6 MD	1118	struct rusage *ru;
	1119	struct vmspace *vm;
	1120	long rss;
	1121
553ea3c8	1122	if ((lp = lwkt_preempted_proc()) != NULL) {
dcc99b62 MD	1123	/*
	1124	* Account for cpu time used and hit the scheduler. Note
	1125	* that this call MUST BE MP SAFE, and the BGL IS NOT HELD
	1126	* HERE.
	1127	*/
553ea3c8	1128	++lp->lwp_cpticks;
de4d4cb0 MD	1129	usched_schedulerclock(lp, info->periodic, info->time);
	1130	} else {
	1131	usched_schedulerclock(NULL, info->periodic, info->time);
dcc99b62	1132	}
553ea3c8	1133	if ((lp = curthread->td_lwp) != NULL) {
dcc99b62 MD	1134	/*
	1135	* Update resource usage integrals and maximums.
	1136	*/
fde7ac71	1137	if ((ru = &lp->lwp_proc->p_ru) &&
553ea3c8	1138	(vm = lp->lwp_proc->p_vmspace) != NULL) {
4b566556 MD	1139	ru->ru_ixrss += pgtok(btoc(vm->vm_tsize));
	1140	ru->ru_idrss += pgtok(btoc(vm->vm_dsize));
	1141	ru->ru_isrss += pgtok(btoc(vm->vm_ssize));
b12defdc MD	1142	if (lwkt_trytoken(&vm->vm_map.token)) {
	1143	rss = pgtok(vmspace_resident_count(vm));
	1144	if (ru->ru_maxrss < rss)
	1145	ru->ru_maxrss = rss;
	1146	lwkt_reltoken(&vm->vm_map.token);
	1147	}
88c4d2f6	1148	}
b68b7282	1149	}
d6d39bc7 MC	1150	/* Increment the global sched_ticks */
	1151	if (mycpu->gd_cpuid == 0)
	1152	++sched_ticks;
984263bc MD	1153	}
	1154
	1155	/*
bbf175be	1156	* Compute number of ticks for the specified amount of time. The
a94976ad	1157	* return value is intended to be used in a clock interrupt timed
317c3bd2	1158	* operation and guaranteed to meet or exceed the requested time.
a94976ad MD	1159	* If the representation overflows, return INT_MAX. The minimum return
	1160	* value is 1 ticks and the function will average the calculation up.
	1161	* If any value greater then 0 microseconds is supplied, a value
	1162	* of at least 2 will be returned to ensure that a near-term clock
	1163	* interrupt does not cause the timeout to occur (degenerately) early.
	1164	*
	1165	* Note that limit checks must take into account microseconds, which is
	1166	* done simply by using the smaller signed long maximum instead of
	1167	* the unsigned long maximum.
	1168	*
	1169	* If ints have 32 bits, then the maximum value for any timeout in
	1170	* 10ms ticks is 248 days.
984263bc MD	1171	*/
984263bc MD	1172	int
a94976ad	1173	tvtohz_high(struct timeval *tv)
984263bc	1174	{
a94976ad	1175	int ticks;
1fd87d54	1176	long sec, usec;
984263bc	1177
984263bc MD	1178	sec = tv->tv_sec;
	1179	usec = tv->tv_usec;
	1180	if (usec < 0) {
	1181	sec--;
	1182	usec += 1000000;
	1183	}
	1184	if (sec < 0) {
	1185	#ifdef DIAGNOSTIC
	1186	if (usec > 0) {
	1187	sec++;
	1188	usec -= 1000000;
	1189	}
a591f597 MD	1190	kprintf("tvtohz_high: negative time difference "
	1191	"%ld sec %ld usec\n",
	1192	sec, usec);
984263bc MD	1193	#endif
984263bc MD	1194	ticks = 1;
a94976ad	1195	} else if (sec <= INT_MAX / hz) {
6e875644	1196	ticks = (int)(sec * hz + howmany((u_long)usec, ustick)) + 1;
a94976ad MD	1197	} else {
	1198	ticks = INT_MAX;
	1199	}
	1200	return (ticks);
	1201	}
	1202
a591f597 MD	1203	int
	1204	tstohz_high(struct timespec *ts)
	1205	{
	1206	int ticks;
	1207	long sec, nsec;
	1208
	1209	sec = ts->tv_sec;
	1210	nsec = ts->tv_nsec;
	1211	if (nsec < 0) {
	1212	sec--;
	1213	nsec += 1000000000;
	1214	}
	1215	if (sec < 0) {
	1216	#ifdef DIAGNOSTIC
	1217	if (nsec > 0) {
	1218	sec++;
	1219	nsec -= 1000000000;
	1220	}
	1221	kprintf("tstohz_high: negative time difference "
	1222	"%ld sec %ld nsec\n",
	1223	sec, nsec);
	1224	#endif
	1225	ticks = 1;
	1226	} else if (sec <= INT_MAX / hz) {
6e875644	1227	ticks = (int)(sec * hz + howmany((u_long)nsec, nstick)) + 1;
a591f597 MD	1228	} else {
	1229	ticks = INT_MAX;
	1230	}
	1231	return (ticks);
	1232	}
	1233
	1234
a94976ad MD	1235	/*
	1236	* Compute number of ticks for the specified amount of time, erroring on
	1237	* the side of it being too low to ensure that sleeping the returned number
	1238	* of ticks will not result in a late return.
	1239	*
	1240	* The supplied timeval may not be negative and should be normalized. A
	1241	* return value of 0 is possible if the timeval converts to less then
	1242	* 1 tick.
	1243	*
	1244	* If ints have 32 bits, then the maximum value for any timeout in
	1245	* 10ms ticks is 248 days.
	1246	*/
	1247	int
	1248	tvtohz_low(struct timeval *tv)
	1249	{
	1250	int ticks;
	1251	long sec;
	1252
	1253	sec = tv->tv_sec;
	1254	if (sec <= INT_MAX / hz)
a591f597	1255	ticks = (int)(sec * hz + (u_long)tv->tv_usec / ustick);
984263bc	1256	else
984263bc	1257	ticks = INT_MAX;
a94976ad	1258	return (ticks);
984263bc MD	1259	}
984263bc MD	1260
a591f597 MD	1261	int
	1262	tstohz_low(struct timespec *ts)
	1263	{
	1264	int ticks;
	1265	long sec;
	1266
	1267	sec = ts->tv_sec;
	1268	if (sec <= INT_MAX / hz)
	1269	ticks = (int)(sec * hz + (u_long)ts->tv_nsec / nstick);
	1270	else
	1271	ticks = INT_MAX;
	1272	return (ticks);
	1273	}
a94976ad	1274
984263bc MD	1275	/*
	1276	* Start profiling on a process.
	1277	*
282f3194 MD	1278	* Caller must hold p->p_token();
282f3194 MD	1279	*
984263bc MD	1280	* Kernel profiling passes proc0 which never exits and hence
	1281	* keeps the profile clock running constantly.
	1282	*/
	1283	void
88c4d2f6	1284	startprofclock(struct proc *p)
984263bc	1285	{
4643740a MD	1286	if ((p->p_flags & P_PROFIL) == 0) {
4643740a MD	1287	p->p_flags \|= P_PROFIL;
88c4d2f6	1288	#if 0 /* XXX */
984263bc	1289	if (++profprocs == 1 && stathz != 0) {
e43a034f	1290	crit_enter();
6ad39cae	1291	psdiv = psratio;
984263bc	1292	setstatclockrate(profhz);
e43a034f	1293	crit_exit();
984263bc	1294	}
88c4d2f6	1295	#endif
984263bc MD	1296	}
	1297	}
	1298
	1299	/*
	1300	* Stop profiling on a process.
616516c8 MD	1301	*
616516c8 MD	1302	* caller must hold p->p_token
984263bc MD	1303	*/
984263bc MD	1304	void
88c4d2f6	1305	stopprofclock(struct proc *p)
984263bc	1306	{
4643740a MD	1307	if (p->p_flags & P_PROFIL) {
4643740a MD	1308	p->p_flags &= ~P_PROFIL;
88c4d2f6	1309	#if 0 /* XXX */
984263bc	1310	if (--profprocs == 0 && stathz != 0) {
e43a034f	1311	crit_enter();
6ad39cae	1312	psdiv = 1;
984263bc	1313	setstatclockrate(stathz);
e43a034f	1314	crit_exit();
984263bc	1315	}
984263bc	1316	#endif
984263bc MD	1317	}
	1318	}
	1319
	1320	/*
	1321	* Return information about system clocks.
	1322	*/
	1323	static int
	1324	sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
	1325	{
f5d21610	1326	struct kinfo_clockinfo clkinfo;
984263bc MD	1327	/*
	1328	* Construct clockinfo structure.
	1329	*/
f5d21610	1330	clkinfo.ci_hz = hz;
a591f597	1331	clkinfo.ci_tick = ustick;
4026c000	1332	clkinfo.ci_tickadj = ntp_default_tick_delta / 1000;
f5d21610 JS	1333	clkinfo.ci_profhz = profhz;
f5d21610 JS	1334	clkinfo.ci_stathz = stathz ? stathz : hz;
984263bc MD	1335	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
	1336	}
	1337
	1338	SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT\|CTLFLAG_RD,
	1339	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
	1340
984263bc MD	1341	/*
	1342	* We have eight functions for looking at the clock, four for
	1343	* microseconds and four for nanoseconds. For each there is fast
	1344	* but less precise version "get{nano\|micro}[up]time" which will
	1345	* return a time which is up to 1/HZ previous to the call, whereas
	1346	* the raw version "{nano\|micro}[up]time" will return a timestamp
	1347	* which is as precise as possible. The "up" variants return the
	1348	* time relative to system boot, these are well suited for time
	1349	* interval measurements.
88c4d2f6	1350	*
317c3bd2	1351	* Each cpu independently maintains the current time of day, so all
88c4d2f6 MD	1352	* we need to do to protect ourselves from changes is to do a loop
88c4d2f6 MD	1353	* check on the seconds field changing out from under us.
fad57d0e MD	1354	*
fad57d0e MD	1355	* The system timer maintains a 32 bit count and due to various issues
317c3bd2	1356	* it is possible for the calculated delta to occasionally exceed
044ee7c4 MD	1357	* sys_cputimer->freq. If this occurs the sys_cputimer->freq64_nsec
	1358	* multiplication can easily overflow, so we deal with the case. For
	1359	* uniformity we deal with the case in the usec case too.
627531fa MD	1360	*
627531fa MD	1361	* All the [get][micro,nano][time,uptime]() routines are MPSAFE.
63823918 MD	1362	*
	1363	* NEW CODE (!)
	1364	*
	1365	* cpu 0 now maintains global ticktimes and an update counter. The
	1366	* getnanotime() and getmicrotime() routines use these globals.
984263bc	1367	*/
984263bc MD	1368	void
	1369	getmicrouptime(struct timeval *tvp)
	1370	{
88c4d2f6 MD	1371	struct globaldata *gd = mycpu;
	1372	sysclock_t delta;
	1373
	1374	do {
	1375	tvp->tv_sec = gd->gd_time_seconds;
	1376	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	1377	} while (tvp->tv_sec != gd->gd_time_seconds);
fad57d0e	1378
044ee7c4 MD	1379	if (delta >= sys_cputimer->freq) {
	1380	tvp->tv_sec += delta / sys_cputimer->freq;
	1381	delta %= sys_cputimer->freq;
fad57d0e	1382	}
8fbc264d	1383	tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32);
88c4d2f6 MD	1384	if (tvp->tv_usec >= 1000000) {
	1385	tvp->tv_usec -= 1000000;
	1386	++tvp->tv_sec;
984263bc MD	1387	}
	1388	}
	1389
	1390	void
	1391	getnanouptime(struct timespec *tsp)
	1392	{
88c4d2f6 MD	1393	struct globaldata *gd = mycpu;
	1394	sysclock_t delta;
	1395
	1396	do {
	1397	tsp->tv_sec = gd->gd_time_seconds;
	1398	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	1399	} while (tsp->tv_sec != gd->gd_time_seconds);
fad57d0e	1400
044ee7c4 MD	1401	if (delta >= sys_cputimer->freq) {
	1402	tsp->tv_sec += delta / sys_cputimer->freq;
	1403	delta %= sys_cputimer->freq;
984263bc	1404	}
8fbc264d	1405	tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
984263bc MD	1406	}
	1407
	1408	void
88c4d2f6	1409	microuptime(struct timeval *tvp)
984263bc	1410	{
88c4d2f6 MD	1411	struct globaldata *gd = mycpu;
	1412	sysclock_t delta;
	1413
	1414	do {
	1415	tvp->tv_sec = gd->gd_time_seconds;
044ee7c4	1416	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
88c4d2f6	1417	} while (tvp->tv_sec != gd->gd_time_seconds);
fad57d0e	1418
044ee7c4 MD	1419	if (delta >= sys_cputimer->freq) {
	1420	tvp->tv_sec += delta / sys_cputimer->freq;
	1421	delta %= sys_cputimer->freq;
984263bc	1422	}
8fbc264d	1423	tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32);
984263bc MD	1424	}
	1425
	1426	void
88c4d2f6	1427	nanouptime(struct timespec *tsp)
984263bc	1428	{
88c4d2f6 MD	1429	struct globaldata *gd = mycpu;
	1430	sysclock_t delta;
	1431
	1432	do {
	1433	tsp->tv_sec = gd->gd_time_seconds;
044ee7c4	1434	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
88c4d2f6	1435	} while (tsp->tv_sec != gd->gd_time_seconds);
fad57d0e	1436
044ee7c4 MD	1437	if (delta >= sys_cputimer->freq) {
	1438	tsp->tv_sec += delta / sys_cputimer->freq;
	1439	delta %= sys_cputimer->freq;
984263bc	1440	}
8fbc264d	1441	tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
984263bc MD	1442	}
984263bc MD	1443
88c4d2f6 MD	1444	/*
	1445	* realtime routines
	1446	*/
984263bc	1447	void
88c4d2f6	1448	getmicrotime(struct timeval *tvp)
984263bc	1449	{
63823918 MD	1450	struct timespec ts;
63823918 MD	1451	int counter;
984263bc	1452
88c4d2f6	1453	do {
63823918 MD	1454	counter = (volatile int )&ticktime_update;
	1455	cpu_lfence();
	1456	switch(counter & 3) {
	1457	case 0: /* ticktime2 completed update */
	1458	ts = ticktime2;
	1459	break;
	1460	case 1: /* ticktime0 update in progress */
	1461	ts = ticktime2;
	1462	break;
	1463	case 2: /* ticktime0 completed update */
	1464	ts = ticktime0;
	1465	break;
	1466	case 3: /* ticktime2 update in progress */
	1467	ts = ticktime0;
	1468	break;
	1469	}
	1470	cpu_lfence();
	1471	} while (counter != (volatile int )&ticktime_update);
	1472	tvp->tv_sec = ts.tv_sec;
	1473	tvp->tv_usec = ts.tv_nsec / 1000;
984263bc MD	1474	}
	1475
	1476	void
88c4d2f6	1477	getnanotime(struct timespec *tsp)
984263bc	1478	{
63823918 MD	1479	struct timespec ts;
63823918 MD	1480	int counter;
984263bc	1481
88c4d2f6	1482	do {
63823918 MD	1483	counter = (volatile int )&ticktime_update;
	1484	cpu_lfence();
	1485	switch(counter & 3) {
	1486	case 0: /* ticktime2 completed update */
	1487	ts = ticktime2;
	1488	break;
	1489	case 1: /* ticktime0 update in progress */
	1490	ts = ticktime2;
	1491	break;
	1492	case 2: /* ticktime0 completed update */
	1493	ts = ticktime0;
	1494	break;
	1495	case 3: /* ticktime2 update in progress */
	1496	ts = ticktime0;
	1497	break;
	1498	}
	1499	cpu_lfence();
	1500	} while (counter != (volatile int )&ticktime_update);
	1501	*tsp = ts;
984263bc MD	1502	}
984263bc MD	1503
5eb5a6bc MD	1504	static void
	1505	getnanotime_nbt(struct timespec nbt, struct timespec tsp)
	1506	{
	1507	struct globaldata *gd = mycpu;
	1508	sysclock_t delta;
	1509
	1510	do {
	1511	tsp->tv_sec = gd->gd_time_seconds;
	1512	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	1513	} while (tsp->tv_sec != gd->gd_time_seconds);
	1514
044ee7c4 MD	1515	if (delta >= sys_cputimer->freq) {
	1516	tsp->tv_sec += delta / sys_cputimer->freq;
	1517	delta %= sys_cputimer->freq;
5eb5a6bc	1518	}
8fbc264d	1519	tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
5eb5a6bc MD	1520
	1521	tsp->tv_sec += nbt->tv_sec;
	1522	tsp->tv_nsec += nbt->tv_nsec;
	1523	while (tsp->tv_nsec >= 1000000000) {
	1524	tsp->tv_nsec -= 1000000000;
	1525	++tsp->tv_sec;
	1526	}
	1527	}
	1528
	1529
88c4d2f6 MD	1530	void
88c4d2f6 MD	1531	microtime(struct timeval *tvp)
984263bc	1532	{
88c4d2f6	1533	struct globaldata *gd = mycpu;
5eb5a6bc	1534	struct timespec *bt;
88c4d2f6	1535	sysclock_t delta;
984263bc	1536
88c4d2f6 MD	1537	do {
88c4d2f6 MD	1538	tvp->tv_sec = gd->gd_time_seconds;
044ee7c4	1539	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
88c4d2f6	1540	} while (tvp->tv_sec != gd->gd_time_seconds);
fad57d0e	1541
044ee7c4 MD	1542	if (delta >= sys_cputimer->freq) {
	1543	tvp->tv_sec += delta / sys_cputimer->freq;
	1544	delta %= sys_cputimer->freq;
fad57d0e	1545	}
8fbc264d	1546	tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32);
984263bc	1547
5eb5a6bc	1548	bt = &basetime[basetime_index];
2ed58723	1549	cpu_lfence();
5eb5a6bc MD	1550	tvp->tv_sec += bt->tv_sec;
5eb5a6bc MD	1551	tvp->tv_usec += bt->tv_nsec / 1000;
88c4d2f6 MD	1552	while (tvp->tv_usec >= 1000000) {
	1553	tvp->tv_usec -= 1000000;
	1554	++tvp->tv_sec;
984263bc	1555	}
984263bc MD	1556	}
984263bc MD	1557
88c4d2f6 MD	1558	void
	1559	nanotime(struct timespec *tsp)
	1560	{
	1561	struct globaldata *gd = mycpu;
5eb5a6bc	1562	struct timespec *bt;
88c4d2f6	1563	sysclock_t delta;
984263bc	1564
88c4d2f6 MD	1565	do {
88c4d2f6 MD	1566	tsp->tv_sec = gd->gd_time_seconds;
044ee7c4	1567	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
88c4d2f6	1568	} while (tsp->tv_sec != gd->gd_time_seconds);
fad57d0e	1569
044ee7c4 MD	1570	if (delta >= sys_cputimer->freq) {
	1571	tsp->tv_sec += delta / sys_cputimer->freq;
	1572	delta %= sys_cputimer->freq;
fad57d0e	1573	}
8fbc264d	1574	tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
984263bc	1575
5eb5a6bc	1576	bt = &basetime[basetime_index];
2ed58723	1577	cpu_lfence();
5eb5a6bc MD	1578	tsp->tv_sec += bt->tv_sec;
5eb5a6bc MD	1579	tsp->tv_nsec += bt->tv_nsec;
88c4d2f6 MD	1580	while (tsp->tv_nsec >= 1000000000) {
	1581	tsp->tv_nsec -= 1000000000;
	1582	++tsp->tv_sec;
984263bc	1583	}
984263bc MD	1584	}
984263bc MD	1585
25b804e7	1586	/*
2ed58723 MD	1587	* Get an approximate time_t. It does not have to be accurate. This
	1588	* function is called only from KTR and can be called with the system in
	1589	* any state so do not use a critical section or other complex operation
	1590	* here.
	1591	*
	1592	* NOTE: This is not exactly synchronized with real time. To do that we
	1593	* would have to do what microtime does and check for a nanoseconds
	1594	* overflow.
25b804e7 MD	1595	*/
	1596	time_t
	1597	get_approximate_time_t(void)
	1598	{
	1599	struct globaldata *gd = mycpu;
5eb5a6bc MD	1600	struct timespec *bt;
	1601
	1602	bt = &basetime[basetime_index];
	1603	return(gd->gd_time_seconds + bt->tv_sec);
25b804e7 MD	1604	}
25b804e7 MD	1605
0c4dbac1 DC	1606	static int
	1607	pps_fetch_timeout(struct timespec timeout, struct pps_state pps)
	1608	{
	1609	int to, err;
	1610	pps_seq_t ap, cp;
	1611	pps_seq_t a, c;
	1612
	1613	to = INT_MAX;
	1614	if (timeout->tv_sec > -1)
	1615	to = tstohz_low(timeout);
	1616
	1617	ap = &pps->ppsinfo.assert_sequence;
	1618	cp = &pps->ppsinfo.clear_sequence;
	1619	a = atomic_load_acq_int(ap);
	1620	c = atomic_load_acq_int(cp);
	1621
	1622	while (a == atomic_load_acq_int(ap) && c == atomic_load_acq_int(cp)) {
	1623	err = tsleep(pps, PCATCH, "ppsfch", to);
	1624	if (err == EWOULDBLOCK) {
	1625	if (timeout->tv_sec < 0)
	1626	continue;
	1627	return (ETIMEDOUT);
	1628	}
	1629	if (err != 0)
	1630	return (err);
	1631	}
	1632
	1633	return (0);
	1634	}
	1635
984263bc MD	1636	int
	1637	pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
	1638	{
	1639	pps_params_t *app;
	1640	struct pps_fetch_args *fapi;
	1641	#ifdef PPS_SYNC
	1642	struct pps_kcbind_args *kapi;
	1643	#endif
0c4dbac1	1644	int err;
984263bc MD	1645
	1646	switch (cmd) {
	1647	case PPS_IOC_CREATE:
	1648	return (0);
	1649	case PPS_IOC_DESTROY:
	1650	return (0);
	1651	case PPS_IOC_SETPARAMS:
	1652	app = (pps_params_t *)data;
	1653	if (app->mode & ~pps->ppscap)
	1654	return (EINVAL);
bbf175be	1655	pps->ppsparam = *app;
984263bc MD	1656	return (0);
	1657	case PPS_IOC_GETPARAMS:
	1658	app = (pps_params_t *)data;
	1659	*app = pps->ppsparam;
	1660	app->api_version = PPS_API_VERS_1;
	1661	return (0);
	1662	case PPS_IOC_GETCAP:
	1663	(int)data = pps->ppscap;
	1664	return (0);
	1665	case PPS_IOC_FETCH:
	1666	fapi = (struct pps_fetch_args *)data;
	1667	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
	1668	return (EINVAL);
0c4dbac1 DC	1669	if (fapi->timeout.tv_sec != 0 \|\| fapi->timeout.tv_nsec != 0) {
	1670	err = pps_fetch_timeout(&fapi->timeout, pps);
	1671	if (err != 0)
	1672	return (err);
	1673	}
bbf175be	1674	pps->ppsinfo.current_mode = pps->ppsparam.mode;
984263bc MD	1675	fapi->pps_info_buf = pps->ppsinfo;
	1676	return (0);
	1677	case PPS_IOC_KCBIND:
	1678	#ifdef PPS_SYNC
	1679	kapi = (struct pps_kcbind_args *)data;
	1680	/* XXX Only root should be able to do this */
	1681	if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
	1682	return (EINVAL);
	1683	if (kapi->kernel_consumer != PPS_KC_HARDPPS)
	1684	return (EINVAL);
	1685	if (kapi->edge & ~pps->ppscap)
	1686	return (EINVAL);
	1687	pps->kcmode = kapi->edge;
	1688	return (0);
	1689	#else
	1690	return (EOPNOTSUPP);
	1691	#endif
	1692	default:
	1693	return (ENOTTY);
	1694	}
	1695	}
	1696
	1697	void
	1698	pps_init(struct pps_state *pps)
	1699	{
0c4dbac1	1700	pps->ppscap \|= PPS_TSFMT_TSPEC \| PPS_CANWAIT;
984263bc MD	1701	if (pps->ppscap & PPS_CAPTUREASSERT)
	1702	pps->ppscap \|= PPS_OFFSETASSERT;
	1703	if (pps->ppscap & PPS_CAPTURECLEAR)
	1704	pps->ppscap \|= PPS_OFFSETCLEAR;
	1705	}
	1706
	1707	void
88c4d2f6	1708	pps_event(struct pps_state *pps, sysclock_t count, int event)
984263bc	1709	{
88c4d2f6 MD	1710	struct globaldata *gd;
	1711	struct timespec *tsp;
	1712	struct timespec *osp;
5eb5a6bc	1713	struct timespec *bt;
88c4d2f6 MD	1714	struct timespec ts;
	1715	sysclock_t *pcount;
	1716	#ifdef PPS_SYNC
	1717	sysclock_t tcount;
	1718	#endif
	1719	sysclock_t delta;
	1720	pps_seq_t *pseq;
	1721	int foff;
aa85218e	1722	#ifdef PPS_SYNC
88c4d2f6	1723	int fhard;
aa85218e	1724	#endif
2ed58723	1725	int ni;
88c4d2f6 MD	1726
88c4d2f6 MD	1727	gd = mycpu;
984263bc MD	1728
	1729	/* Things would be easier with arrays... */
	1730	if (event == PPS_CAPTUREASSERT) {
	1731	tsp = &pps->ppsinfo.assert_timestamp;
	1732	osp = &pps->ppsparam.assert_offset;
	1733	foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
c246e343	1734	#ifdef PPS_SYNC
984263bc	1735	fhard = pps->kcmode & PPS_CAPTUREASSERT;
c246e343	1736	#endif
984263bc MD	1737	pcount = &pps->ppscount[0];
	1738	pseq = &pps->ppsinfo.assert_sequence;
	1739	} else {
	1740	tsp = &pps->ppsinfo.clear_timestamp;
	1741	osp = &pps->ppsparam.clear_offset;
	1742	foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
c246e343	1743	#ifdef PPS_SYNC
984263bc	1744	fhard = pps->kcmode & PPS_CAPTURECLEAR;
c246e343	1745	#endif
984263bc MD	1746	pcount = &pps->ppscount[1];
	1747	pseq = &pps->ppsinfo.clear_sequence;
	1748	}
	1749
984263bc MD	1750	/* Nothing really happened */
	1751	if (*pcount == count)
	1752	return;
	1753
	1754	*pcount = count;
	1755
88c4d2f6 MD	1756	do {
	1757	ts.tv_sec = gd->gd_time_seconds;
	1758	delta = count - gd->gd_cpuclock_base;
	1759	} while (ts.tv_sec != gd->gd_time_seconds);
fad57d0e	1760
044ee7c4 MD	1761	if (delta >= sys_cputimer->freq) {
	1762	ts.tv_sec += delta / sys_cputimer->freq;
	1763	delta %= sys_cputimer->freq;
88c4d2f6	1764	}
8fbc264d	1765	ts.tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
2ed58723 MD	1766	ni = basetime_index;
	1767	cpu_lfence();
	1768	bt = &basetime[ni];
5eb5a6bc MD	1769	ts.tv_sec += bt->tv_sec;
5eb5a6bc MD	1770	ts.tv_nsec += bt->tv_nsec;
88c4d2f6 MD	1771	while (ts.tv_nsec >= 1000000000) {
	1772	ts.tv_nsec -= 1000000000;
	1773	++ts.tv_sec;
984263bc	1774	}
984263bc	1775
0c4dbac1	1776	atomic_add_rel_int(pseq, 1);
984263bc MD	1777	*tsp = ts;
	1778
	1779	if (foff) {
944cd60c	1780	timespecadd(tsp, osp, tsp);
984263bc MD	1781	if (tsp->tv_nsec < 0) {
	1782	tsp->tv_nsec += 1000000000;
	1783	tsp->tv_sec -= 1;
	1784	}
	1785	}
	1786	#ifdef PPS_SYNC
	1787	if (fhard) {
	1788	/* magic, at its best... */
	1789	tcount = count - pps->ppscount[2];
	1790	pps->ppscount[2] = count;
044ee7c4 MD	1791	if (tcount >= sys_cputimer->freq) {
044ee7c4 MD	1792	delta = (1000000000 * (tcount / sys_cputimer->freq) +
bbf175be	1793	sys_cputimer->freq64_nsec *
044ee7c4	1794	(tcount % sys_cputimer->freq)) >> 32;
fad57d0e	1795	} else {
8fbc264d MD	1796	delta = muldivu64(sys_cputimer->freq64_nsec,
8fbc264d MD	1797	tcount, 1L << 32);
fad57d0e	1798	}
984263bc MD	1799	hardpps(tsp, delta);
	1800	}
	1801	#endif
0c4dbac1	1802	wakeup(pps);
984263bc	1803	}
88c4d2f6	1804
d2412a2e MD	1805	/*
	1806	* Return the tsc target value for a delay of (ns).
	1807	*
	1808	* Returns -1 if the TSC is not supported.
	1809	*/
5b49787b	1810	tsc_uclock_t
d2412a2e MD	1811	tsc_get_target(int ns)
	1812	{
	1813	#if defined(_RDTSC_SUPPORTED_)
	1814	if (cpu_feature & CPUID_TSC) {
	1815	return (rdtsc() + tsc_frequency * ns / (int64_t)1000000000);
	1816	}
	1817	#endif
	1818	return(-1);
	1819	}
	1820
	1821	/*
	1822	* Compare the tsc against the passed target
	1823	*
	1824	* Returns +1 if the target has been reached
	1825	* Returns 0 if the target has not yet been reached
	1826	* Returns -1 if the TSC is not supported.
	1827	*
	1828	* Typical use: while (tsc_test_target(target) == 0) { ...poll... }
	1829	*/
	1830	int
	1831	tsc_test_target(int64_t target)
	1832	{
	1833	#if defined(_RDTSC_SUPPORTED_)
	1834	if (cpu_feature & CPUID_TSC) {
	1835	if ((int64_t)(target - rdtsc()) <= 0)
	1836	return(1);
	1837	return(0);
	1838	}
d2412a2e	1839	#endif
2e537993	1840	return(-1);
d2412a2e	1841	}
b12defdc MD	1842
	1843	/*
	1844	* Delay the specified number of nanoseconds using the tsc. This function
	1845	* returns immediately if the TSC is not supported. At least one cpu_pause()
	1846	* will be issued.
	1847	*/
	1848	void
	1849	tsc_delay(int ns)
	1850	{
	1851	int64_t clk;
	1852
	1853	clk = tsc_get_target(ns);
	1854	cpu_pause();
f5955a53 MD	1855	cpu_pause();
	1856	while (tsc_test_target(clk) == 0) {
	1857	cpu_pause();
	1858	cpu_pause();
b12defdc	1859	cpu_pause();
f5955a53 MD	1860	cpu_pause();
f5955a53 MD	1861	}
b12defdc	1862	}