gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
	35	* Copyright (c) 1982, 1986, 1991, 1993
	36	* The Regents of the University of California. All rights reserved.
	37	* (c) UNIX System Laboratories, Inc.
	38	* All or some portions of this file are derived from material licensed
	39	* to the University of California by American Telephone and Telegraph
	40	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	41	* the permission of UNIX System Laboratories, Inc.
	42	*
	43	* Redistribution and use in source and binary forms, with or without
	44	* modification, are permitted provided that the following conditions
	45	* are met:
	46	* 1. Redistributions of source code must retain the above copyright
	47	* notice, this list of conditions and the following disclaimer.
	48	* 2. Redistributions in binary form must reproduce the above copyright
	49	* notice, this list of conditions and the following disclaimer in the
	50	* documentation and/or other materials provided with the distribution.
	51	* 3. Neither the name of the University nor the names of its contributors
	52	* may be used to endorse or promote products derived from this software
	53	* without specific prior written permission.
	54	*
	55	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	56	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	57	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	58	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	59	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	60	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	61	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	62	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	63	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	64	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	65	* SUCH DAMAGE.
	66	*
	67	* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
	68	* $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $
	69	*/
	70
	71	#include "opt_ntp.h"
	72	#include "opt_pctrack.h"
	73
	74	#include <sys/param.h>
	75	#include <sys/systm.h>
	76	#include <sys/callout.h>
	77	#include <sys/kernel.h>
	78	#include <sys/kinfo.h>
	79	#include <sys/proc.h>
	80	#include <sys/malloc.h>
	81	#include <sys/resource.h>
	82	#include <sys/resourcevar.h>
	83	#include <sys/signalvar.h>
	84	#include <sys/caps.h>
	85	#include <sys/timex.h>
	86	#include <sys/timepps.h>
	87	#include <sys/upmap.h>
	88	#include <sys/lock.h>
	89	#include <sys/sysctl.h>
	90	#include <sys/kcollect.h>
	91	#include <sys/exislock.h>
	92	#include <sys/exislock2.h>
	93
	94	#include <vm/vm.h>
	95	#include <vm/pmap.h>
	96	#include <vm/vm_map.h>
	97	#include <vm/vm_extern.h>
	98
	99	#include <sys/thread2.h>
	100	#include <sys/spinlock2.h>
	101
	102	#include <machine/cpu.h>
	103	#include <machine/limits.h>
	104	#include <machine/smp.h>
	105	#include <machine/cpufunc.h>
	106	#include <machine/specialreg.h>
	107	#include <machine/clock.h>
	108
	109	#ifdef DEBUG_PCTRACK
	110	static void do_pctrack(struct intrframe *frame, int which);
	111	#endif
	112
	113	static void initclocks (void *dummy);
	114	SYSINIT(clocks, SI_BOOT2_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
	115
	116	/*
	117	* Some of these don't belong here, but it's easiest to concentrate them.
	118	* Note that cpu_time counts in microseconds, but most userland programs
	119	* just compare relative times against the total by delta.
	120	*/
	121	struct kinfo_cputime cputime_percpu[MAXCPU];
	122	#ifdef DEBUG_PCTRACK
	123	struct kinfo_pcheader cputime_pcheader = { PCTRACK_SIZE, PCTRACK_ARYSIZE };
	124	struct kinfo_pctrack cputime_pctrack[MAXCPU][PCTRACK_SIZE];
	125	#endif
	126
	127	__read_mostly static int sniff_enable = 1;
	128	__read_mostly static int sniff_target = -1;
	129	__read_mostly static int clock_debug2 = 0;
	130	SYSCTL_INT(_kern, OID_AUTO, sniff_enable, CTLFLAG_RW, &sniff_enable, 0 , "");
	131	SYSCTL_INT(_kern, OID_AUTO, sniff_target, CTLFLAG_RW, &sniff_target, 0 , "");
	132	SYSCTL_INT(_debug, OID_AUTO, clock_debug2, CTLFLAG_RW, &clock_debug2, 0 , "");
	133
	134	__read_mostly long pseudo_ticks = 1; /* existential timed locks */
	135
	136	static int
	137	sysctl_cputime(SYSCTL_HANDLER_ARGS)
	138	{
	139	int cpu, error = 0;
	140	int root_error;
	141	size_t size = sizeof(struct kinfo_cputime);
	142	struct kinfo_cputime tmp;
	143
	144	/*
	145	* NOTE: For security reasons, only root can sniff %rip
	146	*/
	147	root_error = caps_priv_check_self(SYSCAP_RESTRICTEDROOT);
	148
	149	for (cpu = 0; cpu < ncpus; ++cpu) {
	150	tmp = cputime_percpu[cpu];
	151	if (root_error == 0) {
	152	tmp.cp_sample_pc =
	153	(int64_t)globaldata_find(cpu)->gd_sample_pc;
	154	tmp.cp_sample_sp =
	155	(int64_t)globaldata_find(cpu)->gd_sample_sp;
	156	}
	157	if ((error = SYSCTL_OUT(req, &tmp, size)) != 0)
	158	break;
	159	}
	160
	161	if (root_error == 0) {
	162	if (sniff_enable) {
	163	int n = sniff_target;
	164	if (n < 0)
	165	smp_sniff();
	166	else if (n < ncpus)
	167	cpu_sniff(n);
	168	}
	169	}
	170
	171	return (error);
	172	}
	173	SYSCTL_PROC(_kern, OID_AUTO, cputime, (CTLTYPE_OPAQUE\|CTLFLAG_RD), 0, 0,
	174	sysctl_cputime, "S,kinfo_cputime", "CPU time statistics");
	175
	176	static int
	177	sysctl_cp_time(SYSCTL_HANDLER_ARGS)
	178	{
	179	long cpu_states[CPUSTATES] = {0};
	180	int cpu, error = 0;
	181	size_t size = sizeof(cpu_states);
	182
	183	for (cpu = 0; cpu < ncpus; ++cpu) {
	184	cpu_states[CP_USER] += cputime_percpu[cpu].cp_user;
	185	cpu_states[CP_NICE] += cputime_percpu[cpu].cp_nice;
	186	cpu_states[CP_SYS] += cputime_percpu[cpu].cp_sys;
	187	cpu_states[CP_INTR] += cputime_percpu[cpu].cp_intr;
	188	cpu_states[CP_IDLE] += cputime_percpu[cpu].cp_idle;
	189	}
	190
	191	error = SYSCTL_OUT(req, cpu_states, size);
	192
	193	return (error);
	194	}
	195
	196	SYSCTL_PROC(_kern, OID_AUTO, cp_time, (CTLTYPE_LONG\|CTLFLAG_RD), 0, 0,
	197	sysctl_cp_time, "LU", "CPU time statistics");
	198
	199	static int
	200	sysctl_cp_times(SYSCTL_HANDLER_ARGS)
	201	{
	202	long cpu_states[CPUSTATES] = {0};
	203	int cpu, error;
	204	size_t size = sizeof(cpu_states);
	205
	206	for (error = 0, cpu = 0; error == 0 && cpu < ncpus; ++cpu) {
	207	cpu_states[CP_USER] = cputime_percpu[cpu].cp_user;
	208	cpu_states[CP_NICE] = cputime_percpu[cpu].cp_nice;
	209	cpu_states[CP_SYS] = cputime_percpu[cpu].cp_sys;
	210	cpu_states[CP_INTR] = cputime_percpu[cpu].cp_intr;
	211	cpu_states[CP_IDLE] = cputime_percpu[cpu].cp_idle;
	212	error = SYSCTL_OUT(req, cpu_states, size);
	213	}
	214
	215	return (error);
	216	}
	217
	218	SYSCTL_PROC(_kern, OID_AUTO, cp_times, (CTLTYPE_LONG\|CTLFLAG_RD), 0, 0,
	219	sysctl_cp_times, "LU", "per-CPU time statistics");
	220
	221	/*
	222	* boottime is used to calculate the 'real' uptime. Do not confuse this with
	223	* microuptime(). microtime() is not drift compensated. The real uptime
	224	* with compensation is nanotime() - bootime. boottime is recalculated
	225	* whenever the real time is set based on the compensated elapsed time
	226	* in seconds (gd->gd_time_seconds).
	227	*
	228	* The gd_time_seconds and gd_cpuclock_base fields remain fairly monotonic.
	229	* Slight adjustments to gd_cpuclock_base are made to phase-lock it to
	230	* the real time.
	231	*
	232	* WARNING! time_second can backstep on time corrections. Also, unlike
	233	* time_second, time_uptime is not a "real" time_t (seconds
	234	* since the Epoch) but seconds since booting.
	235	*/
	236	__read_mostly struct timespec boottime; /* boot time (realtime) for ref only */
	237	__read_mostly struct timespec ticktime0;/* updated every tick */
	238	__read_mostly struct timespec ticktime2;/* updated every tick */
	239	__read_mostly int ticktime_update;
	240	__read_mostly time_t time_second; /* read-only 'passive' rt in seconds */
	241	__read_mostly time_t time_uptime; /* read-only 'passive' ut in seconds */
	242
	243	/*
	244	* basetime is used to calculate the compensated real time of day. The
	245	* basetime can be modified on a per-tick basis by the adjtime(),
	246	* ntp_adjtime(), and sysctl-based time correction APIs.
	247	*
	248	* Note that frequency corrections can also be made by adjusting
	249	* gd_cpuclock_base.
	250	*
	251	* basetime is a tail-chasing FIFO, updated only by cpu #0. The FIFO is
	252	* used on both SMP and UP systems to avoid MP races between cpu's and
	253	* interrupt races on UP systems.
	254	*/
	255	struct hardtime {
	256	__uint32_t time_second;
	257	sysclock_t cpuclock_base;
	258	};
	259
	260	#define BASETIME_ARYSIZE 16
	261	#define BASETIME_ARYMASK (BASETIME_ARYSIZE - 1)
	262	static struct timespec basetime[BASETIME_ARYSIZE];
	263	static struct hardtime hardtime[BASETIME_ARYSIZE];
	264	static volatile int basetime_index;
	265
	266	static int
	267	sysctl_get_basetime(SYSCTL_HANDLER_ARGS)
	268	{
	269	struct timespec *bt;
	270	int error;
	271	int index;
	272
	273	/*
	274	* Because basetime data and index may be updated by another cpu,
	275	* a load fence is required to ensure that the data we read has
	276	* not been speculatively read relative to a possibly updated index.
	277	*/
	278	index = basetime_index;
	279	cpu_lfence();
	280	bt = &basetime[index];
	281	error = SYSCTL_OUT(req, bt, sizeof(*bt));
	282	return (error);
	283	}
	284
	285	SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
	286	&boottime, timespec, "System boottime");
	287	SYSCTL_PROC(_kern, OID_AUTO, basetime, CTLTYPE_STRUCT\|CTLFLAG_RD, 0, 0,
	288	sysctl_get_basetime, "S,timespec", "System basetime");
	289
	290	static void hardclock(systimer_t info, int, struct intrframe *frame);
	291	static void statclock(systimer_t info, int, struct intrframe *frame);
	292	static void schedclock(systimer_t info, int, struct intrframe *frame);
	293	static void getnanotime_nbt(struct timespec nbt, struct timespec tsp);
	294
	295	/*
	296	* Use __read_mostly for ticks and sched_ticks because these variables are
	297	* used all over the kernel and only updated once per tick.
	298	*/
	299	__read_mostly sbintime_t sbticks; /* system master ticks at hz (64bit) */
	300	__read_mostly int ticks; /* system master ticks at hz */
	301	__read_mostly int sched_ticks; /* global schedule clock ticks */
	302	__read_mostly int clocks_running; /* tsleep/timeout clocks operational */
	303	int64_t nsec_adj; /* ntpd per-tick adjustment in nsec << 32 */
	304	int64_t nsec_acc; /* accumulator */
	305
	306	/* NTPD time correction fields */
	307	int64_t ntp_tick_permanent; /* per-tick adjustment in nsec << 32 */
	308	int64_t ntp_tick_acc; /* accumulator for per-tick adjustment */
	309	int64_t ntp_delta; /* one-time correction in nsec */
	310	int64_t ntp_big_delta = 1000000000;
	311	int32_t ntp_tick_delta; /* current adjustment rate */
	312	int32_t ntp_default_tick_delta; /* adjustment rate for ntp_delta */
	313	time_t ntp_leap_second; /* time of next leap second */
	314	int ntp_leap_insert; /* whether to insert or remove a second */
	315	struct spinlock ntp_spin;
	316
	317	/*
	318	* Finish initializing clock frequencies and start all clocks running.
	319	*/
	320	/* ARGSUSED*/
	321	static void
	322	initclocks(void *dummy)
	323	{
	324	/psratio = profhz / stathz;/
	325	spin_init(&ntp_spin, "ntp");
	326	initclocks_pcpu();
	327	clocks_running = 1;
	328	if (kpmap) {
	329	kpmap->tsc_freq = tsc_frequency;
	330	kpmap->tick_freq = hz;
	331	}
	332	}
	333
	334	/*
	335	* Called on a per-cpu basis from the idle thread bootstrap on each cpu
	336	* during SMP initialization.
	337	*
	338	* This routine is called concurrently during low-level SMP initialization
	339	* and may not block in any way. Meaning, among other things, we can't
	340	* acquire any tokens.
	341	*/
	342	void
	343	initclocks_pcpu(void)
	344	{
	345	struct globaldata *gd = mycpu;
	346
	347	crit_enter();
	348	if (gd->gd_cpuid == 0) {
	349	gd->gd_time_seconds = 1;
	350	gd->gd_cpuclock_base = sys_cputimer->count();
	351	hardtime[0].time_second = gd->gd_time_seconds;
	352	hardtime[0].cpuclock_base = gd->gd_cpuclock_base;
	353	} else {
	354	gd->gd_time_seconds = globaldata_find(0)->gd_time_seconds;
	355	gd->gd_cpuclock_base = globaldata_find(0)->gd_cpuclock_base;
	356	}
	357
	358	systimer_intr_enable();
	359
	360	crit_exit();
	361	}
	362
	363	/*
	364	* Called on a 10-second interval after the system is operational.
	365	* Return the collection data for USERPCT and install the data for
	366	* SYSTPCT and IDLEPCT.
	367	*/
	368	static
	369	uint64_t
	370	collect_cputime_callback(int n)
	371	{
	372	static long cpu_base[CPUSTATES];
	373	long cpu_states[CPUSTATES];
	374	long total;
	375	long acc;
	376	long lsb;
	377
	378	bzero(cpu_states, sizeof(cpu_states));
	379	for (n = 0; n < ncpus; ++n) {
	380	cpu_states[CP_USER] += cputime_percpu[n].cp_user;
	381	cpu_states[CP_NICE] += cputime_percpu[n].cp_nice;
	382	cpu_states[CP_SYS] += cputime_percpu[n].cp_sys;
	383	cpu_states[CP_INTR] += cputime_percpu[n].cp_intr;
	384	cpu_states[CP_IDLE] += cputime_percpu[n].cp_idle;
	385	}
	386
	387	acc = 0;
	388	for (n = 0; n < CPUSTATES; ++n) {
	389	total = cpu_states[n] - cpu_base[n];
	390	cpu_base[n] = cpu_states[n];
	391	cpu_states[n] = total;
	392	acc += total;
	393	}
	394	if (acc == 0) /* prevent degenerate divide by 0 */
	395	acc = 1;
	396	lsb = acc / (10000 * 2);
	397	kcollect_setvalue(KCOLLECT_SYSTPCT,
	398	(cpu_states[CP_SYS] + lsb) * 10000 / acc);
	399	kcollect_setvalue(KCOLLECT_IDLEPCT,
	400	(cpu_states[CP_IDLE] + lsb) * 10000 / acc);
	401	kcollect_setvalue(KCOLLECT_INTRPCT,
	402	(cpu_states[CP_INTR] + lsb) * 10000 / acc);
	403	return((cpu_states[CP_USER] + cpu_states[CP_NICE] + lsb) * 10000 / acc);
	404	}
	405
	406	/*
	407	* This routine is called on just the BSP, just after SMP initialization
	408	* completes to * finish initializing any clocks that might contend/block
	409	* (e.g. like on a token). We can't do this in initclocks_pcpu() because
	410	* that function is called from the idle thread bootstrap for each cpu and
	411	* not allowed to block at all.
	412	*/
	413	static
	414	void
	415	initclocks_other(void *dummy)
	416	{
	417	struct globaldata *ogd = mycpu;
	418	struct globaldata *gd;
	419	int n;
	420
	421	for (n = 0; n < ncpus; ++n) {
	422	lwkt_setcpu_self(globaldata_find(n));
	423	gd = mycpu;
	424
	425	/*
	426	* Use a non-queued periodic systimer to prevent multiple
	427	* ticks from building up if the sysclock jumps forward
	428	* (8254 gets reset). The sysclock will never jump backwards.
	429	* Our time sync is based on the actual sysclock, not the
	430	* ticks count.
	431	*
	432	* Install statclock before hardclock to prevent statclock
	433	* from misinterpreting gd_flags for tick assignment when
	434	* they overlap. Also offset the statclock by half of
	435	* its interval to try to avoid being coincident with
	436	* callouts.
	437	*/
	438	systimer_init_periodic_flags(&gd->gd_statclock, statclock,
	439	NULL, stathz,
	440	SYSTF_MSSYNC \| SYSTF_FIRST \|
	441	SYSTF_OFFSET50 \| SYSTF_OFFSETCPU);
	442	systimer_init_periodic_flags(&gd->gd_hardclock, hardclock,
	443	NULL, hz,
	444	SYSTF_MSSYNC \| SYSTF_OFFSETCPU);
	445	}
	446	lwkt_setcpu_self(ogd);
	447
	448	/*
	449	* Regular data collection
	450	*/
	451	kcollect_register(KCOLLECT_USERPCT, "user", collect_cputime_callback,
	452	KCOLLECT_SCALE(KCOLLECT_USERPCT_FORMAT, 0));
	453	kcollect_register(KCOLLECT_SYSTPCT, "syst", NULL,
	454	KCOLLECT_SCALE(KCOLLECT_SYSTPCT_FORMAT, 0));
	455	kcollect_register(KCOLLECT_IDLEPCT, "idle", NULL,
	456	KCOLLECT_SCALE(KCOLLECT_IDLEPCT_FORMAT, 0));
	457	}
	458	SYSINIT(clocks2, SI_BOOT2_POST_SMP, SI_ORDER_ANY, initclocks_other, NULL);
	459
	460	/*
	461	* This method is called on just the BSP, after all the usched implementations
	462	* are initialized. This avoids races between usched initialization functions
	463	* and usched_schedulerclock().
	464	*/
	465	static
	466	void
	467	initclocks_usched(void *dummy)
	468	{
	469	struct globaldata *ogd = mycpu;
	470	struct globaldata *gd;
	471	int n;
	472
	473	for (n = 0; n < ncpus; ++n) {
	474	lwkt_setcpu_self(globaldata_find(n));
	475	gd = mycpu;
	476
	477	/* XXX correct the frequency for scheduler / estcpu tests */
	478	systimer_init_periodic_flags(&gd->gd_schedclock, schedclock,
	479	NULL, ESTCPUFREQ,
	480	SYSTF_MSSYNC \| SYSTF_OFFSETCPU);
	481	}
	482	lwkt_setcpu_self(ogd);
	483	}
	484	SYSINIT(clocks3, SI_BOOT2_USCHED, SI_ORDER_ANY, initclocks_usched, NULL);
	485
	486	/*
	487	* This sets the current real time of day. Timespecs are in seconds and
	488	* nanoseconds. We do not mess with gd_time_seconds and gd_cpuclock_base,
	489	* instead we adjust basetime so basetime + gd_* results in the current
	490	* time of day. This way the gd_* fields are guaranteed to represent
	491	* a monotonically increasing 'uptime' value.
	492	*
	493	* When set_timeofday() is called from userland, the system call forces it
	494	* onto cpu #0 since only cpu #0 can update basetime_index.
	495	*/
	496	void
	497	set_timeofday(struct timespec *ts)
	498	{
	499	struct timespec *nbt;
	500	int ni;
	501
	502	/*
	503	* XXX SMP / non-atomic basetime updates
	504	*/
	505	crit_enter();
	506	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	507	cpu_lfence();
	508	nbt = &basetime[ni];
	509	nanouptime(nbt);
	510	nbt->tv_sec = ts->tv_sec - nbt->tv_sec;
	511	nbt->tv_nsec = ts->tv_nsec - nbt->tv_nsec;
	512	if (nbt->tv_nsec < 0) {
	513	nbt->tv_nsec += 1000000000;
	514	--nbt->tv_sec;
	515	}
	516
	517	/*
	518	* Note that basetime diverges from boottime as the clock drift is
	519	* compensated for, so we cannot do away with boottime. When setting
	520	* the absolute time of day the drift is 0 (for an instant) and we
	521	* can simply assign boottime to basetime.
	522	*
	523	* Note that nanouptime() is based on gd_time_seconds which is drift
	524	* compensated up to a point (it is guaranteed to remain monotonically
	525	* increasing). gd_time_seconds is thus our best uptime guess and
	526	* suitable for use in the boottime calculation. It is already taken
	527	* into account in the basetime calculation above.
	528	*/
	529	spin_lock(&ntp_spin);
	530	boottime.tv_sec = nbt->tv_sec;
	531	ntp_delta = 0;
	532
	533	/*
	534	* We now have a new basetime, make sure all other cpus have it,
	535	* then update the index.
	536	*/
	537	cpu_sfence();
	538	basetime_index = ni;
	539	spin_unlock(&ntp_spin);
	540
	541	crit_exit();
	542	}
	543
	544	/*
	545	* Each cpu has its own hardclock, but we only increment ticks and softticks
	546	* on cpu #0.
	547	*
	548	* NOTE! systimer! the MP lock might not be held here. We can only safely
	549	* manipulate objects owned by the current cpu.
	550	*/
	551	static void
	552	hardclock(systimer_t info, int in_ipi, struct intrframe *frame)
	553	{
	554	sysclock_t cputicks;
	555	struct proc *p;
	556	struct globaldata *gd = mycpu;
	557
	558	if ((gd->gd_reqflags & RQF_IPIQ) == 0 && lwkt_need_ipiq_process(gd)) {
	559	/* Defer to doreti on passive IPIQ processing */
	560	need_ipiq();
	561	}
	562
	563	/*
	564	* We update the compensation base to calculate fine-grained time
	565	* from the sys_cputimer on a per-cpu basis in order to avoid
	566	* having to mess around with locks. sys_cputimer is assumed to
	567	* be consistent across all cpus. CPU N copies the base state from
	568	* CPU 0 using the same FIFO trick that we use for basetime (so we
	569	* don't catch a CPU 0 update in the middle).
	570	*
	571	* Note that we never allow info->time (aka gd->gd_hardclock.time)
	572	* to reverse index gd_cpuclock_base, but that it is possible for
	573	* it to temporarily get behind in the seconds if something in the
	574	* system locks interrupts for a long period of time. Since periodic
	575	* timers count events, though everything should resynch again
	576	* immediately.
	577	*/
	578	if (gd->gd_cpuid == 0) {
	579	int ni;
	580
	581	cputicks = info->time - gd->gd_cpuclock_base;
	582	if (cputicks >= sys_cputimer->freq) {
	583	cputicks /= sys_cputimer->freq;
	584	if (cputicks != 0 && cputicks != 1)
	585	kprintf("Warning: hardclock missed > 1 sec\n");
	586	gd->gd_time_seconds += cputicks;
	587	gd->gd_cpuclock_base += sys_cputimer->freq * cputicks;
	588	/* uncorrected monotonic 1-sec gran */
	589	time_uptime += cputicks;
	590	}
	591	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	592	hardtime[ni].time_second = gd->gd_time_seconds;
	593	hardtime[ni].cpuclock_base = gd->gd_cpuclock_base;
	594	} else {
	595	int ni;
	596
	597	ni = basetime_index;
	598	cpu_lfence();
	599	gd->gd_time_seconds = hardtime[ni].time_second;
	600	gd->gd_cpuclock_base = hardtime[ni].cpuclock_base;
	601	}
	602
	603	/*
	604	* The system-wide ticks counter and NTP related timedelta/tickdelta
	605	* adjustments only occur on cpu #0. NTP adjustments are accomplished
	606	* by updating basetime.
	607	*/
	608	if (gd->gd_cpuid == 0) {
	609	struct timespec *nbt;
	610	struct timespec nts;
	611	int leap;
	612	int ni;
	613
	614	/*
	615	* Update system-wide ticks
	616	*/
	617	++ticks;
	618	++sbticks;
	619
	620	/*
	621	* Update system-wide ticktime for getnanotime() and getmicrotime()
	622	*/
	623	nanotime(&nts);
	624	atomic_add_int_nonlocked(&ticktime_update, 1);
	625	cpu_sfence();
	626	if (ticktime_update & 2)
	627	ticktime2 = nts;
	628	else
	629	ticktime0 = nts;
	630	cpu_sfence();
	631	atomic_add_int_nonlocked(&ticktime_update, 1);
	632
	633	#if 0
	634	if (tco->tc_poll_pps)
	635	tco->tc_poll_pps(tco);
	636	#endif
	637
	638	/*
	639	* Calculate the new basetime index. We are in a critical section
	640	* on cpu #0 and can safely play with basetime_index. Start
	641	* with the current basetime and then make adjustments.
	642	*/
	643	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	644	nbt = &basetime[ni];
	645	*nbt = basetime[basetime_index];
	646
	647	/*
	648	* ntp adjustments only occur on cpu 0 and are protected by
	649	* ntp_spin. This spinlock virtually never conflicts.
	650	*/
	651	spin_lock(&ntp_spin);
	652
	653	/*
	654	* Apply adjtime corrections. (adjtime() API)
	655	*
	656	* adjtime() only runs on cpu #0 so our critical section is
	657	* sufficient to access these variables.
	658	*/
	659	if (ntp_delta != 0) {
	660	nbt->tv_nsec += ntp_tick_delta;
	661	ntp_delta -= ntp_tick_delta;
	662	if ((ntp_delta > 0 && ntp_delta < ntp_tick_delta) \|\|
	663	(ntp_delta < 0 && ntp_delta > ntp_tick_delta)) {
	664	ntp_tick_delta = ntp_delta;
	665	}
	666	}
	667
	668	/*
	669	* Apply permanent frequency corrections. (sysctl API)
	670	*/
	671	if (ntp_tick_permanent != 0) {
	672	ntp_tick_acc += ntp_tick_permanent;
	673	if (ntp_tick_acc >= (1LL << 32)) {
	674	nbt->tv_nsec += ntp_tick_acc >> 32;
	675	ntp_tick_acc -= (ntp_tick_acc >> 32) << 32;
	676	} else if (ntp_tick_acc <= -(1LL << 32)) {
	677	/* Negate ntp_tick_acc to avoid shifting the sign bit. */
	678	nbt->tv_nsec -= (-ntp_tick_acc) >> 32;
	679	ntp_tick_acc += ((-ntp_tick_acc) >> 32) << 32;
	680	}
	681	}
	682
	683	if (nbt->tv_nsec >= 1000000000) {
	684	nbt->tv_sec++;
	685	nbt->tv_nsec -= 1000000000;
	686	} else if (nbt->tv_nsec < 0) {
	687	nbt->tv_sec--;
	688	nbt->tv_nsec += 1000000000;
	689	}
	690
	691	/*
	692	* Another per-tick compensation. (for ntp_adjtime() API)
	693	*/
	694	if (nsec_adj != 0) {
	695	nsec_acc += nsec_adj;
	696	if (nsec_acc >= 0x100000000LL) {
	697	nbt->tv_nsec += nsec_acc >> 32;
	698	nsec_acc = (nsec_acc & 0xFFFFFFFFLL);
	699	} else if (nsec_acc <= -0x100000000LL) {
	700	nbt->tv_nsec -= -nsec_acc >> 32;
	701	nsec_acc = -(-nsec_acc & 0xFFFFFFFFLL);
	702	}
	703	if (nbt->tv_nsec >= 1000000000) {
	704	nbt->tv_nsec -= 1000000000;
	705	++nbt->tv_sec;
	706	} else if (nbt->tv_nsec < 0) {
	707	nbt->tv_nsec += 1000000000;
	708	--nbt->tv_sec;
	709	}
	710	}
	711	spin_unlock(&ntp_spin);
	712
	713	/************************************************************
	714	* LEAP SECOND CORRECTION *
	715	************************************************************
	716	*
	717	* Taking into account all the corrections made above, figure
	718	* out the new real time. If the seconds field has changed
	719	* then apply any pending leap-second corrections.
	720	*/
	721	getnanotime_nbt(nbt, &nts);
	722
	723	if (time_second != nts.tv_sec) {
	724	/*
	725	* Apply leap second (sysctl API). Adjust nts for changes
	726	* so we do not have to call getnanotime_nbt again.
	727	*/
	728	if (ntp_leap_second) {
	729	if (ntp_leap_second == nts.tv_sec) {
	730	if (ntp_leap_insert) {
	731	nbt->tv_sec++;
	732	nts.tv_sec++;
	733	} else {
	734	nbt->tv_sec--;
	735	nts.tv_sec--;
	736	}
	737	ntp_leap_second--;
	738	}
	739	}
	740
	741	/*
	742	* Apply leap second (ntp_adjtime() API), calculate a new
	743	* nsec_adj field. ntp_update_second() returns nsec_adj
	744	* as a per-second value but we need it as a per-tick value.
	745	*/
	746	leap = ntp_update_second(time_second, &nsec_adj);
	747	nsec_adj /= hz;
	748	nbt->tv_sec += leap;
	749	nts.tv_sec += leap;
	750
	751	/*
	752	* Update the time_second 'approximate time' global.
	753	*/
	754	time_second = nts.tv_sec;
	755
	756	/*
	757	* Clear the IPC hint for the currently running thread once
	758	* per second, allowing us to disconnect the hint from a
	759	* thread which may no longer care.
	760	*/
	761	curthread->td_wakefromcpu = -1;
	762	}
	763
	764	/*
	765	* Finally, our new basetime is ready to go live!
	766	*/
	767	cpu_sfence();
	768	basetime_index = ni;
	769
	770	/*
	771	* Update kpmap on each tick. TS updates are integrated with
	772	* fences and upticks allowing userland to read the data
	773	* deterministically.
	774	*/
	775	if (kpmap) {
	776	int w;
	777
	778	w = (kpmap->upticks + 1) & 1;
	779	getnanouptime(&kpmap->ts_uptime[w]);
	780	getnanotime(&kpmap->ts_realtime[w]);
	781	cpu_sfence();
	782	++kpmap->upticks;
	783	cpu_sfence();
	784	}
	785
	786	/*
	787	* Handle exislock pseudo_ticks. We make things as simple as
	788	* possible for the critical path arming code by adding a little
	789	* complication here.
	790	*
	791	* When we find that all cores have been armed, we increment
	792	* pseudo_ticks and disarm all the cores.
	793	*/
	794	{
	795	globaldata_t gd;
	796	int n;
	797
	798	for (n = 0; n < ncpus; ++n) {
	799	gd = globaldata_find(n);
	800	if (gd->gd_exisarmed == 0)
	801	break;
	802	}
	803
	804	if (n == ncpus) {
	805	for (n = 0; n < ncpus; ++n) {
	806	gd = globaldata_find(n);
	807	gd->gd_exisarmed = 0;
	808	}
	809	++pseudo_ticks;
	810	}
	811	}
	812	}
	813
	814	/*
	815	* lwkt thread scheduler fair queueing
	816	*/
	817	lwkt_schedulerclock(curthread);
	818
	819	/*
	820	* Cycle the existential lock system on odd ticks in order to re-arm
	821	* our cpu (in case the cpu is idle or nobody is using any exis locks).
	822	*/
	823	if (ticks & 1) {
	824	exis_hold_gd(gd);
	825	exis_drop_gd(gd);
	826	}
	827
	828	/*
	829	* softticks are handled for all cpus
	830	*/
	831	hardclock_softtick(gd);
	832
	833	/*
	834	* Rollup accumulated vmstats, copy-back for critical path checks.
	835	*/
	836	vmstats_rollup_cpu(gd);
	837	vfscache_rollup_cpu(gd);
	838	mycpu->gd_vmstats = vmstats;
	839
	840	/*
	841	* ITimer handling is per-tick, per-cpu.
	842	*
	843	* We must acquire the per-process token in order for ksignal()
	844	* to be non-blocking. For the moment this requires an AST fault,
	845	* the ksignal() cannot be safely issued from this hard interrupt.
	846	*
	847	* XXX Even the trytoken here isn't right, and itimer operation in
	848	* a multi threaded environment is going to be weird at the
	849	* very least.
	850	*/
	851	if ((p = curproc) != NULL && lwkt_trytoken(&p->p_token)) {
	852	crit_enter_hard();
	853	if (p->p_upmap)
	854	++p->p_upmap->runticks;
	855
	856	if (frame && CLKF_USERMODE(frame) &&
	857	timevalisset(&p->p_timer[ITIMER_VIRTUAL].it_value) &&
	858	itimerdecr(&p->p_timer[ITIMER_VIRTUAL], ustick) == 0) {
	859	p->p_flags \|= P_SIGVTALRM;
	860	need_user_resched();
	861	}
	862	if (timevalisset(&p->p_timer[ITIMER_PROF].it_value) &&
	863	itimerdecr(&p->p_timer[ITIMER_PROF], ustick) == 0) {
	864	p->p_flags \|= P_SIGPROF;
	865	need_user_resched();
	866	}
	867	crit_exit_hard();
	868	lwkt_reltoken(&p->p_token);
	869	}
	870	setdelayed();
	871	}
	872
	873	/*
	874	* The statistics clock typically runs at a 125Hz rate, and is intended
	875	* to be frequency offset from the hardclock (typ 100Hz). It is per-cpu.
	876	*
	877	* NOTE! systimer! the MP lock might not be held here. We can only safely
	878	* manipulate objects owned by the current cpu.
	879	*
	880	* The stats clock is responsible for grabbing a profiling sample.
	881	* Most of the statistics are only used by user-level statistics programs.
	882	* The main exceptions are p->p_uticks, p->p_sticks, p->p_iticks, and
	883	* p->p_estcpu.
	884	*
	885	* Like the other clocks, the stat clock is called from what is effectively
	886	* a fast interrupt, so the context should be the thread/process that got
	887	* interrupted.
	888	*/
	889	static void
	890	statclock(systimer_t info, int in_ipi, struct intrframe *frame)
	891	{
	892	globaldata_t gd = mycpu;
	893	thread_t td;
	894	struct proc *p;
	895	int bump;
	896	sysclock_t cv;
	897	sysclock_t scv;
	898
	899	/*
	900	* How big was our timeslice relative to the last time? Calculate
	901	* in microseconds.
	902	*
	903	* NOTE: Use of microuptime() is typically MPSAFE, but usually not
	904	* during early boot. Just use the systimer count to be nice
	905	* to e.g. qemu. The systimer has a better chance of being
	906	* MPSAFE at early boot.
	907	*/
	908	cv = sys_cputimer->count();
	909	scv = gd->statint.gd_statcv;
	910	if (scv == 0) {
	911	bump = 1;
	912	} else {
	913	bump = muldivu64(sys_cputimer->freq64_usec,
	914	(cv - scv), 1L << 32);
	915	if (bump < 0)
	916	bump = 0;
	917	if (bump > 1000000)
	918	bump = 1000000;
	919	}
	920	gd->statint.gd_statcv = cv;
	921
	922	#if 0
	923	stv = &gd->gd_stattv;
	924	if (stv->tv_sec == 0) {
	925	bump = 1;
	926	} else {
	927	bump = tv.tv_usec - stv->tv_usec +
	928	(tv.tv_sec - stv->tv_sec) * 1000000;
	929	if (bump < 0)
	930	bump = 0;
	931	if (bump > 1000000)
	932	bump = 1000000;
	933	}
	934	*stv = tv;
	935	#endif
	936
	937	td = curthread;
	938	p = td->td_proc;
	939
	940	/*
	941	* If this is an interrupt thread used for the clock interrupt, adjust
	942	* td to the thread it is preempting. If a frame is available, it will
	943	* be related to the thread being preempted.
	944	*/
	945	if ((td->td_flags & TDF_CLKTHREAD) && td->td_preempted)
	946	td = td->td_preempted;
	947
	948	if (frame && CLKF_USERMODE(frame)) {
	949	/*
	950	* Came from userland, handle user time and deal with
	951	* possible process.
	952	*/
	953	if (p && (p->p_flags & P_PROFIL))
	954	addupc_intr(p, CLKF_PC(frame), 1);
	955	td->td_uticks += bump;
	956
	957	/*
	958	* Charge the time as appropriate
	959	*/
	960	if (p && p->p_nice > NZERO)
	961	cpu_time.cp_nice += bump;
	962	else
	963	cpu_time.cp_user += bump;
	964	} else {
	965	int intr_nest = gd->gd_intr_nesting_level;
	966
	967	if (in_ipi) {
	968	/*
	969	* IPI processing code will bump gd_intr_nesting_level
	970	* up by one, which breaks following CLKF_INTR testing,
	971	* so we subtract it by one here.
	972	*/
	973	--intr_nest;
	974	}
	975
	976	/*
	977	* Came from kernel mode, so we were:
	978	* - handling an interrupt,
	979	* - doing syscall or trap work on behalf of the current
	980	* user process, or
	981	* - spinning in the idle loop.
	982	* Whichever it is, charge the time as appropriate.
	983	* Note that we charge interrupts to the current process,
	984	* regardless of whether they are ``for'' that process,
	985	* so that we know how much of its real time was spent
	986	* in ``non-process'' (i.e., interrupt) work.
	987	*
	988	* XXX assume system if frame is NULL. A NULL frame
	989	* can occur if ipi processing is done from a crit_exit().
	990	*/
	991	if ((frame && CLKF_INTR(intr_nest)) \|\|
	992	cpu_interrupt_running(td)) {
	993	/*
	994	* If we interrupted an interrupt thread, well,
	995	* count it as interrupt time.
	996	*/
	997	td->td_iticks += bump;
	998	#ifdef DEBUG_PCTRACK
	999	if (frame)
	1000	do_pctrack(frame, PCTRACK_INT);
	1001	#endif
	1002	cpu_time.cp_intr += bump;
	1003	} else if (gd->gd_flags & GDF_VIRTUSER) {
	1004	/*
	1005	* The vkernel doesn't do a good job providing trap
	1006	* frames that we can test. If the GDF_VIRTUSER
	1007	* flag is set we probably interrupted user mode.
	1008	*/
	1009	td->td_uticks += bump;
	1010
	1011	/*
	1012	* Charge the time as appropriate
	1013	*/
	1014	if (p && p->p_nice > NZERO)
	1015	cpu_time.cp_nice += bump;
	1016	else
	1017	cpu_time.cp_user += bump;
	1018	} else {
	1019	if (clock_debug2 > 0) {
	1020	--clock_debug2;
	1021	kprintf("statclock preempt %s (%p %p)\n", td->td_comm, td, &gd->gd_idlethread);
	1022	}
	1023	td->td_sticks += bump;
	1024	if (td == &gd->gd_idlethread) {
	1025	/*
	1026	* We want to count token contention as
	1027	* system time. When token contention occurs
	1028	* the cpu may only be outside its critical
	1029	* section while switching through the idle
	1030	* thread. In this situation, various flags
	1031	* will be set in gd_reqflags.
	1032	*
	1033	* INTPEND is not necessarily useful because
	1034	* it will be set if the clock interrupt
	1035	* happens to be on an interrupt thread, the
	1036	* cpu_interrupt_running() call does a better
	1037	* job so we've already handled it.
	1038	*/
	1039	if (gd->gd_reqflags &
	1040	(RQF_IDLECHECK_WK_MASK & ~RQF_INTPEND)) {
	1041	cpu_time.cp_sys += bump;
	1042	} else {
	1043	cpu_time.cp_idle += bump;
	1044	}
	1045	} else {
	1046	/*
	1047	* System thread was running.
	1048	*/
	1049	#ifdef DEBUG_PCTRACK
	1050	if (frame)
	1051	do_pctrack(frame, PCTRACK_SYS);
	1052	#endif
	1053	cpu_time.cp_sys += bump;
	1054	}
	1055	}
	1056	}
	1057	}
	1058
	1059	#ifdef DEBUG_PCTRACK
	1060	/*
	1061	* Sample the PC when in the kernel or in an interrupt. User code can
	1062	* retrieve the information and generate a histogram or other output.
	1063	*/
	1064
	1065	static void
	1066	do_pctrack(struct intrframe *frame, int which)
	1067	{
	1068	struct kinfo_pctrack *pctrack;
	1069
	1070	pctrack = &cputime_pctrack[mycpu->gd_cpuid][which];
	1071	pctrack->pc_array[pctrack->pc_index & PCTRACK_ARYMASK] =
	1072	(void *)CLKF_PC(frame);
	1073	++pctrack->pc_index;
	1074	}
	1075
	1076	static int
	1077	sysctl_pctrack(SYSCTL_HANDLER_ARGS)
	1078	{
	1079	struct kinfo_pcheader head;
	1080	int error;
	1081	int cpu;
	1082	int ntrack;
	1083
	1084	head.pc_ntrack = PCTRACK_SIZE;
	1085	head.pc_arysize = PCTRACK_ARYSIZE;
	1086
	1087	if ((error = SYSCTL_OUT(req, &head, sizeof(head))) != 0)
	1088	return (error);
	1089
	1090	for (cpu = 0; cpu < ncpus; ++cpu) {
	1091	for (ntrack = 0; ntrack < PCTRACK_SIZE; ++ntrack) {
	1092	error = SYSCTL_OUT(req, &cputime_pctrack[cpu][ntrack],
	1093	sizeof(struct kinfo_pctrack));
	1094	if (error)
	1095	break;
	1096	}
	1097	if (error)
	1098	break;
	1099	}
	1100	return (error);
	1101	}
	1102	SYSCTL_PROC(_kern, OID_AUTO, pctrack, (CTLTYPE_OPAQUE\|CTLFLAG_RD), 0, 0,
	1103	sysctl_pctrack, "S,kinfo_pcheader", "CPU PC tracking");
	1104
	1105	#endif
	1106
	1107	/*
	1108	* The scheduler clock typically runs at a 50Hz rate. NOTE! systimer,
	1109	* the MP lock might not be held. We can safely manipulate parts of curproc
	1110	* but that's about it.
	1111	*
	1112	* Each cpu has its own scheduler clock.
	1113	*/
	1114	static void
	1115	schedclock(systimer_t info, int in_ipi __unused, struct intrframe *frame)
	1116	{
	1117	struct lwp *lp;
	1118	struct rusage *ru;
	1119	struct vmspace *vm;
	1120	long rss;
	1121
	1122	if ((lp = lwkt_preempted_proc()) != NULL) {
	1123	/*
	1124	* Account for cpu time used and hit the scheduler. Note
	1125	* that this call MUST BE MP SAFE, and the BGL IS NOT HELD
	1126	* HERE.
	1127	*/
	1128	++lp->lwp_cpticks;
	1129	usched_schedulerclock(lp, info->periodic, info->time);
	1130	} else {
	1131	usched_schedulerclock(NULL, info->periodic, info->time);
	1132	}
	1133	if ((lp = curthread->td_lwp) != NULL) {
	1134	/*
	1135	* Update resource usage integrals and maximums.
	1136	*/
	1137	if ((ru = &lp->lwp_proc->p_ru) &&
	1138	(vm = lp->lwp_proc->p_vmspace) != NULL) {
	1139	ru->ru_ixrss += pgtok(btoc(vm->vm_tsize));
	1140	ru->ru_idrss += pgtok(btoc(vm->vm_dsize));
	1141	ru->ru_isrss += pgtok(btoc(vm->vm_ssize));
	1142	if (lwkt_trytoken(&vm->vm_map.token)) {
	1143	rss = pgtok(vmspace_resident_count(vm));
	1144	if (ru->ru_maxrss < rss)
	1145	ru->ru_maxrss = rss;
	1146	lwkt_reltoken(&vm->vm_map.token);
	1147	}
	1148	}
	1149	}
	1150	/* Increment the global sched_ticks */
	1151	if (mycpu->gd_cpuid == 0)
	1152	++sched_ticks;
	1153	}
	1154
	1155	/*
	1156	* Compute number of ticks for the specified amount of time. The
	1157	* return value is intended to be used in a clock interrupt timed
	1158	* operation and guaranteed to meet or exceed the requested time.
	1159	* If the representation overflows, return INT_MAX. The minimum return
	1160	* value is 1 ticks and the function will average the calculation up.
	1161	* If any value greater then 0 microseconds is supplied, a value
	1162	* of at least 2 will be returned to ensure that a near-term clock
	1163	* interrupt does not cause the timeout to occur (degenerately) early.
	1164	*
	1165	* Note that limit checks must take into account microseconds, which is
	1166	* done simply by using the smaller signed long maximum instead of
	1167	* the unsigned long maximum.
	1168	*
	1169	* If ints have 32 bits, then the maximum value for any timeout in
	1170	* 10ms ticks is 248 days.
	1171	*/
	1172	int
	1173	tvtohz_high(struct timeval *tv)
	1174	{
	1175	int ticks;
	1176	long sec, usec;
	1177
	1178	sec = tv->tv_sec;
	1179	usec = tv->tv_usec;
	1180	if (usec < 0) {
	1181	sec--;
	1182	usec += 1000000;
	1183	}
	1184	if (sec < 0) {
	1185	#ifdef DIAGNOSTIC
	1186	if (usec > 0) {
	1187	sec++;
	1188	usec -= 1000000;
	1189	}
	1190	kprintf("tvtohz_high: negative time difference "
	1191	"%ld sec %ld usec\n",
	1192	sec, usec);
	1193	#endif
	1194	ticks = 1;
	1195	} else if (sec <= INT_MAX / hz) {
	1196	ticks = (int)(sec * hz + howmany((u_long)usec, ustick)) + 1;
	1197	} else {
	1198	ticks = INT_MAX;
	1199	}
	1200	return (ticks);
	1201	}
	1202
	1203	int
	1204	tstohz_high(struct timespec *ts)
	1205	{
	1206	int ticks;
	1207	long sec, nsec;
	1208
	1209	sec = ts->tv_sec;
	1210	nsec = ts->tv_nsec;
	1211	if (nsec < 0) {
	1212	sec--;
	1213	nsec += 1000000000;
	1214	}
	1215	if (sec < 0) {
	1216	#ifdef DIAGNOSTIC
	1217	if (nsec > 0) {
	1218	sec++;
	1219	nsec -= 1000000000;
	1220	}
	1221	kprintf("tstohz_high: negative time difference "
	1222	"%ld sec %ld nsec\n",
	1223	sec, nsec);
	1224	#endif
	1225	ticks = 1;
	1226	} else if (sec <= INT_MAX / hz) {
	1227	ticks = (int)(sec * hz + howmany((u_long)nsec, nstick)) + 1;
	1228	} else {
	1229	ticks = INT_MAX;
	1230	}
	1231	return (ticks);
	1232	}
	1233
	1234
	1235	/*
	1236	* Compute number of ticks for the specified amount of time, erroring on
	1237	* the side of it being too low to ensure that sleeping the returned number
	1238	* of ticks will not result in a late return.
	1239	*
	1240	* The supplied timeval may not be negative and should be normalized. A
	1241	* return value of 0 is possible if the timeval converts to less then
	1242	* 1 tick.
	1243	*
	1244	* If ints have 32 bits, then the maximum value for any timeout in
	1245	* 10ms ticks is 248 days.
	1246	*/
	1247	int
	1248	tvtohz_low(struct timeval *tv)
	1249	{
	1250	int ticks;
	1251	long sec;
	1252
	1253	sec = tv->tv_sec;
	1254	if (sec <= INT_MAX / hz)
	1255	ticks = (int)(sec * hz + (u_long)tv->tv_usec / ustick);
	1256	else
	1257	ticks = INT_MAX;
	1258	return (ticks);
	1259	}
	1260
	1261	int
	1262	tstohz_low(struct timespec *ts)
	1263	{
	1264	int ticks;
	1265	long sec;
	1266
	1267	sec = ts->tv_sec;
	1268	if (sec <= INT_MAX / hz)
	1269	ticks = (int)(sec * hz + (u_long)ts->tv_nsec / nstick);
	1270	else
	1271	ticks = INT_MAX;
	1272	return (ticks);
	1273	}
	1274
	1275	/*
	1276	* Start profiling on a process.
	1277	*
	1278	* Caller must hold p->p_token();
	1279	*
	1280	* Kernel profiling passes proc0 which never exits and hence
	1281	* keeps the profile clock running constantly.
	1282	*/
	1283	void
	1284	startprofclock(struct proc *p)
	1285	{
	1286	if ((p->p_flags & P_PROFIL) == 0) {
	1287	p->p_flags \|= P_PROFIL;
	1288	#if 0 /* XXX */
	1289	if (++profprocs == 1 && stathz != 0) {
	1290	crit_enter();
	1291	psdiv = psratio;
	1292	setstatclockrate(profhz);
	1293	crit_exit();
	1294	}
	1295	#endif
	1296	}
	1297	}
	1298
	1299	/*
	1300	* Stop profiling on a process.
	1301	*
	1302	* caller must hold p->p_token
	1303	*/
	1304	void
	1305	stopprofclock(struct proc *p)
	1306	{
	1307	if (p->p_flags & P_PROFIL) {
	1308	p->p_flags &= ~P_PROFIL;
	1309	#if 0 /* XXX */
	1310	if (--profprocs == 0 && stathz != 0) {
	1311	crit_enter();
	1312	psdiv = 1;
	1313	setstatclockrate(stathz);
	1314	crit_exit();
	1315	}
	1316	#endif
	1317	}
	1318	}
	1319
	1320	/*
	1321	* Return information about system clocks.
	1322	*/
	1323	static int
	1324	sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
	1325	{
	1326	struct kinfo_clockinfo clkinfo;
	1327	/*
	1328	* Construct clockinfo structure.
	1329	*/
	1330	clkinfo.ci_hz = hz;
	1331	clkinfo.ci_tick = ustick;
	1332	clkinfo.ci_tickadj = ntp_default_tick_delta / 1000;
	1333	clkinfo.ci_profhz = profhz;
	1334	clkinfo.ci_stathz = stathz ? stathz : hz;
	1335	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
	1336	}
	1337
	1338	SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT\|CTLFLAG_RD,
	1339	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
	1340
	1341	/*
	1342	* We have eight functions for looking at the clock, four for
	1343	* microseconds and four for nanoseconds. For each there is fast
	1344	* but less precise version "get{nano\|micro}[up]time" which will
	1345	* return a time which is up to 1/HZ previous to the call, whereas
	1346	* the raw version "{nano\|micro}[up]time" will return a timestamp
	1347	* which is as precise as possible. The "up" variants return the
	1348	* time relative to system boot, these are well suited for time
	1349	* interval measurements.
	1350	*
	1351	* Each cpu independently maintains the current time of day, so all
	1352	* we need to do to protect ourselves from changes is to do a loop
	1353	* check on the seconds field changing out from under us.
	1354	*
	1355	* The system timer maintains a 32 bit count and due to various issues
	1356	* it is possible for the calculated delta to occasionally exceed
	1357	* sys_cputimer->freq. If this occurs the sys_cputimer->freq64_nsec
	1358	* multiplication can easily overflow, so we deal with the case. For
	1359	* uniformity we deal with the case in the usec case too.
	1360	*
	1361	* All the [get][micro,nano][time,uptime]() routines are MPSAFE.
	1362	*
	1363	* NEW CODE (!)
	1364	*
	1365	* cpu 0 now maintains global ticktimes and an update counter. The
	1366	* getnanotime() and getmicrotime() routines use these globals.
	1367	*/
	1368	void
	1369	getmicrouptime(struct timeval *tvp)
	1370	{
	1371	struct globaldata *gd = mycpu;
	1372	sysclock_t delta;
	1373
	1374	do {
	1375	tvp->tv_sec = gd->gd_time_seconds;
	1376	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	1377	} while (tvp->tv_sec != gd->gd_time_seconds);
	1378
	1379	if (delta >= sys_cputimer->freq) {
	1380	tvp->tv_sec += delta / sys_cputimer->freq;
	1381	delta %= sys_cputimer->freq;
	1382	}
	1383	tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32);
	1384	if (tvp->tv_usec >= 1000000) {
	1385	tvp->tv_usec -= 1000000;
	1386	++tvp->tv_sec;
	1387	}
	1388	}
	1389
	1390	void
	1391	getnanouptime(struct timespec *tsp)
	1392	{
	1393	struct globaldata *gd = mycpu;
	1394	sysclock_t delta;
	1395
	1396	do {
	1397	tsp->tv_sec = gd->gd_time_seconds;
	1398	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	1399	} while (tsp->tv_sec != gd->gd_time_seconds);
	1400
	1401	if (delta >= sys_cputimer->freq) {
	1402	tsp->tv_sec += delta / sys_cputimer->freq;
	1403	delta %= sys_cputimer->freq;
	1404	}
	1405	tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
	1406	}
	1407
	1408	void
	1409	microuptime(struct timeval *tvp)
	1410	{
	1411	struct globaldata *gd = mycpu;
	1412	sysclock_t delta;
	1413
	1414	do {
	1415	tvp->tv_sec = gd->gd_time_seconds;
	1416	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	1417	} while (tvp->tv_sec != gd->gd_time_seconds);
	1418
	1419	if (delta >= sys_cputimer->freq) {
	1420	tvp->tv_sec += delta / sys_cputimer->freq;
	1421	delta %= sys_cputimer->freq;
	1422	}
	1423	tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32);
	1424	}
	1425
	1426	void
	1427	nanouptime(struct timespec *tsp)
	1428	{
	1429	struct globaldata *gd = mycpu;
	1430	sysclock_t delta;
	1431
	1432	do {
	1433	tsp->tv_sec = gd->gd_time_seconds;
	1434	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	1435	} while (tsp->tv_sec != gd->gd_time_seconds);
	1436
	1437	if (delta >= sys_cputimer->freq) {
	1438	tsp->tv_sec += delta / sys_cputimer->freq;
	1439	delta %= sys_cputimer->freq;
	1440	}
	1441	tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
	1442	}
	1443
	1444	/*
	1445	* realtime routines
	1446	*/
	1447	void
	1448	getmicrotime(struct timeval *tvp)
	1449	{
	1450	struct timespec ts;
	1451	int counter;
	1452
	1453	do {
	1454	counter = (volatile int )&ticktime_update;
	1455	cpu_lfence();
	1456	switch(counter & 3) {
	1457	case 0: /* ticktime2 completed update */
	1458	ts = ticktime2;
	1459	break;
	1460	case 1: /* ticktime0 update in progress */
	1461	ts = ticktime2;
	1462	break;
	1463	case 2: /* ticktime0 completed update */
	1464	ts = ticktime0;
	1465	break;
	1466	case 3: /* ticktime2 update in progress */
	1467	ts = ticktime0;
	1468	break;
	1469	}
	1470	cpu_lfence();
	1471	} while (counter != (volatile int )&ticktime_update);
	1472	tvp->tv_sec = ts.tv_sec;
	1473	tvp->tv_usec = ts.tv_nsec / 1000;
	1474	}
	1475
	1476	void
	1477	getnanotime(struct timespec *tsp)
	1478	{
	1479	struct timespec ts;
	1480	int counter;
	1481
	1482	do {
	1483	counter = (volatile int )&ticktime_update;
	1484	cpu_lfence();
	1485	switch(counter & 3) {
	1486	case 0: /* ticktime2 completed update */
	1487	ts = ticktime2;
	1488	break;
	1489	case 1: /* ticktime0 update in progress */
	1490	ts = ticktime2;
	1491	break;
	1492	case 2: /* ticktime0 completed update */
	1493	ts = ticktime0;
	1494	break;
	1495	case 3: /* ticktime2 update in progress */
	1496	ts = ticktime0;
	1497	break;
	1498	}
	1499	cpu_lfence();
	1500	} while (counter != (volatile int )&ticktime_update);
	1501	*tsp = ts;
	1502	}
	1503
	1504	static void
	1505	getnanotime_nbt(struct timespec nbt, struct timespec tsp)
	1506	{
	1507	struct globaldata *gd = mycpu;
	1508	sysclock_t delta;
	1509
	1510	do {
	1511	tsp->tv_sec = gd->gd_time_seconds;
	1512	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	1513	} while (tsp->tv_sec != gd->gd_time_seconds);
	1514
	1515	if (delta >= sys_cputimer->freq) {
	1516	tsp->tv_sec += delta / sys_cputimer->freq;
	1517	delta %= sys_cputimer->freq;
	1518	}
	1519	tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
	1520
	1521	tsp->tv_sec += nbt->tv_sec;
	1522	tsp->tv_nsec += nbt->tv_nsec;
	1523	while (tsp->tv_nsec >= 1000000000) {
	1524	tsp->tv_nsec -= 1000000000;
	1525	++tsp->tv_sec;
	1526	}
	1527	}
	1528
	1529
	1530	void
	1531	microtime(struct timeval *tvp)
	1532	{
	1533	struct globaldata *gd = mycpu;
	1534	struct timespec *bt;
	1535	sysclock_t delta;
	1536
	1537	do {
	1538	tvp->tv_sec = gd->gd_time_seconds;
	1539	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	1540	} while (tvp->tv_sec != gd->gd_time_seconds);
	1541
	1542	if (delta >= sys_cputimer->freq) {
	1543	tvp->tv_sec += delta / sys_cputimer->freq;
	1544	delta %= sys_cputimer->freq;
	1545	}
	1546	tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32);
	1547
	1548	bt = &basetime[basetime_index];
	1549	cpu_lfence();
	1550	tvp->tv_sec += bt->tv_sec;
	1551	tvp->tv_usec += bt->tv_nsec / 1000;
	1552	while (tvp->tv_usec >= 1000000) {
	1553	tvp->tv_usec -= 1000000;
	1554	++tvp->tv_sec;
	1555	}
	1556	}
	1557
	1558	void
	1559	nanotime(struct timespec *tsp)
	1560	{
	1561	struct globaldata *gd = mycpu;
	1562	struct timespec *bt;
	1563	sysclock_t delta;
	1564
	1565	do {
	1566	tsp->tv_sec = gd->gd_time_seconds;
	1567	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	1568	} while (tsp->tv_sec != gd->gd_time_seconds);
	1569
	1570	if (delta >= sys_cputimer->freq) {
	1571	tsp->tv_sec += delta / sys_cputimer->freq;
	1572	delta %= sys_cputimer->freq;
	1573	}
	1574	tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
	1575
	1576	bt = &basetime[basetime_index];
	1577	cpu_lfence();
	1578	tsp->tv_sec += bt->tv_sec;
	1579	tsp->tv_nsec += bt->tv_nsec;
	1580	while (tsp->tv_nsec >= 1000000000) {
	1581	tsp->tv_nsec -= 1000000000;
	1582	++tsp->tv_sec;
	1583	}
	1584	}
	1585
	1586	/*
	1587	* Get an approximate time_t. It does not have to be accurate. This
	1588	* function is called only from KTR and can be called with the system in
	1589	* any state so do not use a critical section or other complex operation
	1590	* here.
	1591	*
	1592	* NOTE: This is not exactly synchronized with real time. To do that we
	1593	* would have to do what microtime does and check for a nanoseconds
	1594	* overflow.
	1595	*/
	1596	time_t
	1597	get_approximate_time_t(void)
	1598	{
	1599	struct globaldata *gd = mycpu;
	1600	struct timespec *bt;
	1601
	1602	bt = &basetime[basetime_index];
	1603	return(gd->gd_time_seconds + bt->tv_sec);
	1604	}
	1605
	1606	static int
	1607	pps_fetch_timeout(struct timespec timeout, struct pps_state pps)
	1608	{
	1609	int to, err;
	1610	pps_seq_t ap, cp;
	1611	pps_seq_t a, c;
	1612
	1613	to = INT_MAX;
	1614	if (timeout->tv_sec > -1)
	1615	to = tstohz_low(timeout);
	1616
	1617	ap = &pps->ppsinfo.assert_sequence;
	1618	cp = &pps->ppsinfo.clear_sequence;
	1619	a = atomic_load_acq_int(ap);
	1620	c = atomic_load_acq_int(cp);
	1621
	1622	while (a == atomic_load_acq_int(ap) && c == atomic_load_acq_int(cp)) {
	1623	err = tsleep(pps, PCATCH, "ppsfch", to);
	1624	if (err == EWOULDBLOCK) {
	1625	if (timeout->tv_sec < 0)
	1626	continue;
	1627	return (ETIMEDOUT);
	1628	}
	1629	if (err != 0)
	1630	return (err);
	1631	}
	1632
	1633	return (0);
	1634	}
	1635
	1636	int
	1637	pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
	1638	{
	1639	pps_params_t *app;
	1640	struct pps_fetch_args *fapi;
	1641	#ifdef PPS_SYNC
	1642	struct pps_kcbind_args *kapi;
	1643	#endif
	1644	int err;
	1645
	1646	switch (cmd) {
	1647	case PPS_IOC_CREATE:
	1648	return (0);
	1649	case PPS_IOC_DESTROY:
	1650	return (0);
	1651	case PPS_IOC_SETPARAMS:
	1652	app = (pps_params_t *)data;
	1653	if (app->mode & ~pps->ppscap)
	1654	return (EINVAL);
	1655	pps->ppsparam = *app;
	1656	return (0);
	1657	case PPS_IOC_GETPARAMS:
	1658	app = (pps_params_t *)data;
	1659	*app = pps->ppsparam;
	1660	app->api_version = PPS_API_VERS_1;
	1661	return (0);
	1662	case PPS_IOC_GETCAP:
	1663	(int)data = pps->ppscap;
	1664	return (0);
	1665	case PPS_IOC_FETCH:
	1666	fapi = (struct pps_fetch_args *)data;
	1667	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
	1668	return (EINVAL);
	1669	if (fapi->timeout.tv_sec != 0 \|\| fapi->timeout.tv_nsec != 0) {
	1670	err = pps_fetch_timeout(&fapi->timeout, pps);
	1671	if (err != 0)
	1672	return (err);
	1673	}
	1674	pps->ppsinfo.current_mode = pps->ppsparam.mode;
	1675	fapi->pps_info_buf = pps->ppsinfo;
	1676	return (0);
	1677	case PPS_IOC_KCBIND:
	1678	#ifdef PPS_SYNC
	1679	kapi = (struct pps_kcbind_args *)data;
	1680	/* XXX Only root should be able to do this */
	1681	if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
	1682	return (EINVAL);
	1683	if (kapi->kernel_consumer != PPS_KC_HARDPPS)
	1684	return (EINVAL);
	1685	if (kapi->edge & ~pps->ppscap)
	1686	return (EINVAL);
	1687	pps->kcmode = kapi->edge;
	1688	return (0);
	1689	#else
	1690	return (EOPNOTSUPP);
	1691	#endif
	1692	default:
	1693	return (ENOTTY);
	1694	}
	1695	}
	1696
	1697	void
	1698	pps_init(struct pps_state *pps)
	1699	{
	1700	pps->ppscap \|= PPS_TSFMT_TSPEC \| PPS_CANWAIT;
	1701	if (pps->ppscap & PPS_CAPTUREASSERT)
	1702	pps->ppscap \|= PPS_OFFSETASSERT;
	1703	if (pps->ppscap & PPS_CAPTURECLEAR)
	1704	pps->ppscap \|= PPS_OFFSETCLEAR;
	1705	}
	1706
	1707	void
	1708	pps_event(struct pps_state *pps, sysclock_t count, int event)
	1709	{
	1710	struct globaldata *gd;
	1711	struct timespec *tsp;
	1712	struct timespec *osp;
	1713	struct timespec *bt;
	1714	struct timespec ts;
	1715	sysclock_t *pcount;
	1716	#ifdef PPS_SYNC
	1717	sysclock_t tcount;
	1718	#endif
	1719	sysclock_t delta;
	1720	pps_seq_t *pseq;
	1721	int foff;
	1722	#ifdef PPS_SYNC
	1723	int fhard;
	1724	#endif
	1725	int ni;
	1726
	1727	gd = mycpu;
	1728
	1729	/* Things would be easier with arrays... */
	1730	if (event == PPS_CAPTUREASSERT) {
	1731	tsp = &pps->ppsinfo.assert_timestamp;
	1732	osp = &pps->ppsparam.assert_offset;
	1733	foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
	1734	#ifdef PPS_SYNC
	1735	fhard = pps->kcmode & PPS_CAPTUREASSERT;
	1736	#endif
	1737	pcount = &pps->ppscount[0];
	1738	pseq = &pps->ppsinfo.assert_sequence;
	1739	} else {
	1740	tsp = &pps->ppsinfo.clear_timestamp;
	1741	osp = &pps->ppsparam.clear_offset;
	1742	foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
	1743	#ifdef PPS_SYNC
	1744	fhard = pps->kcmode & PPS_CAPTURECLEAR;
	1745	#endif
	1746	pcount = &pps->ppscount[1];
	1747	pseq = &pps->ppsinfo.clear_sequence;
	1748	}
	1749
	1750	/* Nothing really happened */
	1751	if (*pcount == count)
	1752	return;
	1753
	1754	*pcount = count;
	1755
	1756	do {
	1757	ts.tv_sec = gd->gd_time_seconds;
	1758	delta = count - gd->gd_cpuclock_base;
	1759	} while (ts.tv_sec != gd->gd_time_seconds);
	1760
	1761	if (delta >= sys_cputimer->freq) {
	1762	ts.tv_sec += delta / sys_cputimer->freq;
	1763	delta %= sys_cputimer->freq;
	1764	}
	1765	ts.tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32);
	1766	ni = basetime_index;
	1767	cpu_lfence();
	1768	bt = &basetime[ni];
	1769	ts.tv_sec += bt->tv_sec;
	1770	ts.tv_nsec += bt->tv_nsec;
	1771	while (ts.tv_nsec >= 1000000000) {
	1772	ts.tv_nsec -= 1000000000;
	1773	++ts.tv_sec;
	1774	}
	1775
	1776	atomic_add_rel_int(pseq, 1);
	1777	*tsp = ts;
	1778
	1779	if (foff) {
	1780	timespecadd(tsp, osp, tsp);
	1781	if (tsp->tv_nsec < 0) {
	1782	tsp->tv_nsec += 1000000000;
	1783	tsp->tv_sec -= 1;
	1784	}
	1785	}
	1786	#ifdef PPS_SYNC
	1787	if (fhard) {
	1788	/* magic, at its best... */
	1789	tcount = count - pps->ppscount[2];
	1790	pps->ppscount[2] = count;
	1791	if (tcount >= sys_cputimer->freq) {
	1792	delta = (1000000000 * (tcount / sys_cputimer->freq) +
	1793	sys_cputimer->freq64_nsec *
	1794	(tcount % sys_cputimer->freq)) >> 32;
	1795	} else {
	1796	delta = muldivu64(sys_cputimer->freq64_nsec,
	1797	tcount, 1L << 32);
	1798	}
	1799	hardpps(tsp, delta);
	1800	}
	1801	#endif
	1802	wakeup(pps);
	1803	}
	1804
	1805	/*
	1806	* Return the tsc target value for a delay of (ns).
	1807	*
	1808	* Returns -1 if the TSC is not supported.
	1809	*/
	1810	tsc_uclock_t
	1811	tsc_get_target(int ns)
	1812	{
	1813	#if defined(_RDTSC_SUPPORTED_)
	1814	if (cpu_feature & CPUID_TSC) {
	1815	return (rdtsc() + tsc_frequency * ns / (int64_t)1000000000);
	1816	}
	1817	#endif
	1818	return(-1);
	1819	}
	1820
	1821	/*
	1822	* Compare the tsc against the passed target
	1823	*
	1824	* Returns +1 if the target has been reached
	1825	* Returns 0 if the target has not yet been reached
	1826	* Returns -1 if the TSC is not supported.
	1827	*
	1828	* Typical use: while (tsc_test_target(target) == 0) { ...poll... }
	1829	*/
	1830	int
	1831	tsc_test_target(int64_t target)
	1832	{
	1833	#if defined(_RDTSC_SUPPORTED_)
	1834	if (cpu_feature & CPUID_TSC) {
	1835	if ((int64_t)(target - rdtsc()) <= 0)
	1836	return(1);
	1837	return(0);
	1838	}
	1839	#endif
	1840	return(-1);
	1841	}
	1842
	1843	/*
	1844	* Delay the specified number of nanoseconds using the tsc. This function
	1845	* returns immediately if the TSC is not supported. At least one cpu_pause()
	1846	* will be issued.
	1847	*/
	1848	void
	1849	tsc_delay(int ns)
	1850	{
	1851	int64_t clk;
	1852
	1853	clk = tsc_get_target(ns);
	1854	cpu_pause();
	1855	cpu_pause();
	1856	while (tsc_test_target(clk) == 0) {
	1857	cpu_pause();
	1858	cpu_pause();
	1859	cpu_pause();
	1860	cpu_pause();
	1861	}
	1862	}