gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
	35	* Copyright (c) 1982, 1986, 1991, 1993
	36	* The Regents of the University of California. All rights reserved.
	37	* (c) UNIX System Laboratories, Inc.
	38	* All or some portions of this file are derived from material licensed
	39	* to the University of California by American Telephone and Telegraph
	40	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	41	* the permission of UNIX System Laboratories, Inc.
	42	*
	43	* Redistribution and use in source and binary forms, with or without
	44	* modification, are permitted provided that the following conditions
	45	* are met:
	46	* 1. Redistributions of source code must retain the above copyright
	47	* notice, this list of conditions and the following disclaimer.
	48	* 2. Redistributions in binary form must reproduce the above copyright
	49	* notice, this list of conditions and the following disclaimer in the
	50	* documentation and/or other materials provided with the distribution.
	51	* 3. All advertising materials mentioning features or use of this software
	52	* must display the following acknowledgement:
	53	* This product includes software developed by the University of
	54	* California, Berkeley and its contributors.
	55	* 4. Neither the name of the University nor the names of its contributors
	56	* may be used to endorse or promote products derived from this software
	57	* without specific prior written permission.
	58	*
	59	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	60	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	61	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	62	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	63	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	64	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	65	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	66	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	67	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	68	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	69	* SUCH DAMAGE.
	70	*
	71	* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
	72	* $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $
	73	* $DragonFly: src/sys/kern/kern_clock.c,v 1.41 2005/06/01 17:43:42 dillon Exp $
	74	*/
	75
	76	#include "opt_ntp.h"
	77
	78	#include <sys/param.h>
	79	#include <sys/systm.h>
	80	#include <sys/callout.h>
	81	#include <sys/kernel.h>
	82	#include <sys/kinfo.h>
	83	#include <sys/proc.h>
	84	#include <sys/malloc.h>
	85	#include <sys/resourcevar.h>
	86	#include <sys/signalvar.h>
	87	#include <sys/timex.h>
	88	#include <sys/timepps.h>
	89	#include <vm/vm.h>
	90	#include <sys/lock.h>
	91	#include <vm/pmap.h>
	92	#include <vm/vm_map.h>
	93	#include <sys/sysctl.h>
	94	#include <sys/thread2.h>
	95
	96	#include <machine/cpu.h>
	97	#include <machine/limits.h>
	98	#include <machine/smp.h>
	99
	100	#ifdef GPROF
	101	#include <sys/gmon.h>
	102	#endif
	103
	104	#ifdef DEVICE_POLLING
	105	extern void init_device_poll(void);
	106	extern void hardclock_device_poll(void);
	107	#endif /* DEVICE_POLLING */
	108
	109	static void initclocks (void *dummy);
	110	SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
	111
	112	/*
	113	* Some of these don't belong here, but it's easiest to concentrate them.
	114	* Note that cpu_time counts in microseconds, but most userland programs
	115	* just compare relative times against the total by delta.
	116	*/
	117	struct kinfo_cputime cputime_percpu[MAXCPU];
	118	#ifdef SMP
	119	static int
	120	sysctl_cputime(SYSCTL_HANDLER_ARGS)
	121	{
	122	int cpu, error = 0;
	123	size_t size = sizeof(struct kinfo_cputime);
	124
	125	for (cpu = 0; cpu < ncpus; ++cpu) {
	126	if ((error = SYSCTL_OUT(req, &cputime_percpu[cpu], size)))
	127	break;
	128	}
	129
	130	return (error);
	131	}
	132	SYSCTL_PROC(_kern, OID_AUTO, cputime, (CTLTYPE_OPAQUE\|CTLFLAG_RD), 0, 0,
	133	sysctl_cputime, "S,kinfo_cputime", "CPU time statistics");
	134	#else
	135	SYSCTL_STRUCT(_kern, OID_AUTO, cputime, CTLFLAG_RD, &cpu_time, kinfo_cputime,
	136	"CPU time statistics");
	137	#endif
	138
	139	/*
	140	* boottime is used to calculate the 'real' uptime. Do not confuse this with
	141	* microuptime(). microtime() is not drift compensated. The real uptime
	142	* with compensation is nanotime() - bootime. boottime is recalculated
	143	* whenever the real time is set based on the compensated elapsed time
	144	* in seconds (gd->gd_time_seconds).
	145	*
	146	* The gd_time_seconds and gd_cpuclock_base fields remain fairly monotonic.
	147	* Slight adjustments to gd_cpuclock_base are made to phase-lock it to
	148	* the real time.
	149	*/
	150	struct timespec boottime; /* boot time (realtime) for reference only */
	151	time_t time_second; /* read-only 'passive' uptime in seconds */
	152
	153	/*
	154	* basetime is used to calculate the compensated real time of day. The
	155	* basetime can be modified on a per-tick basis by the adjtime(),
	156	* ntp_adjtime(), and sysctl-based time correction APIs.
	157	*
	158	* Note that frequency corrections can also be made by adjusting
	159	* gd_cpuclock_base.
	160	*
	161	* basetime is a tail-chasing FIFO, updated only by cpu #0. The FIFO is
	162	* used on both SMP and UP systems to avoid MP races between cpu's and
	163	* interrupt races on UP systems.
	164	*/
	165	#define BASETIME_ARYSIZE 16
	166	#define BASETIME_ARYMASK (BASETIME_ARYSIZE - 1)
	167	static struct timespec basetime[BASETIME_ARYSIZE];
	168	static volatile int basetime_index;
	169
	170	static int
	171	sysctl_get_basetime(SYSCTL_HANDLER_ARGS)
	172	{
	173	struct timespec *bt;
	174	int error;
	175
	176	bt = &basetime[basetime_index];
	177	error = SYSCTL_OUT(req, bt, sizeof(*bt));
	178	return (error);
	179	}
	180
	181	SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
	182	&boottime, timespec, "System boottime");
	183	SYSCTL_PROC(_kern, OID_AUTO, basetime, CTLTYPE_STRUCT\|CTLFLAG_RD, 0, 0,
	184	sysctl_get_basetime, "S,timespec", "System basetime");
	185
	186	static void hardclock(systimer_t info, struct intrframe *frame);
	187	static void statclock(systimer_t info, struct intrframe *frame);
	188	static void schedclock(systimer_t info, struct intrframe *frame);
	189	static void getnanotime_nbt(struct timespec nbt, struct timespec tsp);
	190
	191	int ticks; /* system master ticks at hz */
	192	int clocks_running; /* tsleep/timeout clocks operational */
	193	int64_t nsec_adj; /* ntpd per-tick adjustment in nsec << 32 */
	194	int64_t nsec_acc; /* accumulator */
	195
	196	/* NTPD time correction fields */
	197	int64_t ntp_tick_permanent; /* per-tick adjustment in nsec << 32 */
	198	int64_t ntp_tick_acc; /* accumulator for per-tick adjustment */
	199	int64_t ntp_delta; /* one-time correction in nsec */
	200	int64_t ntp_big_delta = 1000000000;
	201	int32_t ntp_tick_delta; /* current adjustment rate */
	202	int32_t ntp_default_tick_delta; /* adjustment rate for ntp_delta */
	203	time_t ntp_leap_second; /* time of next leap second */
	204	int ntp_leap_insert; /* whether to insert or remove a second */
	205
	206	/*
	207	* Finish initializing clock frequencies and start all clocks running.
	208	*/
	209	/* ARGSUSED*/
	210	static void
	211	initclocks(void *dummy)
	212	{
	213	cpu_initclocks();
	214	#ifdef DEVICE_POLLING
	215	init_device_poll();
	216	#endif
	217	/psratio = profhz / stathz;/
	218	initclocks_pcpu();
	219	clocks_running = 1;
	220	}
	221
	222	/*
	223	* Called on a per-cpu basis
	224	*/
	225	void
	226	initclocks_pcpu(void)
	227	{
	228	struct globaldata *gd = mycpu;
	229
	230	crit_enter();
	231	if (gd->gd_cpuid == 0) {
	232	gd->gd_time_seconds = 1;
	233	gd->gd_cpuclock_base = sys_cputimer->count();
	234	} else {
	235	/* XXX */
	236	gd->gd_time_seconds = globaldata_find(0)->gd_time_seconds;
	237	gd->gd_cpuclock_base = globaldata_find(0)->gd_cpuclock_base;
	238	}
	239
	240	/*
	241	* Use a non-queued periodic systimer to prevent multiple ticks from
	242	* building up if the sysclock jumps forward (8254 gets reset). The
	243	* sysclock will never jump backwards. Our time sync is based on
	244	* the actual sysclock, not the ticks count.
	245	*/
	246	systimer_init_periodic_nq(&gd->gd_hardclock, hardclock, NULL, hz);
	247	systimer_init_periodic_nq(&gd->gd_statclock, statclock, NULL, stathz);
	248	/* XXX correct the frequency for scheduler / estcpu tests */
	249	systimer_init_periodic_nq(&gd->gd_schedclock, schedclock,
	250	NULL, ESTCPUFREQ);
	251	crit_exit();
	252	}
	253
	254	/*
	255	* This sets the current real time of day. Timespecs are in seconds and
	256	* nanoseconds. We do not mess with gd_time_seconds and gd_cpuclock_base,
	257	* instead we adjust basetime so basetime + gd_* results in the current
	258	* time of day. This way the gd_* fields are guarenteed to represent
	259	* a monotonically increasing 'uptime' value.
	260	*
	261	* When set_timeofday() is called from userland, the system call forces it
	262	* onto cpu #0 since only cpu #0 can update basetime_index.
	263	*/
	264	void
	265	set_timeofday(struct timespec *ts)
	266	{
	267	struct timespec *nbt;
	268	int ni;
	269
	270	/*
	271	* XXX SMP / non-atomic basetime updates
	272	*/
	273	crit_enter();
	274	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	275	nbt = &basetime[ni];
	276	nanouptime(nbt);
	277	nbt->tv_sec = ts->tv_sec - nbt->tv_sec;
	278	nbt->tv_nsec = ts->tv_nsec - nbt->tv_nsec;
	279	if (nbt->tv_nsec < 0) {
	280	nbt->tv_nsec += 1000000000;
	281	--nbt->tv_sec;
	282	}
	283
	284	/*
	285	* Note that basetime diverges from boottime as the clock drift is
	286	* compensated for, so we cannot do away with boottime. When setting
	287	* the absolute time of day the drift is 0 (for an instant) and we
	288	* can simply assign boottime to basetime.
	289	*
	290	* Note that nanouptime() is based on gd_time_seconds which is drift
	291	* compensated up to a point (it is guarenteed to remain monotonically
	292	* increasing). gd_time_seconds is thus our best uptime guess and
	293	* suitable for use in the boottime calculation. It is already taken
	294	* into account in the basetime calculation above.
	295	*/
	296	boottime.tv_sec = nbt->tv_sec;
	297	ntp_delta = 0;
	298
	299	/*
	300	* We now have a new basetime, update the index.
	301	*/
	302	cpu_mb1();
	303	basetime_index = ni;
	304
	305	crit_exit();
	306	}
	307
	308	/*
	309	* Each cpu has its own hardclock, but we only increments ticks and softticks
	310	* on cpu #0.
	311	*
	312	* NOTE! systimer! the MP lock might not be held here. We can only safely
	313	* manipulate objects owned by the current cpu.
	314	*/
	315	static void
	316	hardclock(systimer_t info, struct intrframe *frame)
	317	{
	318	sysclock_t cputicks;
	319	struct proc *p;
	320	struct pstats *pstats;
	321	struct globaldata *gd = mycpu;
	322
	323	/*
	324	* Realtime updates are per-cpu. Note that timer corrections as
	325	* returned by microtime() and friends make an additional adjustment
	326	* using a system-wise 'basetime', but the running time is always
	327	* taken from the per-cpu globaldata area. Since the same clock
	328	* is distributing (XXX SMP) to all cpus, the per-cpu timebases
	329	* stay in synch.
	330	*
	331	* Note that we never allow info->time (aka gd->gd_hardclock.time)
	332	* to reverse index gd_cpuclock_base, but that it is possible for
	333	* it to temporarily get behind in the seconds if something in the
	334	* system locks interrupts for a long period of time. Since periodic
	335	* timers count events, though everything should resynch again
	336	* immediately.
	337	*/
	338	cputicks = info->time - gd->gd_cpuclock_base;
	339	if (cputicks >= sys_cputimer->freq) {
	340	++gd->gd_time_seconds;
	341	gd->gd_cpuclock_base += sys_cputimer->freq;
	342	}
	343
	344	/*
	345	* The system-wide ticks counter and NTP related timedelta/tickdelta
	346	* adjustments only occur on cpu #0. NTP adjustments are accomplished
	347	* by updating basetime.
	348	*/
	349	if (gd->gd_cpuid == 0) {
	350	struct timespec *nbt;
	351	struct timespec nts;
	352	int leap;
	353	int ni;
	354
	355	++ticks;
	356
	357	#ifdef DEVICE_POLLING
	358	hardclock_device_poll(); /* mpsafe, short and quick */
	359	#endif /* DEVICE_POLLING */
	360
	361	#if 0
	362	if (tco->tc_poll_pps)
	363	tco->tc_poll_pps(tco);
	364	#endif
	365
	366	/*
	367	* Calculate the new basetime index. We are in a critical section
	368	* on cpu #0 and can safely play with basetime_index. Start
	369	* with the current basetime and then make adjustments.
	370	*/
	371	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	372	nbt = &basetime[ni];
	373	*nbt = basetime[basetime_index];
	374
	375	/*
	376	* Apply adjtime corrections. (adjtime() API)
	377	*
	378	* adjtime() only runs on cpu #0 so our critical section is
	379	* sufficient to access these variables.
	380	*/
	381	if (ntp_delta != 0) {
	382	nbt->tv_nsec += ntp_tick_delta;
	383	ntp_delta -= ntp_tick_delta;
	384	if ((ntp_delta > 0 && ntp_delta < ntp_tick_delta) \|\|
	385	(ntp_delta < 0 && ntp_delta > ntp_tick_delta)) {
	386	ntp_tick_delta = ntp_delta;
	387	}
	388	}
	389
	390	/*
	391	* Apply permanent frequency corrections. (sysctl API)
	392	*/
	393	if (ntp_tick_permanent != 0) {
	394	ntp_tick_acc += ntp_tick_permanent;
	395	if (ntp_tick_acc >= (1LL << 32)) {
	396	nbt->tv_nsec += ntp_tick_acc >> 32;
	397	ntp_tick_acc -= (ntp_tick_acc >> 32) << 32;
	398	} else if (ntp_tick_acc <= -(1LL << 32)) {
	399	/* Negate ntp_tick_acc to avoid shifting the sign bit. */
	400	nbt->tv_nsec -= (-ntp_tick_acc) >> 32;
	401	ntp_tick_acc += ((-ntp_tick_acc) >> 32) << 32;
	402	}
	403	}
	404
	405	if (nbt->tv_nsec >= 1000000000) {
	406	nbt->tv_sec++;
	407	nbt->tv_nsec -= 1000000000;
	408	} else if (nbt->tv_nsec < 0) {
	409	nbt->tv_sec--;
	410	nbt->tv_nsec += 1000000000;
	411	}
	412
	413	/*
	414	* Another per-tick compensation. (for ntp_adjtime() API)
	415	*/
	416	if (nsec_adj != 0) {
	417	nsec_acc += nsec_adj;
	418	if (nsec_acc >= 0x100000000LL) {
	419	nbt->tv_nsec += nsec_acc >> 32;
	420	nsec_acc = (nsec_acc & 0xFFFFFFFFLL);
	421	} else if (nsec_acc <= -0x100000000LL) {
	422	nbt->tv_nsec -= -nsec_acc >> 32;
	423	nsec_acc = -(-nsec_acc & 0xFFFFFFFFLL);
	424	}
	425	if (nbt->tv_nsec >= 1000000000) {
	426	nbt->tv_nsec -= 1000000000;
	427	++nbt->tv_sec;
	428	} else if (nbt->tv_nsec < 0) {
	429	nbt->tv_nsec += 1000000000;
	430	--nbt->tv_sec;
	431	}
	432	}
	433
	434	/************************************************************
	435	* LEAP SECOND CORRECTION *
	436	************************************************************
	437	*
	438	* Taking into account all the corrections made above, figure
	439	* out the new real time. If the seconds field has changed
	440	* then apply any pending leap-second corrections.
	441	*/
	442	getnanotime_nbt(nbt, &nts);
	443
	444	if (time_second != nts.tv_sec) {
	445	/*
	446	* Apply leap second (sysctl API). Adjust nts for changes
	447	* so we do not have to call getnanotime_nbt again.
	448	*/
	449	if (ntp_leap_second) {
	450	if (ntp_leap_second == nts.tv_sec) {
	451	if (ntp_leap_insert) {
	452	nbt->tv_sec++;
	453	nts.tv_sec++;
	454	} else {
	455	nbt->tv_sec--;
	456	nts.tv_sec--;
	457	}
	458	ntp_leap_second--;
	459	}
	460	}
	461
	462	/*
	463	* Apply leap second (ntp_adjtime() API), calculate a new
	464	* nsec_adj field. ntp_update_second() returns nsec_adj
	465	* as a per-second value but we need it as a per-tick value.
	466	*/
	467	leap = ntp_update_second(time_second, &nsec_adj);
	468	nsec_adj /= hz;
	469	nbt->tv_sec += leap;
	470	nts.tv_sec += leap;
	471
	472	/*
	473	* Update the time_second 'approximate time' global.
	474	*/
	475	time_second = nts.tv_sec;
	476	}
	477
	478	/*
	479	* Finally, our new basetime is ready to go live!
	480	*/
	481	cpu_mb1();
	482	basetime_index = ni;
	483	}
	484
	485	/*
	486	* softticks are handled for all cpus
	487	*/
	488	hardclock_softtick(gd);
	489
	490	/*
	491	* ITimer handling is per-tick, per-cpu. I don't think psignal()
	492	* is mpsafe on curproc, so XXX get the mplock.
	493	*/
	494	if ((p = curproc) != NULL && try_mplock()) {
	495	pstats = p->p_stats;
	496	if (frame && CLKF_USERMODE(frame) &&
	497	timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
	498	itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
	499	psignal(p, SIGVTALRM);
	500	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
	501	itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
	502	psignal(p, SIGPROF);
	503	rel_mplock();
	504	}
	505	setdelayed();
	506	}
	507
	508	/*
	509	* The statistics clock typically runs at a 125Hz rate, and is intended
	510	* to be frequency offset from the hardclock (typ 100Hz). It is per-cpu.
	511	*
	512	* NOTE! systimer! the MP lock might not be held here. We can only safely
	513	* manipulate objects owned by the current cpu.
	514	*
	515	* The stats clock is responsible for grabbing a profiling sample.
	516	* Most of the statistics are only used by user-level statistics programs.
	517	* The main exceptions are p->p_uticks, p->p_sticks, p->p_iticks, and
	518	* p->p_estcpu.
	519	*
	520	* Like the other clocks, the stat clock is called from what is effectively
	521	* a fast interrupt, so the context should be the thread/process that got
	522	* interrupted.
	523	*/
	524	static void
	525	statclock(systimer_t info, struct intrframe *frame)
	526	{
	527	#ifdef GPROF
	528	struct gmonparam *g;
	529	int i;
	530	#endif
	531	thread_t td;
	532	struct proc *p;
	533	int bump;
	534	struct timeval tv;
	535	struct timeval *stv;
	536
	537	/*
	538	* How big was our timeslice relative to the last time?
	539	*/
	540	microuptime(&tv); /* mpsafe */
	541	stv = &mycpu->gd_stattv;
	542	if (stv->tv_sec == 0) {
	543	bump = 1;
	544	} else {
	545	bump = tv.tv_usec - stv->tv_usec +
	546	(tv.tv_sec - stv->tv_sec) * 1000000;
	547	if (bump < 0)
	548	bump = 0;
	549	if (bump > 1000000)
	550	bump = 1000000;
	551	}
	552	*stv = tv;
	553
	554	td = curthread;
	555	p = td->td_proc;
	556
	557	if (frame && CLKF_USERMODE(frame)) {
	558	/*
	559	* Came from userland, handle user time and deal with
	560	* possible process.
	561	*/
	562	if (p && (p->p_flag & P_PROFIL))
	563	addupc_intr(p, CLKF_PC(frame), 1);
	564	td->td_uticks += bump;
	565
	566	/*
	567	* Charge the time as appropriate
	568	*/
	569	if (p && p->p_nice > NZERO)
	570	cpu_time.cp_nice += bump;
	571	else
	572	cpu_time.cp_user += bump;
	573	} else {
	574	#ifdef GPROF
	575	/*
	576	* Kernel statistics are just like addupc_intr, only easier.
	577	*/
	578	g = &_gmonparam;
	579	if (g->state == GMON_PROF_ON && frame) {
	580	i = CLKF_PC(frame) - g->lowpc;
	581	if (i < g->textsize) {
	582	i /= HISTFRACTION * sizeof(*g->kcount);
	583	g->kcount[i]++;
	584	}
	585	}
	586	#endif
	587	/*
	588	* Came from kernel mode, so we were:
	589	* - handling an interrupt,
	590	* - doing syscall or trap work on behalf of the current
	591	* user process, or
	592	* - spinning in the idle loop.
	593	* Whichever it is, charge the time as appropriate.
	594	* Note that we charge interrupts to the current process,
	595	* regardless of whether they are ``for'' that process,
	596	* so that we know how much of its real time was spent
	597	* in ``non-process'' (i.e., interrupt) work.
	598	*
	599	* XXX assume system if frame is NULL. A NULL frame
	600	* can occur if ipi processing is done from an splx().
	601	*/
	602	if (frame && CLKF_INTR(frame))
	603	td->td_iticks += bump;
	604	else
	605	td->td_sticks += bump;
	606
	607	if (frame && CLKF_INTR(frame)) {
	608	cpu_time.cp_intr += bump;
	609	} else {
	610	if (td == &mycpu->gd_idlethread)
	611	cpu_time.cp_idle += bump;
	612	else
	613	cpu_time.cp_sys += bump;
	614	}
	615	}
	616	}
	617
	618	/*
	619	* The scheduler clock typically runs at a 20Hz rate. NOTE! systimer,
	620	* the MP lock might not be held. We can safely manipulate parts of curproc
	621	* but that's about it.
	622	*/
	623	static void
	624	schedclock(systimer_t info, struct intrframe *frame)
	625	{
	626	struct proc *p;
	627	struct pstats *pstats;
	628	struct rusage *ru;
	629	struct vmspace *vm;
	630	long rss;
	631
	632	schedulerclock(NULL); /* mpsafe */
	633	if ((p = curproc) != NULL) {
	634	/* Update resource usage integrals and maximums. */
	635	if ((pstats = p->p_stats) != NULL &&
	636	(ru = &pstats->p_ru) != NULL &&
	637	(vm = p->p_vmspace) != NULL) {
	638	ru->ru_ixrss += pgtok(vm->vm_tsize);
	639	ru->ru_idrss += pgtok(vm->vm_dsize);
	640	ru->ru_isrss += pgtok(vm->vm_ssize);
	641	rss = pgtok(vmspace_resident_count(vm));
	642	if (ru->ru_maxrss < rss)
	643	ru->ru_maxrss = rss;
	644	}
	645	}
	646	}
	647
	648	/*
	649	* Compute number of ticks for the specified amount of time. The
	650	* return value is intended to be used in a clock interrupt timed
	651	* operation and guarenteed to meet or exceed the requested time.
	652	* If the representation overflows, return INT_MAX. The minimum return
	653	* value is 1 ticks and the function will average the calculation up.
	654	* If any value greater then 0 microseconds is supplied, a value
	655	* of at least 2 will be returned to ensure that a near-term clock
	656	* interrupt does not cause the timeout to occur (degenerately) early.
	657	*
	658	* Note that limit checks must take into account microseconds, which is
	659	* done simply by using the smaller signed long maximum instead of
	660	* the unsigned long maximum.
	661	*
	662	* If ints have 32 bits, then the maximum value for any timeout in
	663	* 10ms ticks is 248 days.
	664	*/
	665	int
	666	tvtohz_high(struct timeval *tv)
	667	{
	668	int ticks;
	669	long sec, usec;
	670
	671	sec = tv->tv_sec;
	672	usec = tv->tv_usec;
	673	if (usec < 0) {
	674	sec--;
	675	usec += 1000000;
	676	}
	677	if (sec < 0) {
	678	#ifdef DIAGNOSTIC
	679	if (usec > 0) {
	680	sec++;
	681	usec -= 1000000;
	682	}
	683	printf("tvotohz: negative time difference %ld sec %ld usec\n",
	684	sec, usec);
	685	#endif
	686	ticks = 1;
	687	} else if (sec <= INT_MAX / hz) {
	688	ticks = (int)(sec * hz +
	689	((u_long)usec + (tick - 1)) / tick) + 1;
	690	} else {
	691	ticks = INT_MAX;
	692	}
	693	return (ticks);
	694	}
	695
	696	/*
	697	* Compute number of ticks for the specified amount of time, erroring on
	698	* the side of it being too low to ensure that sleeping the returned number
	699	* of ticks will not result in a late return.
	700	*
	701	* The supplied timeval may not be negative and should be normalized. A
	702	* return value of 0 is possible if the timeval converts to less then
	703	* 1 tick.
	704	*
	705	* If ints have 32 bits, then the maximum value for any timeout in
	706	* 10ms ticks is 248 days.
	707	*/
	708	int
	709	tvtohz_low(struct timeval *tv)
	710	{
	711	int ticks;
	712	long sec;
	713
	714	sec = tv->tv_sec;
	715	if (sec <= INT_MAX / hz)
	716	ticks = (int)(sec * hz + (u_long)tv->tv_usec / tick);
	717	else
	718	ticks = INT_MAX;
	719	return (ticks);
	720	}
	721
	722
	723	/*
	724	* Start profiling on a process.
	725	*
	726	* Kernel profiling passes proc0 which never exits and hence
	727	* keeps the profile clock running constantly.
	728	*/
	729	void
	730	startprofclock(struct proc *p)
	731	{
	732	if ((p->p_flag & P_PROFIL) == 0) {
	733	p->p_flag \|= P_PROFIL;
	734	#if 0 /* XXX */
	735	if (++profprocs == 1 && stathz != 0) {
	736	s = splstatclock();
	737	psdiv = psratio;
	738	setstatclockrate(profhz);
	739	splx(s);
	740	}
	741	#endif
	742	}
	743	}
	744
	745	/*
	746	* Stop profiling on a process.
	747	*/
	748	void
	749	stopprofclock(struct proc *p)
	750	{
	751	if (p->p_flag & P_PROFIL) {
	752	p->p_flag &= ~P_PROFIL;
	753	#if 0 /* XXX */
	754	if (--profprocs == 0 && stathz != 0) {
	755	s = splstatclock();
	756	psdiv = 1;
	757	setstatclockrate(stathz);
	758	splx(s);
	759	}
	760	#endif
	761	}
	762	}
	763
	764	/*
	765	* Return information about system clocks.
	766	*/
	767	static int
	768	sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
	769	{
	770	struct kinfo_clockinfo clkinfo;
	771	/*
	772	* Construct clockinfo structure.
	773	*/
	774	clkinfo.ci_hz = hz;
	775	clkinfo.ci_tick = tick;
	776	clkinfo.ci_tickadj = ntp_default_tick_delta / 1000;
	777	clkinfo.ci_profhz = profhz;
	778	clkinfo.ci_stathz = stathz ? stathz : hz;
	779	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
	780	}
	781
	782	SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT\|CTLFLAG_RD,
	783	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
	784
	785	/*
	786	* We have eight functions for looking at the clock, four for
	787	* microseconds and four for nanoseconds. For each there is fast
	788	* but less precise version "get{nano\|micro}[up]time" which will
	789	* return a time which is up to 1/HZ previous to the call, whereas
	790	* the raw version "{nano\|micro}[up]time" will return a timestamp
	791	* which is as precise as possible. The "up" variants return the
	792	* time relative to system boot, these are well suited for time
	793	* interval measurements.
	794	*
	795	* Each cpu independantly maintains the current time of day, so all
	796	* we need to do to protect ourselves from changes is to do a loop
	797	* check on the seconds field changing out from under us.
	798	*
	799	* The system timer maintains a 32 bit count and due to various issues
	800	* it is possible for the calculated delta to occassionally exceed
	801	* sys_cputimer->freq. If this occurs the sys_cputimer->freq64_nsec
	802	* multiplication can easily overflow, so we deal with the case. For
	803	* uniformity we deal with the case in the usec case too.
	804	*/
	805	void
	806	getmicrouptime(struct timeval *tvp)
	807	{
	808	struct globaldata *gd = mycpu;
	809	sysclock_t delta;
	810
	811	do {
	812	tvp->tv_sec = gd->gd_time_seconds;
	813	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	814	} while (tvp->tv_sec != gd->gd_time_seconds);
	815
	816	if (delta >= sys_cputimer->freq) {
	817	tvp->tv_sec += delta / sys_cputimer->freq;
	818	delta %= sys_cputimer->freq;
	819	}
	820	tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32;
	821	if (tvp->tv_usec >= 1000000) {
	822	tvp->tv_usec -= 1000000;
	823	++tvp->tv_sec;
	824	}
	825	}
	826
	827	void
	828	getnanouptime(struct timespec *tsp)
	829	{
	830	struct globaldata *gd = mycpu;
	831	sysclock_t delta;
	832
	833	do {
	834	tsp->tv_sec = gd->gd_time_seconds;
	835	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	836	} while (tsp->tv_sec != gd->gd_time_seconds);
	837
	838	if (delta >= sys_cputimer->freq) {
	839	tsp->tv_sec += delta / sys_cputimer->freq;
	840	delta %= sys_cputimer->freq;
	841	}
	842	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	843	}
	844
	845	void
	846	microuptime(struct timeval *tvp)
	847	{
	848	struct globaldata *gd = mycpu;
	849	sysclock_t delta;
	850
	851	do {
	852	tvp->tv_sec = gd->gd_time_seconds;
	853	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	854	} while (tvp->tv_sec != gd->gd_time_seconds);
	855
	856	if (delta >= sys_cputimer->freq) {
	857	tvp->tv_sec += delta / sys_cputimer->freq;
	858	delta %= sys_cputimer->freq;
	859	}
	860	tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32;
	861	}
	862
	863	void
	864	nanouptime(struct timespec *tsp)
	865	{
	866	struct globaldata *gd = mycpu;
	867	sysclock_t delta;
	868
	869	do {
	870	tsp->tv_sec = gd->gd_time_seconds;
	871	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	872	} while (tsp->tv_sec != gd->gd_time_seconds);
	873
	874	if (delta >= sys_cputimer->freq) {
	875	tsp->tv_sec += delta / sys_cputimer->freq;
	876	delta %= sys_cputimer->freq;
	877	}
	878	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	879	}
	880
	881	/*
	882	* realtime routines
	883	*/
	884
	885	void
	886	getmicrotime(struct timeval *tvp)
	887	{
	888	struct globaldata *gd = mycpu;
	889	struct timespec *bt;
	890	sysclock_t delta;
	891
	892	do {
	893	tvp->tv_sec = gd->gd_time_seconds;
	894	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	895	} while (tvp->tv_sec != gd->gd_time_seconds);
	896
	897	if (delta >= sys_cputimer->freq) {
	898	tvp->tv_sec += delta / sys_cputimer->freq;
	899	delta %= sys_cputimer->freq;
	900	}
	901	tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32;
	902
	903	bt = &basetime[basetime_index];
	904	tvp->tv_sec += bt->tv_sec;
	905	tvp->tv_usec += bt->tv_nsec / 1000;
	906	while (tvp->tv_usec >= 1000000) {
	907	tvp->tv_usec -= 1000000;
	908	++tvp->tv_sec;
	909	}
	910	}
	911
	912	void
	913	getnanotime(struct timespec *tsp)
	914	{
	915	struct globaldata *gd = mycpu;
	916	struct timespec *bt;
	917	sysclock_t delta;
	918
	919	do {
	920	tsp->tv_sec = gd->gd_time_seconds;
	921	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	922	} while (tsp->tv_sec != gd->gd_time_seconds);
	923
	924	if (delta >= sys_cputimer->freq) {
	925	tsp->tv_sec += delta / sys_cputimer->freq;
	926	delta %= sys_cputimer->freq;
	927	}
	928	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	929
	930	bt = &basetime[basetime_index];
	931	tsp->tv_sec += bt->tv_sec;
	932	tsp->tv_nsec += bt->tv_nsec;
	933	while (tsp->tv_nsec >= 1000000000) {
	934	tsp->tv_nsec -= 1000000000;
	935	++tsp->tv_sec;
	936	}
	937	}
	938
	939	static void
	940	getnanotime_nbt(struct timespec nbt, struct timespec tsp)
	941	{
	942	struct globaldata *gd = mycpu;
	943	sysclock_t delta;
	944
	945	do {
	946	tsp->tv_sec = gd->gd_time_seconds;
	947	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	948	} while (tsp->tv_sec != gd->gd_time_seconds);
	949
	950	if (delta >= sys_cputimer->freq) {
	951	tsp->tv_sec += delta / sys_cputimer->freq;
	952	delta %= sys_cputimer->freq;
	953	}
	954	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	955
	956	tsp->tv_sec += nbt->tv_sec;
	957	tsp->tv_nsec += nbt->tv_nsec;
	958	while (tsp->tv_nsec >= 1000000000) {
	959	tsp->tv_nsec -= 1000000000;
	960	++tsp->tv_sec;
	961	}
	962	}
	963
	964
	965	void
	966	microtime(struct timeval *tvp)
	967	{
	968	struct globaldata *gd = mycpu;
	969	struct timespec *bt;
	970	sysclock_t delta;
	971
	972	do {
	973	tvp->tv_sec = gd->gd_time_seconds;
	974	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	975	} while (tvp->tv_sec != gd->gd_time_seconds);
	976
	977	if (delta >= sys_cputimer->freq) {
	978	tvp->tv_sec += delta / sys_cputimer->freq;
	979	delta %= sys_cputimer->freq;
	980	}
	981	tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32;
	982
	983	bt = &basetime[basetime_index];
	984	tvp->tv_sec += bt->tv_sec;
	985	tvp->tv_usec += bt->tv_nsec / 1000;
	986	while (tvp->tv_usec >= 1000000) {
	987	tvp->tv_usec -= 1000000;
	988	++tvp->tv_sec;
	989	}
	990	}
	991
	992	void
	993	nanotime(struct timespec *tsp)
	994	{
	995	struct globaldata *gd = mycpu;
	996	struct timespec *bt;
	997	sysclock_t delta;
	998
	999	do {
	1000	tsp->tv_sec = gd->gd_time_seconds;
	1001	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	1002	} while (tsp->tv_sec != gd->gd_time_seconds);
	1003
	1004	if (delta >= sys_cputimer->freq) {
	1005	tsp->tv_sec += delta / sys_cputimer->freq;
	1006	delta %= sys_cputimer->freq;
	1007	}
	1008	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	1009
	1010	bt = &basetime[basetime_index];
	1011	tsp->tv_sec += bt->tv_sec;
	1012	tsp->tv_nsec += bt->tv_nsec;
	1013	while (tsp->tv_nsec >= 1000000000) {
	1014	tsp->tv_nsec -= 1000000000;
	1015	++tsp->tv_sec;
	1016	}
	1017	}
	1018
	1019	/*
	1020	* note: this is not exactly synchronized with real time. To do that we
	1021	* would have to do what microtime does and check for a nanoseconds overflow.
	1022	*/
	1023	time_t
	1024	get_approximate_time_t(void)
	1025	{
	1026	struct globaldata *gd = mycpu;
	1027	struct timespec *bt;
	1028
	1029	bt = &basetime[basetime_index];
	1030	return(gd->gd_time_seconds + bt->tv_sec);
	1031	}
	1032
	1033	int
	1034	pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
	1035	{
	1036	pps_params_t *app;
	1037	struct pps_fetch_args *fapi;
	1038	#ifdef PPS_SYNC
	1039	struct pps_kcbind_args *kapi;
	1040	#endif
	1041
	1042	switch (cmd) {
	1043	case PPS_IOC_CREATE:
	1044	return (0);
	1045	case PPS_IOC_DESTROY:
	1046	return (0);
	1047	case PPS_IOC_SETPARAMS:
	1048	app = (pps_params_t *)data;
	1049	if (app->mode & ~pps->ppscap)
	1050	return (EINVAL);
	1051	pps->ppsparam = *app;
	1052	return (0);
	1053	case PPS_IOC_GETPARAMS:
	1054	app = (pps_params_t *)data;
	1055	*app = pps->ppsparam;
	1056	app->api_version = PPS_API_VERS_1;
	1057	return (0);
	1058	case PPS_IOC_GETCAP:
	1059	(int)data = pps->ppscap;
	1060	return (0);
	1061	case PPS_IOC_FETCH:
	1062	fapi = (struct pps_fetch_args *)data;
	1063	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
	1064	return (EINVAL);
	1065	if (fapi->timeout.tv_sec \|\| fapi->timeout.tv_nsec)
	1066	return (EOPNOTSUPP);
	1067	pps->ppsinfo.current_mode = pps->ppsparam.mode;
	1068	fapi->pps_info_buf = pps->ppsinfo;
	1069	return (0);
	1070	case PPS_IOC_KCBIND:
	1071	#ifdef PPS_SYNC
	1072	kapi = (struct pps_kcbind_args *)data;
	1073	/* XXX Only root should be able to do this */
	1074	if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
	1075	return (EINVAL);
	1076	if (kapi->kernel_consumer != PPS_KC_HARDPPS)
	1077	return (EINVAL);
	1078	if (kapi->edge & ~pps->ppscap)
	1079	return (EINVAL);
	1080	pps->kcmode = kapi->edge;
	1081	return (0);
	1082	#else
	1083	return (EOPNOTSUPP);
	1084	#endif
	1085	default:
	1086	return (ENOTTY);
	1087	}
	1088	}
	1089
	1090	void
	1091	pps_init(struct pps_state *pps)
	1092	{
	1093	pps->ppscap \|= PPS_TSFMT_TSPEC;
	1094	if (pps->ppscap & PPS_CAPTUREASSERT)
	1095	pps->ppscap \|= PPS_OFFSETASSERT;
	1096	if (pps->ppscap & PPS_CAPTURECLEAR)
	1097	pps->ppscap \|= PPS_OFFSETCLEAR;
	1098	}
	1099
	1100	void
	1101	pps_event(struct pps_state *pps, sysclock_t count, int event)
	1102	{
	1103	struct globaldata *gd;
	1104	struct timespec *tsp;
	1105	struct timespec *osp;
	1106	struct timespec *bt;
	1107	struct timespec ts;
	1108	sysclock_t *pcount;
	1109	#ifdef PPS_SYNC
	1110	sysclock_t tcount;
	1111	#endif
	1112	sysclock_t delta;
	1113	pps_seq_t *pseq;
	1114	int foff;
	1115	int fhard;
	1116
	1117	gd = mycpu;
	1118
	1119	/* Things would be easier with arrays... */
	1120	if (event == PPS_CAPTUREASSERT) {
	1121	tsp = &pps->ppsinfo.assert_timestamp;
	1122	osp = &pps->ppsparam.assert_offset;
	1123	foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
	1124	fhard = pps->kcmode & PPS_CAPTUREASSERT;
	1125	pcount = &pps->ppscount[0];
	1126	pseq = &pps->ppsinfo.assert_sequence;
	1127	} else {
	1128	tsp = &pps->ppsinfo.clear_timestamp;
	1129	osp = &pps->ppsparam.clear_offset;
	1130	foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
	1131	fhard = pps->kcmode & PPS_CAPTURECLEAR;
	1132	pcount = &pps->ppscount[1];
	1133	pseq = &pps->ppsinfo.clear_sequence;
	1134	}
	1135
	1136	/* Nothing really happened */
	1137	if (*pcount == count)
	1138	return;
	1139
	1140	*pcount = count;
	1141
	1142	do {
	1143	ts.tv_sec = gd->gd_time_seconds;
	1144	delta = count - gd->gd_cpuclock_base;
	1145	} while (ts.tv_sec != gd->gd_time_seconds);
	1146
	1147	if (delta >= sys_cputimer->freq) {
	1148	ts.tv_sec += delta / sys_cputimer->freq;
	1149	delta %= sys_cputimer->freq;
	1150	}
	1151	ts.tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	1152	bt = &basetime[basetime_index];
	1153	ts.tv_sec += bt->tv_sec;
	1154	ts.tv_nsec += bt->tv_nsec;
	1155	while (ts.tv_nsec >= 1000000000) {
	1156	ts.tv_nsec -= 1000000000;
	1157	++ts.tv_sec;
	1158	}
	1159
	1160	(*pseq)++;
	1161	*tsp = ts;
	1162
	1163	if (foff) {
	1164	timespecadd(tsp, osp);
	1165	if (tsp->tv_nsec < 0) {
	1166	tsp->tv_nsec += 1000000000;
	1167	tsp->tv_sec -= 1;
	1168	}
	1169	}
	1170	#ifdef PPS_SYNC
	1171	if (fhard) {
	1172	/* magic, at its best... */
	1173	tcount = count - pps->ppscount[2];
	1174	pps->ppscount[2] = count;
	1175	if (tcount >= sys_cputimer->freq) {
	1176	delta = (1000000000 * (tcount / sys_cputimer->freq) +
	1177	sys_cputimer->freq64_nsec *
	1178	(tcount % sys_cputimer->freq)) >> 32;
	1179	} else {
	1180	delta = (sys_cputimer->freq64_nsec * tcount) >> 32;
	1181	}
	1182	hardpps(tsp, delta);
	1183	}
	1184	#endif
	1185	}
	1186