gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
	35	* Copyright (c) 1982, 1986, 1991, 1993
	36	* The Regents of the University of California. All rights reserved.
	37	* (c) UNIX System Laboratories, Inc.
	38	* All or some portions of this file are derived from material licensed
	39	* to the University of California by American Telephone and Telegraph
	40	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	41	* the permission of UNIX System Laboratories, Inc.
	42	*
	43	* Redistribution and use in source and binary forms, with or without
	44	* modification, are permitted provided that the following conditions
	45	* are met:
	46	* 1. Redistributions of source code must retain the above copyright
	47	* notice, this list of conditions and the following disclaimer.
	48	* 2. Redistributions in binary form must reproduce the above copyright
	49	* notice, this list of conditions and the following disclaimer in the
	50	* documentation and/or other materials provided with the distribution.
	51	* 3. All advertising materials mentioning features or use of this software
	52	* must display the following acknowledgement:
	53	* This product includes software developed by the University of
	54	* California, Berkeley and its contributors.
	55	* 4. Neither the name of the University nor the names of its contributors
	56	* may be used to endorse or promote products derived from this software
	57	* without specific prior written permission.
	58	*
	59	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	60	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	61	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	62	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	63	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	64	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	65	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	66	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	67	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	68	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	69	* SUCH DAMAGE.
	70	*
	71	* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
	72	* $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $
	73	* $DragonFly: src/sys/kern/kern_clock.c,v 1.44 2005/06/27 18:37:57 dillon Exp $
	74	*/
	75
	76	#include "opt_ntp.h"
	77
	78	#include <sys/param.h>
	79	#include <sys/systm.h>
	80	#include <sys/callout.h>
	81	#include <sys/kernel.h>
	82	#include <sys/kinfo.h>
	83	#include <sys/proc.h>
	84	#include <sys/malloc.h>
	85	#include <sys/resourcevar.h>
	86	#include <sys/signalvar.h>
	87	#include <sys/timex.h>
	88	#include <sys/timepps.h>
	89	#include <vm/vm.h>
	90	#include <sys/lock.h>
	91	#include <vm/pmap.h>
	92	#include <vm/vm_map.h>
	93	#include <sys/sysctl.h>
	94	#include <sys/thread2.h>
	95
	96	#include <machine/cpu.h>
	97	#include <machine/limits.h>
	98	#include <machine/smp.h>
	99
	100	#ifdef GPROF
	101	#include <sys/gmon.h>
	102	#endif
	103
	104	#ifdef DEVICE_POLLING
	105	extern void init_device_poll(void);
	106	extern void hardclock_device_poll(void);
	107	#endif /* DEVICE_POLLING */
	108
	109	static void initclocks (void *dummy);
	110	SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
	111
	112	/*
	113	* Some of these don't belong here, but it's easiest to concentrate them.
	114	* Note that cpu_time counts in microseconds, but most userland programs
	115	* just compare relative times against the total by delta.
	116	*/
	117	struct kinfo_cputime cputime_percpu[MAXCPU];
	118	#ifdef SMP
	119	static int
	120	sysctl_cputime(SYSCTL_HANDLER_ARGS)
	121	{
	122	int cpu, error = 0;
	123	size_t size = sizeof(struct kinfo_cputime);
	124
	125	for (cpu = 0; cpu < ncpus; ++cpu) {
	126	if ((error = SYSCTL_OUT(req, &cputime_percpu[cpu], size)))
	127	break;
	128	}
	129
	130	return (error);
	131	}
	132	SYSCTL_PROC(_kern, OID_AUTO, cputime, (CTLTYPE_OPAQUE\|CTLFLAG_RD), 0, 0,
	133	sysctl_cputime, "S,kinfo_cputime", "CPU time statistics");
	134	#else
	135	SYSCTL_STRUCT(_kern, OID_AUTO, cputime, CTLFLAG_RD, &cpu_time, kinfo_cputime,
	136	"CPU time statistics");
	137	#endif
	138
	139	/*
	140	* boottime is used to calculate the 'real' uptime. Do not confuse this with
	141	* microuptime(). microtime() is not drift compensated. The real uptime
	142	* with compensation is nanotime() - bootime. boottime is recalculated
	143	* whenever the real time is set based on the compensated elapsed time
	144	* in seconds (gd->gd_time_seconds).
	145	*
	146	* The gd_time_seconds and gd_cpuclock_base fields remain fairly monotonic.
	147	* Slight adjustments to gd_cpuclock_base are made to phase-lock it to
	148	* the real time.
	149	*/
	150	struct timespec boottime; /* boot time (realtime) for reference only */
	151	time_t time_second; /* read-only 'passive' uptime in seconds */
	152
	153	/*
	154	* basetime is used to calculate the compensated real time of day. The
	155	* basetime can be modified on a per-tick basis by the adjtime(),
	156	* ntp_adjtime(), and sysctl-based time correction APIs.
	157	*
	158	* Note that frequency corrections can also be made by adjusting
	159	* gd_cpuclock_base.
	160	*
	161	* basetime is a tail-chasing FIFO, updated only by cpu #0. The FIFO is
	162	* used on both SMP and UP systems to avoid MP races between cpu's and
	163	* interrupt races on UP systems.
	164	*/
	165	#define BASETIME_ARYSIZE 16
	166	#define BASETIME_ARYMASK (BASETIME_ARYSIZE - 1)
	167	static struct timespec basetime[BASETIME_ARYSIZE];
	168	static volatile int basetime_index;
	169
	170	static int
	171	sysctl_get_basetime(SYSCTL_HANDLER_ARGS)
	172	{
	173	struct timespec *bt;
	174	int error;
	175	int index;
	176
	177	/*
	178	* Because basetime data and index may be updated by another cpu,
	179	* a load fence is required to ensure that the data we read has
	180	* not been speculatively read relative to a possibly updated index.
	181	*/
	182	index = basetime_index;
	183	cpu_lfence();
	184	bt = &basetime[index];
	185	error = SYSCTL_OUT(req, bt, sizeof(*bt));
	186	return (error);
	187	}
	188
	189	SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
	190	&boottime, timespec, "System boottime");
	191	SYSCTL_PROC(_kern, OID_AUTO, basetime, CTLTYPE_STRUCT\|CTLFLAG_RD, 0, 0,
	192	sysctl_get_basetime, "S,timespec", "System basetime");
	193
	194	static void hardclock(systimer_t info, struct intrframe *frame);
	195	static void statclock(systimer_t info, struct intrframe *frame);
	196	static void schedclock(systimer_t info, struct intrframe *frame);
	197	static void getnanotime_nbt(struct timespec nbt, struct timespec tsp);
	198
	199	int ticks; /* system master ticks at hz */
	200	int clocks_running; /* tsleep/timeout clocks operational */
	201	int64_t nsec_adj; /* ntpd per-tick adjustment in nsec << 32 */
	202	int64_t nsec_acc; /* accumulator */
	203
	204	/* NTPD time correction fields */
	205	int64_t ntp_tick_permanent; /* per-tick adjustment in nsec << 32 */
	206	int64_t ntp_tick_acc; /* accumulator for per-tick adjustment */
	207	int64_t ntp_delta; /* one-time correction in nsec */
	208	int64_t ntp_big_delta = 1000000000;
	209	int32_t ntp_tick_delta; /* current adjustment rate */
	210	int32_t ntp_default_tick_delta; /* adjustment rate for ntp_delta */
	211	time_t ntp_leap_second; /* time of next leap second */
	212	int ntp_leap_insert; /* whether to insert or remove a second */
	213
	214	/*
	215	* Finish initializing clock frequencies and start all clocks running.
	216	*/
	217	/* ARGSUSED*/
	218	static void
	219	initclocks(void *dummy)
	220	{
	221	cpu_initclocks();
	222	#ifdef DEVICE_POLLING
	223	init_device_poll();
	224	#endif
	225	/psratio = profhz / stathz;/
	226	initclocks_pcpu();
	227	clocks_running = 1;
	228	}
	229
	230	/*
	231	* Called on a per-cpu basis
	232	*/
	233	void
	234	initclocks_pcpu(void)
	235	{
	236	struct globaldata *gd = mycpu;
	237
	238	crit_enter();
	239	if (gd->gd_cpuid == 0) {
	240	gd->gd_time_seconds = 1;
	241	gd->gd_cpuclock_base = sys_cputimer->count();
	242	} else {
	243	/* XXX */
	244	gd->gd_time_seconds = globaldata_find(0)->gd_time_seconds;
	245	gd->gd_cpuclock_base = globaldata_find(0)->gd_cpuclock_base;
	246	}
	247
	248	/*
	249	* Use a non-queued periodic systimer to prevent multiple ticks from
	250	* building up if the sysclock jumps forward (8254 gets reset). The
	251	* sysclock will never jump backwards. Our time sync is based on
	252	* the actual sysclock, not the ticks count.
	253	*/
	254	systimer_init_periodic_nq(&gd->gd_hardclock, hardclock, NULL, hz);
	255	systimer_init_periodic_nq(&gd->gd_statclock, statclock, NULL, stathz);
	256	/* XXX correct the frequency for scheduler / estcpu tests */
	257	systimer_init_periodic_nq(&gd->gd_schedclock, schedclock,
	258	NULL, ESTCPUFREQ);
	259	crit_exit();
	260	}
	261
	262	/*
	263	* This sets the current real time of day. Timespecs are in seconds and
	264	* nanoseconds. We do not mess with gd_time_seconds and gd_cpuclock_base,
	265	* instead we adjust basetime so basetime + gd_* results in the current
	266	* time of day. This way the gd_* fields are guarenteed to represent
	267	* a monotonically increasing 'uptime' value.
	268	*
	269	* When set_timeofday() is called from userland, the system call forces it
	270	* onto cpu #0 since only cpu #0 can update basetime_index.
	271	*/
	272	void
	273	set_timeofday(struct timespec *ts)
	274	{
	275	struct timespec *nbt;
	276	int ni;
	277
	278	/*
	279	* XXX SMP / non-atomic basetime updates
	280	*/
	281	crit_enter();
	282	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	283	nbt = &basetime[ni];
	284	nanouptime(nbt);
	285	nbt->tv_sec = ts->tv_sec - nbt->tv_sec;
	286	nbt->tv_nsec = ts->tv_nsec - nbt->tv_nsec;
	287	if (nbt->tv_nsec < 0) {
	288	nbt->tv_nsec += 1000000000;
	289	--nbt->tv_sec;
	290	}
	291
	292	/*
	293	* Note that basetime diverges from boottime as the clock drift is
	294	* compensated for, so we cannot do away with boottime. When setting
	295	* the absolute time of day the drift is 0 (for an instant) and we
	296	* can simply assign boottime to basetime.
	297	*
	298	* Note that nanouptime() is based on gd_time_seconds which is drift
	299	* compensated up to a point (it is guarenteed to remain monotonically
	300	* increasing). gd_time_seconds is thus our best uptime guess and
	301	* suitable for use in the boottime calculation. It is already taken
	302	* into account in the basetime calculation above.
	303	*/
	304	boottime.tv_sec = nbt->tv_sec;
	305	ntp_delta = 0;
	306
	307	/*
	308	* We now have a new basetime, make sure all other cpus have it,
	309	* then update the index.
	310	*/
	311	cpu_sfence();
	312	basetime_index = ni;
	313
	314	crit_exit();
	315	}
	316
	317	/*
	318	* Each cpu has its own hardclock, but we only increments ticks and softticks
	319	* on cpu #0.
	320	*
	321	* NOTE! systimer! the MP lock might not be held here. We can only safely
	322	* manipulate objects owned by the current cpu.
	323	*/
	324	static void
	325	hardclock(systimer_t info, struct intrframe *frame)
	326	{
	327	sysclock_t cputicks;
	328	struct proc *p;
	329	struct pstats *pstats;
	330	struct globaldata *gd = mycpu;
	331
	332	/*
	333	* Realtime updates are per-cpu. Note that timer corrections as
	334	* returned by microtime() and friends make an additional adjustment
	335	* using a system-wise 'basetime', but the running time is always
	336	* taken from the per-cpu globaldata area. Since the same clock
	337	* is distributing (XXX SMP) to all cpus, the per-cpu timebases
	338	* stay in synch.
	339	*
	340	* Note that we never allow info->time (aka gd->gd_hardclock.time)
	341	* to reverse index gd_cpuclock_base, but that it is possible for
	342	* it to temporarily get behind in the seconds if something in the
	343	* system locks interrupts for a long period of time. Since periodic
	344	* timers count events, though everything should resynch again
	345	* immediately.
	346	*/
	347	cputicks = info->time - gd->gd_cpuclock_base;
	348	if (cputicks >= sys_cputimer->freq) {
	349	++gd->gd_time_seconds;
	350	gd->gd_cpuclock_base += sys_cputimer->freq;
	351	}
	352
	353	/*
	354	* The system-wide ticks counter and NTP related timedelta/tickdelta
	355	* adjustments only occur on cpu #0. NTP adjustments are accomplished
	356	* by updating basetime.
	357	*/
	358	if (gd->gd_cpuid == 0) {
	359	struct timespec *nbt;
	360	struct timespec nts;
	361	int leap;
	362	int ni;
	363
	364	++ticks;
	365
	366	#ifdef DEVICE_POLLING
	367	hardclock_device_poll(); /* mpsafe, short and quick */
	368	#endif /* DEVICE_POLLING */
	369
	370	#if 0
	371	if (tco->tc_poll_pps)
	372	tco->tc_poll_pps(tco);
	373	#endif
	374
	375	/*
	376	* Calculate the new basetime index. We are in a critical section
	377	* on cpu #0 and can safely play with basetime_index. Start
	378	* with the current basetime and then make adjustments.
	379	*/
	380	ni = (basetime_index + 1) & BASETIME_ARYMASK;
	381	nbt = &basetime[ni];
	382	*nbt = basetime[basetime_index];
	383
	384	/*
	385	* Apply adjtime corrections. (adjtime() API)
	386	*
	387	* adjtime() only runs on cpu #0 so our critical section is
	388	* sufficient to access these variables.
	389	*/
	390	if (ntp_delta != 0) {
	391	nbt->tv_nsec += ntp_tick_delta;
	392	ntp_delta -= ntp_tick_delta;
	393	if ((ntp_delta > 0 && ntp_delta < ntp_tick_delta) \|\|
	394	(ntp_delta < 0 && ntp_delta > ntp_tick_delta)) {
	395	ntp_tick_delta = ntp_delta;
	396	}
	397	}
	398
	399	/*
	400	* Apply permanent frequency corrections. (sysctl API)
	401	*/
	402	if (ntp_tick_permanent != 0) {
	403	ntp_tick_acc += ntp_tick_permanent;
	404	if (ntp_tick_acc >= (1LL << 32)) {
	405	nbt->tv_nsec += ntp_tick_acc >> 32;
	406	ntp_tick_acc -= (ntp_tick_acc >> 32) << 32;
	407	} else if (ntp_tick_acc <= -(1LL << 32)) {
	408	/* Negate ntp_tick_acc to avoid shifting the sign bit. */
	409	nbt->tv_nsec -= (-ntp_tick_acc) >> 32;
	410	ntp_tick_acc += ((-ntp_tick_acc) >> 32) << 32;
	411	}
	412	}
	413
	414	if (nbt->tv_nsec >= 1000000000) {
	415	nbt->tv_sec++;
	416	nbt->tv_nsec -= 1000000000;
	417	} else if (nbt->tv_nsec < 0) {
	418	nbt->tv_sec--;
	419	nbt->tv_nsec += 1000000000;
	420	}
	421
	422	/*
	423	* Another per-tick compensation. (for ntp_adjtime() API)
	424	*/
	425	if (nsec_adj != 0) {
	426	nsec_acc += nsec_adj;
	427	if (nsec_acc >= 0x100000000LL) {
	428	nbt->tv_nsec += nsec_acc >> 32;
	429	nsec_acc = (nsec_acc & 0xFFFFFFFFLL);
	430	} else if (nsec_acc <= -0x100000000LL) {
	431	nbt->tv_nsec -= -nsec_acc >> 32;
	432	nsec_acc = -(-nsec_acc & 0xFFFFFFFFLL);
	433	}
	434	if (nbt->tv_nsec >= 1000000000) {
	435	nbt->tv_nsec -= 1000000000;
	436	++nbt->tv_sec;
	437	} else if (nbt->tv_nsec < 0) {
	438	nbt->tv_nsec += 1000000000;
	439	--nbt->tv_sec;
	440	}
	441	}
	442
	443	/************************************************************
	444	* LEAP SECOND CORRECTION *
	445	************************************************************
	446	*
	447	* Taking into account all the corrections made above, figure
	448	* out the new real time. If the seconds field has changed
	449	* then apply any pending leap-second corrections.
	450	*/
	451	getnanotime_nbt(nbt, &nts);
	452
	453	if (time_second != nts.tv_sec) {
	454	/*
	455	* Apply leap second (sysctl API). Adjust nts for changes
	456	* so we do not have to call getnanotime_nbt again.
	457	*/
	458	if (ntp_leap_second) {
	459	if (ntp_leap_second == nts.tv_sec) {
	460	if (ntp_leap_insert) {
	461	nbt->tv_sec++;
	462	nts.tv_sec++;
	463	} else {
	464	nbt->tv_sec--;
	465	nts.tv_sec--;
	466	}
	467	ntp_leap_second--;
	468	}
	469	}
	470
	471	/*
	472	* Apply leap second (ntp_adjtime() API), calculate a new
	473	* nsec_adj field. ntp_update_second() returns nsec_adj
	474	* as a per-second value but we need it as a per-tick value.
	475	*/
	476	leap = ntp_update_second(time_second, &nsec_adj);
	477	nsec_adj /= hz;
	478	nbt->tv_sec += leap;
	479	nts.tv_sec += leap;
	480
	481	/*
	482	* Update the time_second 'approximate time' global.
	483	*/
	484	time_second = nts.tv_sec;
	485	}
	486
	487	/*
	488	* Finally, our new basetime is ready to go live!
	489	*/
	490	cpu_sfence();
	491	basetime_index = ni;
	492	}
	493
	494	/*
	495	* softticks are handled for all cpus
	496	*/
	497	hardclock_softtick(gd);
	498
	499	/*
	500	* ITimer handling is per-tick, per-cpu. I don't think psignal()
	501	* is mpsafe on curproc, so XXX get the mplock.
	502	*/
	503	if ((p = curproc) != NULL && try_mplock()) {
	504	pstats = p->p_stats;
	505	if (frame && CLKF_USERMODE(frame) &&
	506	timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
	507	itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
	508	psignal(p, SIGVTALRM);
	509	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
	510	itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
	511	psignal(p, SIGPROF);
	512	rel_mplock();
	513	}
	514	setdelayed();
	515	}
	516
	517	/*
	518	* The statistics clock typically runs at a 125Hz rate, and is intended
	519	* to be frequency offset from the hardclock (typ 100Hz). It is per-cpu.
	520	*
	521	* NOTE! systimer! the MP lock might not be held here. We can only safely
	522	* manipulate objects owned by the current cpu.
	523	*
	524	* The stats clock is responsible for grabbing a profiling sample.
	525	* Most of the statistics are only used by user-level statistics programs.
	526	* The main exceptions are p->p_uticks, p->p_sticks, p->p_iticks, and
	527	* p->p_estcpu.
	528	*
	529	* Like the other clocks, the stat clock is called from what is effectively
	530	* a fast interrupt, so the context should be the thread/process that got
	531	* interrupted.
	532	*/
	533	static void
	534	statclock(systimer_t info, struct intrframe *frame)
	535	{
	536	#ifdef GPROF
	537	struct gmonparam *g;
	538	int i;
	539	#endif
	540	thread_t td;
	541	struct proc *p;
	542	int bump;
	543	struct timeval tv;
	544	struct timeval *stv;
	545
	546	/*
	547	* How big was our timeslice relative to the last time?
	548	*/
	549	microuptime(&tv); /* mpsafe */
	550	stv = &mycpu->gd_stattv;
	551	if (stv->tv_sec == 0) {
	552	bump = 1;
	553	} else {
	554	bump = tv.tv_usec - stv->tv_usec +
	555	(tv.tv_sec - stv->tv_sec) * 1000000;
	556	if (bump < 0)
	557	bump = 0;
	558	if (bump > 1000000)
	559	bump = 1000000;
	560	}
	561	*stv = tv;
	562
	563	td = curthread;
	564	p = td->td_proc;
	565
	566	if (frame && CLKF_USERMODE(frame)) {
	567	/*
	568	* Came from userland, handle user time and deal with
	569	* possible process.
	570	*/
	571	if (p && (p->p_flag & P_PROFIL))
	572	addupc_intr(p, CLKF_PC(frame), 1);
	573	td->td_uticks += bump;
	574
	575	/*
	576	* Charge the time as appropriate
	577	*/
	578	if (p && p->p_nice > NZERO)
	579	cpu_time.cp_nice += bump;
	580	else
	581	cpu_time.cp_user += bump;
	582	} else {
	583	#ifdef GPROF
	584	/*
	585	* Kernel statistics are just like addupc_intr, only easier.
	586	*/
	587	g = &_gmonparam;
	588	if (g->state == GMON_PROF_ON && frame) {
	589	i = CLKF_PC(frame) - g->lowpc;
	590	if (i < g->textsize) {
	591	i /= HISTFRACTION * sizeof(*g->kcount);
	592	g->kcount[i]++;
	593	}
	594	}
	595	#endif
	596	/*
	597	* Came from kernel mode, so we were:
	598	* - handling an interrupt,
	599	* - doing syscall or trap work on behalf of the current
	600	* user process, or
	601	* - spinning in the idle loop.
	602	* Whichever it is, charge the time as appropriate.
	603	* Note that we charge interrupts to the current process,
	604	* regardless of whether they are ``for'' that process,
	605	* so that we know how much of its real time was spent
	606	* in ``non-process'' (i.e., interrupt) work.
	607	*
	608	* XXX assume system if frame is NULL. A NULL frame
	609	* can occur if ipi processing is done from a crit_exit().
	610	*/
	611	if (frame && CLKF_INTR(frame))
	612	td->td_iticks += bump;
	613	else
	614	td->td_sticks += bump;
	615
	616	if (frame && CLKF_INTR(frame)) {
	617	cpu_time.cp_intr += bump;
	618	} else {
	619	if (td == &mycpu->gd_idlethread)
	620	cpu_time.cp_idle += bump;
	621	else
	622	cpu_time.cp_sys += bump;
	623	}
	624	}
	625	}
	626
	627	/*
	628	* The scheduler clock typically runs at a 50Hz rate. NOTE! systimer,
	629	* the MP lock might not be held. We can safely manipulate parts of curproc
	630	* but that's about it.
	631	*
	632	* Each cpu has its own scheduler clock.
	633	*/
	634	static void
	635	schedclock(systimer_t info, struct intrframe *frame)
	636	{
	637	struct proc *p;
	638	struct pstats *pstats;
	639	struct rusage *ru;
	640	struct vmspace *vm;
	641	long rss;
	642
	643	if ((p = lwkt_preempted_proc()) != NULL) {
	644	/*
	645	* Account for cpu time used and hit the scheduler. Note
	646	* that this call MUST BE MP SAFE, and the BGL IS NOT HELD
	647	* HERE.
	648	*/
	649	p->p_usched->schedulerclock(p, info->periodic, info->time);
	650	}
	651	if ((p = curproc) != NULL) {
	652	/*
	653	* Update resource usage integrals and maximums.
	654	*/
	655	if ((pstats = p->p_stats) != NULL &&
	656	(ru = &pstats->p_ru) != NULL &&
	657	(vm = p->p_vmspace) != NULL) {
	658	ru->ru_ixrss += pgtok(vm->vm_tsize);
	659	ru->ru_idrss += pgtok(vm->vm_dsize);
	660	ru->ru_isrss += pgtok(vm->vm_ssize);
	661	rss = pgtok(vmspace_resident_count(vm));
	662	if (ru->ru_maxrss < rss)
	663	ru->ru_maxrss = rss;
	664	}
	665	}
	666	}
	667
	668	/*
	669	* Compute number of ticks for the specified amount of time. The
	670	* return value is intended to be used in a clock interrupt timed
	671	* operation and guarenteed to meet or exceed the requested time.
	672	* If the representation overflows, return INT_MAX. The minimum return
	673	* value is 1 ticks and the function will average the calculation up.
	674	* If any value greater then 0 microseconds is supplied, a value
	675	* of at least 2 will be returned to ensure that a near-term clock
	676	* interrupt does not cause the timeout to occur (degenerately) early.
	677	*
	678	* Note that limit checks must take into account microseconds, which is
	679	* done simply by using the smaller signed long maximum instead of
	680	* the unsigned long maximum.
	681	*
	682	* If ints have 32 bits, then the maximum value for any timeout in
	683	* 10ms ticks is 248 days.
	684	*/
	685	int
	686	tvtohz_high(struct timeval *tv)
	687	{
	688	int ticks;
	689	long sec, usec;
	690
	691	sec = tv->tv_sec;
	692	usec = tv->tv_usec;
	693	if (usec < 0) {
	694	sec--;
	695	usec += 1000000;
	696	}
	697	if (sec < 0) {
	698	#ifdef DIAGNOSTIC
	699	if (usec > 0) {
	700	sec++;
	701	usec -= 1000000;
	702	}
	703	printf("tvotohz: negative time difference %ld sec %ld usec\n",
	704	sec, usec);
	705	#endif
	706	ticks = 1;
	707	} else if (sec <= INT_MAX / hz) {
	708	ticks = (int)(sec * hz +
	709	((u_long)usec + (tick - 1)) / tick) + 1;
	710	} else {
	711	ticks = INT_MAX;
	712	}
	713	return (ticks);
	714	}
	715
	716	/*
	717	* Compute number of ticks for the specified amount of time, erroring on
	718	* the side of it being too low to ensure that sleeping the returned number
	719	* of ticks will not result in a late return.
	720	*
	721	* The supplied timeval may not be negative and should be normalized. A
	722	* return value of 0 is possible if the timeval converts to less then
	723	* 1 tick.
	724	*
	725	* If ints have 32 bits, then the maximum value for any timeout in
	726	* 10ms ticks is 248 days.
	727	*/
	728	int
	729	tvtohz_low(struct timeval *tv)
	730	{
	731	int ticks;
	732	long sec;
	733
	734	sec = tv->tv_sec;
	735	if (sec <= INT_MAX / hz)
	736	ticks = (int)(sec * hz + (u_long)tv->tv_usec / tick);
	737	else
	738	ticks = INT_MAX;
	739	return (ticks);
	740	}
	741
	742
	743	/*
	744	* Start profiling on a process.
	745	*
	746	* Kernel profiling passes proc0 which never exits and hence
	747	* keeps the profile clock running constantly.
	748	*/
	749	void
	750	startprofclock(struct proc *p)
	751	{
	752	if ((p->p_flag & P_PROFIL) == 0) {
	753	p->p_flag \|= P_PROFIL;
	754	#if 0 /* XXX */
	755	if (++profprocs == 1 && stathz != 0) {
	756	crit_enter();
	757	psdiv = psratio;
	758	setstatclockrate(profhz);
	759	crit_exit();
	760	}
	761	#endif
	762	}
	763	}
	764
	765	/*
	766	* Stop profiling on a process.
	767	*/
	768	void
	769	stopprofclock(struct proc *p)
	770	{
	771	if (p->p_flag & P_PROFIL) {
	772	p->p_flag &= ~P_PROFIL;
	773	#if 0 /* XXX */
	774	if (--profprocs == 0 && stathz != 0) {
	775	crit_enter();
	776	psdiv = 1;
	777	setstatclockrate(stathz);
	778	crit_exit();
	779	}
	780	#endif
	781	}
	782	}
	783
	784	/*
	785	* Return information about system clocks.
	786	*/
	787	static int
	788	sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
	789	{
	790	struct kinfo_clockinfo clkinfo;
	791	/*
	792	* Construct clockinfo structure.
	793	*/
	794	clkinfo.ci_hz = hz;
	795	clkinfo.ci_tick = tick;
	796	clkinfo.ci_tickadj = ntp_default_tick_delta / 1000;
	797	clkinfo.ci_profhz = profhz;
	798	clkinfo.ci_stathz = stathz ? stathz : hz;
	799	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
	800	}
	801
	802	SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT\|CTLFLAG_RD,
	803	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
	804
	805	/*
	806	* We have eight functions for looking at the clock, four for
	807	* microseconds and four for nanoseconds. For each there is fast
	808	* but less precise version "get{nano\|micro}[up]time" which will
	809	* return a time which is up to 1/HZ previous to the call, whereas
	810	* the raw version "{nano\|micro}[up]time" will return a timestamp
	811	* which is as precise as possible. The "up" variants return the
	812	* time relative to system boot, these are well suited for time
	813	* interval measurements.
	814	*
	815	* Each cpu independantly maintains the current time of day, so all
	816	* we need to do to protect ourselves from changes is to do a loop
	817	* check on the seconds field changing out from under us.
	818	*
	819	* The system timer maintains a 32 bit count and due to various issues
	820	* it is possible for the calculated delta to occassionally exceed
	821	* sys_cputimer->freq. If this occurs the sys_cputimer->freq64_nsec
	822	* multiplication can easily overflow, so we deal with the case. For
	823	* uniformity we deal with the case in the usec case too.
	824	*/
	825	void
	826	getmicrouptime(struct timeval *tvp)
	827	{
	828	struct globaldata *gd = mycpu;
	829	sysclock_t delta;
	830
	831	do {
	832	tvp->tv_sec = gd->gd_time_seconds;
	833	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	834	} while (tvp->tv_sec != gd->gd_time_seconds);
	835
	836	if (delta >= sys_cputimer->freq) {
	837	tvp->tv_sec += delta / sys_cputimer->freq;
	838	delta %= sys_cputimer->freq;
	839	}
	840	tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32;
	841	if (tvp->tv_usec >= 1000000) {
	842	tvp->tv_usec -= 1000000;
	843	++tvp->tv_sec;
	844	}
	845	}
	846
	847	void
	848	getnanouptime(struct timespec *tsp)
	849	{
	850	struct globaldata *gd = mycpu;
	851	sysclock_t delta;
	852
	853	do {
	854	tsp->tv_sec = gd->gd_time_seconds;
	855	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	856	} while (tsp->tv_sec != gd->gd_time_seconds);
	857
	858	if (delta >= sys_cputimer->freq) {
	859	tsp->tv_sec += delta / sys_cputimer->freq;
	860	delta %= sys_cputimer->freq;
	861	}
	862	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	863	}
	864
	865	void
	866	microuptime(struct timeval *tvp)
	867	{
	868	struct globaldata *gd = mycpu;
	869	sysclock_t delta;
	870
	871	do {
	872	tvp->tv_sec = gd->gd_time_seconds;
	873	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	874	} while (tvp->tv_sec != gd->gd_time_seconds);
	875
	876	if (delta >= sys_cputimer->freq) {
	877	tvp->tv_sec += delta / sys_cputimer->freq;
	878	delta %= sys_cputimer->freq;
	879	}
	880	tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32;
	881	}
	882
	883	void
	884	nanouptime(struct timespec *tsp)
	885	{
	886	struct globaldata *gd = mycpu;
	887	sysclock_t delta;
	888
	889	do {
	890	tsp->tv_sec = gd->gd_time_seconds;
	891	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	892	} while (tsp->tv_sec != gd->gd_time_seconds);
	893
	894	if (delta >= sys_cputimer->freq) {
	895	tsp->tv_sec += delta / sys_cputimer->freq;
	896	delta %= sys_cputimer->freq;
	897	}
	898	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	899	}
	900
	901	/*
	902	* realtime routines
	903	*/
	904
	905	void
	906	getmicrotime(struct timeval *tvp)
	907	{
	908	struct globaldata *gd = mycpu;
	909	struct timespec *bt;
	910	sysclock_t delta;
	911
	912	do {
	913	tvp->tv_sec = gd->gd_time_seconds;
	914	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	915	} while (tvp->tv_sec != gd->gd_time_seconds);
	916
	917	if (delta >= sys_cputimer->freq) {
	918	tvp->tv_sec += delta / sys_cputimer->freq;
	919	delta %= sys_cputimer->freq;
	920	}
	921	tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32;
	922
	923	bt = &basetime[basetime_index];
	924	tvp->tv_sec += bt->tv_sec;
	925	tvp->tv_usec += bt->tv_nsec / 1000;
	926	while (tvp->tv_usec >= 1000000) {
	927	tvp->tv_usec -= 1000000;
	928	++tvp->tv_sec;
	929	}
	930	}
	931
	932	void
	933	getnanotime(struct timespec *tsp)
	934	{
	935	struct globaldata *gd = mycpu;
	936	struct timespec *bt;
	937	sysclock_t delta;
	938
	939	do {
	940	tsp->tv_sec = gd->gd_time_seconds;
	941	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	942	} while (tsp->tv_sec != gd->gd_time_seconds);
	943
	944	if (delta >= sys_cputimer->freq) {
	945	tsp->tv_sec += delta / sys_cputimer->freq;
	946	delta %= sys_cputimer->freq;
	947	}
	948	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	949
	950	bt = &basetime[basetime_index];
	951	tsp->tv_sec += bt->tv_sec;
	952	tsp->tv_nsec += bt->tv_nsec;
	953	while (tsp->tv_nsec >= 1000000000) {
	954	tsp->tv_nsec -= 1000000000;
	955	++tsp->tv_sec;
	956	}
	957	}
	958
	959	static void
	960	getnanotime_nbt(struct timespec nbt, struct timespec tsp)
	961	{
	962	struct globaldata *gd = mycpu;
	963	sysclock_t delta;
	964
	965	do {
	966	tsp->tv_sec = gd->gd_time_seconds;
	967	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	968	} while (tsp->tv_sec != gd->gd_time_seconds);
	969
	970	if (delta >= sys_cputimer->freq) {
	971	tsp->tv_sec += delta / sys_cputimer->freq;
	972	delta %= sys_cputimer->freq;
	973	}
	974	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	975
	976	tsp->tv_sec += nbt->tv_sec;
	977	tsp->tv_nsec += nbt->tv_nsec;
	978	while (tsp->tv_nsec >= 1000000000) {
	979	tsp->tv_nsec -= 1000000000;
	980	++tsp->tv_sec;
	981	}
	982	}
	983
	984
	985	void
	986	microtime(struct timeval *tvp)
	987	{
	988	struct globaldata *gd = mycpu;
	989	struct timespec *bt;
	990	sysclock_t delta;
	991
	992	do {
	993	tvp->tv_sec = gd->gd_time_seconds;
	994	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	995	} while (tvp->tv_sec != gd->gd_time_seconds);
	996
	997	if (delta >= sys_cputimer->freq) {
	998	tvp->tv_sec += delta / sys_cputimer->freq;
	999	delta %= sys_cputimer->freq;
	1000	}
	1001	tvp->tv_usec = (sys_cputimer->freq64_usec * delta) >> 32;
	1002
	1003	bt = &basetime[basetime_index];
	1004	tvp->tv_sec += bt->tv_sec;
	1005	tvp->tv_usec += bt->tv_nsec / 1000;
	1006	while (tvp->tv_usec >= 1000000) {
	1007	tvp->tv_usec -= 1000000;
	1008	++tvp->tv_sec;
	1009	}
	1010	}
	1011
	1012	void
	1013	nanotime(struct timespec *tsp)
	1014	{
	1015	struct globaldata *gd = mycpu;
	1016	struct timespec *bt;
	1017	sysclock_t delta;
	1018
	1019	do {
	1020	tsp->tv_sec = gd->gd_time_seconds;
	1021	delta = sys_cputimer->count() - gd->gd_cpuclock_base;
	1022	} while (tsp->tv_sec != gd->gd_time_seconds);
	1023
	1024	if (delta >= sys_cputimer->freq) {
	1025	tsp->tv_sec += delta / sys_cputimer->freq;
	1026	delta %= sys_cputimer->freq;
	1027	}
	1028	tsp->tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	1029
	1030	bt = &basetime[basetime_index];
	1031	tsp->tv_sec += bt->tv_sec;
	1032	tsp->tv_nsec += bt->tv_nsec;
	1033	while (tsp->tv_nsec >= 1000000000) {
	1034	tsp->tv_nsec -= 1000000000;
	1035	++tsp->tv_sec;
	1036	}
	1037	}
	1038
	1039	/*
	1040	* note: this is not exactly synchronized with real time. To do that we
	1041	* would have to do what microtime does and check for a nanoseconds overflow.
	1042	*/
	1043	time_t
	1044	get_approximate_time_t(void)
	1045	{
	1046	struct globaldata *gd = mycpu;
	1047	struct timespec *bt;
	1048
	1049	bt = &basetime[basetime_index];
	1050	return(gd->gd_time_seconds + bt->tv_sec);
	1051	}
	1052
	1053	int
	1054	pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
	1055	{
	1056	pps_params_t *app;
	1057	struct pps_fetch_args *fapi;
	1058	#ifdef PPS_SYNC
	1059	struct pps_kcbind_args *kapi;
	1060	#endif
	1061
	1062	switch (cmd) {
	1063	case PPS_IOC_CREATE:
	1064	return (0);
	1065	case PPS_IOC_DESTROY:
	1066	return (0);
	1067	case PPS_IOC_SETPARAMS:
	1068	app = (pps_params_t *)data;
	1069	if (app->mode & ~pps->ppscap)
	1070	return (EINVAL);
	1071	pps->ppsparam = *app;
	1072	return (0);
	1073	case PPS_IOC_GETPARAMS:
	1074	app = (pps_params_t *)data;
	1075	*app = pps->ppsparam;
	1076	app->api_version = PPS_API_VERS_1;
	1077	return (0);
	1078	case PPS_IOC_GETCAP:
	1079	(int)data = pps->ppscap;
	1080	return (0);
	1081	case PPS_IOC_FETCH:
	1082	fapi = (struct pps_fetch_args *)data;
	1083	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
	1084	return (EINVAL);
	1085	if (fapi->timeout.tv_sec \|\| fapi->timeout.tv_nsec)
	1086	return (EOPNOTSUPP);
	1087	pps->ppsinfo.current_mode = pps->ppsparam.mode;
	1088	fapi->pps_info_buf = pps->ppsinfo;
	1089	return (0);
	1090	case PPS_IOC_KCBIND:
	1091	#ifdef PPS_SYNC
	1092	kapi = (struct pps_kcbind_args *)data;
	1093	/* XXX Only root should be able to do this */
	1094	if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
	1095	return (EINVAL);
	1096	if (kapi->kernel_consumer != PPS_KC_HARDPPS)
	1097	return (EINVAL);
	1098	if (kapi->edge & ~pps->ppscap)
	1099	return (EINVAL);
	1100	pps->kcmode = kapi->edge;
	1101	return (0);
	1102	#else
	1103	return (EOPNOTSUPP);
	1104	#endif
	1105	default:
	1106	return (ENOTTY);
	1107	}
	1108	}
	1109
	1110	void
	1111	pps_init(struct pps_state *pps)
	1112	{
	1113	pps->ppscap \|= PPS_TSFMT_TSPEC;
	1114	if (pps->ppscap & PPS_CAPTUREASSERT)
	1115	pps->ppscap \|= PPS_OFFSETASSERT;
	1116	if (pps->ppscap & PPS_CAPTURECLEAR)
	1117	pps->ppscap \|= PPS_OFFSETCLEAR;
	1118	}
	1119
	1120	void
	1121	pps_event(struct pps_state *pps, sysclock_t count, int event)
	1122	{
	1123	struct globaldata *gd;
	1124	struct timespec *tsp;
	1125	struct timespec *osp;
	1126	struct timespec *bt;
	1127	struct timespec ts;
	1128	sysclock_t *pcount;
	1129	#ifdef PPS_SYNC
	1130	sysclock_t tcount;
	1131	#endif
	1132	sysclock_t delta;
	1133	pps_seq_t *pseq;
	1134	int foff;
	1135	int fhard;
	1136
	1137	gd = mycpu;
	1138
	1139	/* Things would be easier with arrays... */
	1140	if (event == PPS_CAPTUREASSERT) {
	1141	tsp = &pps->ppsinfo.assert_timestamp;
	1142	osp = &pps->ppsparam.assert_offset;
	1143	foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
	1144	fhard = pps->kcmode & PPS_CAPTUREASSERT;
	1145	pcount = &pps->ppscount[0];
	1146	pseq = &pps->ppsinfo.assert_sequence;
	1147	} else {
	1148	tsp = &pps->ppsinfo.clear_timestamp;
	1149	osp = &pps->ppsparam.clear_offset;
	1150	foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
	1151	fhard = pps->kcmode & PPS_CAPTURECLEAR;
	1152	pcount = &pps->ppscount[1];
	1153	pseq = &pps->ppsinfo.clear_sequence;
	1154	}
	1155
	1156	/* Nothing really happened */
	1157	if (*pcount == count)
	1158	return;
	1159
	1160	*pcount = count;
	1161
	1162	do {
	1163	ts.tv_sec = gd->gd_time_seconds;
	1164	delta = count - gd->gd_cpuclock_base;
	1165	} while (ts.tv_sec != gd->gd_time_seconds);
	1166
	1167	if (delta >= sys_cputimer->freq) {
	1168	ts.tv_sec += delta / sys_cputimer->freq;
	1169	delta %= sys_cputimer->freq;
	1170	}
	1171	ts.tv_nsec = (sys_cputimer->freq64_nsec * delta) >> 32;
	1172	bt = &basetime[basetime_index];
	1173	ts.tv_sec += bt->tv_sec;
	1174	ts.tv_nsec += bt->tv_nsec;
	1175	while (ts.tv_nsec >= 1000000000) {
	1176	ts.tv_nsec -= 1000000000;
	1177	++ts.tv_sec;
	1178	}
	1179
	1180	(*pseq)++;
	1181	*tsp = ts;
	1182
	1183	if (foff) {
	1184	timespecadd(tsp, osp);
	1185	if (tsp->tv_nsec < 0) {
	1186	tsp->tv_nsec += 1000000000;
	1187	tsp->tv_sec -= 1;
	1188	}
	1189	}
	1190	#ifdef PPS_SYNC
	1191	if (fhard) {
	1192	/* magic, at its best... */
	1193	tcount = count - pps->ppscount[2];
	1194	pps->ppscount[2] = count;
	1195	if (tcount >= sys_cputimer->freq) {
	1196	delta = (1000000000 * (tcount / sys_cputimer->freq) +
	1197	sys_cputimer->freq64_nsec *
	1198	(tcount % sys_cputimer->freq)) >> 32;
	1199	} else {
	1200	delta = (sys_cputimer->freq64_nsec * tcount) >> 32;
	1201	}
	1202	hardpps(tsp, delta);
	1203	}
	1204	#endif
	1205	}
	1206