gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
	35	* Copyright (c) 1982, 1986, 1991, 1993
	36	* The Regents of the University of California. All rights reserved.
	37	* (c) UNIX System Laboratories, Inc.
	38	* All or some portions of this file are derived from material licensed
	39	* to the University of California by American Telephone and Telegraph
	40	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	41	* the permission of UNIX System Laboratories, Inc.
	42	*
	43	* Redistribution and use in source and binary forms, with or without
	44	* modification, are permitted provided that the following conditions
	45	* are met:
	46	* 1. Redistributions of source code must retain the above copyright
	47	* notice, this list of conditions and the following disclaimer.
	48	* 2. Redistributions in binary form must reproduce the above copyright
	49	* notice, this list of conditions and the following disclaimer in the
	50	* documentation and/or other materials provided with the distribution.
	51	* 3. All advertising materials mentioning features or use of this software
	52	* must display the following acknowledgement:
	53	* This product includes software developed by the University of
	54	* California, Berkeley and its contributors.
	55	* 4. Neither the name of the University nor the names of its contributors
	56	* may be used to endorse or promote products derived from this software
	57	* without specific prior written permission.
	58	*
	59	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	60	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	61	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	62	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	63	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	64	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	65	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	66	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	67	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	68	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	69	* SUCH DAMAGE.
	70	*
	71	* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
	72	* $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $
	73	* $DragonFly: src/sys/kern/kern_clock.c,v 1.30 2005/01/31 21:37:52 joerg Exp $
	74	*/
	75
	76	#include "opt_ntp.h"
	77
	78	#include <sys/param.h>
	79	#include <sys/systm.h>
	80	#include <sys/dkstat.h>
	81	#include <sys/callout.h>
	82	#include <sys/kernel.h>
	83	#include <sys/kinfo.h>
	84	#include <sys/proc.h>
	85	#include <sys/malloc.h>
	86	#include <sys/resourcevar.h>
	87	#include <sys/signalvar.h>
	88	#include <sys/timex.h>
	89	#include <sys/timepps.h>
	90	#include <vm/vm.h>
	91	#include <sys/lock.h>
	92	#include <vm/pmap.h>
	93	#include <vm/vm_map.h>
	94	#include <sys/sysctl.h>
	95	#include <sys/thread2.h>
	96
	97	#include <machine/cpu.h>
	98	#include <machine/limits.h>
	99	#include <machine/smp.h>
	100
	101	#ifdef GPROF
	102	#include <sys/gmon.h>
	103	#endif
	104
	105	#ifdef DEVICE_POLLING
	106	extern void init_device_poll(void);
	107	extern void hardclock_device_poll(void);
	108	#endif /* DEVICE_POLLING */
	109
	110	static void initclocks (void *dummy);
	111	SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
	112
	113	/*
	114	* Some of these don't belong here, but it's easiest to concentrate them.
	115	* Note that cp_time counts in microseconds, but most userland programs
	116	* just compare relative times against the total by delta.
	117	*/
	118	struct cp_time cp_time;
	119
	120	SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
	121	"LU", "CPU time statistics");
	122
	123	/*
	124	* boottime is used to calculate the 'real' uptime. Do not confuse this with
	125	* microuptime(). microtime() is not drift compensated. The real uptime
	126	* with compensation is nanotime() - bootime. boottime is recalculated
	127	* whenever the real time is set based on the compensated elapsed time
	128	* in seconds (gd->gd_time_seconds).
	129	*
	130	* basetime is used to calculate the compensated real time of day. Chunky
	131	* changes to the time, aka settimeofday(), are made by modifying basetime.
	132	*
	133	* The gd_time_seconds and gd_cpuclock_base fields remain fairly monotonic.
	134	* Slight adjustments to gd_cpuclock_base are made to phase-lock it to
	135	* the real time.
	136	*/
	137	struct timespec boottime; /* boot time (realtime) for reference only */
	138	struct timespec basetime; /* base time adjusts uptime -> realtime */
	139	time_t time_second; /* read-only 'passive' uptime in seconds */
	140
	141	SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
	142	&boottime, timeval, "System boottime");
	143	SYSCTL_STRUCT(_kern, OID_AUTO, basetime, CTLFLAG_RD,
	144	&basetime, timeval, "System basetime");
	145
	146	static void hardclock(systimer_t info, struct intrframe *frame);
	147	static void statclock(systimer_t info, struct intrframe *frame);
	148	static void schedclock(systimer_t info, struct intrframe *frame);
	149
	150	int ticks; /* system master ticks at hz */
	151	int clocks_running; /* tsleep/timeout clocks operational */
	152	int64_t nsec_adj; /* ntpd per-tick adjustment in nsec << 32 */
	153	int64_t nsec_acc; /* accumulator */
	154
	155	/*
	156	* Finish initializing clock frequencies and start all clocks running.
	157	*/
	158	/* ARGSUSED*/
	159	static void
	160	initclocks(void *dummy)
	161	{
	162	cpu_initclocks();
	163	#ifdef DEVICE_POLLING
	164	init_device_poll();
	165	#endif
	166	/psratio = profhz / stathz;/
	167	initclocks_pcpu();
	168	clocks_running = 1;
	169	}
	170
	171	/*
	172	* Called on a per-cpu basis
	173	*/
	174	void
	175	initclocks_pcpu(void)
	176	{
	177	struct globaldata *gd = mycpu;
	178
	179	crit_enter();
	180	if (gd->gd_cpuid == 0) {
	181	gd->gd_time_seconds = 1;
	182	gd->gd_cpuclock_base = cputimer_count();
	183	} else {
	184	/* XXX */
	185	gd->gd_time_seconds = globaldata_find(0)->gd_time_seconds;
	186	gd->gd_cpuclock_base = globaldata_find(0)->gd_cpuclock_base;
	187	}
	188
	189	/*
	190	* Use a non-queued periodic systimer to prevent multiple ticks from
	191	* building up if the sysclock jumps forward (8254 gets reset). The
	192	* sysclock will never jump backwards. Our time sync is based on
	193	* the actual sysclock, not the ticks count.
	194	*/
	195	systimer_init_periodic_nq(&gd->gd_hardclock, hardclock, NULL, hz);
	196	systimer_init_periodic_nq(&gd->gd_statclock, statclock, NULL, stathz);
	197	/* XXX correct the frequency for scheduler / estcpu tests */
	198	systimer_init_periodic_nq(&gd->gd_schedclock, schedclock,
	199	NULL, ESTCPUFREQ);
	200	crit_exit();
	201	}
	202
	203	/*
	204	* This sets the current real time of day. Timespecs are in seconds and
	205	* nanoseconds. We do not mess with gd_time_seconds and gd_cpuclock_base,
	206	* instead we adjust basetime so basetime + gd_* results in the current
	207	* time of day. This way the gd_* fields are guarenteed to represent
	208	* a monotonically increasing 'uptime' value.
	209	*/
	210	void
	211	set_timeofday(struct timespec *ts)
	212	{
	213	struct timespec ts2;
	214
	215	/*
	216	* XXX SMP / non-atomic basetime updates
	217	*/
	218	crit_enter();
	219	nanouptime(&ts2);
	220	basetime.tv_sec = ts->tv_sec - ts2.tv_sec;
	221	basetime.tv_nsec = ts->tv_nsec - ts2.tv_nsec;
	222	if (basetime.tv_nsec < 0) {
	223	basetime.tv_nsec += 1000000000;
	224	--basetime.tv_sec;
	225	}
	226
	227	/*
	228	* Note that basetime diverges from boottime as the clock drift is
	229	* compensated for, so we cannot do away with boottime. When setting
	230	* the absolute time of day the drift is 0 (for an instant) and we
	231	* can simply assign boottime to basetime.
	232	*
	233	* Note that nanouptime() is based on gd_time_seconds which is drift
	234	* compensated up to a point (it is guarenteed to remain monotonically
	235	* increasing). gd_time_seconds is thus our best uptime guess and
	236	* suitable for use in the boottime calculation. It is already taken
	237	* into account in the basetime calculation above.
	238	*/
	239	boottime.tv_sec = basetime.tv_sec;
	240	timedelta = 0;
	241	crit_exit();
	242	}
	243
	244	/*
	245	* Each cpu has its own hardclock, but we only increments ticks and softticks
	246	* on cpu #0.
	247	*
	248	* NOTE! systimer! the MP lock might not be held here. We can only safely
	249	* manipulate objects owned by the current cpu.
	250	*/
	251	static void
	252	hardclock(systimer_t info, struct intrframe *frame)
	253	{
	254	sysclock_t cputicks;
	255	struct proc *p;
	256	struct pstats *pstats;
	257	struct globaldata *gd = mycpu;
	258
	259	/*
	260	* Realtime updates are per-cpu. Note that timer corrections as
	261	* returned by microtime() and friends make an additional adjustment
	262	* using a system-wise 'basetime', but the running time is always
	263	* taken from the per-cpu globaldata area. Since the same clock
	264	* is distributing (XXX SMP) to all cpus, the per-cpu timebases
	265	* stay in synch.
	266	*
	267	* Note that we never allow info->time (aka gd->gd_hardclock.time)
	268	* to reverse index gd_cpuclock_base, but that it is possible for
	269	* it to temporarily get behind in the seconds if something in the
	270	* system locks interrupts for a long period of time. Since periodic
	271	* timers count events, though everything should resynch again
	272	* immediately.
	273	*/
	274	cputicks = info->time - gd->gd_cpuclock_base;
	275	if (cputicks >= cputimer_freq) {
	276	++gd->gd_time_seconds;
	277	gd->gd_cpuclock_base += cputimer_freq;
	278	}
	279
	280	/*
	281	* The system-wide ticks counter and NTP related timedelta/tickdelta
	282	* adjustments only occur on cpu #0. NTP adjustments are accomplished
	283	* by updating basetime.
	284	*/
	285	if (gd->gd_cpuid == 0) {
	286	struct timespec nts;
	287	int leap;
	288
	289	++ticks;
	290
	291	#ifdef DEVICE_POLLING
	292	hardclock_device_poll(); /* mpsafe, short and quick */
	293	#endif /* DEVICE_POLLING */
	294
	295	#if 0
	296	if (tco->tc_poll_pps)
	297	tco->tc_poll_pps(tco);
	298	#endif
	299	/*
	300	* Apply adjtime corrections. At the moment only do this if
	301	* we can get the MP lock to interlock with adjtime's modification
	302	* of these variables. Note that basetime adjustments are not
	303	* MP safe either XXX.
	304	*/
	305	if (timedelta != 0 && try_mplock()) {
	306	basetime.tv_nsec += tickdelta * 1000;
	307	if (basetime.tv_nsec >= 1000000000) {
	308	basetime.tv_nsec -= 1000000000;
	309	++basetime.tv_sec;
	310	} else if (basetime.tv_nsec < 0) {
	311	basetime.tv_nsec += 1000000000;
	312	--basetime.tv_sec;
	313	}
	314	timedelta -= tickdelta;
	315	rel_mplock();
	316	}
	317
	318	/*
	319	* Apply per-tick compensation. ticks_adj adjusts for both
	320	* offset and frequency, and could be negative.
	321	*/
	322	if (nsec_adj != 0 && try_mplock()) {
	323	nsec_acc += nsec_adj;
	324	if (nsec_acc >= 0x100000000LL) {
	325	basetime.tv_nsec += nsec_acc >> 32;
	326	nsec_acc = (nsec_acc & 0xFFFFFFFFLL);
	327	} else if (nsec_acc <= -0x100000000LL) {
	328	basetime.tv_nsec -= -nsec_acc >> 32;
	329	nsec_acc = -(-nsec_acc & 0xFFFFFFFFLL);
	330	}
	331	if (basetime.tv_nsec >= 1000000000) {
	332	basetime.tv_nsec -= 1000000000;
	333	++basetime.tv_sec;
	334	} else if (basetime.tv_nsec < 0) {
	335	basetime.tv_nsec += 1000000000;
	336	--basetime.tv_sec;
	337	}
	338	rel_mplock();
	339	}
	340
	341	/*
	342	* If the realtime-adjusted seconds hand rolls over then tell
	343	* ntp_update_second() what we did in the last second so it can
	344	* calculate what to do in the next second. It may also add
	345	* or subtract a leap second.
	346	*/
	347	getnanotime(&nts);
	348	if (time_second != nts.tv_sec) {
	349	leap = ntp_update_second(time_second, &nsec_adj);
	350	basetime.tv_sec += leap;
	351	time_second = nts.tv_sec + leap;
	352	nsec_adj /= hz;
	353	}
	354	}
	355
	356	/*
	357	* softticks are handled for all cpus
	358	*/
	359	hardclock_softtick(gd);
	360
	361	/*
	362	* ITimer handling is per-tick, per-cpu. I don't think psignal()
	363	* is mpsafe on curproc, so XXX get the mplock.
	364	*/
	365	if ((p = curproc) != NULL && try_mplock()) {
	366	pstats = p->p_stats;
	367	if (frame && CLKF_USERMODE(frame) &&
	368	timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
	369	itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
	370	psignal(p, SIGVTALRM);
	371	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
	372	itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
	373	psignal(p, SIGPROF);
	374	rel_mplock();
	375	}
	376	setdelayed();
	377	}
	378
	379	/*
	380	* The statistics clock typically runs at a 125Hz rate, and is intended
	381	* to be frequency offset from the hardclock (typ 100Hz). It is per-cpu.
	382	*
	383	* NOTE! systimer! the MP lock might not be held here. We can only safely
	384	* manipulate objects owned by the current cpu.
	385	*
	386	* The stats clock is responsible for grabbing a profiling sample.
	387	* Most of the statistics are only used by user-level statistics programs.
	388	* The main exceptions are p->p_uticks, p->p_sticks, p->p_iticks, and
	389	* p->p_estcpu.
	390	*
	391	* Like the other clocks, the stat clock is called from what is effectively
	392	* a fast interrupt, so the context should be the thread/process that got
	393	* interrupted.
	394	*/
	395	static void
	396	statclock(systimer_t info, struct intrframe *frame)
	397	{
	398	#ifdef GPROF
	399	struct gmonparam *g;
	400	int i;
	401	#endif
	402	thread_t td;
	403	struct proc *p;
	404	int bump;
	405	struct timeval tv;
	406	struct timeval *stv;
	407
	408	/*
	409	* How big was our timeslice relative to the last time?
	410	*/
	411	microuptime(&tv); /* mpsafe */
	412	stv = &mycpu->gd_stattv;
	413	if (stv->tv_sec == 0) {
	414	bump = 1;
	415	} else {
	416	bump = tv.tv_usec - stv->tv_usec +
	417	(tv.tv_sec - stv->tv_sec) * 1000000;
	418	if (bump < 0)
	419	bump = 0;
	420	if (bump > 1000000)
	421	bump = 1000000;
	422	}
	423	*stv = tv;
	424
	425	td = curthread;
	426	p = td->td_proc;
	427
	428	if (frame && CLKF_USERMODE(frame)) {
	429	/*
	430	* Came from userland, handle user time and deal with
	431	* possible process.
	432	*/
	433	if (p && (p->p_flag & P_PROFIL))
	434	addupc_intr(p, CLKF_PC(frame), 1);
	435	td->td_uticks += bump;
	436
	437	/*
	438	* Charge the time as appropriate
	439	*/
	440	if (p && p->p_nice > NZERO)
	441	cp_time.cp_nice += bump;
	442	else
	443	cp_time.cp_user += bump;
	444	} else {
	445	#ifdef GPROF
	446	/*
	447	* Kernel statistics are just like addupc_intr, only easier.
	448	*/
	449	g = &_gmonparam;
	450	if (g->state == GMON_PROF_ON && frame) {
	451	i = CLKF_PC(frame) - g->lowpc;
	452	if (i < g->textsize) {
	453	i /= HISTFRACTION * sizeof(*g->kcount);
	454	g->kcount[i]++;
	455	}
	456	}
	457	#endif
	458	/*
	459	* Came from kernel mode, so we were:
	460	* - handling an interrupt,
	461	* - doing syscall or trap work on behalf of the current
	462	* user process, or
	463	* - spinning in the idle loop.
	464	* Whichever it is, charge the time as appropriate.
	465	* Note that we charge interrupts to the current process,
	466	* regardless of whether they are ``for'' that process,
	467	* so that we know how much of its real time was spent
	468	* in ``non-process'' (i.e., interrupt) work.
	469	*
	470	* XXX assume system if frame is NULL. A NULL frame
	471	* can occur if ipi processing is done from an splx().
	472	*/
	473	if (frame && CLKF_INTR(frame))
	474	td->td_iticks += bump;
	475	else
	476	td->td_sticks += bump;
	477
	478	if (frame && CLKF_INTR(frame)) {
	479	cp_time.cp_intr += bump;
	480	} else {
	481	if (td == &mycpu->gd_idlethread)
	482	cp_time.cp_idle += bump;
	483	else
	484	cp_time.cp_sys += bump;
	485	}
	486	}
	487	}
	488
	489	/*
	490	* The scheduler clock typically runs at a 20Hz rate. NOTE! systimer,
	491	* the MP lock might not be held. We can safely manipulate parts of curproc
	492	* but that's about it.
	493	*/
	494	static void
	495	schedclock(systimer_t info, struct intrframe *frame)
	496	{
	497	struct proc *p;
	498	struct pstats *pstats;
	499	struct rusage *ru;
	500	struct vmspace *vm;
	501	long rss;
	502
	503	schedulerclock(NULL); /* mpsafe */
	504	if ((p = curproc) != NULL) {
	505	/* Update resource usage integrals and maximums. */
	506	if ((pstats = p->p_stats) != NULL &&
	507	(ru = &pstats->p_ru) != NULL &&
	508	(vm = p->p_vmspace) != NULL) {
	509	ru->ru_ixrss += pgtok(vm->vm_tsize);
	510	ru->ru_idrss += pgtok(vm->vm_dsize);
	511	ru->ru_isrss += pgtok(vm->vm_ssize);
	512	rss = pgtok(vmspace_resident_count(vm));
	513	if (ru->ru_maxrss < rss)
	514	ru->ru_maxrss = rss;
	515	}
	516	}
	517	}
	518
	519	/*
	520	* Compute number of ticks for the specified amount of time. The
	521	* return value is intended to be used in a clock interrupt timed
	522	* operation and guarenteed to meet or exceed the requested time.
	523	* If the representation overflows, return INT_MAX. The minimum return
	524	* value is 1 ticks and the function will average the calculation up.
	525	* If any value greater then 0 microseconds is supplied, a value
	526	* of at least 2 will be returned to ensure that a near-term clock
	527	* interrupt does not cause the timeout to occur (degenerately) early.
	528	*
	529	* Note that limit checks must take into account microseconds, which is
	530	* done simply by using the smaller signed long maximum instead of
	531	* the unsigned long maximum.
	532	*
	533	* If ints have 32 bits, then the maximum value for any timeout in
	534	* 10ms ticks is 248 days.
	535	*/
	536	int
	537	tvtohz_high(struct timeval *tv)
	538	{
	539	int ticks;
	540	long sec, usec;
	541
	542	sec = tv->tv_sec;
	543	usec = tv->tv_usec;
	544	if (usec < 0) {
	545	sec--;
	546	usec += 1000000;
	547	}
	548	if (sec < 0) {
	549	#ifdef DIAGNOSTIC
	550	if (usec > 0) {
	551	sec++;
	552	usec -= 1000000;
	553	}
	554	printf("tvotohz: negative time difference %ld sec %ld usec\n",
	555	sec, usec);
	556	#endif
	557	ticks = 1;
	558	} else if (sec <= INT_MAX / hz) {
	559	ticks = (int)(sec * hz +
	560	((u_long)usec + (tick - 1)) / tick) + 1;
	561	} else {
	562	ticks = INT_MAX;
	563	}
	564	return (ticks);
	565	}
	566
	567	/*
	568	* Compute number of ticks for the specified amount of time, erroring on
	569	* the side of it being too low to ensure that sleeping the returned number
	570	* of ticks will not result in a late return.
	571	*
	572	* The supplied timeval may not be negative and should be normalized. A
	573	* return value of 0 is possible if the timeval converts to less then
	574	* 1 tick.
	575	*
	576	* If ints have 32 bits, then the maximum value for any timeout in
	577	* 10ms ticks is 248 days.
	578	*/
	579	int
	580	tvtohz_low(struct timeval *tv)
	581	{
	582	int ticks;
	583	long sec;
	584
	585	sec = tv->tv_sec;
	586	if (sec <= INT_MAX / hz)
	587	ticks = (int)(sec * hz + (u_long)tv->tv_usec / tick);
	588	else
	589	ticks = INT_MAX;
	590	return (ticks);
	591	}
	592
	593
	594	/*
	595	* Start profiling on a process.
	596	*
	597	* Kernel profiling passes proc0 which never exits and hence
	598	* keeps the profile clock running constantly.
	599	*/
	600	void
	601	startprofclock(struct proc *p)
	602	{
	603	if ((p->p_flag & P_PROFIL) == 0) {
	604	p->p_flag \|= P_PROFIL;
	605	#if 0 /* XXX */
	606	if (++profprocs == 1 && stathz != 0) {
	607	s = splstatclock();
	608	psdiv = psratio;
	609	setstatclockrate(profhz);
	610	splx(s);
	611	}
	612	#endif
	613	}
	614	}
	615
	616	/*
	617	* Stop profiling on a process.
	618	*/
	619	void
	620	stopprofclock(struct proc *p)
	621	{
	622	if (p->p_flag & P_PROFIL) {
	623	p->p_flag &= ~P_PROFIL;
	624	#if 0 /* XXX */
	625	if (--profprocs == 0 && stathz != 0) {
	626	s = splstatclock();
	627	psdiv = 1;
	628	setstatclockrate(stathz);
	629	splx(s);
	630	}
	631	#endif
	632	}
	633	}
	634
	635	/*
	636	* Return information about system clocks.
	637	*/
	638	static int
	639	sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
	640	{
	641	struct kinfo_clockinfo clkinfo;
	642	/*
	643	* Construct clockinfo structure.
	644	*/
	645	clkinfo.ci_hz = hz;
	646	clkinfo.ci_tick = tick;
	647	clkinfo.ci_tickadj = tickadj;
	648	clkinfo.ci_profhz = profhz;
	649	clkinfo.ci_stathz = stathz ? stathz : hz;
	650	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
	651	}
	652
	653	SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT\|CTLFLAG_RD,
	654	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
	655
	656	/*
	657	* We have eight functions for looking at the clock, four for
	658	* microseconds and four for nanoseconds. For each there is fast
	659	* but less precise version "get{nano\|micro}[up]time" which will
	660	* return a time which is up to 1/HZ previous to the call, whereas
	661	* the raw version "{nano\|micro}[up]time" will return a timestamp
	662	* which is as precise as possible. The "up" variants return the
	663	* time relative to system boot, these are well suited for time
	664	* interval measurements.
	665	*
	666	* Each cpu independantly maintains the current time of day, so all
	667	* we need to do to protect ourselves from changes is to do a loop
	668	* check on the seconds field changing out from under us.
	669	*
	670	* The system timer maintains a 32 bit count and due to various issues
	671	* it is possible for the calculated delta to occassionally exceed
	672	* cputimer_freq. If this occurs the cputimer_freq64_nsec multiplication
	673	* can easily overflow, so we deal with the case. For uniformity we deal
	674	* with the case in the usec case too.
	675	*/
	676	void
	677	getmicrouptime(struct timeval *tvp)
	678	{
	679	struct globaldata *gd = mycpu;
	680	sysclock_t delta;
	681
	682	do {
	683	tvp->tv_sec = gd->gd_time_seconds;
	684	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	685	} while (tvp->tv_sec != gd->gd_time_seconds);
	686
	687	if (delta >= cputimer_freq) {
	688	tvp->tv_sec += delta / cputimer_freq;
	689	delta %= cputimer_freq;
	690	}
	691	tvp->tv_usec = (cputimer_freq64_usec * delta) >> 32;
	692	if (tvp->tv_usec >= 1000000) {
	693	tvp->tv_usec -= 1000000;
	694	++tvp->tv_sec;
	695	}
	696	}
	697
	698	void
	699	getnanouptime(struct timespec *tsp)
	700	{
	701	struct globaldata *gd = mycpu;
	702	sysclock_t delta;
	703
	704	do {
	705	tsp->tv_sec = gd->gd_time_seconds;
	706	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	707	} while (tsp->tv_sec != gd->gd_time_seconds);
	708
	709	if (delta >= cputimer_freq) {
	710	tsp->tv_sec += delta / cputimer_freq;
	711	delta %= cputimer_freq;
	712	}
	713	tsp->tv_nsec = (cputimer_freq64_nsec * delta) >> 32;
	714	}
	715
	716	void
	717	microuptime(struct timeval *tvp)
	718	{
	719	struct globaldata *gd = mycpu;
	720	sysclock_t delta;
	721
	722	do {
	723	tvp->tv_sec = gd->gd_time_seconds;
	724	delta = cputimer_count() - gd->gd_cpuclock_base;
	725	} while (tvp->tv_sec != gd->gd_time_seconds);
	726
	727	if (delta >= cputimer_freq) {
	728	tvp->tv_sec += delta / cputimer_freq;
	729	delta %= cputimer_freq;
	730	}
	731	tvp->tv_usec = (cputimer_freq64_usec * delta) >> 32;
	732	}
	733
	734	void
	735	nanouptime(struct timespec *tsp)
	736	{
	737	struct globaldata *gd = mycpu;
	738	sysclock_t delta;
	739
	740	do {
	741	tsp->tv_sec = gd->gd_time_seconds;
	742	delta = cputimer_count() - gd->gd_cpuclock_base;
	743	} while (tsp->tv_sec != gd->gd_time_seconds);
	744
	745	if (delta >= cputimer_freq) {
	746	tsp->tv_sec += delta / cputimer_freq;
	747	delta %= cputimer_freq;
	748	}
	749	tsp->tv_nsec = (cputimer_freq64_nsec * delta) >> 32;
	750	}
	751
	752	/*
	753	* realtime routines
	754	*/
	755
	756	void
	757	getmicrotime(struct timeval *tvp)
	758	{
	759	struct globaldata *gd = mycpu;
	760	sysclock_t delta;
	761
	762	do {
	763	tvp->tv_sec = gd->gd_time_seconds;
	764	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	765	} while (tvp->tv_sec != gd->gd_time_seconds);
	766
	767	if (delta >= cputimer_freq) {
	768	tvp->tv_sec += delta / cputimer_freq;
	769	delta %= cputimer_freq;
	770	}
	771	tvp->tv_usec = (cputimer_freq64_usec * delta) >> 32;
	772
	773	tvp->tv_sec += basetime.tv_sec;
	774	tvp->tv_usec += basetime.tv_nsec / 1000;
	775	while (tvp->tv_usec >= 1000000) {
	776	tvp->tv_usec -= 1000000;
	777	++tvp->tv_sec;
	778	}
	779	}
	780
	781	void
	782	getnanotime(struct timespec *tsp)
	783	{
	784	struct globaldata *gd = mycpu;
	785	sysclock_t delta;
	786
	787	do {
	788	tsp->tv_sec = gd->gd_time_seconds;
	789	delta = gd->gd_hardclock.time - gd->gd_cpuclock_base;
	790	} while (tsp->tv_sec != gd->gd_time_seconds);
	791
	792	if (delta >= cputimer_freq) {
	793	tsp->tv_sec += delta / cputimer_freq;
	794	delta %= cputimer_freq;
	795	}
	796	tsp->tv_nsec = (cputimer_freq64_nsec * delta) >> 32;
	797
	798	tsp->tv_sec += basetime.tv_sec;
	799	tsp->tv_nsec += basetime.tv_nsec;
	800	while (tsp->tv_nsec >= 1000000000) {
	801	tsp->tv_nsec -= 1000000000;
	802	++tsp->tv_sec;
	803	}
	804	}
	805
	806	void
	807	microtime(struct timeval *tvp)
	808	{
	809	struct globaldata *gd = mycpu;
	810	sysclock_t delta;
	811
	812	do {
	813	tvp->tv_sec = gd->gd_time_seconds;
	814	delta = cputimer_count() - gd->gd_cpuclock_base;
	815	} while (tvp->tv_sec != gd->gd_time_seconds);
	816
	817	if (delta >= cputimer_freq) {
	818	tvp->tv_sec += delta / cputimer_freq;
	819	delta %= cputimer_freq;
	820	}
	821	tvp->tv_usec = (cputimer_freq64_usec * delta) >> 32;
	822
	823	tvp->tv_sec += basetime.tv_sec;
	824	tvp->tv_usec += basetime.tv_nsec / 1000;
	825	while (tvp->tv_usec >= 1000000) {
	826	tvp->tv_usec -= 1000000;
	827	++tvp->tv_sec;
	828	}
	829	}
	830
	831	void
	832	nanotime(struct timespec *tsp)
	833	{
	834	struct globaldata *gd = mycpu;
	835	sysclock_t delta;
	836
	837	do {
	838	tsp->tv_sec = gd->gd_time_seconds;
	839	delta = cputimer_count() - gd->gd_cpuclock_base;
	840	} while (tsp->tv_sec != gd->gd_time_seconds);
	841
	842	if (delta >= cputimer_freq) {
	843	tsp->tv_sec += delta / cputimer_freq;
	844	delta %= cputimer_freq;
	845	}
	846	tsp->tv_nsec = (cputimer_freq64_nsec * delta) >> 32;
	847
	848	tsp->tv_sec += basetime.tv_sec;
	849	tsp->tv_nsec += basetime.tv_nsec;
	850	while (tsp->tv_nsec >= 1000000000) {
	851	tsp->tv_nsec -= 1000000000;
	852	++tsp->tv_sec;
	853	}
	854	}
	855
	856	int
	857	pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
	858	{
	859	pps_params_t *app;
	860	struct pps_fetch_args *fapi;
	861	#ifdef PPS_SYNC
	862	struct pps_kcbind_args *kapi;
	863	#endif
	864
	865	switch (cmd) {
	866	case PPS_IOC_CREATE:
	867	return (0);
	868	case PPS_IOC_DESTROY:
	869	return (0);
	870	case PPS_IOC_SETPARAMS:
	871	app = (pps_params_t *)data;
	872	if (app->mode & ~pps->ppscap)
	873	return (EINVAL);
	874	pps->ppsparam = *app;
	875	return (0);
	876	case PPS_IOC_GETPARAMS:
	877	app = (pps_params_t *)data;
	878	*app = pps->ppsparam;
	879	app->api_version = PPS_API_VERS_1;
	880	return (0);
	881	case PPS_IOC_GETCAP:
	882	(int)data = pps->ppscap;
	883	return (0);
	884	case PPS_IOC_FETCH:
	885	fapi = (struct pps_fetch_args *)data;
	886	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
	887	return (EINVAL);
	888	if (fapi->timeout.tv_sec \|\| fapi->timeout.tv_nsec)
	889	return (EOPNOTSUPP);
	890	pps->ppsinfo.current_mode = pps->ppsparam.mode;
	891	fapi->pps_info_buf = pps->ppsinfo;
	892	return (0);
	893	case PPS_IOC_KCBIND:
	894	#ifdef PPS_SYNC
	895	kapi = (struct pps_kcbind_args *)data;
	896	/* XXX Only root should be able to do this */
	897	if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
	898	return (EINVAL);
	899	if (kapi->kernel_consumer != PPS_KC_HARDPPS)
	900	return (EINVAL);
	901	if (kapi->edge & ~pps->ppscap)
	902	return (EINVAL);
	903	pps->kcmode = kapi->edge;
	904	return (0);
	905	#else
	906	return (EOPNOTSUPP);
	907	#endif
	908	default:
	909	return (ENOTTY);
	910	}
	911	}
	912
	913	void
	914	pps_init(struct pps_state *pps)
	915	{
	916	pps->ppscap \|= PPS_TSFMT_TSPEC;
	917	if (pps->ppscap & PPS_CAPTUREASSERT)
	918	pps->ppscap \|= PPS_OFFSETASSERT;
	919	if (pps->ppscap & PPS_CAPTURECLEAR)
	920	pps->ppscap \|= PPS_OFFSETCLEAR;
	921	}
	922
	923	void
	924	pps_event(struct pps_state *pps, sysclock_t count, int event)
	925	{
	926	struct globaldata *gd;
	927	struct timespec *tsp;
	928	struct timespec *osp;
	929	struct timespec ts;
	930	sysclock_t *pcount;
	931	#ifdef PPS_SYNC
	932	sysclock_t tcount;
	933	#endif
	934	sysclock_t delta;
	935	pps_seq_t *pseq;
	936	int foff;
	937	int fhard;
	938
	939	gd = mycpu;
	940
	941	/* Things would be easier with arrays... */
	942	if (event == PPS_CAPTUREASSERT) {
	943	tsp = &pps->ppsinfo.assert_timestamp;
	944	osp = &pps->ppsparam.assert_offset;
	945	foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
	946	fhard = pps->kcmode & PPS_CAPTUREASSERT;
	947	pcount = &pps->ppscount[0];
	948	pseq = &pps->ppsinfo.assert_sequence;
	949	} else {
	950	tsp = &pps->ppsinfo.clear_timestamp;
	951	osp = &pps->ppsparam.clear_offset;
	952	foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
	953	fhard = pps->kcmode & PPS_CAPTURECLEAR;
	954	pcount = &pps->ppscount[1];
	955	pseq = &pps->ppsinfo.clear_sequence;
	956	}
	957
	958	/* Nothing really happened */
	959	if (*pcount == count)
	960	return;
	961
	962	*pcount = count;
	963
	964	do {
	965	ts.tv_sec = gd->gd_time_seconds;
	966	delta = count - gd->gd_cpuclock_base;
	967	} while (ts.tv_sec != gd->gd_time_seconds);
	968
	969	if (delta >= cputimer_freq) {
	970	ts.tv_sec += delta / cputimer_freq;
	971	delta %= cputimer_freq;
	972	}
	973	ts.tv_nsec = (cputimer_freq64_nsec * delta) >> 32;
	974	ts.tv_sec += basetime.tv_sec;
	975	ts.tv_nsec += basetime.tv_nsec;
	976	while (ts.tv_nsec >= 1000000000) {
	977	ts.tv_nsec -= 1000000000;
	978	++ts.tv_sec;
	979	}
	980
	981	(*pseq)++;
	982	*tsp = ts;
	983
	984	if (foff) {
	985	timespecadd(tsp, osp);
	986	if (tsp->tv_nsec < 0) {
	987	tsp->tv_nsec += 1000000000;
	988	tsp->tv_sec -= 1;
	989	}
	990	}
	991	#ifdef PPS_SYNC
	992	if (fhard) {
	993	/* magic, at its best... */
	994	tcount = count - pps->ppscount[2];
	995	pps->ppscount[2] = count;
	996	if (tcount >= cputimer_freq) {
	997	delta = (1000000000 * (tcount / cputimer_freq) +
	998	cputimer_freq64_nsec *
	999	(tcount % cputimer_freq)) >> 32;
	1000	} else {
	1001	delta = (cputimer_freq64_nsec * tcount) >> 32;
	1002	}
	1003	hardpps(tsp, delta);
	1004	}
	1005	#endif
	1006	}
	1007