gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
	3	* Copyright (c) 1982, 1986, 1991, 1993
	4	* The Regents of the University of California. All rights reserved.
	5	* (c) UNIX System Laboratories, Inc.
	6	* All or some portions of this file are derived from material licensed
	7	* to the University of California by American Telephone and Telegraph
	8	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	9	* the permission of UNIX System Laboratories, Inc.
	10	*
	11	* Redistribution and use in source and binary forms, with or without
	12	* modification, are permitted provided that the following conditions
	13	* are met:
	14	* 1. Redistributions of source code must retain the above copyright
	15	* notice, this list of conditions and the following disclaimer.
	16	* 2. Redistributions in binary form must reproduce the above copyright
	17	* notice, this list of conditions and the following disclaimer in the
	18	* documentation and/or other materials provided with the distribution.
	19	* 3. All advertising materials mentioning features or use of this software
	20	* must display the following acknowledgement:
	21	* This product includes software developed by the University of
	22	* California, Berkeley and its contributors.
	23	* 4. Neither the name of the University nor the names of its contributors
	24	* may be used to endorse or promote products derived from this software
	25	* without specific prior written permission.
	26	*
	27	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	28	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	29	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	30	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	31	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	32	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	33	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	34	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	35	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	36	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	37	* SUCH DAMAGE.
	38	*
	39	* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
	40	* $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $
	41	* $DragonFly: src/sys/kern/kern_clock.c,v 1.7 2003/07/10 04:47:54 dillon Exp $
	42	*/
	43
	44	#include "opt_ntp.h"
	45
	46	#include <sys/param.h>
	47	#include <sys/systm.h>
	48	#include <sys/dkstat.h>
	49	#include <sys/callout.h>
	50	#include <sys/kernel.h>
	51	#include <sys/proc.h>
	52	#include <sys/malloc.h>
	53	#include <sys/resourcevar.h>
	54	#include <sys/signalvar.h>
	55	#include <sys/timex.h>
	56	#include <sys/timepps.h>
	57	#include <vm/vm.h>
	58	#include <sys/lock.h>
	59	#include <vm/pmap.h>
	60	#include <vm/vm_map.h>
	61	#include <sys/sysctl.h>
	62
	63	#include <machine/cpu.h>
	64	#include <machine/limits.h>
	65	#include <machine/smp.h>
	66
	67	#ifdef GPROF
	68	#include <sys/gmon.h>
	69	#endif
	70
	71	#ifdef DEVICE_POLLING
	72	extern void init_device_poll(void);
	73	extern void hardclock_device_poll(void);
	74	#endif /* DEVICE_POLLING */
	75
	76	/*
	77	* Number of timecounters used to implement stable storage
	78	*/
	79	#ifndef NTIMECOUNTER
	80	#define NTIMECOUNTER 5
	81	#endif
	82
	83	static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter",
	84	"Timecounter stable storage");
	85
	86	static void initclocks __P((void *dummy));
	87	SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
	88
	89	static void tco_forward __P((int force));
	90	static void tco_setscales __P((struct timecounter *tc));
	91	static __inline unsigned tco_delta __P((struct timecounter *tc));
	92
	93	/* Some of these don't belong here, but it's easiest to concentrate them. */
	94	long cp_time[CPUSTATES];
	95
	96	SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
	97	"LU", "CPU time statistics");
	98
	99	long tk_cancc;
	100	long tk_nin;
	101	long tk_nout;
	102	long tk_rawcc;
	103
	104	time_t time_second;
	105
	106	struct timeval boottime;
	107	SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
	108	&boottime, timeval, "System boottime");
	109
	110	/*
	111	* Which update policy to use.
	112	* 0 - every tick, bad hardware may fail with "calcru negative..."
	113	* 1 - more resistent to the above hardware, but less efficient.
	114	*/
	115	static int tco_method;
	116
	117	/*
	118	* Implement a dummy timecounter which we can use until we get a real one
	119	* in the air. This allows the console and other early stuff to use
	120	* timeservices.
	121	*/
	122
	123	static unsigned
	124	dummy_get_timecount(struct timecounter *tc)
	125	{
	126	static unsigned now;
	127	return (++now);
	128	}
	129
	130	static struct timecounter dummy_timecounter = {
	131	dummy_get_timecount,
	132	0,
	133	~0u,
	134	1000000,
	135	"dummy"
	136	};
	137
	138	struct timecounter *timecounter = &dummy_timecounter;
	139
	140	/*
	141	* Clock handling routines.
	142	*
	143	* This code is written to operate with two timers that run independently of
	144	* each other.
	145	*
	146	* The main timer, running hz times per second, is used to trigger interval
	147	* timers, timeouts and rescheduling as needed.
	148	*
	149	* The second timer handles kernel and user profiling,
	150	* and does resource use estimation. If the second timer is programmable,
	151	* it is randomized to avoid aliasing between the two clocks. For example,
	152	* the randomization prevents an adversary from always giving up the cpu
	153	* just before its quantum expires. Otherwise, it would never accumulate
	154	* cpu ticks. The mean frequency of the second timer is stathz.
	155	*
	156	* If no second timer exists, stathz will be zero; in this case we drive
	157	* profiling and statistics off the main clock. This WILL NOT be accurate;
	158	* do not do it unless absolutely necessary.
	159	*
	160	* The statistics clock may (or may not) be run at a higher rate while
	161	* profiling. This profile clock runs at profhz. We require that profhz
	162	* be an integral multiple of stathz.
	163	*
	164	* If the statistics clock is running fast, it must be divided by the ratio
	165	* profhz/stathz for statistics. (For profiling, every tick counts.)
	166	*
	167	* Time-of-day is maintained using a "timecounter", which may or may
	168	* not be related to the hardware generating the above mentioned
	169	* interrupts.
	170	*/
	171
	172	int stathz;
	173	int profhz;
	174	static int profprocs;
	175	int ticks;
	176	static int psdiv, pscnt; /* prof => stat divider */
	177	int psratio; /* ratio: prof / stat */
	178
	179	/*
	180	* Initialize clock frequencies and start both clocks running.
	181	*/
	182	/* ARGSUSED*/
	183	static void
	184	initclocks(dummy)
	185	void *dummy;
	186	{
	187	register int i;
	188
	189	/*
	190	* Set divisors to 1 (normal case) and let the machine-specific
	191	* code do its bit.
	192	*/
	193	psdiv = pscnt = 1;
	194	cpu_initclocks();
	195
	196	#ifdef DEVICE_POLLING
	197	init_device_poll();
	198	#endif
	199
	200	/*
	201	* Compute profhz/stathz, and fix profhz if needed.
	202	*/
	203	i = stathz ? stathz : hz;
	204	if (profhz == 0)
	205	profhz = i;
	206	psratio = profhz / i;
	207	}
	208
	209	/*
	210	* The real-time timer, interrupting hz times per second. This is implemented
	211	* as a FAST interrupt so it is in the context of the thread it interrupted,
	212	* and not in an interrupt thread. YYY needs help.
	213	*/
	214	void
	215	hardclock(frame)
	216	register struct clockframe *frame;
	217	{
	218	register struct proc *p;
	219
	220	p = curproc;
	221	if (p) {
	222	register struct pstats *pstats;
	223
	224	/*
	225	* Run current process's virtual and profile time, as needed.
	226	*/
	227	pstats = p->p_stats;
	228	if (CLKF_USERMODE(frame) &&
	229	timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
	230	itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
	231	psignal(p, SIGVTALRM);
	232	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
	233	itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
	234	psignal(p, SIGPROF);
	235	}
	236
	237	#if defined(SMP) && defined(BETTER_CLOCK)
	238	forward_hardclock(pscnt);
	239	#endif
	240
	241	/*
	242	* If no separate statistics clock is available, run it from here.
	243	*/
	244	if (stathz == 0)
	245	statclock(frame);
	246
	247	tco_forward(0);
	248	ticks++;
	249
	250	#ifdef DEVICE_POLLING
	251	hardclock_device_poll(); /* this is very short and quick */
	252	#endif /* DEVICE_POLLING */
	253
	254	/*
	255	* Process callouts at a very low cpu priority, so we don't keep the
	256	* relatively high clock interrupt priority any longer than necessary.
	257	*/
	258	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
	259	setsoftclock();
	260	} else if (softticks + 1 == ticks) {
	261	++softticks;
	262	}
	263	}
	264
	265	/*
	266	* Compute number of ticks in the specified amount of time.
	267	*/
	268	int
	269	tvtohz(tv)
	270	struct timeval *tv;
	271	{
	272	register unsigned long ticks;
	273	register long sec, usec;
	274
	275	/*
	276	* If the number of usecs in the whole seconds part of the time
	277	* difference fits in a long, then the total number of usecs will
	278	* fit in an unsigned long. Compute the total and convert it to
	279	* ticks, rounding up and adding 1 to allow for the current tick
	280	* to expire. Rounding also depends on unsigned long arithmetic
	281	* to avoid overflow.
	282	*
	283	* Otherwise, if the number of ticks in the whole seconds part of
	284	* the time difference fits in a long, then convert the parts to
	285	* ticks separately and add, using similar rounding methods and
	286	* overflow avoidance. This method would work in the previous
	287	* case but it is slightly slower and assumes that hz is integral.
	288	*
	289	* Otherwise, round the time difference down to the maximum
	290	* representable value.
	291	*
	292	* If ints have 32 bits, then the maximum value for any timeout in
	293	* 10ms ticks is 248 days.
	294	*/
	295	sec = tv->tv_sec;
	296	usec = tv->tv_usec;
	297	if (usec < 0) {
	298	sec--;
	299	usec += 1000000;
	300	}
	301	if (sec < 0) {
	302	#ifdef DIAGNOSTIC
	303	if (usec > 0) {
	304	sec++;
	305	usec -= 1000000;
	306	}
	307	printf("tvotohz: negative time difference %ld sec %ld usec\n",
	308	sec, usec);
	309	#endif
	310	ticks = 1;
	311	} else if (sec <= LONG_MAX / 1000000)
	312	ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
	313	/ tick + 1;
	314	else if (sec <= LONG_MAX / hz)
	315	ticks = sec * hz
	316	+ ((unsigned long)usec + (tick - 1)) / tick + 1;
	317	else
	318	ticks = LONG_MAX;
	319	if (ticks > INT_MAX)
	320	ticks = INT_MAX;
	321	return ((int)ticks);
	322	}
	323
	324	/*
	325	* Start profiling on a process.
	326	*
	327	* Kernel profiling passes proc0 which never exits and hence
	328	* keeps the profile clock running constantly.
	329	*/
	330	void
	331	startprofclock(p)
	332	register struct proc *p;
	333	{
	334	int s;
	335
	336	if ((p->p_flag & P_PROFIL) == 0) {
	337	p->p_flag \|= P_PROFIL;
	338	if (++profprocs == 1 && stathz != 0) {
	339	s = splstatclock();
	340	psdiv = pscnt = psratio;
	341	setstatclockrate(profhz);
	342	splx(s);
	343	}
	344	}
	345	}
	346
	347	/*
	348	* Stop profiling on a process.
	349	*/
	350	void
	351	stopprofclock(p)
	352	register struct proc *p;
	353	{
	354	int s;
	355
	356	if (p->p_flag & P_PROFIL) {
	357	p->p_flag &= ~P_PROFIL;
	358	if (--profprocs == 0 && stathz != 0) {
	359	s = splstatclock();
	360	psdiv = pscnt = 1;
	361	setstatclockrate(stathz);
	362	splx(s);
	363	}
	364	}
	365	}
	366
	367	/*
	368	* Statistics clock. Grab profile sample, and if divider reaches 0,
	369	* do process and kernel statistics. Most of the statistics are only
	370	* used by user-level statistics programs. The main exceptions are
	371	* p->p_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.
	372	*
	373	* The statclock should be called from an exclusive, fast interrupt,
	374	* so the context should be the thread/process that got interrupted and
	375	* not an interrupt thread.
	376	*/
	377	void
	378	statclock(frame)
	379	register struct clockframe *frame;
	380	{
	381	#ifdef GPROF
	382	register struct gmonparam *g;
	383	int i;
	384	#endif
	385	thread_t td;
	386	struct pstats *pstats;
	387	long rss;
	388	struct rusage *ru;
	389	struct vmspace *vm;
	390	struct proc *p;
	391	int bump;
	392	struct timeval tv;
	393	struct timeval *stv;
	394
	395	/*
	396	* How big was our timeslice relative to the last time
	397	*/
	398	microuptime(&tv);
	399	stv = &mycpu->gd_stattv;
	400	if (stv->tv_sec == 0) {
	401	bump = 1;
	402	} else {
	403	bump = tv.tv_usec - stv->tv_usec +
	404	(tv.tv_sec - stv->tv_sec) * 1000000;
	405	if (bump < 0)
	406	bump = 0;
	407	if (bump > 1000000)
	408	bump = 1000000;
	409	}
	410	*stv = tv;
	411
	412	td = curthread;
	413	p = td->td_proc;
	414
	415	if (CLKF_USERMODE(frame)) {
	416	/*
	417	* Came from userland, handle user time and deal with
	418	* possible process.
	419	*/
	420	if (p && (p->p_flag & P_PROFIL))
	421	addupc_intr(p, CLKF_PC(frame), 1);
	422	#if defined(SMP) && defined(BETTER_CLOCK)
	423	if (stathz != 0)
	424	forward_statclock(pscnt);
	425	#endif
	426	td->td_uticks += bump;
	427	if (--pscnt > 0)
	428	return;
	429
	430	/*
	431	* Charge the time as appropriate
	432	*/
	433	if (p && p->p_nice > NZERO)
	434	++cp_time[CP_NICE];
	435	else
	436	++cp_time[CP_USER];
	437	} else {
	438	#ifdef GPROF
	439	/*
	440	* Kernel statistics are just like addupc_intr, only easier.
	441	*/
	442	g = &_gmonparam;
	443	if (g->state == GMON_PROF_ON) {
	444	i = CLKF_PC(frame) - g->lowpc;
	445	if (i < g->textsize) {
	446	i /= HISTFRACTION * sizeof(*g->kcount);
	447	g->kcount[i]++;
	448	}
	449	}
	450	#endif
	451	#if defined(SMP) && defined(BETTER_CLOCK)
	452	if (stathz != 0)
	453	forward_statclock(pscnt);
	454	#endif
	455	/*
	456	* Came from kernel mode, so we were:
	457	* - handling an interrupt,
	458	* - doing syscall or trap work on behalf of the current
	459	* user process, or
	460	* - spinning in the idle loop.
	461	* Whichever it is, charge the time as appropriate.
	462	* Note that we charge interrupts to the current process,
	463	* regardless of whether they are ``for'' that process,
	464	* so that we know how much of its real time was spent
	465	* in ``non-process'' (i.e., interrupt) work.
	466	*/
	467	if (CLKF_INTR(frame))
	468	td->td_iticks += bump;
	469	else
	470	td->td_sticks += bump;
	471
	472	if (--pscnt > 0)
	473	return;
	474
	475	if (CLKF_INTR(frame)) {
	476	cp_time[CP_INTR]++;
	477	} else {
	478	if (td == &mycpu->gd_idlethread)
	479	++cp_time[CP_IDLE];
	480	else
	481	++cp_time[CP_SYS];
	482	}
	483	}
	484	pscnt = psdiv;
	485
	486	if (p != NULL) {
	487	schedclock(p);
	488
	489	/* Update resource usage integrals and maximums. */
	490	if ((pstats = p->p_stats) != NULL &&
	491	(ru = &pstats->p_ru) != NULL &&
	492	(vm = p->p_vmspace) != NULL) {
	493	ru->ru_ixrss += pgtok(vm->vm_tsize);
	494	ru->ru_idrss += pgtok(vm->vm_dsize);
	495	ru->ru_isrss += pgtok(vm->vm_ssize);
	496	rss = pgtok(vmspace_resident_count(vm));
	497	if (ru->ru_maxrss < rss)
	498	ru->ru_maxrss = rss;
	499	}
	500	}
	501	}
	502
	503	/*
	504	* Return information about system clocks.
	505	*/
	506	static int
	507	sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
	508	{
	509	struct clockinfo clkinfo;
	510	/*
	511	* Construct clockinfo structure.
	512	*/
	513	clkinfo.hz = hz;
	514	clkinfo.tick = tick;
	515	clkinfo.tickadj = tickadj;
	516	clkinfo.profhz = profhz;
	517	clkinfo.stathz = stathz ? stathz : hz;
	518	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
	519	}
	520
	521	SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT\|CTLFLAG_RD,
	522	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
	523
	524	static __inline unsigned
	525	tco_delta(struct timecounter *tc)
	526	{
	527
	528	return ((tc->tc_get_timecount(tc) - tc->tc_offset_count) &
	529	tc->tc_counter_mask);
	530	}
	531
	532	/*
	533	* We have eight functions for looking at the clock, four for
	534	* microseconds and four for nanoseconds. For each there is fast
	535	* but less precise version "get{nano\|micro}[up]time" which will
	536	* return a time which is up to 1/HZ previous to the call, whereas
	537	* the raw version "{nano\|micro}[up]time" will return a timestamp
	538	* which is as precise as possible. The "up" variants return the
	539	* time relative to system boot, these are well suited for time
	540	* interval measurements.
	541	*/
	542
	543	void
	544	getmicrotime(struct timeval *tvp)
	545	{
	546	struct timecounter *tc;
	547
	548	if (!tco_method) {
	549	tc = timecounter;
	550	*tvp = tc->tc_microtime;
	551	} else {
	552	microtime(tvp);
	553	}
	554	}
	555
	556	void
	557	getnanotime(struct timespec *tsp)
	558	{
	559	struct timecounter *tc;
	560
	561	if (!tco_method) {
	562	tc = timecounter;
	563	*tsp = tc->tc_nanotime;
	564	} else {
	565	nanotime(tsp);
	566	}
	567	}
	568
	569	void
	570	microtime(struct timeval *tv)
	571	{
	572	struct timecounter *tc;
	573
	574	tc = timecounter;
	575	tv->tv_sec = tc->tc_offset_sec;
	576	tv->tv_usec = tc->tc_offset_micro;
	577	tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
	578	tv->tv_usec += boottime.tv_usec;
	579	tv->tv_sec += boottime.tv_sec;
	580	while (tv->tv_usec < 0) {
	581	tv->tv_usec += 1000000;
	582	if (tv->tv_sec > 0)
	583	tv->tv_sec--;
	584	}
	585	while (tv->tv_usec >= 1000000) {
	586	tv->tv_usec -= 1000000;
	587	tv->tv_sec++;
	588	}
	589	}
	590
	591	void
	592	nanotime(struct timespec *ts)
	593	{
	594	unsigned count;
	595	u_int64_t delta;
	596	struct timecounter *tc;
	597
	598	tc = timecounter;
	599	ts->tv_sec = tc->tc_offset_sec;
	600	count = tco_delta(tc);
	601	delta = tc->tc_offset_nano;
	602	delta += ((u_int64_t)count * tc->tc_scale_nano_f);
	603	delta >>= 32;
	604	delta += ((u_int64_t)count * tc->tc_scale_nano_i);
	605	delta += boottime.tv_usec * 1000;
	606	ts->tv_sec += boottime.tv_sec;
	607	while (delta < 0) {
	608	delta += 1000000000;
	609	if (ts->tv_sec > 0)
	610	ts->tv_sec--;
	611	}
	612	while (delta >= 1000000000) {
	613	delta -= 1000000000;
	614	ts->tv_sec++;
	615	}
	616	ts->tv_nsec = delta;
	617	}
	618
	619	void
	620	getmicrouptime(struct timeval *tvp)
	621	{
	622	struct timecounter *tc;
	623
	624	if (!tco_method) {
	625	tc = timecounter;
	626	tvp->tv_sec = tc->tc_offset_sec;
	627	tvp->tv_usec = tc->tc_offset_micro;
	628	} else {
	629	microuptime(tvp);
	630	}
	631	}
	632
	633	void
	634	getnanouptime(struct timespec *tsp)
	635	{
	636	struct timecounter *tc;
	637
	638	if (!tco_method) {
	639	tc = timecounter;
	640	tsp->tv_sec = tc->tc_offset_sec;
	641	tsp->tv_nsec = tc->tc_offset_nano >> 32;
	642	} else {
	643	nanouptime(tsp);
	644	}
	645	}
	646
	647	void
	648	microuptime(struct timeval *tv)
	649	{
	650	struct timecounter *tc;
	651
	652	tc = timecounter;
	653	tv->tv_sec = tc->tc_offset_sec;
	654	tv->tv_usec = tc->tc_offset_micro;
	655	tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
	656	while (tv->tv_usec < 0) {
	657	tv->tv_usec += 1000000;
	658	if (tv->tv_sec > 0)
	659	tv->tv_sec--;
	660	}
	661	while (tv->tv_usec >= 1000000) {
	662	tv->tv_usec -= 1000000;
	663	tv->tv_sec++;
	664	}
	665	}
	666
	667	void
	668	nanouptime(struct timespec *ts)
	669	{
	670	unsigned count;
	671	u_int64_t delta;
	672	struct timecounter *tc;
	673
	674	tc = timecounter;
	675	ts->tv_sec = tc->tc_offset_sec;
	676	count = tco_delta(tc);
	677	delta = tc->tc_offset_nano;
	678	delta += ((u_int64_t)count * tc->tc_scale_nano_f);
	679	delta >>= 32;
	680	delta += ((u_int64_t)count * tc->tc_scale_nano_i);
	681	while (delta < 0) {
	682	delta += 1000000000;
	683	if (ts->tv_sec > 0)
	684	ts->tv_sec--;
	685	}
	686	while (delta >= 1000000000) {
	687	delta -= 1000000000;
	688	ts->tv_sec++;
	689	}
	690	ts->tv_nsec = delta;
	691	}
	692
	693	static void
	694	tco_setscales(struct timecounter *tc)
	695	{
	696	u_int64_t scale;
	697
	698	scale = 1000000000LL << 32;
	699	scale += tc->tc_adjustment;
	700	scale /= tc->tc_tweak->tc_frequency;
	701	tc->tc_scale_micro = scale / 1000;
	702	tc->tc_scale_nano_f = scale & 0xffffffff;
	703	tc->tc_scale_nano_i = scale >> 32;
	704	}
	705
	706	void
	707	update_timecounter(struct timecounter *tc)
	708	{
	709	tco_setscales(tc);
	710	}
	711
	712	void
	713	init_timecounter(struct timecounter *tc)
	714	{
	715	struct timespec ts1;
	716	struct timecounter t1, t2, *t3;
	717	unsigned u;
	718	int i;
	719
	720	u = tc->tc_frequency / tc->tc_counter_mask;
	721	if (u > hz) {
	722	printf("Timecounter \"%s\" frequency %lu Hz"
	723	" -- Insufficient hz, needs at least %u\n",
	724	tc->tc_name, (u_long) tc->tc_frequency, u);
	725	return;
	726	}
	727
	728	tc->tc_adjustment = 0;
	729	tc->tc_tweak = tc;
	730	tco_setscales(tc);
	731	tc->tc_offset_count = tc->tc_get_timecount(tc);
	732	if (timecounter == &dummy_timecounter)
	733	tc->tc_avail = tc;
	734	else {
	735	tc->tc_avail = timecounter->tc_tweak->tc_avail;
	736	timecounter->tc_tweak->tc_avail = tc;
	737	}
	738	MALLOC(t1, struct timecounter , sizeof t1, M_TIMECOUNTER, M_WAITOK);
	739	tc->tc_other = t1;
	740	t1 = tc;
	741	t2 = t1;
	742	for (i = 1; i < NTIMECOUNTER; i++) {
	743	MALLOC(t3, struct timecounter , sizeof t3,
	744	M_TIMECOUNTER, M_WAITOK);
	745	t3 = tc;
	746	t3->tc_other = t2;
	747	t2 = t3;
	748	}
	749	t1->tc_other = t3;
	750	tc = t1;
	751
	752	printf("Timecounter \"%s\" frequency %lu Hz\n",
	753	tc->tc_name, (u_long)tc->tc_frequency);
	754
	755	/* XXX: For now always start using the counter. */
	756	tc->tc_offset_count = tc->tc_get_timecount(tc);
	757	nanouptime(&ts1);
	758	tc->tc_offset_nano = (u_int64_t)ts1.tv_nsec << 32;
	759	tc->tc_offset_micro = ts1.tv_nsec / 1000;
	760	tc->tc_offset_sec = ts1.tv_sec;
	761	timecounter = tc;
	762	}
	763
	764	void
	765	set_timecounter(struct timespec *ts)
	766	{
	767	struct timespec ts2;
	768
	769	nanouptime(&ts2);
	770	boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
	771	boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
	772	if (boottime.tv_usec < 0) {
	773	boottime.tv_usec += 1000000;
	774	boottime.tv_sec--;
	775	}
	776	/* fiddle all the little crinkly bits around the fiords... */
	777	tco_forward(1);
	778	}
	779
	780	static void
	781	switch_timecounter(struct timecounter *newtc)
	782	{
	783	int s;
	784	struct timecounter *tc;
	785	struct timespec ts;
	786
	787	s = splclock();
	788	tc = timecounter;
	789	if (newtc->tc_tweak == tc->tc_tweak) {
	790	splx(s);
	791	return;
	792	}
	793	newtc = newtc->tc_tweak->tc_other;
	794	nanouptime(&ts);
	795	newtc->tc_offset_sec = ts.tv_sec;
	796	newtc->tc_offset_nano = (u_int64_t)ts.tv_nsec << 32;
	797	newtc->tc_offset_micro = ts.tv_nsec / 1000;
	798	newtc->tc_offset_count = newtc->tc_get_timecount(newtc);
	799	tco_setscales(newtc);
	800	timecounter = newtc;
	801	splx(s);
	802	}
	803
	804	static struct timecounter *
	805	sync_other_counter(void)
	806	{
	807	struct timecounter tc, tcn, *tco;
	808	unsigned delta;
	809
	810	tco = timecounter;
	811	tc = tco->tc_other;
	812	tcn = tc->tc_other;
	813	tc = tco;
	814	tc->tc_other = tcn;
	815	delta = tco_delta(tc);
	816	tc->tc_offset_count += delta;
	817	tc->tc_offset_count &= tc->tc_counter_mask;
	818	tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_f;
	819	tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_i << 32;
	820	return (tc);
	821	}
	822
	823	static void
	824	tco_forward(int force)
	825	{
	826	struct timecounter tc, tco;
	827	struct timeval tvt;
	828
	829	tco = timecounter;
	830	tc = sync_other_counter();
	831	/*
	832	* We may be inducing a tiny error here, the tc_poll_pps() may
	833	* process a latched count which happens after the tco_delta()
	834	* in sync_other_counter(), which would extend the previous
	835	* counters parameters into the domain of this new one.
	836	* Since the timewindow is very small for this, the error is
	837	* going to be only a few weenieseconds (as Dave Mills would
	838	* say), so lets just not talk more about it, OK ?
	839	*/
	840	if (tco->tc_poll_pps)
	841	tco->tc_poll_pps(tco);
	842	if (timedelta != 0) {
	843	tvt = boottime;
	844	tvt.tv_usec += tickdelta;
	845	if (tvt.tv_usec >= 1000000) {
	846	tvt.tv_sec++;
	847	tvt.tv_usec -= 1000000;
	848	} else if (tvt.tv_usec < 0) {
	849	tvt.tv_sec--;
	850	tvt.tv_usec += 1000000;
	851	}
	852	boottime = tvt;
	853	timedelta -= tickdelta;
	854	}
	855
	856	while (tc->tc_offset_nano >= 1000000000ULL << 32) {
	857	tc->tc_offset_nano -= 1000000000ULL << 32;
	858	tc->tc_offset_sec++;
	859	ntp_update_second(tc); /* XXX only needed if xntpd runs */
	860	tco_setscales(tc);
	861	force++;
	862	}
	863
	864	if (tco_method && !force)
	865	return;
	866
	867	tc->tc_offset_micro = (tc->tc_offset_nano / 1000) >> 32;
	868
	869	/* Figure out the wall-clock time */
	870	tc->tc_nanotime.tv_sec = tc->tc_offset_sec + boottime.tv_sec;
	871	tc->tc_nanotime.tv_nsec =
	872	(tc->tc_offset_nano >> 32) + boottime.tv_usec * 1000;
	873	tc->tc_microtime.tv_usec = tc->tc_offset_micro + boottime.tv_usec;
	874	while (tc->tc_nanotime.tv_nsec >= 1000000000) {
	875	tc->tc_nanotime.tv_nsec -= 1000000000;
	876	tc->tc_microtime.tv_usec -= 1000000;
	877	tc->tc_nanotime.tv_sec++;
	878	}
	879	time_second = tc->tc_microtime.tv_sec = tc->tc_nanotime.tv_sec;
	880
	881	timecounter = tc;
	882	}
	883
	884	SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
	885
	886	SYSCTL_INT(_kern_timecounter, OID_AUTO, method, CTLFLAG_RW, &tco_method, 0,
	887	"This variable determines the method used for updating timecounters. "
	888	"If the default algorithm (0) fails with \"calcru negative...\" messages "
	889	"try the alternate algorithm (1) which handles bad hardware better."
	890
	891	);
	892
	893	static int
	894	sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
	895	{
	896	char newname[32];
	897	struct timecounter newtc, tc;
	898	int error;
	899
	900	tc = timecounter->tc_tweak;
	901	strncpy(newname, tc->tc_name, sizeof(newname));
	902	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
	903	if (error == 0 && req->newptr != NULL &&
	904	strcmp(newname, tc->tc_name) != 0) {
	905	for (newtc = tc->tc_avail; newtc != tc;
	906	newtc = newtc->tc_avail) {
	907	if (strcmp(newname, newtc->tc_name) == 0) {
	908	/* Warm up new timecounter. */
	909	(void)newtc->tc_get_timecount(newtc);
	910
	911	switch_timecounter(newtc);
	912	return (0);
	913	}
	914	}
	915	return (EINVAL);
	916	}
	917	return (error);
	918	}
	919
	920	SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING \| CTLFLAG_RW,
	921	0, 0, sysctl_kern_timecounter_hardware, "A", "");
	922
	923
	924	int
	925	pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
	926	{
	927	pps_params_t *app;
	928	struct pps_fetch_args *fapi;
	929	#ifdef PPS_SYNC
	930	struct pps_kcbind_args *kapi;
	931	#endif
	932
	933	switch (cmd) {
	934	case PPS_IOC_CREATE:
	935	return (0);
	936	case PPS_IOC_DESTROY:
	937	return (0);
	938	case PPS_IOC_SETPARAMS:
	939	app = (pps_params_t *)data;
	940	if (app->mode & ~pps->ppscap)
	941	return (EINVAL);
	942	pps->ppsparam = *app;
	943	return (0);
	944	case PPS_IOC_GETPARAMS:
	945	app = (pps_params_t *)data;
	946	*app = pps->ppsparam;
	947	app->api_version = PPS_API_VERS_1;
	948	return (0);
	949	case PPS_IOC_GETCAP:
	950	(int)data = pps->ppscap;
	951	return (0);
	952	case PPS_IOC_FETCH:
	953	fapi = (struct pps_fetch_args *)data;
	954	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
	955	return (EINVAL);
	956	if (fapi->timeout.tv_sec \|\| fapi->timeout.tv_nsec)
	957	return (EOPNOTSUPP);
	958	pps->ppsinfo.current_mode = pps->ppsparam.mode;
	959	fapi->pps_info_buf = pps->ppsinfo;
	960	return (0);
	961	case PPS_IOC_KCBIND:
	962	#ifdef PPS_SYNC
	963	kapi = (struct pps_kcbind_args *)data;
	964	/* XXX Only root should be able to do this */
	965	if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
	966	return (EINVAL);
	967	if (kapi->kernel_consumer != PPS_KC_HARDPPS)
	968	return (EINVAL);
	969	if (kapi->edge & ~pps->ppscap)
	970	return (EINVAL);
	971	pps->kcmode = kapi->edge;
	972	return (0);
	973	#else
	974	return (EOPNOTSUPP);
	975	#endif
	976	default:
	977	return (ENOTTY);
	978	}
	979	}
	980
	981	void
	982	pps_init(struct pps_state *pps)
	983	{
	984	pps->ppscap \|= PPS_TSFMT_TSPEC;
	985	if (pps->ppscap & PPS_CAPTUREASSERT)
	986	pps->ppscap \|= PPS_OFFSETASSERT;
	987	if (pps->ppscap & PPS_CAPTURECLEAR)
	988	pps->ppscap \|= PPS_OFFSETCLEAR;
	989	}
	990
	991	void
	992	pps_event(struct pps_state pps, struct timecounter tc, unsigned count, int event)
	993	{
	994	struct timespec ts, tsp, osp;
	995	u_int64_t delta;
	996	unsigned tcount, *pcount;
	997	int foff, fhard;
	998	pps_seq_t *pseq;
	999
	1000	/* Things would be easier with arrays... */
	1001	if (event == PPS_CAPTUREASSERT) {
	1002	tsp = &pps->ppsinfo.assert_timestamp;
	1003	osp = &pps->ppsparam.assert_offset;
	1004	foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
	1005	fhard = pps->kcmode & PPS_CAPTUREASSERT;
	1006	pcount = &pps->ppscount[0];
	1007	pseq = &pps->ppsinfo.assert_sequence;
	1008	} else {
	1009	tsp = &pps->ppsinfo.clear_timestamp;
	1010	osp = &pps->ppsparam.clear_offset;
	1011	foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
	1012	fhard = pps->kcmode & PPS_CAPTURECLEAR;
	1013	pcount = &pps->ppscount[1];
	1014	pseq = &pps->ppsinfo.clear_sequence;
	1015	}
	1016
	1017	/* The timecounter changed: bail */
	1018	if (!pps->ppstc \|\|
	1019	pps->ppstc->tc_name != tc->tc_name \|\|
	1020	tc->tc_name != timecounter->tc_name) {
	1021	pps->ppstc = tc;
	1022	*pcount = count;
	1023	return;
	1024	}
	1025
	1026	/* Nothing really happened */
	1027	if (*pcount == count)
	1028	return;
	1029
	1030	*pcount = count;
	1031
	1032	/* Convert the count to timespec */
	1033	ts.tv_sec = tc->tc_offset_sec;
	1034	tcount = count - tc->tc_offset_count;
	1035	tcount &= tc->tc_counter_mask;
	1036	delta = tc->tc_offset_nano;
	1037	delta += ((u_int64_t)tcount * tc->tc_scale_nano_f);
	1038	delta >>= 32;
	1039	delta += ((u_int64_t)tcount * tc->tc_scale_nano_i);
	1040	delta += boottime.tv_usec * 1000;
	1041	ts.tv_sec += boottime.tv_sec;
	1042	while (delta >= 1000000000) {
	1043	delta -= 1000000000;
	1044	ts.tv_sec++;
	1045	}
	1046	ts.tv_nsec = delta;
	1047
	1048	(*pseq)++;
	1049	*tsp = ts;
	1050
	1051	if (foff) {
	1052	timespecadd(tsp, osp);
	1053	if (tsp->tv_nsec < 0) {
	1054	tsp->tv_nsec += 1000000000;
	1055	tsp->tv_sec -= 1;
	1056	}
	1057	}
	1058	#ifdef PPS_SYNC
	1059	if (fhard) {
	1060	/* magic, at its best... */
	1061	tcount = count - pps->ppscount[2];
	1062	pps->ppscount[2] = count;
	1063	tcount &= tc->tc_counter_mask;
	1064	delta = ((u_int64_t)tcount * tc->tc_tweak->tc_scale_nano_f);
	1065	delta >>= 32;
	1066	delta += ((u_int64_t)tcount * tc->tc_tweak->tc_scale_nano_i);
	1067	hardpps(tsp, delta);
	1068	}
	1069	#endif
	1070	}