Commit | Line | Data |
---|---|---|
8c10bfcf MD |
1 | /* |
2 | * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. | |
bbf175be | 3 | * |
8c10bfcf MD |
4 | * This code is derived from software contributed to The DragonFly Project |
5 | * by Matthew Dillon <dillon@backplane.com> | |
bbf175be | 6 | * |
8c10bfcf MD |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
bbf175be | 10 | * |
8c10bfcf MD |
11 | * 1. Redistributions of source code must retain the above copyright |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific, prior written permission. | |
bbf175be | 20 | * |
8c10bfcf MD |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
bbf175be | 33 | * |
984263bc MD |
34 | * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org> |
35 | * Copyright (c) 1982, 1986, 1991, 1993 | |
36 | * The Regents of the University of California. All rights reserved. | |
37 | * (c) UNIX System Laboratories, Inc. | |
38 | * All or some portions of this file are derived from material licensed | |
39 | * to the University of California by American Telephone and Telegraph | |
40 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with | |
41 | * the permission of UNIX System Laboratories, Inc. | |
42 | * | |
43 | * Redistribution and use in source and binary forms, with or without | |
44 | * modification, are permitted provided that the following conditions | |
45 | * are met: | |
46 | * 1. Redistributions of source code must retain the above copyright | |
47 | * notice, this list of conditions and the following disclaimer. | |
48 | * 2. Redistributions in binary form must reproduce the above copyright | |
49 | * notice, this list of conditions and the following disclaimer in the | |
50 | * documentation and/or other materials provided with the distribution. | |
dc71b7ab | 51 | * 3. Neither the name of the University nor the names of its contributors |
984263bc MD |
52 | * may be used to endorse or promote products derived from this software |
53 | * without specific prior written permission. | |
54 | * | |
55 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
56 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
57 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
58 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
59 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
60 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
61 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
62 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
63 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
64 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
65 | * SUCH DAMAGE. | |
66 | * | |
67 | * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 | |
68 | * $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $ | |
69 | */ | |
70 | ||
71 | #include "opt_ntp.h" | |
07522099 | 72 | #include "opt_pctrack.h" |
984263bc MD |
73 | |
74 | #include <sys/param.h> | |
75 | #include <sys/systm.h> | |
984263bc MD |
76 | #include <sys/callout.h> |
77 | #include <sys/kernel.h> | |
f5d21610 | 78 | #include <sys/kinfo.h> |
984263bc MD |
79 | #include <sys/proc.h> |
80 | #include <sys/malloc.h> | |
d70eef28 | 81 | #include <sys/resource.h> |
984263bc MD |
82 | #include <sys/resourcevar.h> |
83 | #include <sys/signalvar.h> | |
2b3f93ea | 84 | #include <sys/caps.h> |
984263bc MD |
85 | #include <sys/timex.h> |
86 | #include <sys/timepps.h> | |
0adbcbd6 | 87 | #include <sys/upmap.h> |
984263bc | 88 | #include <sys/lock.h> |
77bc82e1 MD |
89 | #include <sys/sysctl.h> |
90 | #include <sys/kcollect.h> | |
8f7f5bd5 | 91 | #include <sys/exislock.h> |
fac0eb3c | 92 | #include <sys/exislock2.h> |
77bc82e1 MD |
93 | |
94 | #include <vm/vm.h> | |
984263bc MD |
95 | #include <vm/pmap.h> |
96 | #include <vm/vm_map.h> | |
5ffd1608 | 97 | #include <vm/vm_extern.h> |
684a93c4 | 98 | |
2689779e | 99 | #include <sys/thread2.h> |
a55bb12d | 100 | #include <sys/spinlock2.h> |
984263bc MD |
101 | |
102 | #include <machine/cpu.h> | |
103 | #include <machine/limits.h> | |
104 | #include <machine/smp.h> | |
d2412a2e MD |
105 | #include <machine/cpufunc.h> |
106 | #include <machine/specialreg.h> | |
107 | #include <machine/clock.h> | |
984263bc | 108 | |
07522099 MD |
109 | #ifdef DEBUG_PCTRACK |
110 | static void do_pctrack(struct intrframe *frame, int which); | |
111 | #endif | |
112 | ||
402ed7e1 | 113 | static void initclocks (void *dummy); |
f3f3eadb | 114 | SYSINIT(clocks, SI_BOOT2_CLOCKS, SI_ORDER_FIRST, initclocks, NULL); |
984263bc | 115 | |
6ad39cae MD |
116 | /* |
117 | * Some of these don't belong here, but it's easiest to concentrate them. | |
9eea7f0c | 118 | * Note that cpu_time counts in microseconds, but most userland programs |
6ad39cae MD |
119 | * just compare relative times against the total by delta. |
120 | */ | |
9eea7f0c | 121 | struct kinfo_cputime cputime_percpu[MAXCPU]; |
07522099 MD |
122 | #ifdef DEBUG_PCTRACK |
123 | struct kinfo_pcheader cputime_pcheader = { PCTRACK_SIZE, PCTRACK_ARYSIZE }; | |
124 | struct kinfo_pctrack cputime_pctrack[MAXCPU][PCTRACK_SIZE]; | |
125 | #endif | |
126 | ||
63823918 MD |
127 | __read_mostly static int sniff_enable = 1; |
128 | __read_mostly static int sniff_target = -1; | |
129 | __read_mostly static int clock_debug2 = 0; | |
67534613 MD |
130 | SYSCTL_INT(_kern, OID_AUTO, sniff_enable, CTLFLAG_RW, &sniff_enable, 0 , ""); |
131 | SYSCTL_INT(_kern, OID_AUTO, sniff_target, CTLFLAG_RW, &sniff_target, 0 , ""); | |
63823918 | 132 | SYSCTL_INT(_debug, OID_AUTO, clock_debug2, CTLFLAG_RW, &clock_debug2, 0 , ""); |
67534613 | 133 | |
8f7f5bd5 MD |
134 | __read_mostly long pseudo_ticks = 1; /* existential timed locks */ |
135 | ||
9eea7f0c HP |
136 | static int |
137 | sysctl_cputime(SYSCTL_HANDLER_ARGS) | |
138 | { | |
139 | int cpu, error = 0; | |
82f8b550 | 140 | int root_error; |
9eea7f0c | 141 | size_t size = sizeof(struct kinfo_cputime); |
e32d3244 | 142 | struct kinfo_cputime tmp; |
9eea7f0c | 143 | |
82f8b550 MD |
144 | /* |
145 | * NOTE: For security reasons, only root can sniff %rip | |
146 | */ | |
2b3f93ea | 147 | root_error = caps_priv_check_self(SYSCAP_RESTRICTEDROOT); |
82f8b550 | 148 | |
9eea7f0c | 149 | for (cpu = 0; cpu < ncpus; ++cpu) { |
e32d3244 | 150 | tmp = cputime_percpu[cpu]; |
82f8b550 MD |
151 | if (root_error == 0) { |
152 | tmp.cp_sample_pc = | |
153 | (int64_t)globaldata_find(cpu)->gd_sample_pc; | |
154 | tmp.cp_sample_sp = | |
155 | (int64_t)globaldata_find(cpu)->gd_sample_sp; | |
156 | } | |
e32d3244 | 157 | if ((error = SYSCTL_OUT(req, &tmp, size)) != 0) |
9eea7f0c HP |
158 | break; |
159 | } | |
82f8b550 | 160 | |
67534613 MD |
161 | if (root_error == 0) { |
162 | if (sniff_enable) { | |
163 | int n = sniff_target; | |
164 | if (n < 0) | |
165 | smp_sniff(); | |
166 | else if (n < ncpus) | |
167 | cpu_sniff(n); | |
168 | } | |
169 | } | |
984263bc | 170 | |
9eea7f0c HP |
171 | return (error); |
172 | } | |
173 | SYSCTL_PROC(_kern, OID_AUTO, cputime, (CTLTYPE_OPAQUE|CTLFLAG_RD), 0, 0, | |
174 | sysctl_cputime, "S,kinfo_cputime", "CPU time statistics"); | |
984263bc | 175 | |
06636a8e AHJ |
176 | static int |
177 | sysctl_cp_time(SYSCTL_HANDLER_ARGS) | |
178 | { | |
5c13d0f3 | 179 | long cpu_states[CPUSTATES] = {0}; |
06636a8e AHJ |
180 | int cpu, error = 0; |
181 | size_t size = sizeof(cpu_states); | |
182 | ||
183 | for (cpu = 0; cpu < ncpus; ++cpu) { | |
d70eef28 SG |
184 | cpu_states[CP_USER] += cputime_percpu[cpu].cp_user; |
185 | cpu_states[CP_NICE] += cputime_percpu[cpu].cp_nice; | |
186 | cpu_states[CP_SYS] += cputime_percpu[cpu].cp_sys; | |
187 | cpu_states[CP_INTR] += cputime_percpu[cpu].cp_intr; | |
188 | cpu_states[CP_IDLE] += cputime_percpu[cpu].cp_idle; | |
06636a8e AHJ |
189 | } |
190 | ||
191 | error = SYSCTL_OUT(req, cpu_states, size); | |
192 | ||
193 | return (error); | |
194 | } | |
195 | ||
196 | SYSCTL_PROC(_kern, OID_AUTO, cp_time, (CTLTYPE_LONG|CTLFLAG_RD), 0, 0, | |
4276b194 SW |
197 | sysctl_cp_time, "LU", "CPU time statistics"); |
198 | ||
199 | static int | |
200 | sysctl_cp_times(SYSCTL_HANDLER_ARGS) | |
201 | { | |
202 | long cpu_states[CPUSTATES] = {0}; | |
203 | int cpu, error; | |
204 | size_t size = sizeof(cpu_states); | |
205 | ||
206 | for (error = 0, cpu = 0; error == 0 && cpu < ncpus; ++cpu) { | |
207 | cpu_states[CP_USER] = cputime_percpu[cpu].cp_user; | |
208 | cpu_states[CP_NICE] = cputime_percpu[cpu].cp_nice; | |
209 | cpu_states[CP_SYS] = cputime_percpu[cpu].cp_sys; | |
210 | cpu_states[CP_INTR] = cputime_percpu[cpu].cp_intr; | |
211 | cpu_states[CP_IDLE] = cputime_percpu[cpu].cp_idle; | |
212 | error = SYSCTL_OUT(req, cpu_states, size); | |
213 | } | |
214 | ||
215 | return (error); | |
216 | } | |
217 | ||
218 | SYSCTL_PROC(_kern, OID_AUTO, cp_times, (CTLTYPE_LONG|CTLFLAG_RD), 0, 0, | |
219 | sysctl_cp_times, "LU", "per-CPU time statistics"); | |
06636a8e | 220 | |
88c4d2f6 MD |
221 | /* |
222 | * boottime is used to calculate the 'real' uptime. Do not confuse this with | |
223 | * microuptime(). microtime() is not drift compensated. The real uptime | |
60b2809b MD |
224 | * with compensation is nanotime() - bootime. boottime is recalculated |
225 | * whenever the real time is set based on the compensated elapsed time | |
226 | * in seconds (gd->gd_time_seconds). | |
88c4d2f6 | 227 | * |
88c4d2f6 MD |
228 | * The gd_time_seconds and gd_cpuclock_base fields remain fairly monotonic. |
229 | * Slight adjustments to gd_cpuclock_base are made to phase-lock it to | |
230 | * the real time. | |
3dc002ae | 231 | * |
1fceee21 | 232 | * WARNING! time_second can backstep on time corrections. Also, unlike |
2ed58723 | 233 | * time_second, time_uptime is not a "real" time_t (seconds |
1fceee21 | 234 | * since the Epoch) but seconds since booting. |
88c4d2f6 | 235 | */ |
2ff21866 | 236 | __read_mostly struct timespec boottime; /* boot time (realtime) for ref only */ |
63823918 MD |
237 | __read_mostly struct timespec ticktime0;/* updated every tick */ |
238 | __read_mostly struct timespec ticktime2;/* updated every tick */ | |
239 | __read_mostly int ticktime_update; | |
2ff21866 MD |
240 | __read_mostly time_t time_second; /* read-only 'passive' rt in seconds */ |
241 | __read_mostly time_t time_uptime; /* read-only 'passive' ut in seconds */ | |
984263bc | 242 | |
5eb5a6bc MD |
243 | /* |
244 | * basetime is used to calculate the compensated real time of day. The | |
bbf175be | 245 | * basetime can be modified on a per-tick basis by the adjtime(), |
5eb5a6bc MD |
246 | * ntp_adjtime(), and sysctl-based time correction APIs. |
247 | * | |
248 | * Note that frequency corrections can also be made by adjusting | |
249 | * gd_cpuclock_base. | |
250 | * | |
251 | * basetime is a tail-chasing FIFO, updated only by cpu #0. The FIFO is | |
252 | * used on both SMP and UP systems to avoid MP races between cpu's and | |
253 | * interrupt races on UP systems. | |
254 | */ | |
2ed58723 MD |
255 | struct hardtime { |
256 | __uint32_t time_second; | |
257 | sysclock_t cpuclock_base; | |
258 | }; | |
259 | ||
5eb5a6bc MD |
260 | #define BASETIME_ARYSIZE 16 |
261 | #define BASETIME_ARYMASK (BASETIME_ARYSIZE - 1) | |
262 | static struct timespec basetime[BASETIME_ARYSIZE]; | |
2ed58723 | 263 | static struct hardtime hardtime[BASETIME_ARYSIZE]; |
5eb5a6bc MD |
264 | static volatile int basetime_index; |
265 | ||
266 | static int | |
267 | sysctl_get_basetime(SYSCTL_HANDLER_ARGS) | |
268 | { | |
269 | struct timespec *bt; | |
270 | int error; | |
35238fa5 | 271 | int index; |
5eb5a6bc | 272 | |
35238fa5 MD |
273 | /* |
274 | * Because basetime data and index may be updated by another cpu, | |
275 | * a load fence is required to ensure that the data we read has | |
276 | * not been speculatively read relative to a possibly updated index. | |
277 | */ | |
278 | index = basetime_index; | |
279 | cpu_lfence(); | |
280 | bt = &basetime[index]; | |
08f95c49 | 281 | error = SYSCTL_OUT(req, bt, sizeof(*bt)); |
5eb5a6bc MD |
282 | return (error); |
283 | } | |
284 | ||
984263bc | 285 | SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD, |
08f95c49 | 286 | &boottime, timespec, "System boottime"); |
5eb5a6bc | 287 | SYSCTL_PROC(_kern, OID_AUTO, basetime, CTLTYPE_STRUCT|CTLFLAG_RD, 0, 0, |
08f95c49 | 288 | sysctl_get_basetime, "S,timespec", "System basetime"); |
984263bc | 289 | |
96d52ac8 SZ |
290 | static void hardclock(systimer_t info, int, struct intrframe *frame); |
291 | static void statclock(systimer_t info, int, struct intrframe *frame); | |
292 | static void schedclock(systimer_t info, int, struct intrframe *frame); | |
5eb5a6bc | 293 | static void getnanotime_nbt(struct timespec *nbt, struct timespec *tsp); |
88c4d2f6 | 294 | |
2ff21866 MD |
295 | /* |
296 | * Use __read_mostly for ticks and sched_ticks because these variables are | |
297 | * used all over the kernel and only updated once per tick. | |
298 | */ | |
6d3dff5f | 299 | __read_mostly sbintime_t sbticks; /* system master ticks at hz (64bit) */ |
2ff21866 MD |
300 | __read_mostly int ticks; /* system master ticks at hz */ |
301 | __read_mostly int sched_ticks; /* global schedule clock ticks */ | |
302 | __read_mostly int clocks_running; /* tsleep/timeout clocks operational */ | |
88c4d2f6 MD |
303 | int64_t nsec_adj; /* ntpd per-tick adjustment in nsec << 32 */ |
304 | int64_t nsec_acc; /* accumulator */ | |
984263bc | 305 | |
4026c000 JS |
306 | /* NTPD time correction fields */ |
307 | int64_t ntp_tick_permanent; /* per-tick adjustment in nsec << 32 */ | |
308 | int64_t ntp_tick_acc; /* accumulator for per-tick adjustment */ | |
309 | int64_t ntp_delta; /* one-time correction in nsec */ | |
310 | int64_t ntp_big_delta = 1000000000; | |
311 | int32_t ntp_tick_delta; /* current adjustment rate */ | |
312 | int32_t ntp_default_tick_delta; /* adjustment rate for ntp_delta */ | |
48590578 JS |
313 | time_t ntp_leap_second; /* time of next leap second */ |
314 | int ntp_leap_insert; /* whether to insert or remove a second */ | |
a55bb12d | 315 | struct spinlock ntp_spin; |
4026c000 | 316 | |
984263bc | 317 | /* |
88c4d2f6 | 318 | * Finish initializing clock frequencies and start all clocks running. |
984263bc | 319 | */ |
88c4d2f6 MD |
320 | /* ARGSUSED*/ |
321 | static void | |
322 | initclocks(void *dummy) | |
984263bc | 323 | { |
88c4d2f6 | 324 | /*psratio = profhz / stathz;*/ |
a55bb12d | 325 | spin_init(&ntp_spin, "ntp"); |
88c4d2f6 | 326 | initclocks_pcpu(); |
da3639ef | 327 | clocks_running = 1; |
0adbcbd6 | 328 | if (kpmap) { |
5b49787b | 329 | kpmap->tsc_freq = tsc_frequency; |
0adbcbd6 MD |
330 | kpmap->tick_freq = hz; |
331 | } | |
984263bc MD |
332 | } |
333 | ||
88c4d2f6 | 334 | /* |
1997b4c2 MD |
335 | * Called on a per-cpu basis from the idle thread bootstrap on each cpu |
336 | * during SMP initialization. | |
337 | * | |
338 | * This routine is called concurrently during low-level SMP initialization | |
339 | * and may not block in any way. Meaning, among other things, we can't | |
340 | * acquire any tokens. | |
88c4d2f6 MD |
341 | */ |
342 | void | |
343 | initclocks_pcpu(void) | |
344 | { | |
345 | struct globaldata *gd = mycpu; | |
984263bc | 346 | |
88c4d2f6 MD |
347 | crit_enter(); |
348 | if (gd->gd_cpuid == 0) { | |
349 | gd->gd_time_seconds = 1; | |
044ee7c4 | 350 | gd->gd_cpuclock_base = sys_cputimer->count(); |
2ed58723 MD |
351 | hardtime[0].time_second = gd->gd_time_seconds; |
352 | hardtime[0].cpuclock_base = gd->gd_cpuclock_base; | |
88c4d2f6 | 353 | } else { |
88c4d2f6 MD |
354 | gd->gd_time_seconds = globaldata_find(0)->gd_time_seconds; |
355 | gd->gd_cpuclock_base = globaldata_find(0)->gd_cpuclock_base; | |
356 | } | |
0d1dffdf | 357 | |
43adde98 SZ |
358 | systimer_intr_enable(); |
359 | ||
1997b4c2 MD |
360 | crit_exit(); |
361 | } | |
362 | ||
77bc82e1 MD |
363 | /* |
364 | * Called on a 10-second interval after the system is operational. | |
365 | * Return the collection data for USERPCT and install the data for | |
366 | * SYSTPCT and IDLEPCT. | |
367 | */ | |
368 | static | |
369 | uint64_t | |
370 | collect_cputime_callback(int n) | |
371 | { | |
372 | static long cpu_base[CPUSTATES]; | |
373 | long cpu_states[CPUSTATES]; | |
374 | long total; | |
375 | long acc; | |
376 | long lsb; | |
377 | ||
378 | bzero(cpu_states, sizeof(cpu_states)); | |
379 | for (n = 0; n < ncpus; ++n) { | |
380 | cpu_states[CP_USER] += cputime_percpu[n].cp_user; | |
381 | cpu_states[CP_NICE] += cputime_percpu[n].cp_nice; | |
382 | cpu_states[CP_SYS] += cputime_percpu[n].cp_sys; | |
383 | cpu_states[CP_INTR] += cputime_percpu[n].cp_intr; | |
384 | cpu_states[CP_IDLE] += cputime_percpu[n].cp_idle; | |
385 | } | |
386 | ||
387 | acc = 0; | |
388 | for (n = 0; n < CPUSTATES; ++n) { | |
389 | total = cpu_states[n] - cpu_base[n]; | |
390 | cpu_base[n] = cpu_states[n]; | |
391 | cpu_states[n] = total; | |
392 | acc += total; | |
393 | } | |
394 | if (acc == 0) /* prevent degenerate divide by 0 */ | |
395 | acc = 1; | |
396 | lsb = acc / (10000 * 2); | |
397 | kcollect_setvalue(KCOLLECT_SYSTPCT, | |
398 | (cpu_states[CP_SYS] + lsb) * 10000 / acc); | |
399 | kcollect_setvalue(KCOLLECT_IDLEPCT, | |
400 | (cpu_states[CP_IDLE] + lsb) * 10000 / acc); | |
401 | kcollect_setvalue(KCOLLECT_INTRPCT, | |
402 | (cpu_states[CP_INTR] + lsb) * 10000 / acc); | |
403 | return((cpu_states[CP_USER] + cpu_states[CP_NICE] + lsb) * 10000 / acc); | |
404 | } | |
405 | ||
1997b4c2 MD |
406 | /* |
407 | * This routine is called on just the BSP, just after SMP initialization | |
408 | * completes to * finish initializing any clocks that might contend/block | |
409 | * (e.g. like on a token). We can't do this in initclocks_pcpu() because | |
410 | * that function is called from the idle thread bootstrap for each cpu and | |
411 | * not allowed to block at all. | |
412 | */ | |
413 | static | |
414 | void | |
415 | initclocks_other(void *dummy) | |
416 | { | |
417 | struct globaldata *ogd = mycpu; | |
418 | struct globaldata *gd; | |
419 | int n; | |
420 | ||
421 | for (n = 0; n < ncpus; ++n) { | |
422 | lwkt_setcpu_self(globaldata_find(n)); | |
423 | gd = mycpu; | |
424 | ||
425 | /* | |
426 | * Use a non-queued periodic systimer to prevent multiple | |
427 | * ticks from building up if the sysclock jumps forward | |
428 | * (8254 gets reset). The sysclock will never jump backwards. | |
429 | * Our time sync is based on the actual sysclock, not the | |
430 | * ticks count. | |
c91894e0 MD |
431 | * |
432 | * Install statclock before hardclock to prevent statclock | |
433 | * from misinterpreting gd_flags for tick assignment when | |
880fb308 MD |
434 | * they overlap. Also offset the statclock by half of |
435 | * its interval to try to avoid being coincident with | |
436 | * callouts. | |
1997b4c2 | 437 | */ |
c6a766f4 MD |
438 | systimer_init_periodic_flags(&gd->gd_statclock, statclock, |
439 | NULL, stathz, | |
880fb308 | 440 | SYSTF_MSSYNC | SYSTF_FIRST | |
91dc43dd | 441 | SYSTF_OFFSET50 | SYSTF_OFFSETCPU); |
c6a766f4 | 442 | systimer_init_periodic_flags(&gd->gd_hardclock, hardclock, |
91dc43dd MD |
443 | NULL, hz, |
444 | SYSTF_MSSYNC | SYSTF_OFFSETCPU); | |
1997b4c2 MD |
445 | } |
446 | lwkt_setcpu_self(ogd); | |
77bc82e1 MD |
447 | |
448 | /* | |
449 | * Regular data collection | |
450 | */ | |
451 | kcollect_register(KCOLLECT_USERPCT, "user", collect_cputime_callback, | |
452 | KCOLLECT_SCALE(KCOLLECT_USERPCT_FORMAT, 0)); | |
453 | kcollect_register(KCOLLECT_SYSTPCT, "syst", NULL, | |
454 | KCOLLECT_SCALE(KCOLLECT_SYSTPCT_FORMAT, 0)); | |
455 | kcollect_register(KCOLLECT_IDLEPCT, "idle", NULL, | |
456 | KCOLLECT_SCALE(KCOLLECT_IDLEPCT_FORMAT, 0)); | |
88c4d2f6 | 457 | } |
f3f3eadb | 458 | SYSINIT(clocks2, SI_BOOT2_POST_SMP, SI_ORDER_ANY, initclocks_other, NULL); |
984263bc | 459 | |
39799749 IV |
460 | /* |
461 | * This method is called on just the BSP, after all the usched implementations | |
462 | * are initialized. This avoids races between usched initialization functions | |
463 | * and usched_schedulerclock(). | |
464 | */ | |
465 | static | |
466 | void | |
467 | initclocks_usched(void *dummy) | |
468 | { | |
469 | struct globaldata *ogd = mycpu; | |
470 | struct globaldata *gd; | |
471 | int n; | |
472 | ||
473 | for (n = 0; n < ncpus; ++n) { | |
474 | lwkt_setcpu_self(globaldata_find(n)); | |
475 | gd = mycpu; | |
476 | ||
477 | /* XXX correct the frequency for scheduler / estcpu tests */ | |
478 | systimer_init_periodic_flags(&gd->gd_schedclock, schedclock, | |
4341238d MD |
479 | NULL, ESTCPUFREQ, |
480 | SYSTF_MSSYNC | SYSTF_OFFSETCPU); | |
39799749 IV |
481 | } |
482 | lwkt_setcpu_self(ogd); | |
483 | } | |
484 | SYSINIT(clocks3, SI_BOOT2_USCHED, SI_ORDER_ANY, initclocks_usched, NULL); | |
485 | ||
984263bc | 486 | /* |
88c4d2f6 MD |
487 | * This sets the current real time of day. Timespecs are in seconds and |
488 | * nanoseconds. We do not mess with gd_time_seconds and gd_cpuclock_base, | |
489 | * instead we adjust basetime so basetime + gd_* results in the current | |
317c3bd2 | 490 | * time of day. This way the gd_* fields are guaranteed to represent |
88c4d2f6 | 491 | * a monotonically increasing 'uptime' value. |
5eb5a6bc MD |
492 | * |
493 | * When set_timeofday() is called from userland, the system call forces it | |
494 | * onto cpu #0 since only cpu #0 can update basetime_index. | |
984263bc | 495 | */ |
88c4d2f6 MD |
496 | void |
497 | set_timeofday(struct timespec *ts) | |
498 | { | |
5eb5a6bc MD |
499 | struct timespec *nbt; |
500 | int ni; | |
984263bc | 501 | |
88c4d2f6 MD |
502 | /* |
503 | * XXX SMP / non-atomic basetime updates | |
504 | */ | |
505 | crit_enter(); | |
5eb5a6bc | 506 | ni = (basetime_index + 1) & BASETIME_ARYMASK; |
2ed58723 | 507 | cpu_lfence(); |
5eb5a6bc MD |
508 | nbt = &basetime[ni]; |
509 | nanouptime(nbt); | |
510 | nbt->tv_sec = ts->tv_sec - nbt->tv_sec; | |
511 | nbt->tv_nsec = ts->tv_nsec - nbt->tv_nsec; | |
512 | if (nbt->tv_nsec < 0) { | |
513 | nbt->tv_nsec += 1000000000; | |
514 | --nbt->tv_sec; | |
88c4d2f6 | 515 | } |
a81931cc MD |
516 | |
517 | /* | |
518 | * Note that basetime diverges from boottime as the clock drift is | |
519 | * compensated for, so we cannot do away with boottime. When setting | |
520 | * the absolute time of day the drift is 0 (for an instant) and we | |
bbf175be | 521 | * can simply assign boottime to basetime. |
a81931cc MD |
522 | * |
523 | * Note that nanouptime() is based on gd_time_seconds which is drift | |
317c3bd2 | 524 | * compensated up to a point (it is guaranteed to remain monotonically |
a81931cc MD |
525 | * increasing). gd_time_seconds is thus our best uptime guess and |
526 | * suitable for use in the boottime calculation. It is already taken | |
527 | * into account in the basetime calculation above. | |
528 | */ | |
a55bb12d | 529 | spin_lock(&ntp_spin); |
5eb5a6bc | 530 | boottime.tv_sec = nbt->tv_sec; |
4026c000 | 531 | ntp_delta = 0; |
5eb5a6bc MD |
532 | |
533 | /* | |
35238fa5 MD |
534 | * We now have a new basetime, make sure all other cpus have it, |
535 | * then update the index. | |
5eb5a6bc | 536 | */ |
35238fa5 | 537 | cpu_sfence(); |
5eb5a6bc | 538 | basetime_index = ni; |
a55bb12d | 539 | spin_unlock(&ntp_spin); |
5eb5a6bc | 540 | |
88c4d2f6 MD |
541 | crit_exit(); |
542 | } | |
bbf175be | 543 | |
984263bc | 544 | /* |
4871f0f4 | 545 | * Each cpu has its own hardclock, but we only increment ticks and softticks |
88c4d2f6 MD |
546 | * on cpu #0. |
547 | * | |
548 | * NOTE! systimer! the MP lock might not be held here. We can only safely | |
549 | * manipulate objects owned by the current cpu. | |
984263bc | 550 | */ |
984263bc | 551 | static void |
e76d2ad3 | 552 | hardclock(systimer_t info, int in_ipi, struct intrframe *frame) |
984263bc | 553 | { |
88c4d2f6 MD |
554 | sysclock_t cputicks; |
555 | struct proc *p; | |
88c4d2f6 | 556 | struct globaldata *gd = mycpu; |
984263bc | 557 | |
e76d2ad3 SZ |
558 | if ((gd->gd_reqflags & RQF_IPIQ) == 0 && lwkt_need_ipiq_process(gd)) { |
559 | /* Defer to doreti on passive IPIQ processing */ | |
560 | need_ipiq(); | |
561 | } | |
562 | ||
984263bc | 563 | /* |
2ed58723 MD |
564 | * We update the compensation base to calculate fine-grained time |
565 | * from the sys_cputimer on a per-cpu basis in order to avoid | |
566 | * having to mess around with locks. sys_cputimer is assumed to | |
567 | * be consistent across all cpus. CPU N copies the base state from | |
568 | * CPU 0 using the same FIFO trick that we use for basetime (so we | |
569 | * don't catch a CPU 0 update in the middle). | |
88c4d2f6 MD |
570 | * |
571 | * Note that we never allow info->time (aka gd->gd_hardclock.time) | |
fad57d0e MD |
572 | * to reverse index gd_cpuclock_base, but that it is possible for |
573 | * it to temporarily get behind in the seconds if something in the | |
574 | * system locks interrupts for a long period of time. Since periodic | |
575 | * timers count events, though everything should resynch again | |
576 | * immediately. | |
984263bc | 577 | */ |
2ed58723 MD |
578 | if (gd->gd_cpuid == 0) { |
579 | int ni; | |
580 | ||
581 | cputicks = info->time - gd->gd_cpuclock_base; | |
582 | if (cputicks >= sys_cputimer->freq) { | |
583 | cputicks /= sys_cputimer->freq; | |
584 | if (cputicks != 0 && cputicks != 1) | |
585 | kprintf("Warning: hardclock missed > 1 sec\n"); | |
586 | gd->gd_time_seconds += cputicks; | |
587 | gd->gd_cpuclock_base += sys_cputimer->freq * cputicks; | |
588 | /* uncorrected monotonic 1-sec gran */ | |
589 | time_uptime += cputicks; | |
590 | } | |
591 | ni = (basetime_index + 1) & BASETIME_ARYMASK; | |
592 | hardtime[ni].time_second = gd->gd_time_seconds; | |
593 | hardtime[ni].cpuclock_base = gd->gd_cpuclock_base; | |
594 | } else { | |
595 | int ni; | |
596 | ||
597 | ni = basetime_index; | |
598 | cpu_lfence(); | |
599 | gd->gd_time_seconds = hardtime[ni].time_second; | |
600 | gd->gd_cpuclock_base = hardtime[ni].cpuclock_base; | |
88c4d2f6 | 601 | } |
984263bc MD |
602 | |
603 | /* | |
92b561b7 MD |
604 | * The system-wide ticks counter and NTP related timedelta/tickdelta |
605 | * adjustments only occur on cpu #0. NTP adjustments are accomplished | |
606 | * by updating basetime. | |
984263bc | 607 | */ |
88c4d2f6 | 608 | if (gd->gd_cpuid == 0) { |
5eb5a6bc | 609 | struct timespec *nbt; |
88c4d2f6 MD |
610 | struct timespec nts; |
611 | int leap; | |
5eb5a6bc | 612 | int ni; |
984263bc | 613 | |
63823918 MD |
614 | /* |
615 | * Update system-wide ticks | |
616 | */ | |
88c4d2f6 | 617 | ++ticks; |
6d3dff5f | 618 | ++sbticks; |
984263bc | 619 | |
63823918 MD |
620 | /* |
621 | * Update system-wide ticktime for getnanotime() and getmicrotime() | |
622 | */ | |
623 | nanotime(&nts); | |
624 | atomic_add_int_nonlocked(&ticktime_update, 1); | |
625 | cpu_sfence(); | |
626 | if (ticktime_update & 2) | |
627 | ticktime2 = nts; | |
628 | else | |
629 | ticktime0 = nts; | |
630 | cpu_sfence(); | |
631 | atomic_add_int_nonlocked(&ticktime_update, 1); | |
632 | ||
88c4d2f6 | 633 | #if 0 |
bbf175be | 634 | if (tco->tc_poll_pps) |
88c4d2f6 MD |
635 | tco->tc_poll_pps(tco); |
636 | #endif | |
5eb5a6bc | 637 | |
88c4d2f6 | 638 | /* |
5eb5a6bc MD |
639 | * Calculate the new basetime index. We are in a critical section |
640 | * on cpu #0 and can safely play with basetime_index. Start | |
641 | * with the current basetime and then make adjustments. | |
642 | */ | |
643 | ni = (basetime_index + 1) & BASETIME_ARYMASK; | |
644 | nbt = &basetime[ni]; | |
645 | *nbt = basetime[basetime_index]; | |
646 | ||
a55bb12d MD |
647 | /* |
648 | * ntp adjustments only occur on cpu 0 and are protected by | |
649 | * ntp_spin. This spinlock virtually never conflicts. | |
650 | */ | |
651 | spin_lock(&ntp_spin); | |
652 | ||
5eb5a6bc MD |
653 | /* |
654 | * Apply adjtime corrections. (adjtime() API) | |
655 | * | |
656 | * adjtime() only runs on cpu #0 so our critical section is | |
657 | * sufficient to access these variables. | |
88c4d2f6 | 658 | */ |
4026c000 | 659 | if (ntp_delta != 0) { |
5eb5a6bc | 660 | nbt->tv_nsec += ntp_tick_delta; |
4026c000 JS |
661 | ntp_delta -= ntp_tick_delta; |
662 | if ((ntp_delta > 0 && ntp_delta < ntp_tick_delta) || | |
663 | (ntp_delta < 0 && ntp_delta > ntp_tick_delta)) { | |
5eb5a6bc | 664 | ntp_tick_delta = ntp_delta; |
4026c000 JS |
665 | } |
666 | } | |
667 | ||
5eb5a6bc MD |
668 | /* |
669 | * Apply permanent frequency corrections. (sysctl API) | |
670 | */ | |
4026c000 JS |
671 | if (ntp_tick_permanent != 0) { |
672 | ntp_tick_acc += ntp_tick_permanent; | |
673 | if (ntp_tick_acc >= (1LL << 32)) { | |
5eb5a6bc | 674 | nbt->tv_nsec += ntp_tick_acc >> 32; |
331bc6f8 | 675 | ntp_tick_acc -= (ntp_tick_acc >> 32) << 32; |
4026c000 | 676 | } else if (ntp_tick_acc <= -(1LL << 32)) { |
331bc6f8 | 677 | /* Negate ntp_tick_acc to avoid shifting the sign bit. */ |
5eb5a6bc | 678 | nbt->tv_nsec -= (-ntp_tick_acc) >> 32; |
331bc6f8 | 679 | ntp_tick_acc += ((-ntp_tick_acc) >> 32) << 32; |
4026c000 JS |
680 | } |
681 | } | |
682 | ||
5eb5a6bc MD |
683 | if (nbt->tv_nsec >= 1000000000) { |
684 | nbt->tv_sec++; | |
685 | nbt->tv_nsec -= 1000000000; | |
686 | } else if (nbt->tv_nsec < 0) { | |
687 | nbt->tv_sec--; | |
688 | nbt->tv_nsec += 1000000000; | |
88c4d2f6 MD |
689 | } |
690 | ||
691 | /* | |
5eb5a6bc | 692 | * Another per-tick compensation. (for ntp_adjtime() API) |
88c4d2f6 | 693 | */ |
5eb5a6bc | 694 | if (nsec_adj != 0) { |
88c4d2f6 MD |
695 | nsec_acc += nsec_adj; |
696 | if (nsec_acc >= 0x100000000LL) { | |
5eb5a6bc | 697 | nbt->tv_nsec += nsec_acc >> 32; |
88c4d2f6 MD |
698 | nsec_acc = (nsec_acc & 0xFFFFFFFFLL); |
699 | } else if (nsec_acc <= -0x100000000LL) { | |
5eb5a6bc | 700 | nbt->tv_nsec -= -nsec_acc >> 32; |
88c4d2f6 MD |
701 | nsec_acc = -(-nsec_acc & 0xFFFFFFFFLL); |
702 | } | |
5eb5a6bc MD |
703 | if (nbt->tv_nsec >= 1000000000) { |
704 | nbt->tv_nsec -= 1000000000; | |
705 | ++nbt->tv_sec; | |
706 | } else if (nbt->tv_nsec < 0) { | |
707 | nbt->tv_nsec += 1000000000; | |
708 | --nbt->tv_sec; | |
709 | } | |
710 | } | |
a55bb12d | 711 | spin_unlock(&ntp_spin); |
5eb5a6bc MD |
712 | |
713 | /************************************************************ | |
714 | * LEAP SECOND CORRECTION * | |
715 | ************************************************************ | |
716 | * | |
717 | * Taking into account all the corrections made above, figure | |
718 | * out the new real time. If the seconds field has changed | |
719 | * then apply any pending leap-second corrections. | |
720 | */ | |
721 | getnanotime_nbt(nbt, &nts); | |
722 | ||
32040d57 MD |
723 | if (time_second != nts.tv_sec) { |
724 | /* | |
725 | * Apply leap second (sysctl API). Adjust nts for changes | |
726 | * so we do not have to call getnanotime_nbt again. | |
727 | */ | |
728 | if (ntp_leap_second) { | |
729 | if (ntp_leap_second == nts.tv_sec) { | |
730 | if (ntp_leap_insert) { | |
731 | nbt->tv_sec++; | |
732 | nts.tv_sec++; | |
733 | } else { | |
734 | nbt->tv_sec--; | |
735 | nts.tv_sec--; | |
736 | } | |
5eb5a6bc | 737 | ntp_leap_second--; |
32040d57 | 738 | } |
88c4d2f6 | 739 | } |
88c4d2f6 | 740 | |
32040d57 MD |
741 | /* |
742 | * Apply leap second (ntp_adjtime() API), calculate a new | |
743 | * nsec_adj field. ntp_update_second() returns nsec_adj | |
744 | * as a per-second value but we need it as a per-tick value. | |
745 | */ | |
88c4d2f6 | 746 | leap = ntp_update_second(time_second, &nsec_adj); |
88c4d2f6 | 747 | nsec_adj /= hz; |
32040d57 MD |
748 | nbt->tv_sec += leap; |
749 | nts.tv_sec += leap; | |
750 | ||
751 | /* | |
752 | * Update the time_second 'approximate time' global. | |
753 | */ | |
754 | time_second = nts.tv_sec; | |
4871f0f4 MD |
755 | |
756 | /* | |
757 | * Clear the IPC hint for the currently running thread once | |
758 | * per second, allowing us to disconnect the hint from a | |
759 | * thread which may no longer care. | |
760 | */ | |
761 | curthread->td_wakefromcpu = -1; | |
88c4d2f6 | 762 | } |
5eb5a6bc MD |
763 | |
764 | /* | |
765 | * Finally, our new basetime is ready to go live! | |
766 | */ | |
35238fa5 | 767 | cpu_sfence(); |
5eb5a6bc | 768 | basetime_index = ni; |
0adbcbd6 MD |
769 | |
770 | /* | |
12081e87 MD |
771 | * Update kpmap on each tick. TS updates are integrated with |
772 | * fences and upticks allowing userland to read the data | |
773 | * deterministically. | |
0adbcbd6 MD |
774 | */ |
775 | if (kpmap) { | |
12081e87 MD |
776 | int w; |
777 | ||
778 | w = (kpmap->upticks + 1) & 1; | |
779 | getnanouptime(&kpmap->ts_uptime[w]); | |
780 | getnanotime(&kpmap->ts_realtime[w]); | |
781 | cpu_sfence(); | |
782 | ++kpmap->upticks; | |
783 | cpu_sfence(); | |
0adbcbd6 | 784 | } |
8f7f5bd5 MD |
785 | |
786 | /* | |
787 | * Handle exislock pseudo_ticks. We make things as simple as | |
788 | * possible for the critical path arming code by adding a little | |
789 | * complication here. | |
790 | * | |
791 | * When we find that all cores have been armed, we increment | |
792 | * pseudo_ticks and disarm all the cores. | |
793 | */ | |
794 | { | |
795 | globaldata_t gd; | |
796 | int n; | |
797 | ||
798 | for (n = 0; n < ncpus; ++n) { | |
799 | gd = globaldata_find(n); | |
800 | if (gd->gd_exisarmed == 0) | |
801 | break; | |
802 | } | |
803 | ||
804 | if (n == ncpus) { | |
805 | for (n = 0; n < ncpus; ++n) { | |
806 | gd = globaldata_find(n); | |
807 | gd->gd_exisarmed = 0; | |
808 | } | |
809 | ++pseudo_ticks; | |
810 | } | |
811 | } | |
88c4d2f6 MD |
812 | } |
813 | ||
f9235b6d MD |
814 | /* |
815 | * lwkt thread scheduler fair queueing | |
816 | */ | |
85946b6c | 817 | lwkt_schedulerclock(curthread); |
f9235b6d | 818 | |
8f7f5bd5 MD |
819 | /* |
820 | * Cycle the existential lock system on odd ticks in order to re-arm | |
821 | * our cpu (in case the cpu is idle or nobody is using any exis locks). | |
822 | */ | |
823 | if (ticks & 1) { | |
824 | exis_hold_gd(gd); | |
825 | exis_drop_gd(gd); | |
826 | } | |
827 | ||
92b561b7 MD |
828 | /* |
829 | * softticks are handled for all cpus | |
830 | */ | |
831 | hardclock_softtick(gd); | |
832 | ||
5ba14d44 | 833 | /* |
75979118 | 834 | * Rollup accumulated vmstats, copy-back for critical path checks. |
5ba14d44 MD |
835 | */ |
836 | vmstats_rollup_cpu(gd); | |
bf3f67a7 | 837 | vfscache_rollup_cpu(gd); |
75979118 | 838 | mycpu->gd_vmstats = vmstats; |
5ba14d44 | 839 | |
88c4d2f6 | 840 | /* |
8582ec21 MD |
841 | * ITimer handling is per-tick, per-cpu. |
842 | * | |
843 | * We must acquire the per-process token in order for ksignal() | |
898e34b3 MD |
844 | * to be non-blocking. For the moment this requires an AST fault, |
845 | * the ksignal() cannot be safely issued from this hard interrupt. | |
846 | * | |
847 | * XXX Even the trytoken here isn't right, and itimer operation in | |
848 | * a multi threaded environment is going to be weird at the | |
849 | * very least. | |
88c4d2f6 | 850 | */ |
8582ec21 | 851 | if ((p = curproc) != NULL && lwkt_trytoken(&p->p_token)) { |
3dbbd6dd | 852 | crit_enter_hard(); |
0adbcbd6 MD |
853 | if (p->p_upmap) |
854 | ++p->p_upmap->runticks; | |
855 | ||
88c4d2f6 | 856 | if (frame && CLKF_USERMODE(frame) && |
93328593 | 857 | timevalisset(&p->p_timer[ITIMER_VIRTUAL].it_value) && |
898e34b3 | 858 | itimerdecr(&p->p_timer[ITIMER_VIRTUAL], ustick) == 0) { |
4643740a | 859 | p->p_flags |= P_SIGVTALRM; |
898e34b3 MD |
860 | need_user_resched(); |
861 | } | |
93328593 | 862 | if (timevalisset(&p->p_timer[ITIMER_PROF].it_value) && |
898e34b3 | 863 | itimerdecr(&p->p_timer[ITIMER_PROF], ustick) == 0) { |
4643740a | 864 | p->p_flags |= P_SIGPROF; |
898e34b3 MD |
865 | need_user_resched(); |
866 | } | |
3dbbd6dd | 867 | crit_exit_hard(); |
8582ec21 | 868 | lwkt_reltoken(&p->p_token); |
984263bc | 869 | } |
604e1e09 | 870 | setdelayed(); |
88c4d2f6 | 871 | } |
984263bc | 872 | |
88c4d2f6 MD |
873 | /* |
874 | * The statistics clock typically runs at a 125Hz rate, and is intended | |
875 | * to be frequency offset from the hardclock (typ 100Hz). It is per-cpu. | |
876 | * | |
877 | * NOTE! systimer! the MP lock might not be held here. We can only safely | |
878 | * manipulate objects owned by the current cpu. | |
879 | * | |
880 | * The stats clock is responsible for grabbing a profiling sample. | |
881 | * Most of the statistics are only used by user-level statistics programs. | |
882 | * The main exceptions are p->p_uticks, p->p_sticks, p->p_iticks, and | |
883 | * p->p_estcpu. | |
884 | * | |
885 | * Like the other clocks, the stat clock is called from what is effectively | |
886 | * a fast interrupt, so the context should be the thread/process that got | |
887 | * interrupted. | |
888 | */ | |
889 | static void | |
96d52ac8 | 890 | statclock(systimer_t info, int in_ipi, struct intrframe *frame) |
88c4d2f6 | 891 | { |
c91894e0 | 892 | globaldata_t gd = mycpu; |
88c4d2f6 MD |
893 | thread_t td; |
894 | struct proc *p; | |
895 | int bump; | |
1997b4c2 MD |
896 | sysclock_t cv; |
897 | sysclock_t scv; | |
984263bc MD |
898 | |
899 | /* | |
1997b4c2 MD |
900 | * How big was our timeslice relative to the last time? Calculate |
901 | * in microseconds. | |
902 | * | |
903 | * NOTE: Use of microuptime() is typically MPSAFE, but usually not | |
904 | * during early boot. Just use the systimer count to be nice | |
905 | * to e.g. qemu. The systimer has a better chance of being | |
906 | * MPSAFE at early boot. | |
984263bc | 907 | */ |
1997b4c2 | 908 | cv = sys_cputimer->count(); |
c91894e0 | 909 | scv = gd->statint.gd_statcv; |
1997b4c2 MD |
910 | if (scv == 0) { |
911 | bump = 1; | |
912 | } else { | |
8fbc264d MD |
913 | bump = muldivu64(sys_cputimer->freq64_usec, |
914 | (cv - scv), 1L << 32); | |
1997b4c2 MD |
915 | if (bump < 0) |
916 | bump = 0; | |
917 | if (bump > 1000000) | |
918 | bump = 1000000; | |
919 | } | |
c91894e0 | 920 | gd->statint.gd_statcv = cv; |
1997b4c2 MD |
921 | |
922 | #if 0 | |
c91894e0 | 923 | stv = &gd->gd_stattv; |
88c4d2f6 MD |
924 | if (stv->tv_sec == 0) { |
925 | bump = 1; | |
926 | } else { | |
927 | bump = tv.tv_usec - stv->tv_usec + | |
928 | (tv.tv_sec - stv->tv_sec) * 1000000; | |
929 | if (bump < 0) | |
930 | bump = 0; | |
931 | if (bump > 1000000) | |
932 | bump = 1000000; | |
933 | } | |
934 | *stv = tv; | |
1997b4c2 | 935 | #endif |
984263bc | 936 | |
88c4d2f6 MD |
937 | td = curthread; |
938 | p = td->td_proc; | |
984263bc | 939 | |
63823918 MD |
940 | /* |
941 | * If this is an interrupt thread used for the clock interrupt, adjust | |
942 | * td to the thread it is preempting. If a frame is available, it will | |
943 | * be related to the thread being preempted. | |
944 | */ | |
945 | if ((td->td_flags & TDF_CLKTHREAD) && td->td_preempted) | |
946 | td = td->td_preempted; | |
947 | ||
88c4d2f6 MD |
948 | if (frame && CLKF_USERMODE(frame)) { |
949 | /* | |
950 | * Came from userland, handle user time and deal with | |
951 | * possible process. | |
952 | */ | |
4643740a | 953 | if (p && (p->p_flags & P_PROFIL)) |
88c4d2f6 MD |
954 | addupc_intr(p, CLKF_PC(frame), 1); |
955 | td->td_uticks += bump; | |
984263bc | 956 | |
88c4d2f6 MD |
957 | /* |
958 | * Charge the time as appropriate | |
959 | */ | |
960 | if (p && p->p_nice > NZERO) | |
9eea7f0c | 961 | cpu_time.cp_nice += bump; |
88c4d2f6 | 962 | else |
9eea7f0c | 963 | cpu_time.cp_user += bump; |
88c4d2f6 | 964 | } else { |
c91894e0 | 965 | int intr_nest = gd->gd_intr_nesting_level; |
96d52ac8 SZ |
966 | |
967 | if (in_ipi) { | |
968 | /* | |
969 | * IPI processing code will bump gd_intr_nesting_level | |
970 | * up by one, which breaks following CLKF_INTR testing, | |
317c3bd2 | 971 | * so we subtract it by one here. |
96d52ac8 SZ |
972 | */ |
973 | --intr_nest; | |
974 | } | |
6026c54d | 975 | |
88c4d2f6 MD |
976 | /* |
977 | * Came from kernel mode, so we were: | |
978 | * - handling an interrupt, | |
979 | * - doing syscall or trap work on behalf of the current | |
980 | * user process, or | |
981 | * - spinning in the idle loop. | |
982 | * Whichever it is, charge the time as appropriate. | |
983 | * Note that we charge interrupts to the current process, | |
984 | * regardless of whether they are ``for'' that process, | |
985 | * so that we know how much of its real time was spent | |
986 | * in ``non-process'' (i.e., interrupt) work. | |
987 | * | |
bbf175be | 988 | * XXX assume system if frame is NULL. A NULL frame |
e43a034f | 989 | * can occur if ipi processing is done from a crit_exit(). |
88c4d2f6 | 990 | */ |
63823918 MD |
991 | if ((frame && CLKF_INTR(intr_nest)) || |
992 | cpu_interrupt_running(td)) { | |
e2b92533 MD |
993 | /* |
994 | * If we interrupted an interrupt thread, well, | |
995 | * count it as interrupt time. | |
996 | */ | |
c91894e0 | 997 | td->td_iticks += bump; |
07522099 | 998 | #ifdef DEBUG_PCTRACK |
6026c54d SZ |
999 | if (frame) |
1000 | do_pctrack(frame, PCTRACK_INT); | |
07522099 | 1001 | #endif |
9eea7f0c | 1002 | cpu_time.cp_intr += bump; |
c91894e0 MD |
1003 | } else if (gd->gd_flags & GDF_VIRTUSER) { |
1004 | /* | |
1005 | * The vkernel doesn't do a good job providing trap | |
1006 | * frames that we can test. If the GDF_VIRTUSER | |
1007 | * flag is set we probably interrupted user mode. | |
1008 | */ | |
1009 | td->td_uticks += bump; | |
1010 | ||
1011 | /* | |
1012 | * Charge the time as appropriate | |
1013 | */ | |
1014 | if (p && p->p_nice > NZERO) | |
1015 | cpu_time.cp_nice += bump; | |
1016 | else | |
1017 | cpu_time.cp_user += bump; | |
88c4d2f6 | 1018 | } else { |
63823918 MD |
1019 | if (clock_debug2 > 0) { |
1020 | --clock_debug2; | |
1021 | kprintf("statclock preempt %s (%p %p)\n", td->td_comm, td, &gd->gd_idlethread); | |
1022 | } | |
c91894e0 MD |
1023 | td->td_sticks += bump; |
1024 | if (td == &gd->gd_idlethread) { | |
e2b92533 | 1025 | /* |
c6a766f4 MD |
1026 | * We want to count token contention as |
1027 | * system time. When token contention occurs | |
1028 | * the cpu may only be outside its critical | |
1029 | * section while switching through the idle | |
1030 | * thread. In this situation, various flags | |
1031 | * will be set in gd_reqflags. | |
63823918 MD |
1032 | * |
1033 | * INTPEND is not necessarily useful because | |
1034 | * it will be set if the clock interrupt | |
1035 | * happens to be on an interrupt thread, the | |
1036 | * cpu_interrupt_running() call does a better | |
1037 | * job so we've already handled it. | |
e2b92533 | 1038 | */ |
63823918 MD |
1039 | if (gd->gd_reqflags & |
1040 | (RQF_IDLECHECK_WK_MASK & ~RQF_INTPEND)) { | |
76f1911e | 1041 | cpu_time.cp_sys += bump; |
63823918 | 1042 | } else { |
76f1911e | 1043 | cpu_time.cp_idle += bump; |
63823918 | 1044 | } |
07522099 | 1045 | } else { |
e2b92533 MD |
1046 | /* |
1047 | * System thread was running. | |
1048 | */ | |
07522099 MD |
1049 | #ifdef DEBUG_PCTRACK |
1050 | if (frame) | |
1051 | do_pctrack(frame, PCTRACK_SYS); | |
1052 | #endif | |
9eea7f0c | 1053 | cpu_time.cp_sys += bump; |
07522099 | 1054 | } |
88c4d2f6 MD |
1055 | } |
1056 | } | |
1057 | } | |
1058 | ||
07522099 MD |
1059 | #ifdef DEBUG_PCTRACK |
1060 | /* | |
1061 | * Sample the PC when in the kernel or in an interrupt. User code can | |
1062 | * retrieve the information and generate a histogram or other output. | |
1063 | */ | |
1064 | ||
1065 | static void | |
1066 | do_pctrack(struct intrframe *frame, int which) | |
1067 | { | |
1068 | struct kinfo_pctrack *pctrack; | |
1069 | ||
1070 | pctrack = &cputime_pctrack[mycpu->gd_cpuid][which]; | |
bbf175be | 1071 | pctrack->pc_array[pctrack->pc_index & PCTRACK_ARYMASK] = |
07522099 MD |
1072 | (void *)CLKF_PC(frame); |
1073 | ++pctrack->pc_index; | |
1074 | } | |
1075 | ||
1076 | static int | |
1077 | sysctl_pctrack(SYSCTL_HANDLER_ARGS) | |
1078 | { | |
1079 | struct kinfo_pcheader head; | |
1080 | int error; | |
1081 | int cpu; | |
1082 | int ntrack; | |
1083 | ||
1084 | head.pc_ntrack = PCTRACK_SIZE; | |
1085 | head.pc_arysize = PCTRACK_ARYSIZE; | |
1086 | ||
1087 | if ((error = SYSCTL_OUT(req, &head, sizeof(head))) != 0) | |
1088 | return (error); | |
1089 | ||
1090 | for (cpu = 0; cpu < ncpus; ++cpu) { | |
1091 | for (ntrack = 0; ntrack < PCTRACK_SIZE; ++ntrack) { | |
1092 | error = SYSCTL_OUT(req, &cputime_pctrack[cpu][ntrack], | |
1093 | sizeof(struct kinfo_pctrack)); | |
1094 | if (error) | |
1095 | break; | |
1096 | } | |
1097 | if (error) | |
1098 | break; | |
1099 | } | |
1100 | return (error); | |
1101 | } | |
1102 | SYSCTL_PROC(_kern, OID_AUTO, pctrack, (CTLTYPE_OPAQUE|CTLFLAG_RD), 0, 0, | |
1103 | sysctl_pctrack, "S,kinfo_pcheader", "CPU PC tracking"); | |
1104 | ||
1105 | #endif | |
1106 | ||
88c4d2f6 | 1107 | /* |
dcc99b62 | 1108 | * The scheduler clock typically runs at a 50Hz rate. NOTE! systimer, |
88c4d2f6 MD |
1109 | * the MP lock might not be held. We can safely manipulate parts of curproc |
1110 | * but that's about it. | |
dcc99b62 MD |
1111 | * |
1112 | * Each cpu has its own scheduler clock. | |
88c4d2f6 MD |
1113 | */ |
1114 | static void | |
96d52ac8 | 1115 | schedclock(systimer_t info, int in_ipi __unused, struct intrframe *frame) |
88c4d2f6 | 1116 | { |
553ea3c8 | 1117 | struct lwp *lp; |
88c4d2f6 MD |
1118 | struct rusage *ru; |
1119 | struct vmspace *vm; | |
1120 | long rss; | |
1121 | ||
553ea3c8 | 1122 | if ((lp = lwkt_preempted_proc()) != NULL) { |
dcc99b62 MD |
1123 | /* |
1124 | * Account for cpu time used and hit the scheduler. Note | |
1125 | * that this call MUST BE MP SAFE, and the BGL IS NOT HELD | |
1126 | * HERE. | |
1127 | */ | |
553ea3c8 | 1128 | ++lp->lwp_cpticks; |
de4d4cb0 MD |
1129 | usched_schedulerclock(lp, info->periodic, info->time); |
1130 | } else { | |
1131 | usched_schedulerclock(NULL, info->periodic, info->time); | |
dcc99b62 | 1132 | } |
553ea3c8 | 1133 | if ((lp = curthread->td_lwp) != NULL) { |
dcc99b62 MD |
1134 | /* |
1135 | * Update resource usage integrals and maximums. | |
1136 | */ | |
fde7ac71 | 1137 | if ((ru = &lp->lwp_proc->p_ru) && |
553ea3c8 | 1138 | (vm = lp->lwp_proc->p_vmspace) != NULL) { |
4b566556 MD |
1139 | ru->ru_ixrss += pgtok(btoc(vm->vm_tsize)); |
1140 | ru->ru_idrss += pgtok(btoc(vm->vm_dsize)); | |
1141 | ru->ru_isrss += pgtok(btoc(vm->vm_ssize)); | |
b12defdc MD |
1142 | if (lwkt_trytoken(&vm->vm_map.token)) { |
1143 | rss = pgtok(vmspace_resident_count(vm)); | |
1144 | if (ru->ru_maxrss < rss) | |
1145 | ru->ru_maxrss = rss; | |
1146 | lwkt_reltoken(&vm->vm_map.token); | |
1147 | } | |
88c4d2f6 | 1148 | } |
b68b7282 | 1149 | } |
d6d39bc7 MC |
1150 | /* Increment the global sched_ticks */ |
1151 | if (mycpu->gd_cpuid == 0) | |
1152 | ++sched_ticks; | |
984263bc MD |
1153 | } |
1154 | ||
1155 | /* | |
bbf175be | 1156 | * Compute number of ticks for the specified amount of time. The |
a94976ad | 1157 | * return value is intended to be used in a clock interrupt timed |
317c3bd2 | 1158 | * operation and guaranteed to meet or exceed the requested time. |
a94976ad MD |
1159 | * If the representation overflows, return INT_MAX. The minimum return |
1160 | * value is 1 ticks and the function will average the calculation up. | |
1161 | * If any value greater then 0 microseconds is supplied, a value | |
1162 | * of at least 2 will be returned to ensure that a near-term clock | |
1163 | * interrupt does not cause the timeout to occur (degenerately) early. | |
1164 | * | |
1165 | * Note that limit checks must take into account microseconds, which is | |
1166 | * done simply by using the smaller signed long maximum instead of | |
1167 | * the unsigned long maximum. | |
1168 | * | |
1169 | * If ints have 32 bits, then the maximum value for any timeout in | |
1170 | * 10ms ticks is 248 days. | |
984263bc MD |
1171 | */ |
1172 | int | |
a94976ad | 1173 | tvtohz_high(struct timeval *tv) |
984263bc | 1174 | { |
a94976ad | 1175 | int ticks; |
1fd87d54 | 1176 | long sec, usec; |
984263bc | 1177 | |
984263bc MD |
1178 | sec = tv->tv_sec; |
1179 | usec = tv->tv_usec; | |
1180 | if (usec < 0) { | |
1181 | sec--; | |
1182 | usec += 1000000; | |
1183 | } | |
1184 | if (sec < 0) { | |
1185 | #ifdef DIAGNOSTIC | |
1186 | if (usec > 0) { | |
1187 | sec++; | |
1188 | usec -= 1000000; | |
1189 | } | |
a591f597 MD |
1190 | kprintf("tvtohz_high: negative time difference " |
1191 | "%ld sec %ld usec\n", | |
1192 | sec, usec); | |
984263bc MD |
1193 | #endif |
1194 | ticks = 1; | |
a94976ad | 1195 | } else if (sec <= INT_MAX / hz) { |
6e875644 | 1196 | ticks = (int)(sec * hz + howmany((u_long)usec, ustick)) + 1; |
a94976ad MD |
1197 | } else { |
1198 | ticks = INT_MAX; | |
1199 | } | |
1200 | return (ticks); | |
1201 | } | |
1202 | ||
a591f597 MD |
1203 | int |
1204 | tstohz_high(struct timespec *ts) | |
1205 | { | |
1206 | int ticks; | |
1207 | long sec, nsec; | |
1208 | ||
1209 | sec = ts->tv_sec; | |
1210 | nsec = ts->tv_nsec; | |
1211 | if (nsec < 0) { | |
1212 | sec--; | |
1213 | nsec += 1000000000; | |
1214 | } | |
1215 | if (sec < 0) { | |
1216 | #ifdef DIAGNOSTIC | |
1217 | if (nsec > 0) { | |
1218 | sec++; | |
1219 | nsec -= 1000000000; | |
1220 | } | |
1221 | kprintf("tstohz_high: negative time difference " | |
1222 | "%ld sec %ld nsec\n", | |
1223 | sec, nsec); | |
1224 | #endif | |
1225 | ticks = 1; | |
1226 | } else if (sec <= INT_MAX / hz) { | |
6e875644 | 1227 | ticks = (int)(sec * hz + howmany((u_long)nsec, nstick)) + 1; |
a591f597 MD |
1228 | } else { |
1229 | ticks = INT_MAX; | |
1230 | } | |
1231 | return (ticks); | |
1232 | } | |
1233 | ||
1234 | ||
a94976ad MD |
1235 | /* |
1236 | * Compute number of ticks for the specified amount of time, erroring on | |
1237 | * the side of it being too low to ensure that sleeping the returned number | |
1238 | * of ticks will not result in a late return. | |
1239 | * | |
1240 | * The supplied timeval may not be negative and should be normalized. A | |
1241 | * return value of 0 is possible if the timeval converts to less then | |
1242 | * 1 tick. | |
1243 | * | |
1244 | * If ints have 32 bits, then the maximum value for any timeout in | |
1245 | * 10ms ticks is 248 days. | |
1246 | */ | |
1247 | int | |
1248 | tvtohz_low(struct timeval *tv) | |
1249 | { | |
1250 | int ticks; | |
1251 | long sec; | |
1252 | ||
1253 | sec = tv->tv_sec; | |
1254 | if (sec <= INT_MAX / hz) | |
a591f597 | 1255 | ticks = (int)(sec * hz + (u_long)tv->tv_usec / ustick); |
984263bc | 1256 | else |
984263bc | 1257 | ticks = INT_MAX; |
a94976ad | 1258 | return (ticks); |
984263bc MD |
1259 | } |
1260 | ||
a591f597 MD |
1261 | int |
1262 | tstohz_low(struct timespec *ts) | |
1263 | { | |
1264 | int ticks; | |
1265 | long sec; | |
1266 | ||
1267 | sec = ts->tv_sec; | |
1268 | if (sec <= INT_MAX / hz) | |
1269 | ticks = (int)(sec * hz + (u_long)ts->tv_nsec / nstick); | |
1270 | else | |
1271 | ticks = INT_MAX; | |
1272 | return (ticks); | |
1273 | } | |
a94976ad | 1274 | |
984263bc MD |
1275 | /* |
1276 | * Start profiling on a process. | |
1277 | * | |
282f3194 MD |
1278 | * Caller must hold p->p_token(); |
1279 | * | |
984263bc MD |
1280 | * Kernel profiling passes proc0 which never exits and hence |
1281 | * keeps the profile clock running constantly. | |
1282 | */ | |
1283 | void | |
88c4d2f6 | 1284 | startprofclock(struct proc *p) |
984263bc | 1285 | { |
4643740a MD |
1286 | if ((p->p_flags & P_PROFIL) == 0) { |
1287 | p->p_flags |= P_PROFIL; | |
88c4d2f6 | 1288 | #if 0 /* XXX */ |
984263bc | 1289 | if (++profprocs == 1 && stathz != 0) { |
e43a034f | 1290 | crit_enter(); |
6ad39cae | 1291 | psdiv = psratio; |
984263bc | 1292 | setstatclockrate(profhz); |
e43a034f | 1293 | crit_exit(); |
984263bc | 1294 | } |
88c4d2f6 | 1295 | #endif |
984263bc MD |
1296 | } |
1297 | } | |
1298 | ||
1299 | /* | |
1300 | * Stop profiling on a process. | |
616516c8 MD |
1301 | * |
1302 | * caller must hold p->p_token | |
984263bc MD |
1303 | */ |
1304 | void | |
88c4d2f6 | 1305 | stopprofclock(struct proc *p) |
984263bc | 1306 | { |
4643740a MD |
1307 | if (p->p_flags & P_PROFIL) { |
1308 | p->p_flags &= ~P_PROFIL; | |
88c4d2f6 | 1309 | #if 0 /* XXX */ |
984263bc | 1310 | if (--profprocs == 0 && stathz != 0) { |
e43a034f | 1311 | crit_enter(); |
6ad39cae | 1312 | psdiv = 1; |
984263bc | 1313 | setstatclockrate(stathz); |
e43a034f | 1314 | crit_exit(); |
984263bc | 1315 | } |
984263bc | 1316 | #endif |
984263bc MD |
1317 | } |
1318 | } | |
1319 | ||
1320 | /* | |
1321 | * Return information about system clocks. | |
1322 | */ | |
1323 | static int | |
1324 | sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) | |
1325 | { | |
f5d21610 | 1326 | struct kinfo_clockinfo clkinfo; |
984263bc MD |
1327 | /* |
1328 | * Construct clockinfo structure. | |
1329 | */ | |
f5d21610 | 1330 | clkinfo.ci_hz = hz; |
a591f597 | 1331 | clkinfo.ci_tick = ustick; |
4026c000 | 1332 | clkinfo.ci_tickadj = ntp_default_tick_delta / 1000; |
f5d21610 JS |
1333 | clkinfo.ci_profhz = profhz; |
1334 | clkinfo.ci_stathz = stathz ? stathz : hz; | |
984263bc MD |
1335 | return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); |
1336 | } | |
1337 | ||
1338 | SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, | |
1339 | 0, 0, sysctl_kern_clockrate, "S,clockinfo",""); | |
1340 | ||
984263bc MD |
1341 | /* |
1342 | * We have eight functions for looking at the clock, four for | |
1343 | * microseconds and four for nanoseconds. For each there is fast | |
1344 | * but less precise version "get{nano|micro}[up]time" which will | |
1345 | * return a time which is up to 1/HZ previous to the call, whereas | |
1346 | * the raw version "{nano|micro}[up]time" will return a timestamp | |
1347 | * which is as precise as possible. The "up" variants return the | |
1348 | * time relative to system boot, these are well suited for time | |
1349 | * interval measurements. | |
88c4d2f6 | 1350 | * |
317c3bd2 | 1351 | * Each cpu independently maintains the current time of day, so all |
88c4d2f6 MD |
1352 | * we need to do to protect ourselves from changes is to do a loop |
1353 | * check on the seconds field changing out from under us. | |
fad57d0e MD |
1354 | * |
1355 | * The system timer maintains a 32 bit count and due to various issues | |
317c3bd2 | 1356 | * it is possible for the calculated delta to occasionally exceed |
044ee7c4 MD |
1357 | * sys_cputimer->freq. If this occurs the sys_cputimer->freq64_nsec |
1358 | * multiplication can easily overflow, so we deal with the case. For | |
1359 | * uniformity we deal with the case in the usec case too. | |
627531fa MD |
1360 | * |
1361 | * All the [get][micro,nano][time,uptime]() routines are MPSAFE. | |
63823918 MD |
1362 | * |
1363 | * NEW CODE (!) | |
1364 | * | |
1365 | * cpu 0 now maintains global ticktimes and an update counter. The | |
1366 | * getnanotime() and getmicrotime() routines use these globals. | |
984263bc | 1367 | */ |
984263bc MD |
1368 | void |
1369 | getmicrouptime(struct timeval *tvp) | |
1370 | { | |
88c4d2f6 MD |
1371 | struct globaldata *gd = mycpu; |
1372 | sysclock_t delta; | |
1373 | ||
1374 | do { | |
1375 | tvp->tv_sec = gd->gd_time_seconds; | |
1376 | delta = gd->gd_hardclock.time - gd->gd_cpuclock_base; | |
1377 | } while (tvp->tv_sec != gd->gd_time_seconds); | |
fad57d0e | 1378 | |
044ee7c4 MD |
1379 | if (delta >= sys_cputimer->freq) { |
1380 | tvp->tv_sec += delta / sys_cputimer->freq; | |
1381 | delta %= sys_cputimer->freq; | |
fad57d0e | 1382 | } |
8fbc264d | 1383 | tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32); |
88c4d2f6 MD |
1384 | if (tvp->tv_usec >= 1000000) { |
1385 | tvp->tv_usec -= 1000000; | |
1386 | ++tvp->tv_sec; | |
984263bc MD |
1387 | } |
1388 | } | |
1389 | ||
1390 | void | |
1391 | getnanouptime(struct timespec *tsp) | |
1392 | { | |
88c4d2f6 MD |
1393 | struct globaldata *gd = mycpu; |
1394 | sysclock_t delta; | |
1395 | ||
1396 | do { | |
1397 | tsp->tv_sec = gd->gd_time_seconds; | |
1398 | delta = gd->gd_hardclock.time - gd->gd_cpuclock_base; | |
1399 | } while (tsp->tv_sec != gd->gd_time_seconds); | |
fad57d0e | 1400 | |
044ee7c4 MD |
1401 | if (delta >= sys_cputimer->freq) { |
1402 | tsp->tv_sec += delta / sys_cputimer->freq; | |
1403 | delta %= sys_cputimer->freq; | |
984263bc | 1404 | } |
8fbc264d | 1405 | tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32); |
984263bc MD |
1406 | } |
1407 | ||
1408 | void | |
88c4d2f6 | 1409 | microuptime(struct timeval *tvp) |
984263bc | 1410 | { |
88c4d2f6 MD |
1411 | struct globaldata *gd = mycpu; |
1412 | sysclock_t delta; | |
1413 | ||
1414 | do { | |
1415 | tvp->tv_sec = gd->gd_time_seconds; | |
044ee7c4 | 1416 | delta = sys_cputimer->count() - gd->gd_cpuclock_base; |
88c4d2f6 | 1417 | } while (tvp->tv_sec != gd->gd_time_seconds); |
fad57d0e | 1418 | |
044ee7c4 MD |
1419 | if (delta >= sys_cputimer->freq) { |
1420 | tvp->tv_sec += delta / sys_cputimer->freq; | |
1421 | delta %= sys_cputimer->freq; | |
984263bc | 1422 | } |
8fbc264d | 1423 | tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32); |
984263bc MD |
1424 | } |
1425 | ||
1426 | void | |
88c4d2f6 | 1427 | nanouptime(struct timespec *tsp) |
984263bc | 1428 | { |
88c4d2f6 MD |
1429 | struct globaldata *gd = mycpu; |
1430 | sysclock_t delta; | |
1431 | ||
1432 | do { | |
1433 | tsp->tv_sec = gd->gd_time_seconds; | |
044ee7c4 | 1434 | delta = sys_cputimer->count() - gd->gd_cpuclock_base; |
88c4d2f6 | 1435 | } while (tsp->tv_sec != gd->gd_time_seconds); |
fad57d0e | 1436 | |
044ee7c4 MD |
1437 | if (delta >= sys_cputimer->freq) { |
1438 | tsp->tv_sec += delta / sys_cputimer->freq; | |
1439 | delta %= sys_cputimer->freq; | |
984263bc | 1440 | } |
8fbc264d | 1441 | tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32); |
984263bc MD |
1442 | } |
1443 | ||
88c4d2f6 MD |
1444 | /* |
1445 | * realtime routines | |
1446 | */ | |
984263bc | 1447 | void |
88c4d2f6 | 1448 | getmicrotime(struct timeval *tvp) |
984263bc | 1449 | { |
63823918 MD |
1450 | struct timespec ts; |
1451 | int counter; | |
984263bc | 1452 | |
88c4d2f6 | 1453 | do { |
63823918 MD |
1454 | counter = *(volatile int *)&ticktime_update; |
1455 | cpu_lfence(); | |
1456 | switch(counter & 3) { | |
1457 | case 0: /* ticktime2 completed update */ | |
1458 | ts = ticktime2; | |
1459 | break; | |
1460 | case 1: /* ticktime0 update in progress */ | |
1461 | ts = ticktime2; | |
1462 | break; | |
1463 | case 2: /* ticktime0 completed update */ | |
1464 | ts = ticktime0; | |
1465 | break; | |
1466 | case 3: /* ticktime2 update in progress */ | |
1467 | ts = ticktime0; | |
1468 | break; | |
1469 | } | |
1470 | cpu_lfence(); | |
1471 | } while (counter != *(volatile int *)&ticktime_update); | |
1472 | tvp->tv_sec = ts.tv_sec; | |
1473 | tvp->tv_usec = ts.tv_nsec / 1000; | |
984263bc MD |
1474 | } |
1475 | ||
1476 | void | |
88c4d2f6 | 1477 | getnanotime(struct timespec *tsp) |
984263bc | 1478 | { |
63823918 MD |
1479 | struct timespec ts; |
1480 | int counter; | |
984263bc | 1481 | |
88c4d2f6 | 1482 | do { |
63823918 MD |
1483 | counter = *(volatile int *)&ticktime_update; |
1484 | cpu_lfence(); | |
1485 | switch(counter & 3) { | |
1486 | case 0: /* ticktime2 completed update */ | |
1487 | ts = ticktime2; | |
1488 | break; | |
1489 | case 1: /* ticktime0 update in progress */ | |
1490 | ts = ticktime2; | |
1491 | break; | |
1492 | case 2: /* ticktime0 completed update */ | |
1493 | ts = ticktime0; | |
1494 | break; | |
1495 | case 3: /* ticktime2 update in progress */ | |
1496 | ts = ticktime0; | |
1497 | break; | |
1498 | } | |
1499 | cpu_lfence(); | |
1500 | } while (counter != *(volatile int *)&ticktime_update); | |
1501 | *tsp = ts; | |
984263bc MD |
1502 | } |
1503 | ||
5eb5a6bc MD |
1504 | static void |
1505 | getnanotime_nbt(struct timespec *nbt, struct timespec *tsp) | |
1506 | { | |
1507 | struct globaldata *gd = mycpu; | |
1508 | sysclock_t delta; | |
1509 | ||
1510 | do { | |
1511 | tsp->tv_sec = gd->gd_time_seconds; | |
1512 | delta = gd->gd_hardclock.time - gd->gd_cpuclock_base; | |
1513 | } while (tsp->tv_sec != gd->gd_time_seconds); | |
1514 | ||
044ee7c4 MD |
1515 | if (delta >= sys_cputimer->freq) { |
1516 | tsp->tv_sec += delta / sys_cputimer->freq; | |
1517 | delta %= sys_cputimer->freq; | |
5eb5a6bc | 1518 | } |
8fbc264d | 1519 | tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32); |
5eb5a6bc MD |
1520 | |
1521 | tsp->tv_sec += nbt->tv_sec; | |
1522 | tsp->tv_nsec += nbt->tv_nsec; | |
1523 | while (tsp->tv_nsec >= 1000000000) { | |
1524 | tsp->tv_nsec -= 1000000000; | |
1525 | ++tsp->tv_sec; | |
1526 | } | |
1527 | } | |
1528 | ||
1529 | ||
88c4d2f6 MD |
1530 | void |
1531 | microtime(struct timeval *tvp) | |
984263bc | 1532 | { |
88c4d2f6 | 1533 | struct globaldata *gd = mycpu; |
5eb5a6bc | 1534 | struct timespec *bt; |
88c4d2f6 | 1535 | sysclock_t delta; |
984263bc | 1536 | |
88c4d2f6 MD |
1537 | do { |
1538 | tvp->tv_sec = gd->gd_time_seconds; | |
044ee7c4 | 1539 | delta = sys_cputimer->count() - gd->gd_cpuclock_base; |
88c4d2f6 | 1540 | } while (tvp->tv_sec != gd->gd_time_seconds); |
fad57d0e | 1541 | |
044ee7c4 MD |
1542 | if (delta >= sys_cputimer->freq) { |
1543 | tvp->tv_sec += delta / sys_cputimer->freq; | |
1544 | delta %= sys_cputimer->freq; | |
fad57d0e | 1545 | } |
8fbc264d | 1546 | tvp->tv_usec = muldivu64(sys_cputimer->freq64_usec, delta, 1L << 32); |
984263bc | 1547 | |
5eb5a6bc | 1548 | bt = &basetime[basetime_index]; |
2ed58723 | 1549 | cpu_lfence(); |
5eb5a6bc MD |
1550 | tvp->tv_sec += bt->tv_sec; |
1551 | tvp->tv_usec += bt->tv_nsec / 1000; | |
88c4d2f6 MD |
1552 | while (tvp->tv_usec >= 1000000) { |
1553 | tvp->tv_usec -= 1000000; | |
1554 | ++tvp->tv_sec; | |
984263bc | 1555 | } |
984263bc MD |
1556 | } |
1557 | ||
88c4d2f6 MD |
1558 | void |
1559 | nanotime(struct timespec *tsp) | |
1560 | { | |
1561 | struct globaldata *gd = mycpu; | |
5eb5a6bc | 1562 | struct timespec *bt; |
88c4d2f6 | 1563 | sysclock_t delta; |
984263bc | 1564 | |
88c4d2f6 MD |
1565 | do { |
1566 | tsp->tv_sec = gd->gd_time_seconds; | |
044ee7c4 | 1567 | delta = sys_cputimer->count() - gd->gd_cpuclock_base; |
88c4d2f6 | 1568 | } while (tsp->tv_sec != gd->gd_time_seconds); |
fad57d0e | 1569 | |
044ee7c4 MD |
1570 | if (delta >= sys_cputimer->freq) { |
1571 | tsp->tv_sec += delta / sys_cputimer->freq; | |
1572 | delta %= sys_cputimer->freq; | |
fad57d0e | 1573 | } |
8fbc264d | 1574 | tsp->tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32); |
984263bc | 1575 | |
5eb5a6bc | 1576 | bt = &basetime[basetime_index]; |
2ed58723 | 1577 | cpu_lfence(); |
5eb5a6bc MD |
1578 | tsp->tv_sec += bt->tv_sec; |
1579 | tsp->tv_nsec += bt->tv_nsec; | |
88c4d2f6 MD |
1580 | while (tsp->tv_nsec >= 1000000000) { |
1581 | tsp->tv_nsec -= 1000000000; | |
1582 | ++tsp->tv_sec; | |
984263bc | 1583 | } |
984263bc MD |
1584 | } |
1585 | ||
25b804e7 | 1586 | /* |
2ed58723 MD |
1587 | * Get an approximate time_t. It does not have to be accurate. This |
1588 | * function is called only from KTR and can be called with the system in | |
1589 | * any state so do not use a critical section or other complex operation | |
1590 | * here. | |
1591 | * | |
1592 | * NOTE: This is not exactly synchronized with real time. To do that we | |
1593 | * would have to do what microtime does and check for a nanoseconds | |
1594 | * overflow. | |
25b804e7 MD |
1595 | */ |
1596 | time_t | |
1597 | get_approximate_time_t(void) | |
1598 | { | |
1599 | struct globaldata *gd = mycpu; | |
5eb5a6bc MD |
1600 | struct timespec *bt; |
1601 | ||
1602 | bt = &basetime[basetime_index]; | |
1603 | return(gd->gd_time_seconds + bt->tv_sec); | |
25b804e7 MD |
1604 | } |
1605 | ||
0c4dbac1 DC |
1606 | static int |
1607 | pps_fetch_timeout(struct timespec *timeout, struct pps_state *pps) | |
1608 | { | |
1609 | int to, err; | |
1610 | pps_seq_t *ap, *cp; | |
1611 | pps_seq_t a, c; | |
1612 | ||
1613 | to = INT_MAX; | |
1614 | if (timeout->tv_sec > -1) | |
1615 | to = tstohz_low(timeout); | |
1616 | ||
1617 | ap = &pps->ppsinfo.assert_sequence; | |
1618 | cp = &pps->ppsinfo.clear_sequence; | |
1619 | a = atomic_load_acq_int(ap); | |
1620 | c = atomic_load_acq_int(cp); | |
1621 | ||
1622 | while (a == atomic_load_acq_int(ap) && c == atomic_load_acq_int(cp)) { | |
1623 | err = tsleep(pps, PCATCH, "ppsfch", to); | |
1624 | if (err == EWOULDBLOCK) { | |
1625 | if (timeout->tv_sec < 0) | |
1626 | continue; | |
1627 | return (ETIMEDOUT); | |
1628 | } | |
1629 | if (err != 0) | |
1630 | return (err); | |
1631 | } | |
1632 | ||
1633 | return (0); | |
1634 | } | |
1635 | ||
984263bc MD |
1636 | int |
1637 | pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps) | |
1638 | { | |
1639 | pps_params_t *app; | |
1640 | struct pps_fetch_args *fapi; | |
1641 | #ifdef PPS_SYNC | |
1642 | struct pps_kcbind_args *kapi; | |
1643 | #endif | |
0c4dbac1 | 1644 | int err; |
984263bc MD |
1645 | |
1646 | switch (cmd) { | |
1647 | case PPS_IOC_CREATE: | |
1648 | return (0); | |
1649 | case PPS_IOC_DESTROY: | |
1650 | return (0); | |
1651 | case PPS_IOC_SETPARAMS: | |
1652 | app = (pps_params_t *)data; | |
1653 | if (app->mode & ~pps->ppscap) | |
1654 | return (EINVAL); | |
bbf175be | 1655 | pps->ppsparam = *app; |
984263bc MD |
1656 | return (0); |
1657 | case PPS_IOC_GETPARAMS: | |
1658 | app = (pps_params_t *)data; | |
1659 | *app = pps->ppsparam; | |
1660 | app->api_version = PPS_API_VERS_1; | |
1661 | return (0); | |
1662 | case PPS_IOC_GETCAP: | |
1663 | *(int*)data = pps->ppscap; | |
1664 | return (0); | |
1665 | case PPS_IOC_FETCH: | |
1666 | fapi = (struct pps_fetch_args *)data; | |
1667 | if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC) | |
1668 | return (EINVAL); | |
0c4dbac1 DC |
1669 | if (fapi->timeout.tv_sec != 0 || fapi->timeout.tv_nsec != 0) { |
1670 | err = pps_fetch_timeout(&fapi->timeout, pps); | |
1671 | if (err != 0) | |
1672 | return (err); | |
1673 | } | |
bbf175be | 1674 | pps->ppsinfo.current_mode = pps->ppsparam.mode; |
984263bc MD |
1675 | fapi->pps_info_buf = pps->ppsinfo; |
1676 | return (0); | |
1677 | case PPS_IOC_KCBIND: | |
1678 | #ifdef PPS_SYNC | |
1679 | kapi = (struct pps_kcbind_args *)data; | |
1680 | /* XXX Only root should be able to do this */ | |
1681 | if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC) | |
1682 | return (EINVAL); | |
1683 | if (kapi->kernel_consumer != PPS_KC_HARDPPS) | |
1684 | return (EINVAL); | |
1685 | if (kapi->edge & ~pps->ppscap) | |
1686 | return (EINVAL); | |
1687 | pps->kcmode = kapi->edge; | |
1688 | return (0); | |
1689 | #else | |
1690 | return (EOPNOTSUPP); | |
1691 | #endif | |
1692 | default: | |
1693 | return (ENOTTY); | |
1694 | } | |
1695 | } | |
1696 | ||
1697 | void | |
1698 | pps_init(struct pps_state *pps) | |
1699 | { | |
0c4dbac1 | 1700 | pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT; |
984263bc MD |
1701 | if (pps->ppscap & PPS_CAPTUREASSERT) |
1702 | pps->ppscap |= PPS_OFFSETASSERT; | |
1703 | if (pps->ppscap & PPS_CAPTURECLEAR) | |
1704 | pps->ppscap |= PPS_OFFSETCLEAR; | |
1705 | } | |
1706 | ||
1707 | void | |
88c4d2f6 | 1708 | pps_event(struct pps_state *pps, sysclock_t count, int event) |
984263bc | 1709 | { |
88c4d2f6 MD |
1710 | struct globaldata *gd; |
1711 | struct timespec *tsp; | |
1712 | struct timespec *osp; | |
5eb5a6bc | 1713 | struct timespec *bt; |
88c4d2f6 MD |
1714 | struct timespec ts; |
1715 | sysclock_t *pcount; | |
1716 | #ifdef PPS_SYNC | |
1717 | sysclock_t tcount; | |
1718 | #endif | |
1719 | sysclock_t delta; | |
1720 | pps_seq_t *pseq; | |
1721 | int foff; | |
aa85218e | 1722 | #ifdef PPS_SYNC |
88c4d2f6 | 1723 | int fhard; |
aa85218e | 1724 | #endif |
2ed58723 | 1725 | int ni; |
88c4d2f6 MD |
1726 | |
1727 | gd = mycpu; | |
984263bc MD |
1728 | |
1729 | /* Things would be easier with arrays... */ | |
1730 | if (event == PPS_CAPTUREASSERT) { | |
1731 | tsp = &pps->ppsinfo.assert_timestamp; | |
1732 | osp = &pps->ppsparam.assert_offset; | |
1733 | foff = pps->ppsparam.mode & PPS_OFFSETASSERT; | |
c246e343 | 1734 | #ifdef PPS_SYNC |
984263bc | 1735 | fhard = pps->kcmode & PPS_CAPTUREASSERT; |
c246e343 | 1736 | #endif |
984263bc MD |
1737 | pcount = &pps->ppscount[0]; |
1738 | pseq = &pps->ppsinfo.assert_sequence; | |
1739 | } else { | |
1740 | tsp = &pps->ppsinfo.clear_timestamp; | |
1741 | osp = &pps->ppsparam.clear_offset; | |
1742 | foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; | |
c246e343 | 1743 | #ifdef PPS_SYNC |
984263bc | 1744 | fhard = pps->kcmode & PPS_CAPTURECLEAR; |
c246e343 | 1745 | #endif |
984263bc MD |
1746 | pcount = &pps->ppscount[1]; |
1747 | pseq = &pps->ppsinfo.clear_sequence; | |
1748 | } | |
1749 | ||
984263bc MD |
1750 | /* Nothing really happened */ |
1751 | if (*pcount == count) | |
1752 | return; | |
1753 | ||
1754 | *pcount = count; | |
1755 | ||
88c4d2f6 MD |
1756 | do { |
1757 | ts.tv_sec = gd->gd_time_seconds; | |
1758 | delta = count - gd->gd_cpuclock_base; | |
1759 | } while (ts.tv_sec != gd->gd_time_seconds); | |
fad57d0e | 1760 | |
044ee7c4 MD |
1761 | if (delta >= sys_cputimer->freq) { |
1762 | ts.tv_sec += delta / sys_cputimer->freq; | |
1763 | delta %= sys_cputimer->freq; | |
88c4d2f6 | 1764 | } |
8fbc264d | 1765 | ts.tv_nsec = muldivu64(sys_cputimer->freq64_nsec, delta, 1L << 32); |
2ed58723 MD |
1766 | ni = basetime_index; |
1767 | cpu_lfence(); | |
1768 | bt = &basetime[ni]; | |
5eb5a6bc MD |
1769 | ts.tv_sec += bt->tv_sec; |
1770 | ts.tv_nsec += bt->tv_nsec; | |
88c4d2f6 MD |
1771 | while (ts.tv_nsec >= 1000000000) { |
1772 | ts.tv_nsec -= 1000000000; | |
1773 | ++ts.tv_sec; | |
984263bc | 1774 | } |
984263bc | 1775 | |
0c4dbac1 | 1776 | atomic_add_rel_int(pseq, 1); |
984263bc MD |
1777 | *tsp = ts; |
1778 | ||
1779 | if (foff) { | |
944cd60c | 1780 | timespecadd(tsp, osp, tsp); |
984263bc MD |
1781 | if (tsp->tv_nsec < 0) { |
1782 | tsp->tv_nsec += 1000000000; | |
1783 | tsp->tv_sec -= 1; | |
1784 | } | |
1785 | } | |
1786 | #ifdef PPS_SYNC | |
1787 | if (fhard) { | |
1788 | /* magic, at its best... */ | |
1789 | tcount = count - pps->ppscount[2]; | |
1790 | pps->ppscount[2] = count; | |
044ee7c4 MD |
1791 | if (tcount >= sys_cputimer->freq) { |
1792 | delta = (1000000000 * (tcount / sys_cputimer->freq) + | |
bbf175be | 1793 | sys_cputimer->freq64_nsec * |
044ee7c4 | 1794 | (tcount % sys_cputimer->freq)) >> 32; |
fad57d0e | 1795 | } else { |
8fbc264d MD |
1796 | delta = muldivu64(sys_cputimer->freq64_nsec, |
1797 | tcount, 1L << 32); | |
fad57d0e | 1798 | } |
984263bc MD |
1799 | hardpps(tsp, delta); |
1800 | } | |
1801 | #endif | |
0c4dbac1 | 1802 | wakeup(pps); |
984263bc | 1803 | } |
88c4d2f6 | 1804 | |
d2412a2e MD |
1805 | /* |
1806 | * Return the tsc target value for a delay of (ns). | |
1807 | * | |
1808 | * Returns -1 if the TSC is not supported. | |
1809 | */ | |
5b49787b | 1810 | tsc_uclock_t |
d2412a2e MD |
1811 | tsc_get_target(int ns) |
1812 | { | |
1813 | #if defined(_RDTSC_SUPPORTED_) | |
1814 | if (cpu_feature & CPUID_TSC) { | |
1815 | return (rdtsc() + tsc_frequency * ns / (int64_t)1000000000); | |
1816 | } | |
1817 | #endif | |
1818 | return(-1); | |
1819 | } | |
1820 | ||
1821 | /* | |
1822 | * Compare the tsc against the passed target | |
1823 | * | |
1824 | * Returns +1 if the target has been reached | |
1825 | * Returns 0 if the target has not yet been reached | |
1826 | * Returns -1 if the TSC is not supported. | |
1827 | * | |
1828 | * Typical use: while (tsc_test_target(target) == 0) { ...poll... } | |
1829 | */ | |
1830 | int | |
1831 | tsc_test_target(int64_t target) | |
1832 | { | |
1833 | #if defined(_RDTSC_SUPPORTED_) | |
1834 | if (cpu_feature & CPUID_TSC) { | |
1835 | if ((int64_t)(target - rdtsc()) <= 0) | |
1836 | return(1); | |
1837 | return(0); | |
1838 | } | |
d2412a2e | 1839 | #endif |
2e537993 | 1840 | return(-1); |
d2412a2e | 1841 | } |
b12defdc MD |
1842 | |
1843 | /* | |
1844 | * Delay the specified number of nanoseconds using the tsc. This function | |
1845 | * returns immediately if the TSC is not supported. At least one cpu_pause() | |
1846 | * will be issued. | |
1847 | */ | |
1848 | void | |
1849 | tsc_delay(int ns) | |
1850 | { | |
1851 | int64_t clk; | |
1852 | ||
1853 | clk = tsc_get_target(ns); | |
1854 | cpu_pause(); | |
f5955a53 MD |
1855 | cpu_pause(); |
1856 | while (tsc_test_target(clk) == 0) { | |
1857 | cpu_pause(); | |
1858 | cpu_pause(); | |
b12defdc | 1859 | cpu_pause(); |
f5955a53 MD |
1860 | cpu_pause(); |
1861 | } | |
b12defdc | 1862 | } |