kernel - Refactor sysclock_t from 32 to 64 bits (2)
[dragonfly.git] / sys / platform / pc64 / isa / clock.c
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * Copyright (c) 2008 The DragonFly Project.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * William Jolitz and Don Ahn.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      from: @(#)clock.c       7.2 (Berkeley) 5/12/91
34  * $FreeBSD: src/sys/i386/isa/clock.c,v 1.149.2.6 2002/11/02 04:41:50 iwasaki Exp $
35  */
36
37 /*
38  * Routines to handle clock hardware.
39  */
40
41 /*
42  * inittodr, settodr and support routines written
43  * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>
44  *
45  * reintroduced and updated by Chris Stenton <chris@gnome.co.uk> 8/10/94
46  */
47
48 #if 0
49 #include "opt_clock.h"
50 #endif
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/eventhandler.h>
55 #include <sys/time.h>
56 #include <sys/kernel.h>
57 #include <sys/bus.h>
58 #include <sys/sysctl.h>
59 #include <sys/cons.h>
60 #include <sys/kbio.h>
61 #include <sys/systimer.h>
62 #include <sys/globaldata.h>
63 #include <sys/machintr.h>
64 #include <sys/interrupt.h>
65
66 #include <sys/thread2.h>
67
68 #include <machine/clock.h>
69 #include <machine/cputypes.h>
70 #include <machine/frame.h>
71 #include <machine/ipl.h>
72 #include <machine/limits.h>
73 #include <machine/md_var.h>
74 #include <machine/psl.h>
75 #include <machine/segments.h>
76 #include <machine/smp.h>
77 #include <machine/specialreg.h>
78 #include <machine/intr_machdep.h>
79
80 #include <machine_base/apic/ioapic.h>
81 #include <machine_base/apic/ioapic_abi.h>
82 #include <machine_base/icu/icu.h>
83 #include <bus/isa/isa.h>
84 #include <bus/isa/rtc.h>
85 #include <machine_base/isa/timerreg.h>
86
87 SET_DECLARE(timecounter_init_set, const timecounter_init_t);
88 TIMECOUNTER_INIT(placeholder, NULL);
89
90 static void i8254_restore(void);
91 static void resettodr_on_shutdown(void *arg __unused);
92
93 /*
94  * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we
95  * can use a simple formula for leap years.
96  */
97 #define LEAPYEAR(y) ((u_int)(y) % 4 == 0)
98 #define DAYSPERYEAR   (31+28+31+30+31+30+31+31+30+31+30+31)
99
100 #ifndef TIMER_FREQ
101 #define TIMER_FREQ   1193182
102 #endif
103
104 static uint8_t i8254_walltimer_sel;
105 static uint16_t i8254_walltimer_cntr;
106
107 int     adjkerntz;              /* local offset from GMT in seconds */
108 int     disable_rtc_set;        /* disable resettodr() if != 0 */
109 int     tsc_present;
110 int     tsc_invariant;
111 int     tsc_mpsync;
112 int     wall_cmos_clock;        /* wall CMOS clock assumed if != 0 */
113 int     timer0_running;
114 tsc_uclock_t tsc_frequency;
115 tsc_uclock_t tsc_oneus_approx;  /* always at least 1, approx only */
116
117 enum tstate { RELEASED, ACQUIRED };
118 enum tstate timer0_state;
119 enum tstate timer1_state;
120 enum tstate timer2_state;
121
122 int     i8254_cputimer_disable; /* No need to initialize i8254 cputimer. */
123
124 static  int     beeping = 0;
125 static  const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31};
126 static  u_char  rtc_statusa = RTCSA_DIVIDER | RTCSA_NOPROF;
127 static  u_char  rtc_statusb = RTCSB_24HR | RTCSB_PINTR;
128 static  int     rtc_loaded;
129
130 static  sysclock_t i8254_cputimer_div;
131
132 static int i8254_nointr;
133 static int i8254_intr_disable = 1;
134 TUNABLE_INT("hw.i8254.intr_disable", &i8254_intr_disable);
135
136 static int calibrate_timers_with_rtc = 0;
137 TUNABLE_INT("hw.calibrate_timers_with_rtc", &calibrate_timers_with_rtc);
138
139 static int calibrate_tsc_fast = 1;
140 TUNABLE_INT("hw.calibrate_tsc_fast", &calibrate_tsc_fast);
141
142 static int calibrate_test;
143 TUNABLE_INT("hw.tsc_calibrate_test", &calibrate_test);
144
145 static struct callout sysbeepstop_ch;
146
147 static sysclock_t i8254_cputimer_count(void);
148 static void i8254_cputimer_construct(struct cputimer *cputimer, sysclock_t last);
149 static void i8254_cputimer_destruct(struct cputimer *cputimer);
150
151 static struct cputimer  i8254_cputimer = {
152     .next               = SLIST_ENTRY_INITIALIZER,
153     .name               = "i8254",
154     .pri                = CPUTIMER_PRI_8254,
155     .type               = 0,    /* determined later */
156     .count              = i8254_cputimer_count,
157     .fromhz             = cputimer_default_fromhz,
158     .fromus             = cputimer_default_fromus,
159     .construct          = i8254_cputimer_construct,
160     .destruct           = i8254_cputimer_destruct,
161     .freq               = TIMER_FREQ
162 };
163
164 static void i8254_intr_reload(struct cputimer_intr *, sysclock_t);
165 static void i8254_intr_config(struct cputimer_intr *, const struct cputimer *);
166 static void i8254_intr_initclock(struct cputimer_intr *, boolean_t);
167
168 static struct cputimer_intr i8254_cputimer_intr = {
169     .freq = TIMER_FREQ,
170     .reload = i8254_intr_reload,
171     .enable = cputimer_intr_default_enable,
172     .config = i8254_intr_config,
173     .restart = cputimer_intr_default_restart,
174     .pmfixup = cputimer_intr_default_pmfixup,
175     .initclock = i8254_intr_initclock,
176     .pcpuhand = NULL,
177     .next = SLIST_ENTRY_INITIALIZER,
178     .name = "i8254",
179     .type = CPUTIMER_INTR_8254,
180     .prio = CPUTIMER_INTR_PRIO_8254,
181     .caps = CPUTIMER_INTR_CAP_PS,
182     .priv = NULL
183 };
184
185 /*
186  * Use this to lwkt_switch() when the scheduler clock is not
187  * yet running, otherwise lwkt_switch() won't do anything.
188  * XXX needs cleaning up in lwkt_thread.c
189  */
190 static void
191 lwkt_force_switch(void)
192 {
193         crit_enter();
194         lwkt_schedulerclock(curthread);
195         crit_exit();
196         lwkt_switch();
197 }
198
199 /*
200  * timer0 clock interrupt.  Timer0 is in one-shot mode and has stopped
201  * counting as of this interrupt.  We use timer1 in free-running mode (not
202  * generating any interrupts) as our main counter.  Each cpu has timeouts
203  * pending.
204  *
205  * This code is INTR_MPSAFE and may be called without the BGL held.
206  */
207 static void
208 clkintr(void *dummy, void *frame_arg)
209 {
210         static sysclock_t sysclock_count;       /* NOTE! Must be static */
211         struct globaldata *gd = mycpu;
212         struct globaldata *gscan;
213         int n;
214
215         /*
216          * SWSTROBE mode is a one-shot, the timer is no longer running
217          */
218         timer0_running = 0;
219
220         /*
221          * XXX the dispatcher needs work.  right now we call systimer_intr()
222          * directly or via IPI for any cpu with systimers queued, which is
223          * usually *ALL* of them.  We need to use the LAPIC timer for this.
224          */
225         sysclock_count = sys_cputimer->count();
226         for (n = 0; n < ncpus; ++n) {
227             gscan = globaldata_find(n);
228             if (TAILQ_FIRST(&gscan->gd_systimerq) == NULL)
229                 continue;
230             if (gscan != gd) {
231                 lwkt_send_ipiq3(gscan, (ipifunc3_t)systimer_intr, 
232                                 &sysclock_count, 1);
233             } else {
234                 systimer_intr(&sysclock_count, 0, frame_arg);
235             }
236         }
237 }
238
239
240 /*
241  * NOTE! not MP safe.
242  */
243 int
244 acquire_timer2(int mode)
245 {
246         if (timer2_state != RELEASED)
247                 return (-1);
248         timer2_state = ACQUIRED;
249
250         /*
251          * This access to the timer registers is as atomic as possible
252          * because it is a single instruction.  We could do better if we
253          * knew the rate.
254          */
255         outb(TIMER_MODE, TIMER_SEL2 | (mode & 0x3f));
256         return (0);
257 }
258
259 int
260 release_timer2(void)
261 {
262         if (timer2_state != ACQUIRED)
263                 return (-1);
264         outb(TIMER_MODE, TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT);
265         timer2_state = RELEASED;
266         return (0);
267 }
268
269 #include "opt_ddb.h"
270 #ifdef DDB
271 #include <ddb/ddb.h>
272
273 DB_SHOW_COMMAND(rtc, rtc)
274 {
275         kprintf("%02x/%02x/%02x %02x:%02x:%02x, A = %02x, B = %02x, C = %02x\n",
276                rtcin(RTC_YEAR), rtcin(RTC_MONTH), rtcin(RTC_DAY),
277                rtcin(RTC_HRS), rtcin(RTC_MIN), rtcin(RTC_SEC),
278                rtcin(RTC_STATUSA), rtcin(RTC_STATUSB), rtcin(RTC_INTR));
279 }
280 #endif /* DDB */
281
282 /*
283  * Return the current cpu timer count as a 32 bit integer.
284  */
285 static
286 sysclock_t
287 i8254_cputimer_count(void)
288 {
289         static uint16_t cputimer_last;
290         uint16_t count;
291         sysclock_t ret;
292
293         clock_lock();
294         outb(TIMER_MODE, i8254_walltimer_sel | TIMER_LATCH);
295         count = (uint8_t)inb(i8254_walltimer_cntr);     /* get countdown */
296         count |= ((uint8_t)inb(i8254_walltimer_cntr) << 8);
297         count = -count;                                 /* -> countup */
298         if (count < cputimer_last)                      /* rollover */
299                 i8254_cputimer.base += 0x00010000U;
300         ret = i8254_cputimer.base | count;
301         cputimer_last = count;
302         clock_unlock();
303
304         return(ret);
305 }
306
307 /*
308  * This function is called whenever the system timebase changes, allowing
309  * us to calculate what is needed to convert a system timebase tick 
310  * into an 8254 tick for the interrupt timer.  If we can convert to a
311  * simple shift, multiplication, or division, we do so.  Otherwise 64
312  * bit arithmatic is required every time the interrupt timer is reloaded.
313  */
314 static void
315 i8254_intr_config(struct cputimer_intr *cti, const struct cputimer *timer)
316 {
317     sysclock_t freq;
318     sysclock_t div;
319
320     /*
321      * Will a simple divide do the trick?
322      */
323     div = (timer->freq + (cti->freq / 2)) / cti->freq;
324     freq = cti->freq * div;
325
326     if (freq >= timer->freq - 1 && freq <= timer->freq + 1)
327         i8254_cputimer_div = div;
328     else
329         i8254_cputimer_div = 0;
330 }
331
332 /*
333  * Reload for the next timeout.  It is possible for the reload value
334  * to be 0 or negative, indicating that an immediate timer interrupt
335  * is desired.  For now make the minimum 2 ticks.
336  *
337  * We may have to convert from the system timebase to the 8254 timebase.
338  */
339 static void
340 i8254_intr_reload(struct cputimer_intr *cti, sysclock_t reload)
341 {
342     uint16_t count;
343
344     if ((ssysclock_t)reload < 0)
345             reload = 1;
346     if (i8254_cputimer_div)
347         reload /= i8254_cputimer_div;
348     else
349         reload = muldivu64(reload, cti->freq, sys_cputimer->freq);
350
351     if (reload < 2)
352         reload = 2;             /* minimum count */
353     if (reload > 0xFFFF)
354         reload = 0xFFFF;        /* almost full count (0 is full count) */
355
356     clock_lock();
357     if (timer0_running) {
358         outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);     /* count-down timer */
359         count = (uint8_t)inb(TIMER_CNTR0);              /* lsb */
360         count |= ((uint8_t)inb(TIMER_CNTR0) << 8);      /* msb */
361         if (reload < count) {
362             outb(TIMER_MODE, TIMER_SEL0 | TIMER_SWSTROBE | TIMER_16BIT);
363             outb(TIMER_CNTR0, (uint8_t)reload);         /* lsb */
364             outb(TIMER_CNTR0, (uint8_t)(reload >> 8));  /* msb */
365         }
366     } else {
367         timer0_running = 1;
368         outb(TIMER_MODE, TIMER_SEL0 | TIMER_SWSTROBE | TIMER_16BIT);
369         outb(TIMER_CNTR0, (uint8_t)reload);             /* lsb */
370         outb(TIMER_CNTR0, (uint8_t)(reload >> 8));      /* msb */
371     }
372     clock_unlock();
373 }
374
375 /*
376  * DELAY(usec)       - Spin for the specified number of microseconds.
377  * DRIVERSLEEP(usec) - Spin for the specified number of microseconds,
378  *                     but do a thread switch in the loop
379  *
380  * Relies on timer 1 counting down from (cputimer_freq / hz)
381  * Note: timer had better have been programmed before this is first used!
382  */
383 static void
384 DODELAY(int n, int doswitch)
385 {
386         ssysclock_t delta, ticks_left;
387         sysclock_t prev_tick, tick;
388
389 #ifdef DELAYDEBUG
390         int getit_calls = 1;
391         int n1;
392         static int state = 0;
393
394         if (state == 0) {
395                 state = 1;
396                 for (n1 = 1; n1 <= 10000000; n1 *= 10)
397                         DELAY(n1);
398                 state = 2;
399         }
400         if (state == 1)
401                 kprintf("DELAY(%d)...", n);
402 #endif
403         /*
404          * Guard against the timer being uninitialized if we are called
405          * early for console i/o.
406          */
407         if (timer0_state == RELEASED && i8254_cputimer_disable == 0)
408                 i8254_restore();
409
410         /*
411          * Read the counter first, so that the rest of the setup overhead is
412          * counted.  Then calculate the number of hardware timer ticks
413          * required, rounding up to be sure we delay at least the requested
414          * number of microseconds.
415          */
416         prev_tick = sys_cputimer->count();
417         ticks_left = muldivu64(n, sys_cputimer->freq + 999999, 1000000);
418
419         /*
420          * Loop until done.
421          */
422         while (ticks_left > 0) {
423                 tick = sys_cputimer->count();
424 #ifdef DELAYDEBUG
425                 ++getit_calls;
426 #endif
427                 delta = tick - prev_tick;
428                 prev_tick = tick;
429                 if (delta < 0)
430                         delta = 0;
431                 ticks_left -= delta;
432                 if (doswitch && ticks_left > 0)
433                         lwkt_switch();
434                 cpu_pause();
435         }
436 #ifdef DELAYDEBUG
437         if (state == 1)
438                 kprintf(" %d calls to getit() at %d usec each\n",
439                        getit_calls, (n + 5) / getit_calls);
440 #endif
441 }
442
443 /*
444  * DELAY() never switches.
445  */
446 void
447 DELAY(int n)
448 {
449         DODELAY(n, 0);
450 }
451
452 /*
453  * Returns non-zero if the specified time period has elapsed.  Call
454  * first with last_clock set to 0.
455  */
456 int
457 CHECKTIMEOUT(TOTALDELAY *tdd)
458 {
459         sysclock_t delta;
460         int us;
461
462         if (tdd->started == 0) {
463                 if (timer0_state == RELEASED && i8254_cputimer_disable == 0)
464                         i8254_restore();
465                 tdd->last_clock = sys_cputimer->count();
466                 tdd->started = 1;
467                 return(0);
468         }
469         delta = sys_cputimer->count() - tdd->last_clock;
470         us = muldivu64(delta, 1000000, sys_cputimer->freq);
471         tdd->last_clock += muldivu64(us, sys_cputimer->freq, 1000000);
472         tdd->us -= us;
473
474         return (tdd->us < 0);
475 }
476
477
478 /*
479  * DRIVERSLEEP() does not switch if called with a spinlock held or
480  * from a hard interrupt.
481  */
482 void
483 DRIVERSLEEP(int usec)
484 {
485         globaldata_t gd = mycpu;
486
487         if (gd->gd_intr_nesting_level || gd->gd_spinlocks) {
488                 DODELAY(usec, 0);
489         } else {
490                 DODELAY(usec, 1);
491         }
492 }
493
494 static void
495 sysbeepstop(void *chan)
496 {
497         outb(IO_PPI, inb(IO_PPI)&0xFC); /* disable counter2 output to speaker */
498         beeping = 0;
499         release_timer2();
500 }
501
502 int
503 sysbeep(int pitch, int period)
504 {
505         if (acquire_timer2(TIMER_SQWAVE|TIMER_16BIT))
506                 return(-1);
507         if (sysbeep_enable == 0)
508                 return(-1);
509         /*
510          * Nobody else is using timer2, we do not need the clock lock
511          */
512         outb(TIMER_CNTR2, pitch);
513         outb(TIMER_CNTR2, (pitch>>8));
514         if (!beeping) {
515                 /* enable counter2 output to speaker */
516                 outb(IO_PPI, inb(IO_PPI) | 3);
517                 beeping = period;
518                 callout_reset(&sysbeepstop_ch, period, sysbeepstop, NULL);
519         }
520         return (0);
521 }
522
523 /*
524  * RTC support routines
525  */
526
527 int
528 rtcin(int reg)
529 {
530         u_char val;
531
532         crit_enter();
533         outb(IO_RTC, reg);
534         inb(0x84);
535         val = inb(IO_RTC + 1);
536         inb(0x84);
537         crit_exit();
538         return (val);
539 }
540
541 static __inline void
542 writertc(u_char reg, u_char val)
543 {
544         crit_enter();
545         inb(0x84);
546         outb(IO_RTC, reg);
547         inb(0x84);
548         outb(IO_RTC + 1, val);
549         inb(0x84);              /* XXX work around wrong order in rtcin() */
550         crit_exit();
551 }
552
553 static __inline int
554 readrtc(int port)
555 {
556         return(bcd2bin(rtcin(port)));
557 }
558
559 static u_int
560 calibrate_clocks(void)
561 {
562         tsc_uclock_t old_tsc;
563         sysclock_t tot_count;
564         sysclock_t count, prev_count;
565         int sec, start_sec, timeout;
566
567         if (bootverbose)
568                 kprintf("Calibrating clock(s) ...\n");
569         if (!(rtcin(RTC_STATUSD) & RTCSD_PWR))
570                 goto fail;
571         timeout = 100000000;
572
573         /* Read the mc146818A seconds counter. */
574         for (;;) {
575                 if (!(rtcin(RTC_STATUSA) & RTCSA_TUP)) {
576                         sec = rtcin(RTC_SEC);
577                         break;
578                 }
579                 if (--timeout == 0)
580                         goto fail;
581         }
582
583         /* Wait for the mC146818A seconds counter to change. */
584         start_sec = sec;
585         for (;;) {
586                 if (!(rtcin(RTC_STATUSA) & RTCSA_TUP)) {
587                         sec = rtcin(RTC_SEC);
588                         if (sec != start_sec)
589                                 break;
590                 }
591                 if (--timeout == 0)
592                         goto fail;
593         }
594
595         /* Start keeping track of the i8254 counter. */
596         prev_count = sys_cputimer->count();
597         tot_count = 0;
598
599         if (tsc_present) 
600                 old_tsc = rdtsc();
601         else
602                 old_tsc = 0;            /* shut up gcc */
603
604         /*
605          * Wait for the mc146818A seconds counter to change.  Read the i8254
606          * counter for each iteration since this is convenient and only
607          * costs a few usec of inaccuracy. The timing of the final reads
608          * of the counters almost matches the timing of the initial reads,
609          * so the main cause of inaccuracy is the varying latency from 
610          * inside getit() or rtcin(RTC_STATUSA) to the beginning of the
611          * rtcin(RTC_SEC) that returns a changed seconds count.  The
612          * maximum inaccuracy from this cause is < 10 usec on 486's.
613          */
614         start_sec = sec;
615         for (;;) {
616                 if (!(rtcin(RTC_STATUSA) & RTCSA_TUP))
617                         sec = rtcin(RTC_SEC);
618                 count = sys_cputimer->count();
619                 tot_count += (sysclock_t)(count - prev_count);
620                 prev_count = count;
621                 if (sec != start_sec)
622                         break;
623                 if (--timeout == 0)
624                         goto fail;
625         }
626
627         /*
628          * Read the cpu cycle counter.  The timing considerations are
629          * similar to those for the i8254 clock.
630          */
631         if (tsc_present) {
632                 tsc_frequency = rdtsc() - old_tsc;
633                 if (bootverbose) {
634                         kprintf("TSC clock: %jd Hz (Method A)\n",
635                             (intmax_t)tsc_frequency);
636                 }
637         }
638         tsc_oneus_approx = ((tsc_frequency|1) + 999999) / 1000000;
639
640         kprintf("i8254 clock: %lu Hz\n", tot_count);
641         return (tot_count);
642
643 fail:
644         kprintf("failed, using default i8254 clock of %lu Hz\n",
645                 i8254_cputimer.freq);
646         return (i8254_cputimer.freq);
647 }
648
649 static void
650 i8254_restore(void)
651 {
652         timer0_state = ACQUIRED;
653
654         clock_lock();
655
656         /*
657          * Timer0 is our fine-grained variable clock interrupt
658          */
659         outb(TIMER_MODE, TIMER_SEL0 | TIMER_SWSTROBE | TIMER_16BIT);
660         outb(TIMER_CNTR0, 2);   /* lsb */
661         outb(TIMER_CNTR0, 0);   /* msb */
662         clock_unlock();
663
664         if (!i8254_nointr) {
665                 cputimer_intr_register(&i8254_cputimer_intr);
666                 cputimer_intr_select(&i8254_cputimer_intr, 0);
667         }
668
669         /*
670          * Timer1 or timer2 is our free-running clock, but only if another
671          * has not been selected.
672          */
673         cputimer_register(&i8254_cputimer);
674         cputimer_select(&i8254_cputimer, 0);
675 }
676
677 static void
678 i8254_cputimer_construct(struct cputimer *timer, sysclock_t oldclock)
679 {
680         int which;
681
682         /*
683          * Should we use timer 1 or timer 2 ?
684          */
685         which = 0;
686         TUNABLE_INT_FETCH("hw.i8254.walltimer", &which);
687         if (which != 1 && which != 2)
688                 which = 2;
689
690         switch(which) {
691         case 1:
692                 timer->name = "i8254_timer1";
693                 timer->type = CPUTIMER_8254_SEL1;
694                 i8254_walltimer_sel = TIMER_SEL1;
695                 i8254_walltimer_cntr = TIMER_CNTR1;
696                 timer1_state = ACQUIRED;
697                 break;
698         case 2:
699                 timer->name = "i8254_timer2";
700                 timer->type = CPUTIMER_8254_SEL2;
701                 i8254_walltimer_sel = TIMER_SEL2;
702                 i8254_walltimer_cntr = TIMER_CNTR2;
703                 timer2_state = ACQUIRED;
704                 break;
705         }
706
707         timer->base = (oldclock + 0xFFFF) & 0xFFFFFFFFFFFF0000LU;
708
709         clock_lock();
710         outb(TIMER_MODE, i8254_walltimer_sel | TIMER_RATEGEN | TIMER_16BIT);
711         outb(i8254_walltimer_cntr, 0);  /* lsb */
712         outb(i8254_walltimer_cntr, 0);  /* msb */
713         outb(IO_PPI, inb(IO_PPI) | 1);  /* bit 0: enable gate, bit 1: spkr */
714         clock_unlock();
715 }
716
717 static void
718 i8254_cputimer_destruct(struct cputimer *timer)
719 {
720         switch(timer->type) {
721         case CPUTIMER_8254_SEL1:
722             timer1_state = RELEASED;
723             break;
724         case CPUTIMER_8254_SEL2:
725             timer2_state = RELEASED;
726             break;
727         default:
728             break;
729         }
730         timer->type = 0;
731 }
732
733 static void
734 rtc_restore(void)
735 {
736         /* Restore all of the RTC's "status" (actually, control) registers. */
737         writertc(RTC_STATUSB, RTCSB_24HR);
738         writertc(RTC_STATUSA, rtc_statusa);
739         writertc(RTC_STATUSB, rtc_statusb);
740 }
741
742 /*
743  * Restore all the timers.
744  *
745  * This function is called to resynchronize our core timekeeping after a
746  * long halt, e.g. from apm_default_resume() and friends.  It is also 
747  * called if after a BIOS call we have detected munging of the 8254.
748  * It is necessary because cputimer_count() counter's delta may have grown
749  * too large for nanouptime() and friends to handle, or (in the case of 8254
750  * munging) might cause the SYSTIMER code to prematurely trigger.
751  */
752 void
753 timer_restore(void)
754 {
755         crit_enter();
756         if (i8254_cputimer_disable == 0)
757                 i8254_restore();        /* restore timer_freq and hz */
758         rtc_restore();                  /* reenable RTC interrupts */
759         crit_exit();
760 }
761
762 #define MAX_MEASURE_RETRIES     100
763
764 static u_int64_t
765 do_measure(u_int64_t timer_latency, u_int64_t *latency, sysclock_t *time,
766     int *retries)
767 {
768         u_int64_t tsc1, tsc2;
769         u_int64_t threshold;
770         sysclock_t val;
771         int cnt = 0;
772
773         do {
774                 if (cnt > MAX_MEASURE_RETRIES/2)
775                         threshold = timer_latency << 1;
776                 else
777                         threshold = timer_latency + (timer_latency >> 2);
778
779                 cnt++;
780                 tsc1 = rdtsc_ordered();
781                 val = sys_cputimer->count();
782                 tsc2 = rdtsc_ordered();
783         } while (timer_latency > 0 && cnt < MAX_MEASURE_RETRIES &&
784             tsc2 - tsc1 > threshold);
785
786         *retries = cnt - 1;
787         *latency = tsc2 - tsc1;
788         *time = val;
789         return tsc1;
790 }
791
792 static u_int64_t
793 do_calibrate_cputimer(u_int usecs, u_int64_t timer_latency)
794 {
795         if (calibrate_tsc_fast) {
796                 u_int64_t old_tsc1, start_lat1, new_tsc1, end_lat1;
797                 u_int64_t old_tsc2, start_lat2, new_tsc2, end_lat2;
798                 u_int64_t freq1, freq2;
799                 sysclock_t start1, end1, start2, end2;
800                 int retries1, retries2, retries3, retries4;
801
802                 DELAY(1000);
803                 old_tsc1 = do_measure(timer_latency, &start_lat1, &start1,
804                     &retries1);
805                 DELAY(20000);
806                 old_tsc2 = do_measure(timer_latency, &start_lat2, &start2,
807                     &retries2);
808                 DELAY(usecs);
809                 new_tsc1 = do_measure(timer_latency, &end_lat1, &end1,
810                     &retries3);
811                 DELAY(20000);
812                 new_tsc2 = do_measure(timer_latency, &end_lat2, &end2,
813                     &retries4);
814
815                 old_tsc1 += start_lat1;
816                 old_tsc2 += start_lat2;
817                 freq1 = (new_tsc1 - old_tsc1) + (start_lat1 + end_lat1) / 2;
818                 freq2 = (new_tsc2 - old_tsc2) + (start_lat2 + end_lat2) / 2;
819                 end1 -= start1;
820                 end2 -= start2;
821                 /* This should in practice be safe from overflows. */
822                 freq1 = muldivu64(freq1, sys_cputimer->freq, end1);
823                 freq2 = muldivu64(freq2, sys_cputimer->freq, end2);
824                 if (calibrate_test && (retries1 > 0 || retries2 > 0)) {
825                         kprintf("%s: retries: %d, %d, %d, %d\n",
826                             __func__, retries1, retries2, retries3, retries4);
827                 }
828                 if (calibrate_test) {
829                         kprintf("%s: freq1=%ju freq2=%ju avg=%ju\n",
830                             __func__, freq1, freq2, (freq1 + freq2) / 2);
831                 }
832                 return (freq1 + freq2) / 2;
833         } else {
834                 u_int64_t old_tsc, new_tsc;
835                 u_int64_t freq;
836
837                 old_tsc = rdtsc_ordered();
838                 DELAY(usecs);
839                 new_tsc = rdtsc();
840                 freq = new_tsc - old_tsc;
841                 /* This should in practice be safe from overflows. */
842                 freq = (freq * 1000 * 1000) / usecs;
843                 return freq;
844         }
845 }
846
847 /*
848  * Initialize 8254 timer 0 early so that it can be used in DELAY().
849  */
850 void
851 startrtclock(void)
852 {
853         const timecounter_init_t **list;
854         sysclock_t delta, freq;
855
856         callout_init_mp(&sysbeepstop_ch);
857
858         /* 
859          * Can we use the TSC?
860          *
861          * NOTE: If running under qemu, probably a good idea to force the
862          *       TSC because we are not likely to detect it as being
863          *       invariant or mpsyncd if you don't.  This will greatly
864          *       reduce SMP contention.
865          */
866         if (cpu_feature & CPUID_TSC) {
867                 tsc_present = 1;
868                 TUNABLE_INT_FETCH("hw.tsc_cputimer_force", &tsc_invariant);
869
870                 if ((cpu_vendor_id == CPU_VENDOR_INTEL ||
871                      cpu_vendor_id == CPU_VENDOR_AMD) &&
872                     cpu_exthigh >= 0x80000007) {
873                         u_int regs[4];
874
875                         do_cpuid(0x80000007, regs);
876                         if (regs[3] & 0x100)
877                                 tsc_invariant = 1;
878                 }
879         } else {
880                 tsc_present = 0;
881         }
882
883         /*
884          * Initial RTC state, don't do anything unexpected
885          */
886         writertc(RTC_STATUSA, rtc_statusa);
887         writertc(RTC_STATUSB, RTCSB_24HR);
888
889         SET_FOREACH(list, timecounter_init_set) {
890                 if ((*list)->configure != NULL)
891                         (*list)->configure();
892         }
893
894         /*
895          * If tsc_frequency is already initialized now, and a flag is set
896          * that i8254 timer is unneeded, we are done.
897          */
898         if (tsc_frequency != 0 && i8254_cputimer_disable != 0)
899                 goto done;
900
901         /*
902          * Set the 8254 timer0 in TIMER_SWSTROBE mode and cause it to 
903          * generate an interrupt, which we will ignore for now.
904          *
905          * Set the 8254 timer1 in TIMER_RATEGEN mode and load 0x0000
906          * (so it counts a full 2^16 and repeats).  We will use this timer
907          * for our counting.
908          */
909         if (i8254_cputimer_disable == 0)
910                 i8254_restore();
911
912         kprintf("Using cputimer %s for TSC calibration\n", sys_cputimer->name);
913
914         /*
915          * When booting without verbose messages, it's pointless to run the
916          * calibrate_clocks() calibration code, when we don't use the
917          * results in any way. With bootverbose, we are at least printing
918          *  this information to the kernel log.
919          */
920         if (i8254_cputimer_disable != 0 ||
921             (calibrate_timers_with_rtc == 0 && !bootverbose)) {
922                 goto skip_rtc_based;
923         }
924
925         freq = calibrate_clocks();
926 #ifdef CLK_CALIBRATION_LOOP
927         if (bootverbose) {
928                 int c;
929
930                 cnpoll(TRUE);
931                 kprintf("Press a key on the console to "
932                         "abort clock calibration\n");
933                 while ((c = cncheckc()) == -1 || c == NOKEY)
934                         calibrate_clocks();
935                 cnpoll(FALSE);
936         }
937 #endif
938
939         /*
940          * Use the calibrated i8254 frequency if it seems reasonable.
941          * Otherwise use the default, and don't use the calibrated i586
942          * frequency.
943          */
944         delta = freq > i8254_cputimer.freq ? 
945                 freq - i8254_cputimer.freq : i8254_cputimer.freq - freq;
946         if (delta < i8254_cputimer.freq / 100) {
947                 if (calibrate_timers_with_rtc == 0) {
948                         kprintf(
949 "hw.calibrate_timers_with_rtc not set - using default i8254 frequency\n");
950                         freq = i8254_cputimer.freq;
951                 }
952                 /*
953                  * NOTE:
954                  * Interrupt timer's freq must be adjusted
955                  * before we change the cuptimer's frequency.
956                  */
957                 i8254_cputimer_intr.freq = freq;
958                 cputimer_set_frequency(&i8254_cputimer, freq);
959         } else {
960                 if (bootverbose)
961                         kprintf("%lu Hz differs from default of %lu Hz "
962                                 "by more than 1%%\n",
963                                 freq, i8254_cputimer.freq);
964                 tsc_frequency = 0;
965         }
966
967         if (tsc_frequency != 0 && calibrate_timers_with_rtc == 0) {
968                 kprintf("hw.calibrate_timers_with_rtc not "
969                         "set - using old calibration method\n");
970                 tsc_frequency = 0;
971         }
972
973 skip_rtc_based:
974         if (tsc_present && tsc_frequency == 0) {
975                 u_int cnt;
976                 u_int64_t cputime_latency_tsc = 0, max = 0, min = 0;
977                 int i;
978
979                 for (i = 0; i < 10; i++) {
980                         /* Warm up */
981                         (void)sys_cputimer->count();
982                 }
983                 for (i = 0; i < 100; i++) {
984                         u_int64_t old_tsc, new_tsc;
985
986                         old_tsc = rdtsc_ordered();
987                         (void)sys_cputimer->count();
988                         new_tsc = rdtsc_ordered();
989                         cputime_latency_tsc += (new_tsc - old_tsc);
990                         if (max < (new_tsc - old_tsc))
991                                 max = new_tsc - old_tsc;
992                         if (min == 0 || min > (new_tsc - old_tsc))
993                                 min = new_tsc - old_tsc;
994                 }
995                 cputime_latency_tsc /= 100;
996                 kprintf(
997                     "Timer latency (in TSC ticks): %lu min=%lu max=%lu\n",
998                     cputime_latency_tsc, min, max);
999                 /* XXX Instead of this, properly filter out outliers. */
1000                 cputime_latency_tsc = min;
1001
1002                 if (calibrate_test > 0) {
1003                         u_int64_t values[20], avg = 0;
1004                         for (i = 1; i <= 20; i++) {
1005                                 u_int64_t freq;
1006
1007                                 freq = do_calibrate_cputimer(i * 100 * 1000,
1008                                     cputime_latency_tsc);
1009                                 values[i - 1] = freq;
1010                         }
1011                         /* Compute an average TSC for the 1s to 2s delays. */
1012                         for (i = 10; i < 20; i++)
1013                                 avg += values[i];
1014                         avg /= 10;
1015                         for (i = 0; i < 20; i++) {
1016                                 kprintf("%ums: %lu (Diff from average: %ld)\n",
1017                                     (i + 1) * 100, values[i],
1018                                     (int64_t)(values[i] - avg));
1019                         }
1020                 }
1021
1022                 if (calibrate_tsc_fast > 0) {
1023                         /* HPET would typically be >10MHz */
1024                         if (sys_cputimer->freq >= 10000000)
1025                                 cnt = 200000;
1026                         else
1027                                 cnt = 500000;
1028                 } else {
1029                         cnt = 1000000;
1030                 }
1031
1032                 tsc_frequency = do_calibrate_cputimer(cnt, cputime_latency_tsc);
1033                 if (bootverbose && calibrate_timers_with_rtc) {
1034                         kprintf("TSC clock: %jd Hz (Method B)\n",
1035                             (intmax_t)tsc_frequency);
1036                 }
1037         }
1038
1039 done:
1040         if (tsc_present) {
1041                 kprintf("TSC%s clock: %jd Hz\n",
1042                     tsc_invariant ? " invariant" : "",
1043                     (intmax_t)tsc_frequency);
1044         }
1045         tsc_oneus_approx = ((tsc_frequency|1) + 999999) / 1000000;
1046
1047         EVENTHANDLER_REGISTER(shutdown_post_sync, resettodr_on_shutdown,
1048                               NULL, SHUTDOWN_PRI_LAST);
1049 }
1050
1051 /*
1052  * Sync the time of day back to the RTC on shutdown, but only if
1053  * we have already loaded it and have not crashed.
1054  */
1055 static void
1056 resettodr_on_shutdown(void *arg __unused)
1057 {
1058         if (rtc_loaded && panicstr == NULL) {
1059                 resettodr();
1060         }
1061 }
1062
1063 /*
1064  * Initialize the time of day register, based on the time base which is, e.g.
1065  * from a filesystem.
1066  */
1067 void
1068 inittodr(time_t base)
1069 {
1070         unsigned long   sec, days;
1071         int             year, month;
1072         int             y, m;
1073         struct timespec ts;
1074
1075         if (base) {
1076                 ts.tv_sec = base;
1077                 ts.tv_nsec = 0;
1078                 set_timeofday(&ts);
1079         }
1080
1081         /* Look if we have a RTC present and the time is valid */
1082         if (!(rtcin(RTC_STATUSD) & RTCSD_PWR))
1083                 goto wrong_time;
1084
1085         /* wait for time update to complete */
1086         /* If RTCSA_TUP is zero, we have at least 244us before next update */
1087         crit_enter();
1088         while (rtcin(RTC_STATUSA) & RTCSA_TUP) {
1089                 crit_exit();
1090                 crit_enter();
1091         }
1092
1093         days = 0;
1094 #ifdef USE_RTC_CENTURY
1095         year = readrtc(RTC_YEAR) + readrtc(RTC_CENTURY) * 100;
1096 #else
1097         year = readrtc(RTC_YEAR) + 1900;
1098         if (year < 1970)
1099                 year += 100;
1100 #endif
1101         if (year < 1970) {
1102                 crit_exit();
1103                 goto wrong_time;
1104         }
1105         month = readrtc(RTC_MONTH);
1106         for (m = 1; m < month; m++)
1107                 days += daysinmonth[m-1];
1108         if ((month > 2) && LEAPYEAR(year))
1109                 days ++;
1110         days += readrtc(RTC_DAY) - 1;
1111         for (y = 1970; y < year; y++)
1112                 days += DAYSPERYEAR + LEAPYEAR(y);
1113         sec = ((( days * 24 +
1114                   readrtc(RTC_HRS)) * 60 +
1115                   readrtc(RTC_MIN)) * 60 +
1116                   readrtc(RTC_SEC));
1117         /* sec now contains the number of seconds, since Jan 1 1970,
1118            in the local time zone */
1119
1120         sec += tz.tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0);
1121
1122         y = (int)(time_second - sec);
1123         if (y <= -2 || y >= 2) {
1124                 /* badly off, adjust it */
1125                 ts.tv_sec = sec;
1126                 ts.tv_nsec = 0;
1127                 set_timeofday(&ts);
1128         }
1129         rtc_loaded = 1;
1130         crit_exit();
1131         return;
1132
1133 wrong_time:
1134         kprintf("Invalid time in real time clock.\n");
1135         kprintf("Check and reset the date immediately!\n");
1136 }
1137
1138 /*
1139  * Write system time back to RTC
1140  */
1141 void
1142 resettodr(void)
1143 {
1144         struct timeval tv;
1145         unsigned long tm;
1146         int m;
1147         int y;
1148
1149         if (disable_rtc_set)
1150                 return;
1151
1152         microtime(&tv);
1153         tm = tv.tv_sec;
1154
1155         crit_enter();
1156         /* Disable RTC updates and interrupts. */
1157         writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR);
1158
1159         /* Calculate local time to put in RTC */
1160
1161         tm -= tz.tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0);
1162
1163         writertc(RTC_SEC, bin2bcd(tm%60)); tm /= 60;    /* Write back Seconds */
1164         writertc(RTC_MIN, bin2bcd(tm%60)); tm /= 60;    /* Write back Minutes */
1165         writertc(RTC_HRS, bin2bcd(tm%24)); tm /= 24;    /* Write back Hours   */
1166
1167         /* We have now the days since 01-01-1970 in tm */
1168         writertc(RTC_WDAY, (tm+4)%7);                   /* Write back Weekday */
1169         for (y = 1970, m = DAYSPERYEAR + LEAPYEAR(y);
1170              tm >= m;
1171              y++,      m = DAYSPERYEAR + LEAPYEAR(y))
1172              tm -= m;
1173
1174         /* Now we have the years in y and the day-of-the-year in tm */
1175         writertc(RTC_YEAR, bin2bcd(y%100));             /* Write back Year    */
1176 #ifdef USE_RTC_CENTURY
1177         writertc(RTC_CENTURY, bin2bcd(y/100));          /* ... and Century    */
1178 #endif
1179         for (m = 0; ; m++) {
1180                 int ml;
1181
1182                 ml = daysinmonth[m];
1183                 if (m == 1 && LEAPYEAR(y))
1184                         ml++;
1185                 if (tm < ml)
1186                         break;
1187                 tm -= ml;
1188         }
1189
1190         writertc(RTC_MONTH, bin2bcd(m + 1));            /* Write back Month   */
1191         writertc(RTC_DAY, bin2bcd(tm + 1));             /* Write back Month Day */
1192
1193         /* Reenable RTC updates and interrupts. */
1194         writertc(RTC_STATUSB, rtc_statusb);
1195         crit_exit();
1196 }
1197
1198 static int
1199 i8254_ioapic_trial(int irq, struct cputimer_intr *cti)
1200 {
1201         sysclock_t base;
1202         long lastcnt;
1203
1204         /*
1205          * Following code assumes the 8254 is the cpu timer,
1206          * so make sure it is.
1207          */
1208         /*KKASSERT(sys_cputimer == &i8254_cputimer); (tested by CuteLarva) */
1209         KKASSERT(cti == &i8254_cputimer_intr);
1210
1211         lastcnt = get_interrupt_counter(irq, mycpuid);
1212
1213         /*
1214          * Force an 8254 Timer0 interrupt and wait 1/100s for
1215          * it to happen, then see if we got it.
1216          */
1217         kprintf("IOAPIC: testing 8254 interrupt delivery...");
1218
1219         i8254_intr_reload(cti, sys_cputimer->fromus(2));
1220         base = sys_cputimer->count();
1221         while (sys_cputimer->count() - base < sys_cputimer->freq / 100)
1222                 ; /* nothing */
1223
1224         if (get_interrupt_counter(irq, mycpuid) - lastcnt == 0) {
1225                 kprintf(" failed\n");
1226                 return ENOENT;
1227         } else {
1228                 kprintf(" success\n");
1229         }
1230         return 0;
1231 }
1232
1233 /*
1234  * Start both clocks running.  DragonFly note: the stat clock is no longer
1235  * used.  Instead, 8254 based systimers are used for all major clock
1236  * interrupts.
1237  */
1238 static void
1239 i8254_intr_initclock(struct cputimer_intr *cti, boolean_t selected)
1240 {
1241         void *clkdesc = NULL;
1242         int irq = 0, mixed_mode = 0, error;
1243
1244         KKASSERT(mycpuid == 0);
1245
1246         if (!selected && i8254_intr_disable)
1247                 goto nointr;
1248
1249         /*
1250          * The stat interrupt mask is different without the
1251          * statistics clock.  Also, don't set the interrupt
1252          * flag which would normally cause the RTC to generate
1253          * interrupts.
1254          */
1255         rtc_statusb = RTCSB_24HR;
1256
1257         /* Finish initializing 8254 timer 0. */
1258         if (ioapic_enable) {
1259                 irq = machintr_legacy_intr_find(0, INTR_TRIGGER_EDGE,
1260                         INTR_POLARITY_HIGH);
1261                 if (irq < 0) {
1262 mixed_mode_setup:
1263                         error = ioapic_conf_legacy_extint(0);
1264                         if (!error) {
1265                                 irq = machintr_legacy_intr_find(0,
1266                                     INTR_TRIGGER_EDGE, INTR_POLARITY_HIGH);
1267                                 if (irq < 0)
1268                                         error = ENOENT;
1269                         }
1270
1271                         if (error) {
1272                                 if (!selected) {
1273                                         kprintf("IOAPIC: setup mixed mode for "
1274                                                 "irq 0 failed: %d\n", error);
1275                                         goto nointr;
1276                                 } else {
1277                                         panic("IOAPIC: setup mixed mode for "
1278                                               "irq 0 failed: %d\n", error);
1279                                 }
1280                         }
1281                         mixed_mode = 1;
1282                 }
1283                 clkdesc = register_int(irq, clkintr, NULL, "clk",
1284                                        NULL,
1285                                        INTR_EXCL | INTR_CLOCK |
1286                                        INTR_NOPOLL | INTR_MPSAFE |
1287                                        INTR_NOENTROPY, 0);
1288         } else {
1289                 register_int(0, clkintr, NULL, "clk", NULL,
1290                              INTR_EXCL | INTR_CLOCK |
1291                              INTR_NOPOLL | INTR_MPSAFE |
1292                              INTR_NOENTROPY, 0);
1293         }
1294
1295         /* Initialize RTC. */
1296         writertc(RTC_STATUSA, rtc_statusa);
1297         writertc(RTC_STATUSB, RTCSB_24HR);
1298
1299         if (ioapic_enable) {
1300                 error = i8254_ioapic_trial(irq, cti);
1301                 if (error) {
1302                         if (mixed_mode) {
1303                                 if (!selected) {
1304                                         kprintf("IOAPIC: mixed mode for irq %d "
1305                                                 "trial failed: %d\n",
1306                                                 irq, error);
1307                                         goto nointr;
1308                                 } else {
1309                                         panic("IOAPIC: mixed mode for irq %d "
1310                                               "trial failed: %d\n", irq, error);
1311                                 }
1312                         } else {
1313                                 kprintf("IOAPIC: warning 8254 is not connected "
1314                                         "to the correct pin, try mixed mode\n");
1315                                 unregister_int(clkdesc, 0);
1316                                 goto mixed_mode_setup;
1317                         }
1318                 }
1319         }
1320         return;
1321
1322 nointr:
1323         i8254_nointr = 1; /* don't try to register again */
1324         cputimer_intr_deregister(cti);
1325 }
1326
1327 void
1328 setstatclockrate(int newhz)
1329 {
1330         if (newhz == RTC_PROFRATE)
1331                 rtc_statusa = RTCSA_DIVIDER | RTCSA_PROF;
1332         else
1333                 rtc_statusa = RTCSA_DIVIDER | RTCSA_NOPROF;
1334         writertc(RTC_STATUSA, rtc_statusa);
1335 }
1336
1337 #if 0
1338 static unsigned
1339 tsc_get_timecount(struct timecounter *tc)
1340 {
1341         return (rdtsc());
1342 }
1343 #endif
1344
1345 #ifdef KERN_TIMESTAMP
1346 #define KERN_TIMESTAMP_SIZE 16384
1347 static u_long tsc[KERN_TIMESTAMP_SIZE] ;
1348 SYSCTL_OPAQUE(_debug, OID_AUTO, timestamp, CTLFLAG_RD, tsc,
1349         sizeof(tsc), "LU", "Kernel timestamps");
1350 void  
1351 _TSTMP(u_int32_t x)
1352 {
1353         static int i;
1354
1355         tsc[i] = (u_int32_t)rdtsc();
1356         tsc[i+1] = x;
1357         i = i + 2;
1358         if (i >= KERN_TIMESTAMP_SIZE)
1359                 i = 0;
1360         tsc[i] = 0; /* mark last entry */
1361 }
1362 #endif /* KERN_TIMESTAMP */
1363
1364 /*
1365  *
1366  */
1367
1368 static int
1369 hw_i8254_timestamp(SYSCTL_HANDLER_ARGS)
1370 {
1371     sysclock_t count;
1372     uint64_t tscval;
1373     char buf[32];
1374
1375     crit_enter();
1376     if (sys_cputimer == &i8254_cputimer)
1377         count = sys_cputimer->count();
1378     else
1379         count = 0;
1380     if (tsc_present)
1381         tscval = rdtsc();
1382     else
1383         tscval = 0;
1384     crit_exit();
1385     ksnprintf(buf, sizeof(buf), "%016lx %016lx", count, tscval);
1386     return(SYSCTL_OUT(req, buf, strlen(buf) + 1));
1387 }
1388
1389 struct tsc_mpsync_info {
1390         volatile int            tsc_ready_cnt;
1391         volatile int            tsc_done_cnt;
1392         volatile int            tsc_command;
1393         volatile int            unused01[5];
1394         struct {
1395                 uint64_t        v;
1396                 uint64_t        unused02;
1397         } tsc_saved[MAXCPU];
1398 } __cachealign;
1399
1400 #if 0
1401 static void
1402 tsc_mpsync_test_loop(struct tsc_mpsync_thr *info)
1403 {
1404         struct globaldata *gd = mycpu;
1405         tsc_uclock_t test_end, test_begin;
1406         u_int i;
1407
1408         if (bootverbose) {
1409                 kprintf("cpu%d: TSC testing MP synchronization ...\n",
1410                     gd->gd_cpuid);
1411         }
1412
1413         test_begin = rdtsc_ordered();
1414         /* Run test for 100ms */
1415         test_end = test_begin + (tsc_frequency / 10);
1416
1417         arg->tsc_mpsync = 1;
1418         arg->tsc_target = test_begin;
1419
1420 #define TSC_TEST_TRYMAX         1000000 /* Make sure we could stop */
1421 #define TSC_TEST_TRYMIN         50000
1422
1423         for (i = 0; i < TSC_TEST_TRYMAX; ++i) {
1424                 struct lwkt_cpusync cs;
1425
1426                 crit_enter();
1427                 lwkt_cpusync_init(&cs, gd->gd_other_cpus,
1428                     tsc_mpsync_test_remote, arg);
1429                 lwkt_cpusync_interlock(&cs);
1430                 cpu_pause();
1431                 arg->tsc_target = rdtsc_ordered();
1432                 cpu_mfence();
1433                 lwkt_cpusync_deinterlock(&cs);
1434                 crit_exit();
1435                 cpu_pause();
1436
1437                 if (!arg->tsc_mpsync) {
1438                         kprintf("cpu%d: TSC is not MP synchronized @%u\n",
1439                             gd->gd_cpuid, i);
1440                         break;
1441                 }
1442                 if (arg->tsc_target > test_end && i >= TSC_TEST_TRYMIN)
1443                         break;
1444         }
1445
1446 #undef TSC_TEST_TRYMIN
1447 #undef TSC_TEST_TRYMAX
1448
1449         if (arg->tsc_target == test_begin) {
1450                 kprintf("cpu%d: TSC does not tick?!\n", gd->gd_cpuid);
1451                 /* XXX disable TSC? */
1452                 tsc_invariant = 0;
1453                 arg->tsc_mpsync = 0;
1454                 return;
1455         }
1456
1457         if (arg->tsc_mpsync && bootverbose) {
1458                 kprintf("cpu%d: TSC is MP synchronized after %u tries\n",
1459                     gd->gd_cpuid, i);
1460         }
1461 }
1462
1463 #endif
1464
1465 #define TSC_TEST_COUNT          50000
1466
1467 static void
1468 tsc_mpsync_ap_thread(void *xinfo)
1469 {
1470         struct tsc_mpsync_info *info = xinfo;
1471         int cpu = mycpuid;
1472         int i;
1473
1474         /*
1475          * Tell main loop that we are ready and wait for initiation
1476          */
1477         atomic_add_int(&info->tsc_ready_cnt, 1);
1478         while (info->tsc_command == 0) {
1479                 lwkt_force_switch();
1480         }
1481
1482         /*
1483          * Run test for 10000 loops or until tsc_done_cnt != 0 (another
1484          * cpu has finished its test), then increment done.
1485          */
1486         crit_enter();
1487         for (i = 0; i < TSC_TEST_COUNT && info->tsc_done_cnt == 0; ++i) {
1488                 info->tsc_saved[cpu].v = rdtsc_ordered();
1489         }
1490         crit_exit();
1491         atomic_add_int(&info->tsc_done_cnt, 1);
1492
1493         lwkt_exit();
1494 }
1495
1496 static void
1497 tsc_mpsync_test(void)
1498 {
1499         int cpu;
1500         int try;
1501
1502         if (!tsc_invariant) {
1503                 /* Not even invariant TSC */
1504                 return;
1505         }
1506
1507         if (ncpus == 1) {
1508                 /* Only one CPU */
1509                 tsc_mpsync = 1;
1510                 return;
1511         }
1512
1513         /*
1514          * Forcing can be used w/qemu to reduce contention
1515          */
1516         TUNABLE_INT_FETCH("hw.tsc_cputimer_force", &tsc_mpsync);
1517
1518         if (tsc_mpsync == 0) {
1519                 switch (cpu_vendor_id) {
1520                 case CPU_VENDOR_INTEL:
1521                         /*
1522                          * Intel probably works
1523                          */
1524                         break;
1525
1526                 case CPU_VENDOR_AMD:
1527                         /*
1528                          * For AMD 15h and 16h (i.e. The Bulldozer and Jaguar
1529                          * architectures) we have to watch out for
1530                          * Erratum 778:
1531                          *     "Processor Core Time Stamp Counters May
1532                          *      Experience Drift"
1533                          * This Erratum is only listed for cpus in Family
1534                          * 15h < Model 30h and for 16h < Model 30h.
1535                          *
1536                          * AMD < Bulldozer probably doesn't work
1537                          */
1538                         if (CPUID_TO_FAMILY(cpu_id) == 0x15 ||
1539                             CPUID_TO_FAMILY(cpu_id) == 0x16) {
1540                                 if (CPUID_TO_MODEL(cpu_id) < 0x30)
1541                                         return;
1542                         } else if (CPUID_TO_FAMILY(cpu_id) < 0x17) {
1543                                 return;
1544                         }
1545                         break;
1546
1547                 default:
1548                         /* probably won't work */
1549                         return;
1550                 }
1551         } else if (tsc_mpsync < 0) {
1552                 kprintf("TSC MP synchronization test is disabled\n");
1553                 tsc_mpsync = 0;
1554                 return;
1555         }
1556
1557         /*
1558          * Test even if forced to 1 above.  If forced, we will use the TSC
1559          * even if the test fails.  (set forced to -1 to disable entirely).
1560          */
1561         kprintf("TSC testing MP synchronization ...\n");
1562
1563         /*
1564          * Test TSC MP synchronization on APs.  Try up to 4 times.
1565          */
1566         for (try = 0; try < 4; ++try) {
1567                 struct tsc_mpsync_info info;
1568                 uint64_t last;
1569                 int64_t xdelta;
1570                 int64_t delta;
1571
1572                 bzero(&info, sizeof(info));
1573
1574                 for (cpu = 0; cpu < ncpus; ++cpu) {
1575                         thread_t td;
1576                         lwkt_create(tsc_mpsync_ap_thread, &info, &td,
1577                                     NULL, TDF_NOSTART, cpu,
1578                                     "tsc mpsync %d", cpu);
1579                         lwkt_setpri_initial(td, curthread->td_pri);
1580                         lwkt_schedule(td);
1581                 }
1582                 while (info.tsc_ready_cnt != ncpus)
1583                         lwkt_force_switch();
1584
1585                 /*
1586                  * All threads are ready, start the test and wait for
1587                  * completion.
1588                  */
1589                 info.tsc_command = 1;
1590                 while (info.tsc_done_cnt != ncpus)
1591                         lwkt_force_switch();
1592
1593                 /*
1594                  * Process results
1595                  */
1596                 last = info.tsc_saved[0].v;
1597                 delta = 0;
1598                 for (cpu = 0; cpu < ncpus; ++cpu) {
1599                         xdelta = (int64_t)(info.tsc_saved[cpu].v - last);
1600                         last = info.tsc_saved[cpu].v;
1601                         if (xdelta < 0)
1602                                 xdelta = -xdelta;
1603                         delta += xdelta;
1604
1605                 }
1606
1607                 /*
1608                  * Result from attempt.  If its too wild just stop now.
1609                  * Also break out if we succeed, no need to try further.
1610                  */
1611                 kprintf("TSC MPSYNC TEST %jd %d -> %jd (10uS=%jd)\n",
1612                         delta, ncpus, delta / ncpus,
1613                         tsc_frequency / 100000);
1614                 if (delta / ncpus > tsc_frequency / 100)
1615                         break;
1616                 if (delta / ncpus < tsc_frequency / 100000) {
1617                         tsc_mpsync = 1;
1618                         break;
1619                 }
1620         }
1621
1622         if (tsc_mpsync)
1623                 kprintf("TSC is MP synchronized\n");
1624         else
1625                 kprintf("TSC is not MP synchronized\n");
1626 }
1627 SYSINIT(tsc_mpsync, SI_BOOT2_FINISH_SMP, SI_ORDER_ANY, tsc_mpsync_test, NULL);
1628
1629 SYSCTL_NODE(_hw, OID_AUTO, i8254, CTLFLAG_RW, 0, "I8254");
1630 SYSCTL_UINT(_hw_i8254, OID_AUTO, freq, CTLFLAG_RD, &i8254_cputimer.freq, 0,
1631             "frequency");
1632 SYSCTL_PROC(_hw_i8254, OID_AUTO, timestamp, CTLTYPE_STRING|CTLFLAG_RD,
1633             0, 0, hw_i8254_timestamp, "A", "");
1634
1635 SYSCTL_INT(_hw, OID_AUTO, tsc_present, CTLFLAG_RD,
1636             &tsc_present, 0, "TSC Available");
1637 SYSCTL_INT(_hw, OID_AUTO, tsc_invariant, CTLFLAG_RD,
1638             &tsc_invariant, 0, "Invariant TSC");
1639 SYSCTL_INT(_hw, OID_AUTO, tsc_mpsync, CTLFLAG_RD,
1640             &tsc_mpsync, 0, "TSC is synchronized across CPUs");
1641 SYSCTL_QUAD(_hw, OID_AUTO, tsc_frequency, CTLFLAG_RD,
1642             &tsc_frequency, 0, "TSC Frequency");