sys/kern/kern_time.c

   1 /*
   2  * Copyright (c) 1982, 1986, 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. All advertising materials mentioning features or use of this software
  14  *    must display the following acknowledgement:
  15  *      This product includes software developed by the University of
  16  *      California, Berkeley and its contributors.
  17  * 4. Neither the name of the University nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  *
  33  *      @(#)kern_time.c 8.1 (Berkeley) 6/10/93
  34  * $FreeBSD: src/sys/kern/kern_time.c,v 1.68.2.1 2002/10/01 08:00:41 bde Exp $
  35  */
  36
  37 #include <sys/param.h>
  38 #include <sys/systm.h>
  39 #include <sys/buf.h>
  40 #include <sys/sysproto.h>
  41 #include <sys/resourcevar.h>
  42 #include <sys/signalvar.h>
  43 #include <sys/kernel.h>
  44 #include <sys/sysent.h>
  45 #include <sys/sysunion.h>
  46 #include <sys/proc.h>
  47 #include <sys/priv.h>
  48 #include <sys/time.h>
  49 #include <sys/vnode.h>
  50 #include <sys/sysctl.h>
  51 #include <sys/kern_syscall.h>
  52 #include <vm/vm.h>
  53 #include <vm/vm_extern.h>
  54
  55 #include <sys/msgport2.h>
  56 #include <sys/thread2.h>
  57 #include <sys/mplock2.h>
  58
  59 struct timezone tz;
  60
  61 /*
  62  * Time of day and interval timer support.
  63  *
  64  * These routines provide the kernel entry points to get and set
  65  * the time-of-day and per-process interval timers.  Subroutines
  66  * here provide support for adding and subtracting timeval structures
  67  * and decrementing interval timers, optionally reloading the interval
  68  * timers when they expire.
  69  */
  70
  71 static int      settime(struct timeval *);
  72 static void     timevalfix(struct timeval *);
  73
  74 /*
  75  * Nanosleep tries very hard to sleep for a precisely requested time
  76  * interval, down to 1uS.  The administrator can impose a minimum delay
  77  * and a delay below which we hard-loop instead of initiate a timer
  78  * interrupt and sleep.
  79  *
  80  * For machines under high loads it might be beneficial to increase min_us
  81  * to e.g. 1000uS (1ms) so spining processes sleep meaningfully.
  82  */
  83 static int     nanosleep_min_us = 10;
  84 static int     nanosleep_hard_us = 100;
  85 static int     gettimeofday_quick = 0;
  86 SYSCTL_INT(_kern, OID_AUTO, nanosleep_min_us, CTLFLAG_RW,
  87            &nanosleep_min_us, 0, "")
  88 SYSCTL_INT(_kern, OID_AUTO, nanosleep_hard_us, CTLFLAG_RW,
  89            &nanosleep_hard_us, 0, "")
  90 SYSCTL_INT(_kern, OID_AUTO, gettimeofday_quick, CTLFLAG_RW,
  91            &gettimeofday_quick, 0, "")
  92
  93 static int
  94 settime(struct timeval *tv)
  95 {
  96         struct timeval delta, tv1, tv2;
  97         static struct timeval maxtime, laststep;
  98         struct timespec ts;
  99         int origcpu;
 100
 101         if ((origcpu = mycpu->gd_cpuid) != 0)
 102                 lwkt_setcpu_self(globaldata_find(0));
 103
 104         crit_enter();
 105         microtime(&tv1);
 106         delta = *tv;
 107         timevalsub(&delta, &tv1);
 108
 109         /*
 110          * If the system is secure, we do not allow the time to be
 111          * set to a value earlier than 1 second less than the highest
 112          * time we have yet seen. The worst a miscreant can do in
 113          * this circumstance is "freeze" time. He couldn't go
 114          * back to the past.
 115          *
 116          * We similarly do not allow the clock to be stepped more
 117          * than one second, nor more than once per second. This allows
 118          * a miscreant to make the clock march double-time, but no worse.
 119          */
 120         if (securelevel > 1) {
 121                 if (delta.tv_sec < 0 || delta.tv_usec < 0) {
 122                         /*
 123                          * Update maxtime to latest time we've seen.
 124                          */
 125                         if (tv1.tv_sec > maxtime.tv_sec)
 126                                 maxtime = tv1;
 127                         tv2 = *tv;
 128                         timevalsub(&tv2, &maxtime);
 129                         if (tv2.tv_sec < -1) {
 130                                 tv->tv_sec = maxtime.tv_sec - 1;
 131                                 kprintf("Time adjustment clamped to -1 second\n");
 132                         }
 133                 } else {
 134                         if (tv1.tv_sec == laststep.tv_sec) {
 135                                 crit_exit();
 136                                 return (EPERM);
 137                         }
 138                         if (delta.tv_sec > 1) {
 139                                 tv->tv_sec = tv1.tv_sec + 1;
 140                                 kprintf("Time adjustment clamped to +1 second\n");
 141                         }
 142                         laststep = *tv;
 143                 }
 144         }
 145
 146         ts.tv_sec = tv->tv_sec;
 147         ts.tv_nsec = tv->tv_usec * 1000;
 148         set_timeofday(&ts);
 149         crit_exit();
 150
 151         if (origcpu != 0)
 152                 lwkt_setcpu_self(globaldata_find(origcpu));
 153
 154         resettodr();
 155         return (0);
 156 }
 157
 158 /*
 159  * MPSAFE
 160  */
 161 int
 162 kern_clock_gettime(clockid_t clock_id, struct timespec *ats)
 163 {
 164         int error = 0;
 165         struct proc *p;
 166
 167         switch(clock_id) {
 168         case CLOCK_REALTIME:
 169         case CLOCK_REALTIME_PRECISE:
 170                 nanotime(ats);
 171                 break;
 172         case CLOCK_REALTIME_FAST:
 173                 getnanotime(ats);
 174                 break;
 175         case CLOCK_MONOTONIC:
 176         case CLOCK_MONOTONIC_PRECISE:
 177         case CLOCK_UPTIME:
 178         case CLOCK_UPTIME_PRECISE:
 179                 nanouptime(ats);
 180                 break;
 181         case CLOCK_MONOTONIC_FAST:
 182         case CLOCK_UPTIME_FAST:
 183                 getnanouptime(ats);
 184                 break;
 185         case CLOCK_VIRTUAL:
 186                 p = curproc;
 187                 ats->tv_sec = p->p_timer[ITIMER_VIRTUAL].it_value.tv_sec;
 188                 ats->tv_nsec = p->p_timer[ITIMER_VIRTUAL].it_value.tv_usec *
 189                                1000;
 190                 break;
 191         case CLOCK_PROF:
 192                 p = curproc;
 193                 ats->tv_sec = p->p_timer[ITIMER_PROF].it_value.tv_sec;
 194                 ats->tv_nsec = p->p_timer[ITIMER_PROF].it_value.tv_usec *
 195                                1000;
 196                 break;
 197         case CLOCK_SECOND:
 198                 ats->tv_sec = time_second;
 199                 ats->tv_nsec = 0;
 200                 break;
 201         default:
 202                 error = EINVAL;
 203                 break;
 204         }
 205         return (error);
 206 }
 207
 208 /*
 209  * MPSAFE
 210  */
 211 int
 212 sys_clock_gettime(struct clock_gettime_args *uap)
 213 {
 214         struct timespec ats;
 215         int error;
 216
 217         error = kern_clock_gettime(uap->clock_id, &ats);
 218         if (error == 0)
 219                 error = copyout(&ats, uap->tp, sizeof(ats));
 220
 221         return (error);
 222 }
 223
 224 int
 225 kern_clock_settime(clockid_t clock_id, struct timespec *ats)
 226 {
 227         struct thread *td = curthread;
 228         struct timeval atv;
 229         int error;
 230
 231         if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 232                 return (error);
 233         if (clock_id != CLOCK_REALTIME)
 234                 return (EINVAL);
 235         if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
 236                 return (EINVAL);
 237
 238         TIMESPEC_TO_TIMEVAL(&atv, ats);
 239         error = settime(&atv);
 240         return (error);
 241 }
 242
 243 /*
 244  * MPALMOSTSAFE
 245  */
 246 int
 247 sys_clock_settime(struct clock_settime_args *uap)
 248 {
 249         struct timespec ats;
 250         int error;
 251
 252         if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
 253                 return (error);
 254
 255         get_mplock();
 256         error = kern_clock_settime(uap->clock_id, &ats);
 257         rel_mplock();
 258         return (error);
 259 }
 260
 261 /*
 262  * MPSAFE
 263  */
 264 int
 265 kern_clock_getres(clockid_t clock_id, struct timespec *ts)
 266 {
 267         int error;
 268
 269         switch(clock_id) {
 270         case CLOCK_REALTIME:
 271         case CLOCK_REALTIME_FAST:
 272         case CLOCK_REALTIME_PRECISE:
 273         case CLOCK_MONOTONIC:
 274         case CLOCK_MONOTONIC_FAST:
 275         case CLOCK_MONOTONIC_PRECISE:
 276         case CLOCK_UPTIME:
 277         case CLOCK_UPTIME_FAST:
 278         case CLOCK_UPTIME_PRECISE:
 279                 /*
 280                  * Round up the result of the division cheaply
 281                  * by adding 1.  Rounding up is especially important
 282                  * if rounding down would give 0.  Perfect rounding
 283                  * is unimportant.
 284                  */
 285                 ts->tv_sec = 0;
 286                 ts->tv_nsec = 1000000000 / sys_cputimer->freq + 1;
 287                 error = 0;
 288                 break;
 289         case CLOCK_VIRTUAL:
 290         case CLOCK_PROF:
 291                 /* Accurately round up here because we can do so cheaply. */
 292                 ts->tv_sec = 0;
 293                 ts->tv_nsec = (1000000000 + hz - 1) / hz;
 294                 error = 0;
 295                 break;
 296         case CLOCK_SECOND:
 297                 ts->tv_sec = 1;
 298                 ts->tv_nsec = 0;
 299                 error = 0;
 300                 break;
 301         default:
 302                 error = EINVAL;
 303                 break;
 304         }
 305
 306         return(error);
 307 }
 308
 309 /*
 310  * MPSAFE
 311  */
 312 int
 313 sys_clock_getres(struct clock_getres_args *uap)
 314 {
 315         int error;
 316         struct timespec ts;
 317
 318         error = kern_clock_getres(uap->clock_id, &ts);
 319         if (error == 0)
 320                 error = copyout(&ts, uap->tp, sizeof(ts));
 321
 322         return (error);
 323 }
 324
 325 /*
 326  * nanosleep1()
 327  *
 328  *      This is a general helper function for nanosleep() (aka sleep() aka
 329  *      usleep()).
 330  *
 331  *      If there is less then one tick's worth of time left and
 332  *      we haven't done a yield, or the remaining microseconds is
 333  *      ridiculously low, do a yield.  This avoids having
 334  *      to deal with systimer overheads when the system is under
 335  *      heavy loads.  If we have done a yield already then use
 336  *      a systimer and an uninterruptable thread wait.
 337  *
 338  *      If there is more then a tick's worth of time left,
 339  *      calculate the baseline ticks and use an interruptable
 340  *      tsleep, then handle the fine-grained delay on the next
 341  *      loop.  This usually results in two sleeps occuring, a long one
 342  *      and a short one.
 343  *
 344  * MPSAFE
 345  */
 346 static void
 347 ns1_systimer(systimer_t info, int in_ipi __unused,
 348     struct intrframe *frame __unused)
 349 {
 350         lwkt_schedule(info->data);
 351 }
 352
 353 int
 354 nanosleep1(struct timespec *rqt, struct timespec *rmt)
 355 {
 356         static int nanowait;
 357         struct timespec ts, ts2, ts3;
 358         struct timeval tv;
 359         int error;
 360
 361         if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
 362                 return (EINVAL);
 363         /* XXX: imho this should return EINVAL at least for tv_sec < 0 */
 364         if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
 365                 return (0);
 366         nanouptime(&ts);
 367         timespecadd(&ts, rqt);          /* ts = target timestamp compare */
 368         TIMESPEC_TO_TIMEVAL(&tv, rqt);  /* tv = sleep interval */
 369
 370         for (;;) {
 371                 int ticks;
 372                 struct systimer info;
 373
 374                 ticks = tv.tv_usec / ustick;    /* approximate */
 375
 376                 if (tv.tv_sec == 0 && ticks == 0) {
 377                         thread_t td = curthread;
 378                         if (tv.tv_usec > 0 && tv.tv_usec < nanosleep_min_us)
 379                                 tv.tv_usec = nanosleep_min_us;
 380                         if (tv.tv_usec < nanosleep_hard_us) {
 381                                 lwkt_user_yield();
 382                                 cpu_pause();
 383                         } else {
 384                                 crit_enter_quick(td);
 385                                 systimer_init_oneshot(&info, ns1_systimer,
 386                                                 td, tv.tv_usec);
 387                                 lwkt_deschedule_self(td);
 388                                 crit_exit_quick(td);
 389                                 lwkt_switch();
 390                                 systimer_del(&info); /* make sure it's gone */
 391                         }
 392                         error = iscaught(td->td_lwp);
 393                 } else if (tv.tv_sec == 0) {
 394                         error = tsleep(&nanowait, PCATCH, "nanslp", ticks);
 395                 } else {
 396                         ticks = tvtohz_low(&tv); /* also handles overflow */
 397                         error = tsleep(&nanowait, PCATCH, "nanslp", ticks);
 398                 }
 399                 nanouptime(&ts2);
 400                 if (error && error != EWOULDBLOCK) {
 401                         if (error == ERESTART)
 402                                 error = EINTR;
 403                         if (rmt != NULL) {
 404                                 timespecsub(&ts, &ts2);
 405                                 if (ts.tv_sec < 0)
 406                                         timespecclear(&ts);
 407                                 *rmt = ts;
 408                         }
 409                         return (error);
 410                 }
 411                 if (timespeccmp(&ts2, &ts, >=))
 412                         return (0);
 413                 ts3 = ts;
 414                 timespecsub(&ts3, &ts2);
 415                 TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 416         }
 417 }
 418
 419 /*
 420  * MPSAFE
 421  */
 422 int
 423 sys_nanosleep(struct nanosleep_args *uap)
 424 {
 425         int error;
 426         struct timespec rqt;
 427         struct timespec rmt;
 428
 429         error = copyin(uap->rqtp, &rqt, sizeof(rqt));
 430         if (error)
 431                 return (error);
 432
 433         error = nanosleep1(&rqt, &rmt);
 434
 435         /*
 436          * copyout the residual if nanosleep was interrupted.
 437          */
 438         if (error && uap->rmtp) {
 439                 int error2;
 440
 441                 error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
 442                 if (error2)
 443                         error = error2;
 444         }
 445         return (error);
 446 }
 447
 448 /*
 449  * The gettimeofday() system call is supposed to return a fine-grained
 450  * realtime stamp.  However, acquiring a fine-grained stamp can create a
 451  * bottleneck when multiple cpu cores are trying to accessing e.g. the
 452  * HPET hardware timer all at the same time, so we have a sysctl that
 453  * allows its behavior to be changed to a more coarse-grained timestamp
 454  * which does not have to access a hardware timer.
 455  */
 456 int
 457 sys_gettimeofday(struct gettimeofday_args *uap)
 458 {
 459         struct timeval atv;
 460         int error = 0;
 461
 462         if (uap->tp) {
 463                 if (gettimeofday_quick)
 464                         getmicrotime(&atv);
 465                 else
 466                         microtime(&atv);
 467                 if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp,
 468                     sizeof (atv))))
 469                         return (error);
 470         }
 471         if (uap->tzp)
 472                 error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
 473                     sizeof (tz));
 474         return (error);
 475 }
 476
 477 /*
 478  * MPALMOSTSAFE
 479  */
 480 int
 481 sys_settimeofday(struct settimeofday_args *uap)
 482 {
 483         struct thread *td = curthread;
 484         struct timeval atv;
 485         struct timezone atz;
 486         int error;
 487
 488         if ((error = priv_check(td, PRIV_SETTIMEOFDAY)))
 489                 return (error);
 490         /*
 491          * Verify all parameters before changing time.
 492          *
 493          * NOTE: We do not allow the time to be set to 0.0, which also by
 494          *       happy coincidence works around a pkgsrc bulk build bug.
 495          */
 496         if (uap->tv) {
 497                 if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
 498                     sizeof(atv))))
 499                         return (error);
 500                 if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
 501                         return (EINVAL);
 502                 if (atv.tv_sec == 0 && atv.tv_usec == 0)
 503                         return (EINVAL);
 504         }
 505         if (uap->tzp &&
 506             (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
 507                 return (error);
 508
 509         get_mplock();
 510         if (uap->tv && (error = settime(&atv))) {
 511                 rel_mplock();
 512                 return (error);
 513         }
 514         rel_mplock();
 515         if (uap->tzp)
 516                 tz = atz;
 517         return (0);
 518 }
 519
 520 static void
 521 kern_adjtime_common(void)
 522 {
 523         if ((ntp_delta >= 0 && ntp_delta < ntp_default_tick_delta) ||
 524             (ntp_delta < 0 && ntp_delta > -ntp_default_tick_delta))
 525                 ntp_tick_delta = ntp_delta;
 526         else if (ntp_delta > ntp_big_delta)
 527                 ntp_tick_delta = 10 * ntp_default_tick_delta;
 528         else if (ntp_delta < -ntp_big_delta)
 529                 ntp_tick_delta = -10 * ntp_default_tick_delta;
 530         else if (ntp_delta > 0)
 531                 ntp_tick_delta = ntp_default_tick_delta;
 532         else
 533                 ntp_tick_delta = -ntp_default_tick_delta;
 534 }
 535
 536 void
 537 kern_adjtime(int64_t delta, int64_t *odelta)
 538 {
 539         int origcpu;
 540
 541         if ((origcpu = mycpu->gd_cpuid) != 0)
 542                 lwkt_setcpu_self(globaldata_find(0));
 543
 544         crit_enter();
 545         *odelta = ntp_delta;
 546         ntp_delta = delta;
 547         kern_adjtime_common();
 548         crit_exit();
 549
 550         if (origcpu != 0)
 551                 lwkt_setcpu_self(globaldata_find(origcpu));
 552 }
 553
 554 static void
 555 kern_get_ntp_delta(int64_t *delta)
 556 {
 557         int origcpu;
 558
 559         if ((origcpu = mycpu->gd_cpuid) != 0)
 560                 lwkt_setcpu_self(globaldata_find(0));
 561
 562         crit_enter();
 563         *delta = ntp_delta;
 564         crit_exit();
 565
 566         if (origcpu != 0)
 567                 lwkt_setcpu_self(globaldata_find(origcpu));
 568 }
 569
 570 void
 571 kern_reladjtime(int64_t delta)
 572 {
 573         int origcpu;
 574
 575         if ((origcpu = mycpu->gd_cpuid) != 0)
 576                 lwkt_setcpu_self(globaldata_find(0));
 577
 578         crit_enter();
 579         ntp_delta += delta;
 580         kern_adjtime_common();
 581         crit_exit();
 582
 583         if (origcpu != 0)
 584                 lwkt_setcpu_self(globaldata_find(origcpu));
 585 }
 586
 587 static void
 588 kern_adjfreq(int64_t rate)
 589 {
 590         int origcpu;
 591
 592         if ((origcpu = mycpu->gd_cpuid) != 0)
 593                 lwkt_setcpu_self(globaldata_find(0));
 594
 595         crit_enter();
 596         ntp_tick_permanent = rate;
 597         crit_exit();
 598
 599         if (origcpu != 0)
 600                 lwkt_setcpu_self(globaldata_find(origcpu));
 601 }
 602
 603 /*
 604  * MPALMOSTSAFE
 605  */
 606 int
 607 sys_adjtime(struct adjtime_args *uap)
 608 {
 609         struct thread *td = curthread;
 610         struct timeval atv;
 611         int64_t ndelta, odelta;
 612         int error;
 613
 614         if ((error = priv_check(td, PRIV_ADJTIME)))
 615                 return (error);
 616         error = copyin(uap->delta, &atv, sizeof(struct timeval));
 617         if (error)
 618                 return (error);
 619
 620         /*
 621          * Compute the total correction and the rate at which to apply it.
 622          * Round the adjustment down to a whole multiple of the per-tick
 623          * delta, so that after some number of incremental changes in
 624          * hardclock(), tickdelta will become zero, lest the correction
 625          * overshoot and start taking us away from the desired final time.
 626          */
 627         ndelta = (int64_t)atv.tv_sec * 1000000000 + atv.tv_usec * 1000;
 628         get_mplock();
 629         kern_adjtime(ndelta, &odelta);
 630         rel_mplock();
 631
 632         if (uap->olddelta) {
 633                 atv.tv_sec = odelta / 1000000000;
 634                 atv.tv_usec = odelta % 1000000000 / 1000;
 635                 copyout(&atv, uap->olddelta, sizeof(struct timeval));
 636         }
 637         return (0);
 638 }
 639
 640 static int
 641 sysctl_adjtime(SYSCTL_HANDLER_ARGS)
 642 {
 643         int64_t delta;
 644         int error;
 645
 646         if (req->newptr != NULL) {
 647                 if (priv_check(curthread, PRIV_ROOT))
 648                         return (EPERM);
 649                 error = SYSCTL_IN(req, &delta, sizeof(delta));
 650                 if (error)
 651                         return (error);
 652                 kern_reladjtime(delta);
 653         }
 654
 655         if (req->oldptr)
 656                 kern_get_ntp_delta(&delta);
 657         error = SYSCTL_OUT(req, &delta, sizeof(delta));
 658         return (error);
 659 }
 660
 661 /*
 662  * delta is in nanoseconds.
 663  */
 664 static int
 665 sysctl_delta(SYSCTL_HANDLER_ARGS)
 666 {
 667         int64_t delta, old_delta;
 668         int error;
 669
 670         if (req->newptr != NULL) {
 671                 if (priv_check(curthread, PRIV_ROOT))
 672                         return (EPERM);
 673                 error = SYSCTL_IN(req, &delta, sizeof(delta));
 674                 if (error)
 675                         return (error);
 676                 kern_adjtime(delta, &old_delta);
 677         }
 678
 679         if (req->oldptr != NULL)
 680                 kern_get_ntp_delta(&old_delta);
 681         error = SYSCTL_OUT(req, &old_delta, sizeof(old_delta));
 682         return (error);
 683 }
 684
 685 /*
 686  * frequency is in nanoseconds per second shifted left 32.
 687  * kern_adjfreq() needs it in nanoseconds per tick shifted left 32.
 688  */
 689 static int
 690 sysctl_adjfreq(SYSCTL_HANDLER_ARGS)
 691 {
 692         int64_t freqdelta;
 693         int error;
 694
 695         if (req->newptr != NULL) {
 696                 if (priv_check(curthread, PRIV_ROOT))
 697                         return (EPERM);
 698                 error = SYSCTL_IN(req, &freqdelta, sizeof(freqdelta));
 699                 if (error)
 700                         return (error);
 701
 702                 freqdelta /= hz;
 703                 kern_adjfreq(freqdelta);
 704         }
 705
 706         if (req->oldptr != NULL)
 707                 freqdelta = ntp_tick_permanent * hz;
 708         error = SYSCTL_OUT(req, &freqdelta, sizeof(freqdelta));
 709         if (error)
 710                 return (error);
 711
 712         return (0);
 713 }
 714
 715 SYSCTL_NODE(_kern, OID_AUTO, ntp, CTLFLAG_RW, 0, "NTP related controls");
 716 SYSCTL_PROC(_kern_ntp, OID_AUTO, permanent,
 717     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 718     sysctl_adjfreq, "Q", "permanent correction per second");
 719 SYSCTL_PROC(_kern_ntp, OID_AUTO, delta,
 720     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 721     sysctl_delta, "Q", "one-time delta");
 722 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, big_delta, CTLFLAG_RD,
 723     &ntp_big_delta, sizeof(ntp_big_delta), "Q",
 724     "threshold for fast adjustment");
 725 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, tick_delta, CTLFLAG_RD,
 726     &ntp_tick_delta, sizeof(ntp_tick_delta), "LU",
 727     "per-tick adjustment");
 728 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, default_tick_delta, CTLFLAG_RD,
 729     &ntp_default_tick_delta, sizeof(ntp_default_tick_delta), "LU",
 730     "default per-tick adjustment");
 731 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, next_leap_second, CTLFLAG_RW,
 732     &ntp_leap_second, sizeof(ntp_leap_second), "LU",
 733     "next leap second");
 734 SYSCTL_INT(_kern_ntp, OID_AUTO, insert_leap_second, CTLFLAG_RW,
 735     &ntp_leap_insert, 0, "insert or remove leap second");
 736 SYSCTL_PROC(_kern_ntp, OID_AUTO, adjust,
 737     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 738     sysctl_adjtime, "Q", "relative adjust for delta");
 739
 740 /*
 741  * Get value of an interval timer.  The process virtual and
 742  * profiling virtual time timers are kept in the p_stats area, since
 743  * they can be swapped out.  These are kept internally in the
 744  * way they are specified externally: in time until they expire.
 745  *
 746  * The real time interval timer is kept in the process table slot
 747  * for the process, and its value (it_value) is kept as an
 748  * absolute time rather than as a delta, so that it is easy to keep
 749  * periodic real-time signals from drifting.
 750  *
 751  * Virtual time timers are processed in the hardclock() routine of
 752  * kern_clock.c.  The real time timer is processed by a timeout
 753  * routine, called from the softclock() routine.  Since a callout
 754  * may be delayed in real time due to interrupt processing in the system,
 755  * it is possible for the real time timeout routine (realitexpire, given below),
 756  * to be delayed in real time past when it is supposed to occur.  It
 757  * does not suffice, therefore, to reload the real timer .it_value from the
 758  * real time timers .it_interval.  Rather, we compute the next time in
 759  * absolute time the timer should go off.
 760  *
 761  * MPALMOSTSAFE
 762  */
 763 int
 764 sys_getitimer(struct getitimer_args *uap)
 765 {
 766         struct proc *p = curproc;
 767         struct timeval ctv;
 768         struct itimerval aitv;
 769
 770         if (uap->which > ITIMER_PROF)
 771                 return (EINVAL);
 772         lwkt_gettoken(&p->p_token);
 773         if (uap->which == ITIMER_REAL) {
 774                 /*
 775                  * Convert from absolute to relative time in .it_value
 776                  * part of real time timer.  If time for real time timer
 777                  * has passed return 0, else return difference between
 778                  * current time and time for the timer to go off.
 779                  */
 780                 aitv = p->p_realtimer;
 781                 if (timevalisset(&aitv.it_value)) {
 782                         getmicrouptime(&ctv);
 783                         if (timevalcmp(&aitv.it_value, &ctv, <))
 784                                 timevalclear(&aitv.it_value);
 785                         else
 786                                 timevalsub(&aitv.it_value, &ctv);
 787                 }
 788         } else {
 789                 aitv = p->p_timer[uap->which];
 790         }
 791         lwkt_reltoken(&p->p_token);
 792         return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
 793 }
 794
 795 /*
 796  * MPALMOSTSAFE
 797  */
 798 int
 799 sys_setitimer(struct setitimer_args *uap)
 800 {
 801         struct itimerval aitv;
 802         struct timeval ctv;
 803         struct itimerval *itvp;
 804         struct proc *p = curproc;
 805         int error;
 806
 807         if (uap->which > ITIMER_PROF)
 808                 return (EINVAL);
 809         itvp = uap->itv;
 810         if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
 811             sizeof(struct itimerval))))
 812                 return (error);
 813         if ((uap->itv = uap->oitv) &&
 814             (error = sys_getitimer((struct getitimer_args *)uap)))
 815                 return (error);
 816         if (itvp == NULL)
 817                 return (0);
 818         if (itimerfix(&aitv.it_value))
 819                 return (EINVAL);
 820         if (!timevalisset(&aitv.it_value))
 821                 timevalclear(&aitv.it_interval);
 822         else if (itimerfix(&aitv.it_interval))
 823                 return (EINVAL);
 824         lwkt_gettoken(&p->p_token);
 825         if (uap->which == ITIMER_REAL) {
 826                 if (timevalisset(&p->p_realtimer.it_value))
 827                         callout_stop_sync(&p->p_ithandle);
 828                 if (timevalisset(&aitv.it_value))
 829                         callout_reset(&p->p_ithandle,
 830                             tvtohz_high(&aitv.it_value), realitexpire, p);
 831                 getmicrouptime(&ctv);
 832                 timevaladd(&aitv.it_value, &ctv);
 833                 p->p_realtimer = aitv;
 834         } else {
 835                 p->p_timer[uap->which] = aitv;
 836                 switch(uap->which) {
 837                 case ITIMER_VIRTUAL:
 838                         p->p_flags &= ~P_SIGVTALRM;
 839                         break;
 840                 case ITIMER_PROF:
 841                         p->p_flags &= ~P_SIGPROF;
 842                         break;
 843                 }
 844         }
 845         lwkt_reltoken(&p->p_token);
 846         return (0);
 847 }
 848
 849 /*
 850  * Real interval timer expired:
 851  * send process whose timer expired an alarm signal.
 852  * If time is not set up to reload, then just return.
 853  * Else compute next time timer should go off which is > current time.
 854  * This is where delay in processing this timeout causes multiple
 855  * SIGALRM calls to be compressed into one.
 856  * tvtohz_high() always adds 1 to allow for the time until the next clock
 857  * interrupt being strictly less than 1 clock tick, but we don't want
 858  * that here since we want to appear to be in sync with the clock
 859  * interrupt even when we're delayed.
 860  */
 861 void
 862 realitexpire(void *arg)
 863 {
 864         struct proc *p;
 865         struct timeval ctv, ntv;
 866
 867         p = (struct proc *)arg;
 868         PHOLD(p);
 869         lwkt_gettoken(&p->p_token);
 870         ksignal(p, SIGALRM);
 871         if (!timevalisset(&p->p_realtimer.it_interval)) {
 872                 timevalclear(&p->p_realtimer.it_value);
 873                 goto done;
 874         }
 875         for (;;) {
 876                 timevaladd(&p->p_realtimer.it_value,
 877                            &p->p_realtimer.it_interval);
 878                 getmicrouptime(&ctv);
 879                 if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
 880                         ntv = p->p_realtimer.it_value;
 881                         timevalsub(&ntv, &ctv);
 882                         callout_reset(&p->p_ithandle, tvtohz_low(&ntv),
 883                                       realitexpire, p);
 884                         goto done;
 885                 }
 886         }
 887 done:
 888         lwkt_reltoken(&p->p_token);
 889         PRELE(p);
 890 }
 891
 892 /*
 893  * Check that a proposed value to load into the .it_value or
 894  * .it_interval part of an interval timer is acceptable, and
 895  * fix it to have at least minimal value (i.e. if it is less
 896  * than the resolution of the clock, round it up.)
 897  *
 898  * MPSAFE
 899  */
 900 int
 901 itimerfix(struct timeval *tv)
 902 {
 903
 904         if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
 905             tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 906                 return (EINVAL);
 907         if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < ustick)
 908                 tv->tv_usec = ustick;
 909         return (0);
 910 }
 911
 912 /*
 913  * Decrement an interval timer by a specified number
 914  * of microseconds, which must be less than a second,
 915  * i.e. < 1000000.  If the timer expires, then reload
 916  * it.  In this case, carry over (usec - old value) to
 917  * reduce the value reloaded into the timer so that
 918  * the timer does not drift.  This routine assumes
 919  * that it is called in a context where the timers
 920  * on which it is operating cannot change in value.
 921  */
 922 int
 923 itimerdecr(struct itimerval *itp, int usec)
 924 {
 925
 926         if (itp->it_value.tv_usec < usec) {
 927                 if (itp->it_value.tv_sec == 0) {
 928                         /* expired, and already in next interval */
 929                         usec -= itp->it_value.tv_usec;
 930                         goto expire;
 931                 }
 932                 itp->it_value.tv_usec += 1000000;
 933                 itp->it_value.tv_sec--;
 934         }
 935         itp->it_value.tv_usec -= usec;
 936         usec = 0;
 937         if (timevalisset(&itp->it_value))
 938                 return (1);
 939         /* expired, exactly at end of interval */
 940 expire:
 941         if (timevalisset(&itp->it_interval)) {
 942                 itp->it_value = itp->it_interval;
 943                 itp->it_value.tv_usec -= usec;
 944                 if (itp->it_value.tv_usec < 0) {
 945                         itp->it_value.tv_usec += 1000000;
 946                         itp->it_value.tv_sec--;
 947                 }
 948         } else
 949                 itp->it_value.tv_usec = 0;              /* sec is already 0 */
 950         return (0);
 951 }
 952
 953 /*
 954  * Add and subtract routines for timevals.
 955  * N.B.: subtract routine doesn't deal with
 956  * results which are before the beginning,
 957  * it just gets very confused in this case.
 958  * Caveat emptor.
 959  */
 960 void
 961 timevaladd(struct timeval *t1, const struct timeval *t2)
 962 {
 963
 964         t1->tv_sec += t2->tv_sec;
 965         t1->tv_usec += t2->tv_usec;
 966         timevalfix(t1);
 967 }
 968
 969 void
 970 timevalsub(struct timeval *t1, const struct timeval *t2)
 971 {
 972
 973         t1->tv_sec -= t2->tv_sec;
 974         t1->tv_usec -= t2->tv_usec;
 975         timevalfix(t1);
 976 }
 977
 978 static void
 979 timevalfix(struct timeval *t1)
 980 {
 981
 982         if (t1->tv_usec < 0) {
 983                 t1->tv_sec--;
 984                 t1->tv_usec += 1000000;
 985         }
 986         if (t1->tv_usec >= 1000000) {
 987                 t1->tv_sec++;
 988                 t1->tv_usec -= 1000000;
 989         }
 990 }
 991
 992 /*
 993  * ratecheck(): simple time-based rate-limit checking.
 994  */
 995 int
 996 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
 997 {
 998         struct timeval tv, delta;
 999         int rv = 0;
1000
1001         getmicrouptime(&tv);            /* NB: 10ms precision */
1002         delta = tv;
1003         timevalsub(&delta, lasttime);
1004
1005         /*
1006          * check for 0,0 is so that the message will be seen at least once,
1007          * even if interval is huge.
1008          */
1009         if (timevalcmp(&delta, mininterval, >=) ||
1010             (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
1011                 *lasttime = tv;
1012                 rv = 1;
1013         }
1014
1015         return (rv);
1016 }
1017
1018 /*
1019  * ppsratecheck(): packets (or events) per second limitation.
1020  *
1021  * Return 0 if the limit is to be enforced (e.g. the caller
1022  * should drop a packet because of the rate limitation).
1023  *
1024  * maxpps of 0 always causes zero to be returned.  maxpps of -1
1025  * always causes 1 to be returned; this effectively defeats rate
1026  * limiting.
1027  *
1028  * Note that we maintain the struct timeval for compatibility
1029  * with other bsd systems.  We reuse the storage and just monitor
1030  * clock ticks for minimal overhead.
1031  */
1032 int
1033 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
1034 {
1035         int now;
1036
1037         /*
1038          * Reset the last time and counter if this is the first call
1039          * or more than a second has passed since the last update of
1040          * lasttime.
1041          */
1042         now = ticks;
1043         if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
1044                 lasttime->tv_sec = now;
1045                 *curpps = 1;
1046                 return (maxpps != 0);
1047         } else {
1048                 (*curpps)++;            /* NB: ignore potential overflow */
1049                 return (maxpps < 0 || *curpps < maxpps);
1050         }
1051 }
1052