2 * Copyright (c) 2004,2014 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * Copyright (c) 1982, 1986, 1991, 1993
36 * The Regents of the University of California. All rights reserved.
37 * (c) UNIX System Laboratories, Inc.
38 * All or some portions of this file are derived from material licensed
39 * to the University of California by American Telephone and Telegraph
40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41 * the permission of UNIX System Laboratories, Inc.
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * The original callout mechanism was based on the work of Adam M. Costello
69 * and George Varghese, published in a technical report entitled "Redesigning
70 * the BSD Callout and Timer Facilities" and modified slightly for inclusion
71 * in FreeBSD by Justin T. Gibbs. The original work on the data structures
72 * used in this implementation was published by G. Varghese and T. Lauck in
73 * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
74 * the Efficient Implementation of a Timer Facility" in the Proceedings of
75 * the 11th ACM Annual Symposium on Operating Systems Principles,
76 * Austin, Texas Nov 1987.
78 * The per-cpu augmentation was done by Matthew Dillon. This file has
79 * essentially been rewritten pretty much from scratch by Matt.
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/callout.h>
85 #include <sys/kernel.h>
86 #include <sys/interrupt.h>
87 #include <sys/thread.h>
89 #include <sys/thread2.h>
90 #include <sys/mplock2.h>
92 #include <vm/vm_extern.h>
94 struct softclock_pcpu {
95 struct callout_tailq *callwheel;
96 struct callout * volatile next;
97 intptr_t running; /* NOTE! Bit 0 used to flag wakeup */
98 int softticks; /* softticks index */
99 int curticks; /* per-cpu ticks counter */
101 struct thread thread;
104 typedef struct softclock_pcpu *softclock_pcpu_t;
106 static MALLOC_DEFINE(M_CALLOUT, "callout", "callout structures");
107 static int cwheelsize;
108 static int cwheelmask;
109 static softclock_pcpu_t softclock_pcpu_ary[MAXCPU];
111 static void softclock_handler(void *arg);
112 static void slotimer_callback(void *arg);
113 static void callout_reset_ipi(void *arg);
114 static void callout_stop_ipi(void *arg, int issync, struct intrframe *frame);
117 callout_setclear(struct callout *c, int sflags, int cflags)
125 nflags = (flags | sflags) & ~cflags;
126 if (atomic_cmpset_int(&c->c_flags, flags, nflags))
133 swi_softclock_setup(void *arg)
140 * Figure out how large a callwheel we need. It must be a power of 2.
142 * ncallout is primarily based on available memory, don't explode
143 * the allocations if the system has a lot of cpus.
145 target = ncallout / ncpus + 16;
148 while (cwheelsize < target)
150 cwheelmask = cwheelsize - 1;
153 * Initialize per-cpu data structures.
155 for (cpu = 0; cpu < ncpus; ++cpu) {
159 sc = (void *)kmem_alloc3(&kernel_map, sizeof(*sc),
160 VM_SUBSYS_GD, KM_CPU(cpu));
161 memset(sc, 0, sizeof(*sc));
162 softclock_pcpu_ary[cpu] = sc;
164 wheel_sz = sizeof(*sc->callwheel) * cwheelsize;
165 sc->callwheel = (void *)kmem_alloc3(&kernel_map, wheel_sz,
166 VM_SUBSYS_GD, KM_CPU(cpu));
167 memset(sc->callwheel, 0, wheel_sz);
168 for (i = 0; i < cwheelsize; ++i)
169 TAILQ_INIT(&sc->callwheel[i]);
172 * Mark the softclock handler as being an interrupt thread
173 * even though it really isn't, but do not allow it to
174 * preempt other threads (do not assign td_preemptable).
176 * Kernel code now assumes that callouts do not preempt
177 * the cpu they were scheduled on.
179 lwkt_create(softclock_handler, sc, NULL, &sc->thread,
180 TDF_NOSTART | TDF_INTTHREAD,
181 cpu, "softclock %d", cpu);
186 * Must occur after ncpus has been initialized.
188 SYSINIT(softclock_setup, SI_BOOT2_SOFTCLOCK, SI_ORDER_SECOND,
189 swi_softclock_setup, NULL);
192 * This routine is called from the hardclock() (basically a FASTint/IPI) on
193 * each cpu in the system. sc->curticks is this cpu's notion of the timebase.
194 * It IS NOT NECESSARILY SYNCHRONIZED WITH 'ticks'! sc->softticks is where
195 * the callwheel is currently indexed.
197 * WARNING! The MP lock is not necessarily held on call, nor can it be
200 * sc->softticks is adjusted by either this routine or our helper thread
201 * depending on whether the helper thread is running or not.
204 hardclock_softtick(globaldata_t gd)
208 sc = softclock_pcpu_ary[gd->gd_cpuid];
212 if (sc->softticks == sc->curticks) {
214 * In sync, only wakeup the thread if there is something to
217 if (TAILQ_FIRST(&sc->callwheel[sc->softticks & cwheelmask])) {
219 lwkt_schedule(&sc->thread);
225 * out of sync, wakeup the thread unconditionally so it can
229 lwkt_schedule(&sc->thread);
234 * This procedure is the main loop of our per-cpu helper thread. The
235 * sc->isrunning flag prevents us from racing hardclock_softtick() and
236 * a critical section is sufficient to interlock sc->curticks and protect
237 * us from remote IPI's / list removal.
239 * The thread starts with the MP lock released and not in a critical
240 * section. The loop itself is MP safe while individual callbacks
241 * may or may not be, so we obtain or release the MP lock as appropriate.
244 softclock_handler(void *arg)
248 struct callout_tailq *bucket;
249 struct callout slotimer;
254 * Setup pcpu slow clocks which we want to run from the callout
257 callout_init_mp(&slotimer);
258 callout_reset(&slotimer, hz * 10, slotimer_callback, &slotimer);
261 * Run the callout thread at the same priority as other kernel
262 * threads so it can be round-robined.
264 /*lwkt_setpri_self(TDPRI_SOFT_NORM);*/
267 * Loop critical section against ipi operations to this cpu.
272 while (sc->softticks != (int)(sc->curticks + 1)) {
273 bucket = &sc->callwheel[sc->softticks & cwheelmask];
275 for (c = TAILQ_FIRST(bucket); c; c = sc->next) {
276 void (*c_func)(void *);
281 if (c->c_time != sc->softticks) {
282 sc->next = TAILQ_NEXT(c, c_links.tqe);
287 * Synchronize with mpsafe requirements
290 if (flags & CALLOUT_MPSAFE) {
297 * The request might be removed while we
298 * are waiting to get the MP lock. If it
299 * was removed sc->next will point to the
300 * next valid request or NULL, loop up.
312 * Queue protection only exists while we hold the
313 * critical section uninterrupted.
315 * Adjust sc->next when removing (c) from the queue,
316 * note that an IPI on this cpu may make further
317 * adjustments to sc->next.
319 sc->next = TAILQ_NEXT(c, c_links.tqe);
320 TAILQ_REMOVE(bucket, c, c_links.tqe);
322 KASSERT((c->c_flags & CALLOUT_DID_INIT) &&
323 (c->c_flags & CALLOUT_PENDING) &&
324 CALLOUT_FLAGS_TO_CPU(c->c_flags) ==
326 ("callout %p: bad flags %08x", c, c->c_flags));
329 * Once CALLOUT_PENDING is cleared only the IPI_MASK
330 * prevents the callout from being moved to another
331 * cpu. However, callout_stop() will also check
332 * sc->running on the assigned cpu if CALLOUT_EXECUTED
333 * is set. CALLOUT_EXECUTE implies a callback
334 * interlock is needed when cross-cpu.
336 sc->running = (intptr_t)c;
342 if ((flags & (CALLOUT_AUTOLOCK | CALLOUT_ACTIVE)) ==
343 (CALLOUT_AUTOLOCK | CALLOUT_ACTIVE)) {
344 error = lockmgr(c_lk, LK_EXCLUSIVE |
347 flags = callout_setclear(c,
354 lockmgr(c_lk, LK_RELEASE);
356 flags = callout_setclear(c,
360 } else if (flags & CALLOUT_ACTIVE) {
361 flags = callout_setclear(c,
369 flags = callout_setclear(c,
376 * Read and clear sc->running. If bit 0 was set,
377 * a callout_stop() is likely blocked waiting for
378 * the callback to complete.
380 * The sigclear above also cleared CALLOUT_WAITING
381 * and returns the contents of flags prior to clearing
384 * Interlock wakeup any _stop's waiting on us. Note
385 * that once c_func() was called, the callout
386 * structure (c) pointer may no longer be valid. It
387 * can only be used for the wakeup.
389 if ((atomic_readandclear_ptr(&sc->running) & 1) ||
390 (flags & CALLOUT_WAITING)) {
393 /* NOTE: list may have changed */
399 * Don't leave us holding the MP lock when we deschedule ourselves.
406 lwkt_deschedule_self(&sc->thread); /* == curthread */
413 * A very slow system cleanup timer (10 second interval),
417 slotimer_callback(void *arg)
419 struct callout *c = arg;
422 callout_reset(c, hz * 10, slotimer_callback, c);
426 * Start or restart a timeout. Installs the callout structure on the
427 * callwheel of the current cpu. Callers may legally pass any value, even
428 * if 0 or negative, but since the sc->curticks index may have already
429 * been processed a minimum timeout of 1 tick will be enforced.
431 * This function will block if the callout is currently queued to a different
432 * cpu or the callback is currently running in another thread.
435 callout_reset(struct callout *c, int to_ticks, void (*ftn)(void *), void *arg)
441 if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
444 "callout_reset(%p) from %p: callout was not initialized\n",
445 c, ((int **)&c)[-1]);
450 sc = softclock_pcpu_ary[gd->gd_cpuid];
454 * Our cpu must gain ownership of the callout and cancel anything
455 * still running, which is complex. The easiest way to do it is to
456 * issue a callout_stop_sync(). callout_stop_sync() will also
457 * handle CALLOUT_EXECUTED (dispatch waiting), and clear it.
459 * WARNING: callout_stop_sync()'s return state can race other
460 * callout_*() calls due to blocking, so we must re-check.
466 if (c->c_flags & (CALLOUT_ARMED_MASK | CALLOUT_EXECUTED))
467 callout_stop_sync(c);
468 flags = c->c_flags & ~(CALLOUT_ARMED_MASK | CALLOUT_EXECUTED);
469 nflags = (flags & ~CALLOUT_CPU_MASK) |
470 CALLOUT_CPU_TO_FLAGS(gd->gd_cpuid) |
473 if (atomic_cmpset_int(&c->c_flags, flags, nflags))
479 * With the critical section held and PENDING set we now 'own' the
487 c->c_time = sc->curticks + to_ticks;
489 TAILQ_INSERT_TAIL(&sc->callwheel[c->c_time & cwheelmask],
495 * Setup a callout to run on the specified cpu. Should generally be used
496 * to run a callout on a specific cpu which does not nominally change. This
497 * callout_reset() will be issued asynchronously via an IPI.
500 callout_reset_bycpu(struct callout *c, int to_ticks, void (*ftn)(void *),
501 void *arg, int cpuid)
507 if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
510 "callout_reset(%p) from %p: callout was not initialized\n",
511 c, ((int **)&c)[-1]);
518 tgd = globaldata_find(cpuid);
521 * This code is similar to the code in callout_reset() but we assign
522 * the callout to the target cpu. We cannot set PENDING here since
523 * we cannot atomically add the callout to the target cpu's queue.
524 * However, incrementing the IPI count has the effect of locking
525 * the cpu assignment.
527 * WARNING: callout_stop_sync()'s return state can race other
528 * callout_*() calls due to blocking, so we must re-check.
534 if (c->c_flags & (CALLOUT_ARMED_MASK | CALLOUT_EXECUTED))
535 callout_stop_sync(c);
536 flags = c->c_flags & ~(CALLOUT_ARMED_MASK | CALLOUT_EXECUTED);
537 nflags = (flags & ~(CALLOUT_CPU_MASK |
539 CALLOUT_CPU_TO_FLAGS(tgd->gd_cpuid) |
541 nflags = nflags + 1; /* bump IPI count */
542 if (atomic_cmpset_int(&c->c_flags, flags, nflags))
548 * Since we control our +1 in the IPI count, the target cpu cannot
549 * now change until our IPI is processed.
556 c->c_load = to_ticks; /* IPI will add curticks */
558 lwkt_send_ipiq(tgd, callout_reset_ipi, c);
563 * Remote IPI for callout_reset_bycpu(). The cpu assignment cannot be
564 * ripped out from under us due to the count in IPI_MASK, but it is possible
565 * that other IPIs executed so we must deal with other flags that might
566 * have been set or cleared.
569 callout_reset_ipi(void *arg)
571 struct callout *c = arg;
572 globaldata_t gd = mycpu;
577 sc = softclock_pcpu_ary[gd->gd_cpuid];
582 KKASSERT((flags & CALLOUT_IPI_MASK) > 0 &&
583 CALLOUT_FLAGS_TO_CPU(flags) == gd->gd_cpuid);
585 nflags = (flags - 1) & ~(CALLOUT_EXECUTED | CALLOUT_WAITING);
586 nflags |= CALLOUT_PENDING;
589 * Put us on the queue
591 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
592 if (flags & CALLOUT_PENDING) {
594 sc->next = TAILQ_NEXT(c, c_links.tqe);
596 &sc->callwheel[c->c_time & cwheelmask],
600 c->c_time = sc->curticks + c->c_load;
602 &sc->callwheel[c->c_time & cwheelmask],
611 * Issue wakeup if requested.
613 if (flags & CALLOUT_WAITING)
618 * Stop a running timer and ensure that any running callout completes before
619 * returning. If the timer is running on another cpu this function may block
620 * to interlock against the callout. If the callout is currently executing
621 * or blocked in another thread this function may also block to interlock
622 * against the callout.
624 * The caller must be careful to avoid deadlocks, either by using
625 * callout_init_lk() (which uses the lockmgr lock cancelation feature),
626 * by using tokens and dealing with breaks in the serialization, or using
627 * the lockmgr lock cancelation feature yourself in the callout callback
630 * callout_stop() returns non-zero if the callout was pending.
633 _callout_stop(struct callout *c, int issync)
635 globaldata_t gd = mycpu;
644 if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
647 "callout_stop(%p) from %p: callout was not initialized\n",
648 c, ((int **)&c)[-1]);
656 * Adjust flags for the required operation. If the callout is
657 * armed on another cpu we break out into the remote-cpu code which
658 * will issue an IPI. If it is not armed we are trivially done,
659 * but may still need to test EXECUTED.
665 cpuid = CALLOUT_FLAGS_TO_CPU(flags);
668 * Armed on remote cpu (break to remote-cpu code)
670 if ((flags & CALLOUT_ARMED_MASK) && gd->gd_cpuid != cpuid) {
672 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
674 * BREAK TO REMOTE-CPU CODE HERE
683 * Armed or armable on current cpu
685 if (flags & CALLOUT_IPI_MASK) {
688 continue; /* retry */
692 * If PENDING is set we can remove the callout from our
693 * queue and also use the side effect that the bit causes
694 * the callout to be locked to our cpu.
696 if (flags & CALLOUT_PENDING) {
697 sc = softclock_pcpu_ary[gd->gd_cpuid];
699 sc->next = TAILQ_NEXT(c, c_links.tqe);
701 &sc->callwheel[c->c_time & cwheelmask],
709 nflags = flags & ~(CALLOUT_ACTIVE |
713 if (atomic_cmpset_int(&c->c_flags,
723 * If PENDING was not set the callout might not be locked
726 nflags = flags & ~(CALLOUT_ACTIVE |
730 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
738 * Remote cpu path. We incremented the IPI_MASK count so the callout
739 * is now locked to the remote cpu and we can safely send an IPI
742 * Once sent, wait for all IPIs to be processed. If PENDING remains
743 * set after all IPIs have processed we raced a callout or
744 * callout_reset and must retry. Callers expect the callout to
745 * be completely stopped upon return, so make sure it is.
747 tgd = globaldata_find(cpuid);
748 lwkt_send_ipiq3(tgd, callout_stop_ipi, c, issync);
754 if ((flags & CALLOUT_IPI_MASK) == 0)
757 nflags = flags | CALLOUT_WAITING;
758 tsleep_interlock(c, 0);
759 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
760 tsleep(c, PINTERLOCKED, "cstp1", 0);
763 if (flags & CALLOUT_PENDING)
767 * Caller expects callout_stop_sync() to clear EXECUTED and return
768 * its previous status.
770 atomic_clear_int(&c->c_flags, CALLOUT_EXECUTED);
773 if (flags & CALLOUT_WAITING)
777 * If (issync) we must also wait for any in-progress callbacks to
778 * complete, unless the stop is being executed from the callback
779 * itself. The EXECUTED flag is set prior to the callback
780 * being made so our existing flags status already has it.
782 * If auto-lock mode is being used, this is where we cancel any
783 * blocked lock that is potentially preventing the target cpu
784 * from completing the callback.
790 sc = softclock_pcpu_ary[cpuid];
791 if (gd->gd_curthread == &sc->thread) /* stop from cb */
796 if ((runco & ~(intptr_t)1) != (intptr_t)c)
798 if (c->c_flags & CALLOUT_AUTOLOCK)
799 lockmgr(c->c_lk, LK_CANCEL_BEG);
800 tsleep_interlock(c, 0);
801 if (atomic_cmpset_long(runp, runco, runco | 1))
802 tsleep(c, PINTERLOCKED, "cstp3", 0);
803 if (c->c_flags & CALLOUT_AUTOLOCK)
804 lockmgr(c->c_lk, LK_CANCEL_END);
808 rc = (flags & CALLOUT_EXECUTED) != 0;
814 * IPI for stop function. The callout is locked to the receiving cpu
815 * by the IPI_MASK count.
818 callout_stop_ipi(void *arg, int issync, struct intrframe *frame)
820 globaldata_t gd = mycpu;
821 struct callout *c = arg;
829 KKASSERT(CALLOUT_FLAGS_TO_CPU(flags) == gd->gd_cpuid);
832 * We can handle the PENDING flag immediately.
834 if (flags & CALLOUT_PENDING) {
835 sc = softclock_pcpu_ary[gd->gd_cpuid];
837 sc->next = TAILQ_NEXT(c, c_links.tqe);
839 &sc->callwheel[c->c_time & cwheelmask],
846 * Transition to the stopped state and decrement the IPI count.
847 * Leave the EXECUTED bit alone (the next callout_reset() will
848 * have to deal with it).
853 nflags = (flags - 1) & ~(CALLOUT_ACTIVE |
857 if (atomic_cmpset_int(&c->c_flags, flags, nflags))
861 if (flags & CALLOUT_WAITING)
866 callout_stop(struct callout *c)
868 return _callout_stop(c, 0);
872 callout_stop_sync(struct callout *c)
874 return _callout_stop(c, 1);
878 callout_stop_async(struct callout *c)
884 callout_terminate(struct callout *c)
887 atomic_clear_int(&c->c_flags, CALLOUT_DID_INIT);
891 * Prepare a callout structure for use by callout_reset() and/or
894 * The MP version of this routine requires that the callback
895 * function installed by callout_reset() be MP safe.
897 * The LK version of this routine is also MPsafe and will automatically
898 * acquire the specified lock for the duration of the function call,
899 * and release it after the function returns. In addition, when autolocking
900 * is used, callout_stop() becomes synchronous if the caller owns the lock.
901 * callout_reset(), callout_stop(), and callout_stop_sync() will block
902 * normally instead of spinning when a cpu race occurs. Lock cancelation
903 * is used to avoid deadlocks against the callout ring dispatch.
905 * The init functions can be called from any cpu and do not have to be
906 * called from the cpu that the timer will eventually run on.
909 _callout_init(struct callout *c, int flags)
916 callout_init(struct callout *c)
918 _callout_init(c, CALLOUT_DID_INIT);
922 callout_init_mp(struct callout *c)
924 _callout_init(c, CALLOUT_DID_INIT | CALLOUT_MPSAFE);
928 callout_init_lk(struct callout *c, struct lock *lk)
930 _callout_init(c, CALLOUT_DID_INIT | CALLOUT_MPSAFE | CALLOUT_AUTOLOCK);