kernel - kqueue - refactor kqueue_scan(), rename tick to ustick
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 3 Jan 2010 03:40:15 +0000 (19:40 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 3 Jan 2010 03:40:15 +0000 (19:40 -0800)
* Refactor kqueue_scan() so it can be called stand-alone.

* Rename tick to ustick (microseconds per tick), and add nstick
  (nanoseconds per tick).

16 files changed:
sys/kern/init_main.c
sys/kern/kern_acct.c
sys/kern/kern_clock.c
sys/kern/kern_event.c
sys/kern/kern_exit.c
sys/kern/kern_fork.c
sys/kern/kern_synch.c
sys/kern/kern_time.c
sys/kern/subr_param.c
sys/kern/sys_mqueue.c
sys/kern/uipc_socket.c
sys/net/altq/altq_rmclass.c
sys/net/bpf.c
sys/sys/kernel.h
sys/sys/proc.h
sys/sys/time.h

index a39c37c..569bde2 100644 (file)
@@ -410,6 +410,8 @@ proc0_init(void *dummy __unused)
                    vmspace_pmap(&vmspace0));
        sysref_activate(&vmspace0.vm_sysref);
 
+       kqueue_init(&lwp0.lwp_kqueue, &filedesc0);
+
        /*
         * Charge root for one process.
         */
index 73df0a1..44502e9 100644 (file)
@@ -235,7 +235,7 @@ acct_process(struct proc *p)
        r = &p->p_ru;
        tmp = ru.ru_utime;;
        timevaladd(&tmp, &ru.ru_stime);
-       t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+       t = tmp.tv_sec * hz + tmp.tv_usec / ustick;
        if (t)
                acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
        else
index 65c737b..83818c2 100644 (file)
@@ -539,10 +539,10 @@ hardclock(systimer_t info, struct intrframe *frame)
        if ((p = curproc) != NULL && try_mplock()) {
                if (frame && CLKF_USERMODE(frame) &&
                    timevalisset(&p->p_timer[ITIMER_VIRTUAL].it_value) &&
-                   itimerdecr(&p->p_timer[ITIMER_VIRTUAL], tick) == 0)
+                   itimerdecr(&p->p_timer[ITIMER_VIRTUAL], ustick) == 0)
                        ksignal(p, SIGVTALRM);
                if (timevalisset(&p->p_timer[ITIMER_PROF].it_value) &&
-                   itimerdecr(&p->p_timer[ITIMER_PROF], tick) == 0)
+                   itimerdecr(&p->p_timer[ITIMER_PROF], ustick) == 0)
                        ksignal(p, SIGPROF);
                rel_mplock();
        }
@@ -791,19 +791,53 @@ tvtohz_high(struct timeval *tv)
                        sec++;
                        usec -= 1000000;
                }
-               kprintf("tvtohz_high: negative time difference %ld sec %ld usec\n",
-                      sec, usec);
+               kprintf("tvtohz_high: negative time difference "
+                       "%ld sec %ld usec\n",
+                       sec, usec);
 #endif
                ticks = 1;
        } else if (sec <= INT_MAX / hz) {
                ticks = (int)(sec * hz + 
-                           ((u_long)usec + (tick - 1)) / tick) + 1;
+                           ((u_long)usec + (ustick - 1)) / ustick) + 1;
        } else {
                ticks = INT_MAX;
        }
        return (ticks);
 }
 
+int
+tstohz_high(struct timespec *ts)
+{
+       int ticks;
+       long sec, nsec;
+
+       sec = ts->tv_sec;
+       nsec = ts->tv_nsec;
+       if (nsec < 0) {
+               sec--;
+               nsec += 1000000000;
+       }
+       if (sec < 0) {
+#ifdef DIAGNOSTIC
+               if (nsec > 0) {
+                       sec++;
+                       nsec -= 1000000000;
+               }
+               kprintf("tstohz_high: negative time difference "
+                       "%ld sec %ld nsec\n",
+                       sec, nsec);
+#endif
+               ticks = 1;
+       } else if (sec <= INT_MAX / hz) {
+               ticks = (int)(sec * hz +
+                           ((u_long)nsec + (nstick - 1)) / nstick) + 1;
+       } else {
+               ticks = INT_MAX;
+       }
+       return (ticks);
+}
+
+
 /*
  * Compute number of ticks for the specified amount of time, erroring on
  * the side of it being too low to ensure that sleeping the returned number
@@ -824,12 +858,25 @@ tvtohz_low(struct timeval *tv)
 
        sec = tv->tv_sec;
        if (sec <= INT_MAX / hz)
-               ticks = (int)(sec * hz + (u_long)tv->tv_usec / tick);
+               ticks = (int)(sec * hz + (u_long)tv->tv_usec / ustick);
        else
                ticks = INT_MAX;
        return (ticks);
 }
 
+int
+tstohz_low(struct timespec *ts)
+{
+       int ticks;
+       long sec;
+
+       sec = ts->tv_sec;
+       if (sec <= INT_MAX / hz)
+               ticks = (int)(sec * hz + (u_long)ts->tv_nsec / nstick);
+       else
+               ticks = INT_MAX;
+       return (ticks);
+}
 
 /*
  * Start profiling on a process.
@@ -883,7 +930,7 @@ sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
         * Construct clockinfo structure.
         */
        clkinfo.ci_hz = hz;
-       clkinfo.ci_tick = tick;
+       clkinfo.ci_tick = ustick;
        clkinfo.ci_tickadj = ntp_default_tick_delta / 1000;
        clkinfo.ci_profhz = profhz;
        clkinfo.ci_stathz = stathz ? stathz : hz;
index d70f053..38c6065 100644 (file)
@@ -59,9 +59,8 @@
 
 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
-static int     kqueue_scan(struct file *fp, int maxevents,
-                   struct kevent *ulistp, const struct timespec *timeout,
-                   struct thread *td, int *res);
+static int     kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
+                   struct timespec *tsp, int *errorp);
 static int     kqueue_read(struct file *fp, struct uio *uio,
                    struct ucred *cred, int flags);
 static int     kqueue_write(struct file *fp, struct uio *uio,
@@ -471,10 +470,10 @@ sys_kevent(struct kevent_args *uap)
        struct kqueue *kq;
        struct file *fp = NULL;
        struct timespec ts;
-       int i, n, nerrors, error;
+       struct timespec *tsp;
+       int i, n, total, nerrors, error;
        struct kevent kev[KQ_NEVENTS];
 
-
        fp = holdfp(p->p_fd, uap->fd, -1);
        if (fp == NULL)
                return (EBADF);
@@ -483,11 +482,13 @@ sys_kevent(struct kevent_args *uap)
                return (EBADF);
        }
 
-       if (uap->timeout != NULL) {
+       if (uap->timeout) {
                error = copyin(uap->timeout, &ts, sizeof(ts));
                if (error)
                        goto done;
-               uap->timeout = &ts;
+               tsp = &ts;
+       } else {
+               tsp = NULL;
        }
 
        kq = (struct kqueue *)fp->f_data;
@@ -507,9 +508,8 @@ sys_kevent(struct kevent_args *uap)
                                if (uap->nevents != 0) {
                                        kevp->flags = EV_ERROR;
                                        kevp->data = error;
-                                       (void) copyout((caddr_t)kevp,
-                                           (caddr_t)uap->eventlist,
-                                           sizeof(*kevp));
+                                       copyout(kevp, uap->eventlist,
+                                               sizeof(*kevp));
                                        uap->eventlist++;
                                        uap->nevents--;
                                        nerrors++;
@@ -527,8 +527,42 @@ sys_kevent(struct kevent_args *uap)
                goto done;
        }
 
-       error = kqueue_scan(fp, uap->nevents, uap->eventlist,
-                           uap->timeout, td, &uap->sysmsg_result);
+       /*
+        * Acquire/wait for events - setup timeout
+        */
+       if (tsp != NULL) {
+               struct timespec ats;
+
+               if (tsp->tv_sec || tsp->tv_nsec) {
+                       nanouptime(&ats);
+                       timespecadd(tsp, &ats);         /* tsp = target time */
+               }
+       }
+
+       /*
+        * Loop as required.
+        *
+        * Collect as many events as we can.  The timeout on successive
+        * loops is disabled (kqueue_scan() becomes non-blocking).
+        */
+       total = 0;
+       error = 0;
+       while ((n = uap->nevents - total) > 0) {
+               if (n > KQ_NEVENTS)
+                       n = KQ_NEVENTS;
+               i = kqueue_scan(kq, kev, n, tsp, &error);
+               if (i == 0)
+                       break;
+               error = copyout(kev, uap->eventlist + total,
+                               (size_t)i * sizeof(struct kevent));
+               total += i;
+               if (error || i != n)
+                       break;
+               tsp = &ts;              /* successive loops non-blocking */
+               tsp->tv_sec = 0;
+               tsp->tv_nsec = 0;
+       }
+       uap->sysmsg_result = total;
 done:
        rel_mplock();
        if (fp != NULL)
@@ -665,83 +699,63 @@ done:
        return (error);
 }
 
+/*
+ * Scan the kqueue, blocking if necessary until the target time is reached.
+ * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
+ * 0 we do not block at all.
+ */
 static int
-kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
-           const struct timespec *tsp, struct thread *td, int *res)
+kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
+           struct timespec *tsp, int *errorp)
 {
-       struct kqueue *kq = (struct kqueue *)fp->f_data;
-       struct kevent *kevp;
-       struct timeval atv, rtv, ttv;
        struct knote *kn, marker;
-       int count, timeout, nkev = 0, error = 0;
-       struct kevent kev[KQ_NEVENTS];
-
-       count = maxevents;
-       if (count == 0)
-               goto done;
-
-       if (tsp != NULL) {
-               TIMESPEC_TO_TIMEVAL(&atv, tsp);
-               if (itimerfix(&atv)) {
-                       error = EINVAL;
-                       goto done;
-               }
-               if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
-                       timeout = -1;
-               else 
-                       timeout = atv.tv_sec > 24 * 60 * 60 ?
-                           24 * 60 * 60 * hz : tvtohz_high(&atv);
-               getmicrouptime(&rtv);
-               timevaladd(&atv, &rtv);
-       } else {
-               atv.tv_sec = 0;
-               atv.tv_usec = 0;
-               timeout = 0;
-       }
-       goto start;
-
-retry:
-       if (atv.tv_sec || atv.tv_usec) {
-               getmicrouptime(&rtv);
-               if (timevalcmp(&rtv, &atv, >=))
-                       goto done;
-               ttv = atv;
-               timevalsub(&ttv, &rtv);
-               timeout = ttv.tv_sec > 24 * 60 * 60 ?
-                       24 * 60 * 60 * hz : tvtohz_high(&ttv);
-       }
+       int total;
 
-start:
-       kevp = &kev[0];
+       total = 0;
+again:
        crit_enter();
        if (kq->kq_count == 0) {
-               if (timeout < 0) { 
-                       error = EWOULDBLOCK;
-               } else {
+               if (tsp == NULL) {
                        kq->kq_state |= KQ_SLEEP;
-                       error = tsleep(kq, PCATCH, "kqread", timeout);
+                       *errorp = tsleep(kq, PCATCH, "kqread", 0);
+               } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
+                       *errorp = EWOULDBLOCK;
+               } else {
+                       struct timespec ats;
+                       struct timespec atx = *tsp;
+                       int timeout;
+
+                       nanouptime(&ats);
+                       timespecsub(&atx, &ats);
+                       if (ats.tv_sec < 0) {
+                               *errorp = EWOULDBLOCK;
+                       } else {
+                               timeout = atx.tv_sec > 24 * 60 * 60 ?
+                                       24 * 60 * 60 * hz : tstohz_high(&atx);
+                               kq->kq_state |= KQ_SLEEP;
+                               *errorp = tsleep(kq, PCATCH, "kqread", timeout);
+                       }
                }
                crit_exit();
-               if (error == 0)
-                       goto retry;
+               if (*errorp == 0)
+                       goto again;
                /* don't restart after signals... */
-               if (error == ERESTART)
-                       error = EINTR;
-               else if (error == EWOULDBLOCK)
-                       error = 0;
+               if (*errorp == ERESTART)
+                       *errorp = EINTR;
+               else if (*errorp == EWOULDBLOCK)
+                       *errorp = 0;
                goto done;
        }
 
+       /*
+        * Collect events
+        */
        TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
        while (count) {
                kn = TAILQ_FIRST(&kq->kq_knpend);
                TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
-               if (kn == &marker) {
-                       crit_exit();
-                       if (count == maxevents)
-                               goto retry;
-                       goto done;
-               }
+               if (kn == &marker)
+                       break;
                if (kn->kn_status & KN_DISABLED) {
                        kn->kn_status &= ~KN_QUEUED;
                        kq->kq_count--;
@@ -753,9 +767,13 @@ start:
                        kq->kq_count--;
                        continue;
                }
-               *kevp = kn->kn_kevent;
-               kevp++;
-               nkev++;
+               *kevp++ = kn->kn_kevent;
+               ++total;
+               --count;
+
+               /*
+                * Post-event action on the note
+                */
                if (kn->kn_flags & EV_ONESHOT) {
                        kn->kn_status &= ~KN_QUEUED;
                        kq->kq_count--;
@@ -771,26 +789,13 @@ start:
                } else {
                        TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
                }
-               count--;
-               if (nkev == KQ_NEVENTS) {
-                       crit_exit();
-                       error = copyout(kev, ulistp,
-                                       sizeof(struct kevent) * nkev);
-                       ulistp += nkev;
-                       nkev = 0;
-                       kevp = &kev[0];
-                       crit_enter();
-                       if (error)
-                               break;
-               }
        }
        TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
        crit_exit();
+       if (total == 0)
+               goto again;
 done:
-       if (nkev != 0)
-               error = copyout(kev, ulistp, sizeof(struct kevent) * nkev);
-        *res = maxevents - count;
-       return (error);
+       return (total);
 }
 
 /*
index ffcc1c2..a9b3952 100644 (file)
@@ -568,6 +568,11 @@ lwp_exit(int masterexit)
        if (lp->lwp_vkernel)
                vkernel_lwp_exit(lp);
 
+       /*
+        * Clean up select/poll support
+        */
+       kqueue_terminate(&lp->lwp_kqueue);
+
        /*
         * Clean up any syscall-cached ucred
         */
index a804e8b..12b1fca 100644 (file)
@@ -626,6 +626,7 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags)
         */
        cpu_fork(origlp, lp, flags);
        caps_fork(origlp->lwp_thread, lp->lwp_thread);
+       kqueue_init(&lp->lwp_kqueue, destproc->p_fd);
 
        return (lp);
 }
index 04114e7..a8c3ddf 100644 (file)
@@ -128,13 +128,13 @@ sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
        int error, new_val;
 
-       new_val = sched_quantum * tick;
+       new_val = sched_quantum * ustick;
        error = sysctl_handle_int(oidp, &new_val, 0, req);
         if (error != 0 || req->newptr == NULL)
                return (error);
-       if (new_val < tick)
+       if (new_val < ustick)
                return (EINVAL);
-       sched_quantum = new_val / tick;
+       sched_quantum = new_val / ustick;
        hogticks = 2 * sched_quantum;
        return (0);
 }
index 2f3ec84..ec55d71 100644 (file)
@@ -311,7 +311,7 @@ nanosleep1(struct timespec *rqt, struct timespec *rmt)
                int ticks;
                struct systimer info;
 
-               ticks = tv.tv_usec / tick;      /* approximate */
+               ticks = tv.tv_usec / ustick;    /* approximate */
 
                if (tv.tv_sec == 0 && ticks == 0) {
                        thread_t td = curthread;
@@ -821,8 +821,8 @@ itimerfix(struct timeval *tv)
        if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
            tv->tv_usec < 0 || tv->tv_usec >= 1000000)
                return (EINVAL);
-       if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
-               tv->tv_usec = tick;
+       if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < ustick)
+               tv->tv_usec = ustick;
        return (0);
 }
 
index 2497fa5..7ef25d3 100644 (file)
@@ -73,7 +73,8 @@
 int    hz;
 int    stathz;
 int    profhz;
-int    tick;                           /* tick interval in microseconds */
+int    ustick;                         /* tick interval in microseconds */
+int    nstick;                         /* tick interval in nanoseconds */
 int    maxusers;                       /* base tunable */
 int    maxproc;                        /* maximum # of processes */
 int    maxprocperuid;                  /* max # of procs per user */
@@ -114,7 +115,8 @@ init_param1(void)
        TUNABLE_INT_FETCH("kern.hz", &hz);
        stathz = hz * 128 / 100;
        profhz = stathz;
-       tick = 1000000 / hz;
+       ustick = 1000000 / hz;
+       nstick = 1000000000 / hz;
        /* can adjust 30ms in 60s */
        ntp_default_tick_delta = howmany(30000000, 60 * hz);
 
index 74cdb6f..08347d2 100644 (file)
@@ -246,8 +246,8 @@ itimespecfix(struct timespec *ts)
 {
        if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
                return (EINVAL);
-       if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
-               ts->tv_nsec = tick * 1000;
+       if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < nstick)
+               ts->tv_nsec = nstick;
        return (0);
 }
 
index 900172e..97ae97e 100644 (file)
@@ -1430,7 +1430,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
                        }
                        /* assert(tick > 0); */
                        /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
-                       val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
+                       val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick;
                        if (val > SHRT_MAX) {
                                error = EDOM;
                                goto bad;
@@ -1582,7 +1582,7 @@ integer:
                                  so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo);
 
                        tv.tv_sec = optval / hz;
-                       tv.tv_usec = (optval % hz) * tick;
+                       tv.tv_usec = (optval % hz) * ustick;
                        error = sooptcopyout(sopt, &tv, sizeof tv);
                        break;                  
 
index 10d8bc0..28824ff 100644 (file)
@@ -1480,8 +1480,8 @@ rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
                 * NOTE:  If there's no other traffic, we need the timer as
                 * a 'backstop' to restart this class.
                 */
-               if (delay > tick * 2)
-                       t = (delay + tick - 1) / tick;
+               if (delay > ustick * 2)
+                       t = (delay + ustick - 1) / ustick;
                else
                        t = 2;
                callout_reset(&cl->callout_, t, rmc_restart, cl);
index cb795b7..a92c207 100644 (file)
@@ -840,7 +840,7 @@ bpfioctl(struct dev_ioctl_args *ap)
                        struct timeval *tv = (struct timeval *)ap->a_data;
 
                        tv->tv_sec = d->bd_rtout / hz;
-                       tv->tv_usec = (d->bd_rtout % hz) * tick;
+                       tv->tv_usec = (d->bd_rtout % hz) * ustick;
                        break;
                }
 
index ef9c04f..950eded 100644 (file)
@@ -76,7 +76,8 @@ extern struct timespec boottime;
 
 extern struct timezone tz;                     /* XXX */
 
-extern int tick;                       /* usec per tick (1000000 / hz) */
+extern int ustick;                     /* usec per tick (1000000 / hz) */
+extern int nstick;                     /* nsec per tick (1000000000 / hz) */
 extern int tickadj;                    /* "standard" clock skew, us./tick */
 extern int hz;                         /* system clock's frequency */
 extern int psratio;                    /* ratio: prof / stat */
index 9d1d42a..3e074c8 100644 (file)
@@ -61,6 +61,7 @@
 #endif
 #include <sys/ucred.h>
 #include <sys/event.h>                 /* For struct klist */
+#include <sys/eventvar.h>
 #include <sys/sysent.h>                        /* For struct sysentvec */
 #include <sys/thread.h>
 #include <sys/varsym.h>
@@ -216,6 +217,7 @@ struct lwp {
 
        struct thread   *lwp_thread;    /* backpointer to proc's thread */
        struct upcall   *lwp_upcall;    /* REGISTERED USERLAND POINTER! */
+       struct kqueue   lwp_kqueue;     /* for select/poll */
 };
 
 struct proc {
index af6c630..27f17a2 100644 (file)
@@ -223,6 +223,8 @@ void        timevaladd (struct timeval *, const struct timeval *);
 void   timevalsub (struct timeval *, const struct timeval *);
 int    tvtohz_high (struct timeval *);
 int    tvtohz_low (struct timeval *);
+int    tstohz_high (struct timespec *);
+int    tstohz_low (struct timespec *);
 int64_t        tsc_get_target(int ns);
 int    tsc_test_target(int64_t target);