kernel - Provide descriptions for lwkt.* and debug.* sysctl's
[dragonfly.git] / sys / kern / kern_event.c
index b2bfbcf..1aefd31 100644 (file)
@@ -36,7 +36,6 @@
 #include <sys/file.h>
 #include <sys/lock.h>
 #include <sys/fcntl.h>
-#include <sys/select.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
@@ -46,6 +45,7 @@
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
+#include <sys/thread.h>
 #include <sys/uio.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 
 #include <vm/vm_zone.h>
 
+/*
+ * Global token for kqueue subsystem
+ */
+struct lwkt_token kq_token = LWKT_TOKEN_UP_INITIALIZER(kq_token);
+SYSCTL_INT(_lwkt, OID_AUTO, kq_mpsafe,
+    CTLFLAG_RW, &kq_token.t_flags, 0,
+    "Require MP lock for kq_token");
+SYSCTL_LONG(_lwkt, OID_AUTO, kq_collisions,
+    CTLFLAG_RW, &kq_token.t_collisions, 0,
+    "Collision counter of kq_token");
+
 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 struct kevent_copyin_args {
@@ -77,6 +88,9 @@ static int    kqueue_kqfilter(struct file *fp, struct knote *kn);
 static int     kqueue_stat(struct file *fp, struct stat *st,
                    struct ucred *cred);
 static int     kqueue_close(struct file *fp);
+static void    kqueue_wakeup(struct kqueue *kq);
+static int     filter_attach(struct knote *kn);
+static int     filter_event(struct knote *kn, long hint);
 
 /*
  * MPSAFE
@@ -93,6 +107,7 @@ static struct fileops kqueueops = {
 
 static void    knote_attach(struct knote *kn);
 static void    knote_drop(struct knote *kn);
+static void    knote_detach_and_drop(struct knote *kn);
 static void    knote_enqueue(struct knote *kn);
 static void    knote_dequeue(struct knote *kn);
 static void    knote_init(void);
@@ -111,9 +126,9 @@ static void filt_timerdetach(struct knote *kn);
 static int     filt_timer(struct knote *kn, long hint);
 
 static struct filterops file_filtops =
-       { 1, filt_fileattach, NULL, NULL };
+       { FILTEROP_ISFD, filt_fileattach, NULL, NULL };
 static struct filterops kqread_filtops =
-       { 1, NULL, filt_kqdetach, filt_kqueue };
+       { FILTEROP_ISFD, NULL, filt_kqdetach, filt_kqueue };
 static struct filterops proc_filtops =
        { 0, filt_procattach, filt_procdetach, filt_proc };
 static struct filterops timer_filtops =
@@ -124,6 +139,9 @@ static int          kq_ncallouts = 0;
 static int             kq_calloutmax = (4 * 1024);
 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
+static int             kq_checkloop = 1000000;
+SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW,
+    &kq_checkloop, 0, "Maximum number of callouts allocated for kqueue");
 
 #define KNOTE_ACTIVATE(kn) do {                                        \
        kn->kn_status |= KN_ACTIVE;                                     \
@@ -158,22 +176,18 @@ filt_fileattach(struct knote *kn)
 }
 
 /*
- * MPALMOSTSAFE - acquires mplock
+ * MPSAFE
  */
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
-       get_mplock();
-       if (kn->kn_filter != EVFILT_READ) {
-               rel_mplock();
+       if (kn->kn_filter != EVFILT_READ)
                return (EOPNOTSUPP);
-       }
 
        kn->kn_fop = &kqread_filtops;
-       SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
-       rel_mplock();
+       knote_insert(&kq->kq_kqinfo.ki_note, kn);
        return (0);
 }
 
@@ -182,7 +196,7 @@ filt_kqdetach(struct knote *kn)
 {
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
-       SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
+       knote_remove(&kq->kq_kqinfo.ki_note, kn);
 }
 
 /*ARGSUSED*/
@@ -229,8 +243,7 @@ filt_procattach(struct knote *kn)
                kn->kn_flags &= ~EV_FLAG1;
        }
 
-       /* XXX lock the proc here while adding to the list? */
-       SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+       knote_insert(&p->p_klist, kn);
 
        /*
         * Immediately activate any exit notes if the target process is a
@@ -259,9 +272,9 @@ filt_procdetach(struct knote *kn)
 
        if (kn->kn_status & KN_DETACHED)
                return;
-       /* XXX locking?  this might modify another process. */
+       /* XXX locking? take proc_token here? */
        p = kn->kn_ptr.p_proc;
-       SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+       knote_remove(&p->p_klist, kn);
 }
 
 static int
@@ -288,7 +301,7 @@ filt_proc(struct knote *kn, long hint)
        if (event == NOTE_EXIT) {
                struct proc *p = kn->kn_ptr.p_proc;
                if ((kn->kn_status & KN_DETACHED) == 0) {
-                       SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+                       knote_remove(&p->p_klist, kn);
                        kn->kn_status |= KN_DETACHED;
                        kn->kn_data = p->p_xstat;
                        kn->kn_ptr.p_proc = NULL;
@@ -323,6 +336,12 @@ filt_proc(struct knote *kn, long hint)
        return (kn->kn_fflags != 0);
 }
 
+/*
+ * The callout interlocks with callout_stop() (or should), so the
+ * knote should still be a valid structure.  However the timeout
+ * can race a deletion so if KN_DELETING is set we just don't touch
+ * the knote.
+ */
 static void
 filt_timerexpire(void *knx)
 {
@@ -331,16 +350,20 @@ filt_timerexpire(void *knx)
        struct timeval tv;
        int tticks;
 
-       kn->kn_data++;
-       KNOTE_ACTIVATE(kn);
+       lwkt_gettoken(&kq_token);
+       if ((kn->kn_status & KN_DELETING) == 0) {
+               kn->kn_data++;
+               KNOTE_ACTIVATE(kn);
 
-       if ((kn->kn_flags & EV_ONESHOT) == 0) {
-               tv.tv_sec = kn->kn_sdata / 1000;
-               tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
-               tticks = tvtohz_high(&tv);
-               calloutp = (struct callout *)kn->kn_hook;
-               callout_reset(calloutp, tticks, filt_timerexpire, kn);
+               if ((kn->kn_flags & EV_ONESHOT) == 0) {
+                       tv.tv_sec = kn->kn_sdata / 1000;
+                       tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
+                       tticks = tvtohz_high(&tv);
+                       calloutp = (struct callout *)kn->kn_hook;
+                       callout_reset(calloutp, tticks, filt_timerexpire, kn);
+               }
        }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
@@ -389,6 +412,55 @@ filt_timer(struct knote *kn, long hint)
        return (kn->kn_data != 0);
 }
 
+/*
+ * Acquire a knote, return non-zero on success, 0 on failure.
+ *
+ * If we cannot acquire the knote we sleep and return 0.  The knote
+ * may be stale on return in this case and the caller must restart
+ * whatever loop they are in.
+ */
+static __inline
+int
+knote_acquire(struct knote *kn)
+{
+       if (kn->kn_status & KN_PROCESSING) {
+               kn->kn_status |= KN_WAITING | KN_REPROCESS;
+               tsleep(kn, 0, "kqepts", hz);
+               /* knote may be stale now */
+               return(0);
+       }
+       kn->kn_status |= KN_PROCESSING;
+       return(1);
+}
+
+/*
+ * Release an acquired knote, clearing KN_PROCESSING and handling any
+ * KN_REPROCESS events.
+ *
+ * Non-zero is returned if the knote is destroyed.
+ */
+static __inline
+int
+knote_release(struct knote *kn)
+{
+       while (kn->kn_status & KN_REPROCESS) {
+               kn->kn_status &= ~KN_REPROCESS;
+               if (kn->kn_status & KN_WAITING) {
+                       kn->kn_status &= ~KN_WAITING;
+                       wakeup(kn);
+               }
+               if (kn->kn_status & KN_DELETING) {
+                       knote_detach_and_drop(kn);
+                       return(1);
+                       /* NOT REACHED */
+               }
+               if (filter_event(kn, 0))
+                       KNOTE_ACTIVATE(kn);
+       }
+       kn->kn_status &= ~KN_PROCESSING;
+       return(0);
+}
+
 /*
  * Initialize a kqueue.
  *
@@ -403,43 +475,32 @@ kqueue_init(struct kqueue *kq, struct filedesc *fdp)
        TAILQ_INIT(&kq->kq_knlist);
        kq->kq_count = 0;
        kq->kq_fdp = fdp;
-       SLIST_INIT(&kq->kq_sel.si_note);
+       SLIST_INIT(&kq->kq_kqinfo.ki_note);
 }
 
 /*
  * Terminate a kqueue.  Freeing the actual kq itself is left up to the
  * caller (it might be embedded in a lwp so we don't do it here).
+ *
+ * The kq's knlist must be completely eradicated so block on any
+ * processing races.
  */
 void
 kqueue_terminate(struct kqueue *kq)
 {
        struct knote *kn;
-       struct klist *list;
-       int hv;
 
+       lwkt_gettoken(&kq_token);
        while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) {
-               kn->kn_fop->f_detach(kn);
-               if (kn->kn_fop->f_isfd) {
-                       list = &kn->kn_fp->f_klist;
-                       SLIST_REMOVE(list, kn, knote, kn_link);
-                       fdrop(kn->kn_fp);
-                       kn->kn_fp = NULL;
-               } else {
-                       hv = KN_HASH(kn->kn_id, kq->kq_knhashmask);
-                       list = &kq->kq_knhash[hv];
-                       SLIST_REMOVE(list, kn, knote, kn_link);
-               }
-               TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
-               if (kn->kn_status & KN_QUEUED)
-                       knote_dequeue(kn);
-               knote_free(kn);
+               if (knote_acquire(kn))
+                       knote_detach_and_drop(kn);
        }
-
        if (kq->kq_knhash) {
                kfree(kq->kq_knhash, M_KQUEUE);
                kq->kq_knhash = NULL;
                kq->kq_knhashmask = 0;
        }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
@@ -516,7 +577,7 @@ kevent_copyin(void *arg, struct kevent *kevp, int max, int *events)
 }
 
 /*
- * MPALMOSTSAFE
+ * MPSAFE
  */
 int
 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
@@ -527,13 +588,14 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
        struct timespec *tsp;
        int i, n, total, error, nerrors = 0;
        int lres;
+       int limit = kq_checkloop;
        struct kevent kev[KQ_NEVENTS];
        struct knote marker;
 
        tsp = tsp_in;
        *res = 0;
 
-       get_mplock();
+       lwkt_gettoken(&kq_token);
        for ( ;; ) {
                n = 0;
                error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
@@ -558,17 +620,13 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
                         * ignore it too.
                         */
                        if (error) {
-                               if (nevents != 0) {
-                                       kevp->flags = EV_ERROR;
-                                       kevp->data = error;
-                                       lres = *res;
-                                       kevent_copyoutfn(uap, kevp, 1, res);
-                                       if (lres != *res) {
-                                               nevents--;
-                                               nerrors++;
-                                       }
-                               } else {
-                                       goto done;
+                               kevp->flags = EV_ERROR;
+                               kevp->data = error;
+                               lres = *res;
+                               kevent_copyoutfn(uap, kevp, 1, res);
+                               if (lres != *res) {
+                                       nevents--;
+                                       nerrors++;
                                }
                        }
                }
@@ -608,9 +666,8 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
        total = 0;
        error = 0;
        marker.kn_filter = EVFILT_MARKER;
-       crit_enter();
+       marker.kn_status = KN_PROCESSING;
        TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
-       crit_exit();
        while ((n = nevents - total) > 0) {
                if (n > KQ_NEVENTS)
                        n = KQ_NEVENTS;
@@ -625,25 +682,27 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
                 */
                if (kq->kq_count == 0 && *res == 0) {
                        error = kqueue_sleep(kq, tsp);
-
                        if (error)
                                break;
-                       crit_enter();
+
                        TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
                        TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
-                       crit_exit();
                }
 
                /*
                 * Process all received events
+                * Account for all non-spurious events in our total
                 */
                i = kqueue_scan(kq, kev, n, &marker);
                if (i) {
+                       lres = *res;
                        error = kevent_copyoutfn(uap, kev, i, res);
-                       total += i;
+                       total += *res - lres;
                        if (error)
                                break;
                }
+               if (limit && --limit == 0)
+                       panic("kqueue: checkloop failed i=%d", i);
 
                /*
                 * Normally when fewer events are returned than requested
@@ -668,22 +727,18 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
                 *       same event.
                 */
                if (i == 0) {
-                       crit_enter();
                        TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
                        TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
-                       crit_exit();
                }
        }
-       crit_enter();
        TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
-       crit_exit();
 
        /* Timeouts do not return EWOULDBLOCK. */
        if (error == EWOULDBLOCK)
                error = 0;
 
 done:
-       rel_mplock();
+       lwkt_reltoken(&kq_token);
        return (error);
 }
 
@@ -755,16 +810,22 @@ kqueue_register(struct kqueue *kq, struct kevent *kev)
                return (EINVAL);
        }
 
-       if (fops->f_isfd) {
+       lwkt_gettoken(&kq_token);
+       if (fops->f_flags & FILTEROP_ISFD) {
                /* validate descriptor */
                fp = holdfp(fdp, kev->ident, -1);
-               if (fp == NULL)
+               if (fp == NULL) {
+                       lwkt_reltoken(&kq_token);
                        return (EBADF);
+               }
 
+again1:
                SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
                        if (kn->kn_kq == kq &&
                            kn->kn_filter == kev->filter &&
                            kn->kn_id == kev->ident) {
+                               if (knote_acquire(kn) == 0)
+                                       goto again1;
                                break;
                        }
                }
@@ -774,14 +835,22 @@ kqueue_register(struct kqueue *kq, struct kevent *kev)
                        
                        list = &kq->kq_knhash[
                            KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
+again2:
                        SLIST_FOREACH(kn, list, kn_link) {
                                if (kn->kn_id == kev->ident &&
-                                   kn->kn_filter == kev->filter)
+                                   kn->kn_filter == kev->filter) {
+                                       if (knote_acquire(kn) == 0)
+                                               goto again2;
                                        break;
+                               }
                        }
                }
        }
 
+       /*
+        * NOTE: At this point if kn is non-NULL we will have acquired
+        *       it and set KN_PROCESSING.
+        */
        if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
                error = ENOENT;
                goto done;
@@ -813,49 +882,87 @@ kqueue_register(struct kqueue *kq, struct kevent *kev)
                        kev->data = 0;
                        kn->kn_kevent = *kev;
 
+                       /*
+                        * KN_PROCESSING prevents the knote from getting
+                        * ripped out from under us while we are trying
+                        * to attach it, in case the attach blocks.
+                        */
+                       kn->kn_status = KN_PROCESSING;
                        knote_attach(kn);
-                       if ((error = fops->f_attach(kn)) != 0) {
+                       if ((error = filter_attach(kn)) != 0) {
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
                                knote_drop(kn);
                                goto done;
                        }
+
+                       /*
+                        * Interlock against close races which either tried
+                        * to remove our knote while we were blocked or missed
+                        * it entirely prior to our attachment.  We do not
+                        * want to end up with a knote on a closed descriptor.
+                        */
+                       if ((fops->f_flags & FILTEROP_ISFD) &&
+                           checkfdclosed(fdp, kev->ident, kn->kn_fp)) {
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
+                       }
                } else {
                        /*
                         * The user may change some filter values after the
                         * initial EV_ADD, but doing so will not reset any 
                         * filter which have already been triggered.
                         */
+                       KKASSERT(kn->kn_status & KN_PROCESSING);
                        kn->kn_sfflags = kev->fflags;
                        kn->kn_sdata = kev->data;
                        kn->kn_kevent.udata = kev->udata;
                }
 
-               crit_enter();
-               if (kn->kn_fop->f_event(kn, 0))
-                       KNOTE_ACTIVATE(kn);
-               crit_exit();
+               /*
+                * Execute the filter event to immediately activate the
+                * knote if necessary.  If reprocessing events are pending
+                * due to blocking above we do not run the filter here
+                * but instead let knote_release() do it.  Otherwise we
+                * might run the filter on a deleted event.
+                */
+               if ((kn->kn_status & KN_REPROCESS) == 0) {
+                       if (filter_event(kn, 0))
+                               KNOTE_ACTIVATE(kn);
+               }
        } else if (kev->flags & EV_DELETE) {
-               kn->kn_fop->f_detach(kn);
-               knote_drop(kn);
+               /*
+                * Delete the existing knote
+                */
+               knote_detach_and_drop(kn);
                goto done;
        }
 
+       /*
+        * Disablement does not deactivate a knote here.
+        */
        if ((kev->flags & EV_DISABLE) &&
            ((kn->kn_status & KN_DISABLED) == 0)) {
-               crit_enter();
                kn->kn_status |= KN_DISABLED;
-               crit_exit();
        }
 
+       /*
+        * Re-enablement may have to immediately enqueue an active knote.
+        */
        if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
-               crit_enter();
                kn->kn_status &= ~KN_DISABLED;
                if ((kn->kn_status & KN_ACTIVE) &&
-                   ((kn->kn_status & KN_QUEUED) == 0))
+                   ((kn->kn_status & KN_QUEUED) == 0)) {
                        knote_enqueue(kn);
-               crit_exit();
+               }
        }
 
+       /*
+        * Handle any required reprocessing
+        */
+       knote_release(kn);
+       /* kn may be invalid now */
+
 done:
+       lwkt_reltoken(&kq_token);
        if (fp != NULL)
                fdrop(fp);
        return (error);
@@ -871,7 +978,6 @@ kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
 {
        int error = 0;
 
-       crit_enter();
        if (tsp == NULL) {
                kq->kq_state |= KQ_SLEEP;
                error = tsleep(kq, PCATCH, "kqread", 0);
@@ -893,7 +999,6 @@ kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
                        error = tsleep(kq, PCATCH, "kqread", timeout);
                }
        }
-       crit_exit();
 
        /* don't restart after signals... */
        if (error == ERESTART)
@@ -918,7 +1023,7 @@ kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
 
         total = 0;
        local_marker.kn_filter = EVFILT_MARKER;
-        crit_enter();
+       local_marker.kn_status = KN_PROCESSING;
 
        /*
         * Collect events.
@@ -938,44 +1043,88 @@ kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
                        continue;
                }
 
-               TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
-               if (kn->kn_status & KN_DISABLED) {
-                       kn->kn_status &= ~KN_QUEUED;
-                       kq->kq_count--;
-                       continue;
-               }
-               if ((kn->kn_flags & EV_ONESHOT) == 0 &&
-                   kn->kn_fop->f_event(kn, 0) == 0) {
-                       kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-                       kq->kq_count--;
+               /*
+                * We can't skip a knote undergoing processing, otherwise
+                * we risk not returning it when the user process expects
+                * it should be returned.  Sleep and retry.
+                */
+               if (knote_acquire(kn) == 0)
                        continue;
-               }
-               *kevp++ = kn->kn_kevent;
-               ++total;
-               --count;
 
                /*
-                * Post-event action on the note
+                * Remove the event for processing.
+                *
+                * WARNING!  We must leave KN_QUEUED set to prevent the
+                *           event from being KNOTE_ACTIVATE()d while
+                *           the queue state is in limbo, in case we
+                *           block.
+                *
+                * WARNING!  We must set KN_PROCESSING to avoid races
+                *           against deletion or another thread's
+                *           processing.
+                */
+               TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
+               kq->kq_count--;
+
+               /*
+                * We have to deal with an extremely important race against
+                * file descriptor close()s here.  The file descriptor can
+                * disappear MPSAFE, and there is a small window of
+                * opportunity between that and the call to knote_fdclose().
+                *
+                * If we hit that window here while doselect or dopoll is
+                * trying to delete a spurious event they will not be able
+                * to match up the event against a knote and will go haywire.
                 */
-               if (kn->kn_flags & EV_ONESHOT) {
+               if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
+                   checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) {
+                       kn->kn_status |= KN_DELETING | KN_REPROCESS;
+               }
+
+               if (kn->kn_status & KN_DISABLED) {
+                       /*
+                        * If disabled we ensure the event is not queued
+                        * but leave its active bit set.  On re-enablement
+                        * the event may be immediately triggered.
+                        */
                        kn->kn_status &= ~KN_QUEUED;
-                       kq->kq_count--;
-                       crit_exit();
-                       kn->kn_fop->f_detach(kn);
-                       knote_drop(kn);
-                       crit_enter();
-               } else if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_data = 0;
-                       kn->kn_fflags = 0;
+               } else if ((kn->kn_flags & EV_ONESHOT) == 0 &&
+                          (kn->kn_status & KN_DELETING) == 0 &&
+                          filter_event(kn, 0) == 0) {
+                       /*
+                        * If not running in one-shot mode and the event
+                        * is no longer present we ensure it is removed
+                        * from the queue and ignore it.
+                        */
                        kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-                       kq->kq_count--;
                } else {
-                       TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
+                       /*
+                        * Post the event
+                        */
+                       *kevp++ = kn->kn_kevent;
+                       ++total;
+                       --count;
+
+                       if (kn->kn_flags & EV_ONESHOT) {
+                               kn->kn_status &= ~KN_QUEUED;
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
+                       } else if (kn->kn_flags & EV_CLEAR) {
+                               kn->kn_data = 0;
+                               kn->kn_fflags = 0;
+                               kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+                       } else {
+                               TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
+                               kq->kq_count++;
+                       }
                }
+
+               /*
+                * Handle any post-processing states
+                */
+               knote_release(kn);
        }
        TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
 
-       crit_exit();
        return (total);
 }
 
@@ -1010,7 +1159,7 @@ kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
        struct kqueue *kq;
        int error;
 
-       get_mplock();
+       lwkt_gettoken(&kq_token);
        kq = (struct kqueue *)fp->f_data;
 
        switch(com) {
@@ -1028,7 +1177,7 @@ kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
                error = ENOTTY;
                break;
        }
-       rel_mplock();
+       lwkt_reltoken(&kq_token);
        return (error);
 }
 
@@ -1048,87 +1197,251 @@ kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred)
 }
 
 /*
- * MPALMOSTSAFE - acquires mplock
+ * MPSAFE
  */
 static int
 kqueue_close(struct file *fp)
 {
        struct kqueue *kq = (struct kqueue *)fp->f_data;
 
-       get_mplock();
-
        kqueue_terminate(kq);
 
        fp->f_data = NULL;
        funsetown(kq->kq_sigio);
-       rel_mplock();
 
        kfree(kq, M_KQUEUE);
        return (0);
 }
 
-void
+static void
 kqueue_wakeup(struct kqueue *kq)
 {
        if (kq->kq_state & KQ_SLEEP) {
                kq->kq_state &= ~KQ_SLEEP;
                wakeup(kq);
        }
-       KNOTE(&kq->kq_sel.si_note, 0);
+       KNOTE(&kq->kq_kqinfo.ki_note, 0);
+}
+
+/*
+ * Calls filterops f_attach function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ */
+static int
+filter_attach(struct knote *kn)
+{
+       int ret;
+
+       if (!(kn->kn_fop->f_flags & FILTEROP_MPSAFE)) {
+               get_mplock();
+               ret = kn->kn_fop->f_attach(kn);
+               rel_mplock();
+       } else {
+               ret = kn->kn_fop->f_attach(kn);
+       }
+
+       return (ret);
 }
 
 /*
- * walk down a list of knotes, activating them if their event has triggered.
+ * Detach the knote and drop it, destroying the knote.
+ *
+ * Calls filterops f_detach function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ */
+static void
+knote_detach_and_drop(struct knote *kn)
+{
+       kn->kn_status |= KN_DELETING | KN_REPROCESS;
+       if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
+               kn->kn_fop->f_detach(kn);
+       } else {
+               get_mplock();
+               kn->kn_fop->f_detach(kn);
+               rel_mplock();
+       }
+       knote_drop(kn);
+}
+
+/*
+ * Calls filterops f_event function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ *
+ * If the knote is in the middle of being created or deleted we cannot
+ * safely call the filter op.
+ */
+static int
+filter_event(struct knote *kn, long hint)
+{
+       int ret;
+
+       if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
+               ret = kn->kn_fop->f_event(kn, hint);
+       } else {
+               get_mplock();
+               ret = kn->kn_fop->f_event(kn, hint);
+               rel_mplock();
+       }
+       return (ret);
+}
+
+/*
+ * Walk down a list of knotes, activating them if their event has triggered.
+ *
+ * If we encounter any knotes which are undergoing processing we just mark
+ * them for reprocessing and do not try to [re]activate the knote.  However,
+ * if a hint is being passed we have to wait and that makes things a bit
+ * sticky.
  */
 void
 knote(struct klist *list, long hint)
 {
        struct knote *kn;
 
-       SLIST_FOREACH(kn, list, kn_selnext)
-               if (kn->kn_fop->f_event(kn, hint))
-                       KNOTE_ACTIVATE(kn);
+       lwkt_gettoken(&kq_token);
+restart:
+       SLIST_FOREACH(kn, list, kn_next) {
+               if (kn->kn_status & KN_PROCESSING) {
+                       /*
+                        * Someone else is processing the knote, ask the
+                        * other thread to reprocess it and don't mess
+                        * with it otherwise.
+                        */
+                       if (hint == 0) {
+                               kn->kn_status |= KN_REPROCESS;
+                               continue;
+                       }
+
+                       /*
+                        * If the hint is non-zero we have to wait or risk
+                        * losing the state the caller is trying to update.
+                        *
+                        * XXX This is a real problem, certain process
+                        *     and signal filters will bump kn_data for
+                        *     already-processed notes more than once if
+                        *     we restart the list scan.  FIXME.
+                        */
+                       kn->kn_status |= KN_WAITING | KN_REPROCESS;
+                       tsleep(kn, 0, "knotec", hz);
+                       goto restart;
+               }
+
+               /*
+                * Become the reprocessing master ourselves.
+                *
+                * If hint is non-zer running the event is mandatory
+                * when not deleting so do it whether reprocessing is
+                * set or not.
+                */
+               kn->kn_status |= KN_PROCESSING;
+               if ((kn->kn_status & KN_DELETING) == 0) {
+                       if (filter_event(kn, hint))
+                               KNOTE_ACTIVATE(kn);
+               }
+               if (knote_release(kn))
+                       goto restart;
+       }
+       lwkt_reltoken(&kq_token);
+}
+
+/*
+ * Insert knote at head of klist.
+ *
+ * This function may only be called via a filter function and thus
+ * kq_token should already be held and marked for processing.
+ */
+void
+knote_insert(struct klist *klist, struct knote *kn)
+{
+       KKASSERT(kn->kn_status & KN_PROCESSING);
+       ASSERT_LWKT_TOKEN_HELD(&kq_token);
+       SLIST_INSERT_HEAD(klist, kn, kn_next);
+}
+
+/*
+ * Remove knote from a klist
+ *
+ * This function may only be called via a filter function and thus
+ * kq_token should already be held and marked for processing.
+ */
+void
+knote_remove(struct klist *klist, struct knote *kn)
+{
+       KKASSERT(kn->kn_status & KN_PROCESSING);
+       ASSERT_LWKT_TOKEN_HELD(&kq_token);
+       SLIST_REMOVE(klist, kn, knote, kn_next);
 }
 
 /*
- * remove all knotes from a specified klist
+ * Remove all knotes from a specified klist
+ *
+ * Only called from aio.
  */
 void
-knote_remove(struct klist *list)
+knote_empty(struct klist *list)
 {
        struct knote *kn;
 
+       lwkt_gettoken(&kq_token);
        while ((kn = SLIST_FIRST(list)) != NULL) {
-               kn->kn_fop->f_detach(kn);
-               knote_drop(kn);
+               if (knote_acquire(kn))
+                       knote_detach_and_drop(kn);
        }
+       lwkt_reltoken(&kq_token);
+}
+
+void
+knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst,
+                   struct filterops *ops, void *hook)
+{
+       struct knote *kn;
+
+       lwkt_gettoken(&kq_token);
+       while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) {
+               if (knote_acquire(kn)) {
+                       knote_remove(&src->ki_note, kn);
+                       kn->kn_fop = ops;
+                       kn->kn_hook = hook;
+                       knote_insert(&dst->ki_note, kn);
+                       knote_release(kn);
+                       /* kn may be invalid now */
+               }
+       }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
- * remove all knotes referencing a specified fd
+ * Remove all knotes referencing a specified fd
  */
 void
 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd)
 {
        struct knote *kn;
 
+       lwkt_gettoken(&kq_token);
 restart:
        SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
                if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) {
-                       kn->kn_fop->f_detach(kn);
-                       knote_drop(kn);
+                       if (knote_acquire(kn))
+                               knote_detach_and_drop(kn);
                        goto restart;
                }
        }
+       lwkt_reltoken(&kq_token);
 }
 
+/*
+ * Low level attach function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
 knote_attach(struct knote *kn)
 {
        struct klist *list;
        struct kqueue *kq = kn->kn_kq;
 
-       if (kn->kn_fop->f_isfd) {
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
                KKASSERT(kn->kn_fp);
                list = &kn->kn_fp->f_klist;
        } else {
@@ -1139,12 +1452,12 @@ knote_attach(struct knote *kn)
        }
        SLIST_INSERT_HEAD(list, kn, kn_link);
        TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink);
-       kn->kn_status = 0;
 }
 
 /*
- * should be called outside of a critical section, since we don't want to
- * hold a critical section while calling fdrop and free.
+ * Low level drop function.
+ *
+ * The knote should already be marked for processing.
  */
 static void
 knote_drop(struct knote *kn)
@@ -1154,7 +1467,7 @@ knote_drop(struct knote *kn)
 
        kq = kn->kn_kq;
 
-       if (kn->kn_fop->f_isfd)
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD)
                list = &kn->kn_fp->f_klist;
        else
                list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
@@ -1163,20 +1476,24 @@ knote_drop(struct knote *kn)
        TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
        if (kn->kn_status & KN_QUEUED)
                knote_dequeue(kn);
-       if (kn->kn_fop->f_isfd)
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
                fdrop(kn->kn_fp);
+               kn->kn_fp = NULL;
+       }
        knote_free(kn);
 }
 
-
+/*
+ * Low level enqueue function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
 knote_enqueue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
 
-       crit_enter();
        KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
-
        TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
        kn->kn_status |= KN_QUEUED;
        ++kq->kq_count;
@@ -1186,22 +1503,24 @@ knote_enqueue(struct knote *kn)
         */
        if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1)
                pgsigio(kq->kq_sigio, SIGIO, 0);
-       crit_exit();
+
        kqueue_wakeup(kq);
 }
 
+/*
+ * Low level dequeue function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
 knote_dequeue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
 
        KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
-       crit_enter();
-
        TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
        kn->kn_status &= ~KN_QUEUED;
        kq->kq_count--;
-       crit_exit();
 }
 
 static void