kernel - Provide descriptions for lwkt.* and debug.* sysctl's
[dragonfly.git] / sys / kern / kern_event.c
index 611f41a..1aefd31 100644 (file)
 #include <sys/file.h>
 #include <sys/lock.h>
 #include <sys/fcntl.h>
-#include <sys/select.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
-#include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
+#include <sys/thread.h>
 #include <sys/uio.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
+#include <sys/ktr.h>
 
 #include <sys/thread2.h>
 #include <sys/file2.h>
 
 #include <vm/vm_zone.h>
 
+/*
+ * Global token for kqueue subsystem
+ */
+struct lwkt_token kq_token = LWKT_TOKEN_UP_INITIALIZER(kq_token);
+SYSCTL_INT(_lwkt, OID_AUTO, kq_mpsafe,
+    CTLFLAG_RW, &kq_token.t_flags, 0,
+    "Require MP lock for kq_token");
+SYSCTL_LONG(_lwkt, OID_AUTO, kq_collisions,
+    CTLFLAG_RW, &kq_token.t_collisions, 0,
+    "Collision counter of kq_token");
+
 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
+struct kevent_copyin_args {
+       struct kevent_args      *ka;
+       int                     pchanges;
+};
+
+static int     kqueue_sleep(struct kqueue *kq, struct timespec *tsp);
 static int     kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
-                   struct timespec *tsp, int *errorp);
+                   struct knote *marker);
 static int     kqueue_read(struct file *fp, struct uio *uio,
                    struct ucred *cred, int flags);
 static int     kqueue_write(struct file *fp, struct uio *uio,
                    struct ucred *cred, int flags);
 static int     kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
                    struct ucred *cred, struct sysmsg *msg);
-static int     kqueue_poll(struct file *fp, int events, struct ucred *cred);
 static int     kqueue_kqfilter(struct file *fp, struct knote *kn);
 static int     kqueue_stat(struct file *fp, struct stat *st,
                    struct ucred *cred);
 static int     kqueue_close(struct file *fp);
-static void    kqueue_wakeup(struct kqueue *kq);
+static void    kqueue_wakeup(struct kqueue *kq);
+static int     filter_attach(struct knote *kn);
+static int     filter_event(struct knote *kn, long hint);
 
 /*
  * MPSAFE
@@ -81,7 +99,6 @@ static struct fileops kqueueops = {
        .fo_read = kqueue_read,
        .fo_write = kqueue_write,
        .fo_ioctl = kqueue_ioctl,
-       .fo_poll = kqueue_poll,
        .fo_kqfilter = kqueue_kqfilter,
        .fo_stat = kqueue_stat,
        .fo_close = kqueue_close,
@@ -90,6 +107,7 @@ static struct fileops kqueueops = {
 
 static void    knote_attach(struct knote *kn);
 static void    knote_drop(struct knote *kn);
+static void    knote_detach_and_drop(struct knote *kn);
 static void    knote_enqueue(struct knote *kn);
 static void    knote_dequeue(struct knote *kn);
 static void    knote_init(void);
@@ -108,9 +126,9 @@ static void filt_timerdetach(struct knote *kn);
 static int     filt_timer(struct knote *kn, long hint);
 
 static struct filterops file_filtops =
-       { 1, filt_fileattach, NULL, NULL };
+       { FILTEROP_ISFD, filt_fileattach, NULL, NULL };
 static struct filterops kqread_filtops =
-       { 1, NULL, filt_kqdetach, filt_kqueue };
+       { FILTEROP_ISFD, NULL, filt_kqdetach, filt_kqueue };
 static struct filterops proc_filtops =
        { 0, filt_procattach, filt_procdetach, filt_proc };
 static struct filterops timer_filtops =
@@ -121,6 +139,9 @@ static int          kq_ncallouts = 0;
 static int             kq_calloutmax = (4 * 1024);
 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
+static int             kq_checkloop = 1000000;
+SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW,
+    &kq_checkloop, 0, "Maximum number of callouts allocated for kqueue");
 
 #define KNOTE_ACTIVATE(kn) do {                                        \
        kn->kn_status |= KN_ACTIVE;                                     \
@@ -145,6 +166,7 @@ static struct filterops *sysfilt_ops[] = {
        &proc_filtops,                  /* EVFILT_PROC */
        &sig_filtops,                   /* EVFILT_SIGNAL */
        &timer_filtops,                 /* EVFILT_TIMER */
+       &file_filtops,                  /* EVFILT_EXCEPT */
 };
 
 static int
@@ -154,22 +176,18 @@ filt_fileattach(struct knote *kn)
 }
 
 /*
- * MPALMOSTSAFE - acquires mplock
+ * MPSAFE
  */
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
-       get_mplock();
-       if (kn->kn_filter != EVFILT_READ) {
-               rel_mplock();
-               return (1);
-       }
+       if (kn->kn_filter != EVFILT_READ)
+               return (EOPNOTSUPP);
 
        kn->kn_fop = &kqread_filtops;
-       SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
-       rel_mplock();
+       knote_insert(&kq->kq_kqinfo.ki_note, kn);
        return (0);
 }
 
@@ -178,7 +196,7 @@ filt_kqdetach(struct knote *kn)
 {
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
-       SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
+       knote_remove(&kq->kq_kqinfo.ki_note, kn);
 }
 
 /*ARGSUSED*/
@@ -198,15 +216,20 @@ filt_procattach(struct knote *kn)
        int immediate;
 
        immediate = 0;
+       lwkt_gettoken(&proc_token);
        p = pfind(kn->kn_id);
        if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
                p = zpfind(kn->kn_id);
                immediate = 1;
        }
-       if (p == NULL)
+       if (p == NULL) {
+               lwkt_reltoken(&proc_token);
                return (ESRCH);
-       if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred))
+       }
+       if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) {
+               lwkt_reltoken(&proc_token);
                return (EACCES);
+       }
 
        kn->kn_ptr.p_proc = p;
        kn->kn_flags |= EV_CLEAR;               /* automatically set */
@@ -220,16 +243,16 @@ filt_procattach(struct knote *kn)
                kn->kn_flags &= ~EV_FLAG1;
        }
 
-       /* XXX lock the proc here while adding to the list? */
-       SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+       knote_insert(&p->p_klist, kn);
 
        /*
         * Immediately activate any exit notes if the target process is a
         * zombie.  This is necessary to handle the case where the target
-        * process, e.g. a child, dies before the kevent is registered.
+        * process, e.g. a child, dies before the kevent is negistered.
         */
        if (immediate && filt_proc(kn, NOTE_EXIT))
                KNOTE_ACTIVATE(kn);
+       lwkt_reltoken(&proc_token);
 
        return (0);
 }
@@ -249,9 +272,9 @@ filt_procdetach(struct knote *kn)
 
        if (kn->kn_status & KN_DETACHED)
                return;
-       /* XXX locking?  this might modify another process. */
+       /* XXX locking? take proc_token here? */
        p = kn->kn_ptr.p_proc;
-       SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+       knote_remove(&p->p_klist, kn);
 }
 
 static int
@@ -278,7 +301,7 @@ filt_proc(struct knote *kn, long hint)
        if (event == NOTE_EXIT) {
                struct proc *p = kn->kn_ptr.p_proc;
                if ((kn->kn_status & KN_DETACHED) == 0) {
-                       SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+                       knote_remove(&p->p_klist, kn);
                        kn->kn_status |= KN_DETACHED;
                        kn->kn_data = p->p_xstat;
                        kn->kn_ptr.p_proc = NULL;
@@ -313,6 +336,12 @@ filt_proc(struct knote *kn, long hint)
        return (kn->kn_fflags != 0);
 }
 
+/*
+ * The callout interlocks with callout_stop() (or should), so the
+ * knote should still be a valid structure.  However the timeout
+ * can race a deletion so if KN_DELETING is set we just don't touch
+ * the knote.
+ */
 static void
 filt_timerexpire(void *knx)
 {
@@ -321,16 +350,20 @@ filt_timerexpire(void *knx)
        struct timeval tv;
        int tticks;
 
-       kn->kn_data++;
-       KNOTE_ACTIVATE(kn);
+       lwkt_gettoken(&kq_token);
+       if ((kn->kn_status & KN_DELETING) == 0) {
+               kn->kn_data++;
+               KNOTE_ACTIVATE(kn);
 
-       if ((kn->kn_flags & EV_ONESHOT) == 0) {
-               tv.tv_sec = kn->kn_sdata / 1000;
-               tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
-               tticks = tvtohz_high(&tv);
-               calloutp = (struct callout *)kn->kn_hook;
-               callout_reset(calloutp, tticks, filt_timerexpire, kn);
+               if ((kn->kn_flags & EV_ONESHOT) == 0) {
+                       tv.tv_sec = kn->kn_sdata / 1000;
+                       tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
+                       tticks = tvtohz_high(&tv);
+                       calloutp = (struct callout *)kn->kn_hook;
+                       callout_reset(calloutp, tticks, filt_timerexpire, kn);
+               }
        }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
@@ -379,6 +412,55 @@ filt_timer(struct knote *kn, long hint)
        return (kn->kn_data != 0);
 }
 
+/*
+ * Acquire a knote, return non-zero on success, 0 on failure.
+ *
+ * If we cannot acquire the knote we sleep and return 0.  The knote
+ * may be stale on return in this case and the caller must restart
+ * whatever loop they are in.
+ */
+static __inline
+int
+knote_acquire(struct knote *kn)
+{
+       if (kn->kn_status & KN_PROCESSING) {
+               kn->kn_status |= KN_WAITING | KN_REPROCESS;
+               tsleep(kn, 0, "kqepts", hz);
+               /* knote may be stale now */
+               return(0);
+       }
+       kn->kn_status |= KN_PROCESSING;
+       return(1);
+}
+
+/*
+ * Release an acquired knote, clearing KN_PROCESSING and handling any
+ * KN_REPROCESS events.
+ *
+ * Non-zero is returned if the knote is destroyed.
+ */
+static __inline
+int
+knote_release(struct knote *kn)
+{
+       while (kn->kn_status & KN_REPROCESS) {
+               kn->kn_status &= ~KN_REPROCESS;
+               if (kn->kn_status & KN_WAITING) {
+                       kn->kn_status &= ~KN_WAITING;
+                       wakeup(kn);
+               }
+               if (kn->kn_status & KN_DELETING) {
+                       knote_detach_and_drop(kn);
+                       return(1);
+                       /* NOT REACHED */
+               }
+               if (filter_event(kn, 0))
+                       KNOTE_ACTIVATE(kn);
+       }
+       kn->kn_status &= ~KN_PROCESSING;
+       return(0);
+}
+
 /*
  * Initialize a kqueue.
  *
@@ -391,43 +473,34 @@ kqueue_init(struct kqueue *kq, struct filedesc *fdp)
 {
        TAILQ_INIT(&kq->kq_knpend);
        TAILQ_INIT(&kq->kq_knlist);
+       kq->kq_count = 0;
        kq->kq_fdp = fdp;
+       SLIST_INIT(&kq->kq_kqinfo.ki_note);
 }
 
 /*
  * Terminate a kqueue.  Freeing the actual kq itself is left up to the
  * caller (it might be embedded in a lwp so we don't do it here).
+ *
+ * The kq's knlist must be completely eradicated so block on any
+ * processing races.
  */
 void
 kqueue_terminate(struct kqueue *kq)
 {
        struct knote *kn;
-       struct klist *list;
-       int hv;
 
+       lwkt_gettoken(&kq_token);
        while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) {
-               kn->kn_fop->f_detach(kn);
-               if (kn->kn_fop->f_isfd) {
-                       list = &kn->kn_fp->f_klist;
-                       SLIST_REMOVE(list, kn, knote, kn_link);
-                       fdrop(kn->kn_fp);
-                       kn->kn_fp = NULL;
-               } else {
-                       hv = KN_HASH(kn->kn_id, kq->kq_knhashmask);
-                       list = &kq->kq_knhash[hv];
-                       SLIST_REMOVE(list, kn, knote, kn_link);
-               }
-               TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
-               if (kn->kn_status & KN_QUEUED)
-                       knote_dequeue(kn);
-               knote_free(kn);
+               if (knote_acquire(kn))
+                       knote_detach_and_drop(kn);
        }
-
        if (kq->kq_knhash) {
                kfree(kq->kq_knhash, M_KQUEUE);
                kq->kq_knhash = NULL;
                kq->kq_knhashmask = 0;
        }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
@@ -459,70 +532,106 @@ sys_kqueue(struct kqueue_args *uap)
 }
 
 /*
- * MPALMOSTSAFE
+ * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
-int
-sys_kevent(struct kevent_args *uap)
+static int
+kevent_copyout(void *arg, struct kevent *kevp, int count, int *res)
 {
-       struct thread *td = curthread;
-       struct proc *p = td->td_proc;
-       struct kevent *kevp;
-       struct kqueue *kq;
-       struct file *fp = NULL;
-       struct timespec ts;
-       struct timespec *tsp;
-       int i, n, total, nerrors, error;
-       struct kevent kev[KQ_NEVENTS];
+       struct kevent_copyin_args *kap;
+       int error;
 
-       fp = holdfp(p->p_fd, uap->fd, -1);
-       if (fp == NULL)
-               return (EBADF);
-       if (fp->f_type != DTYPE_KQUEUE) {
-               fdrop(fp);
-               return (EBADF);
-       }
+       kap = (struct kevent_copyin_args *)arg;
 
-       if (uap->timeout) {
-               error = copyin(uap->timeout, &ts, sizeof(ts));
-               if (error)
-                       goto done;
-               tsp = &ts;
+       error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp));
+       if (error == 0) {
+               kap->ka->eventlist += count;
+               *res += count;
        } else {
-               tsp = NULL;
+               *res = -1;
        }
 
-       kq = (struct kqueue *)fp->f_data;
-       nerrors = 0;
+       return (error);
+}
+
+/*
+ * Copy at most 'max' items from the list pointed to by kap->changelist,
+ * return number of items in 'events'.
+ */
+static int
+kevent_copyin(void *arg, struct kevent *kevp, int max, int *events)
+{
+       struct kevent_copyin_args *kap;
+       int error, count;
+
+       kap = (struct kevent_copyin_args *)arg;
+
+       count = min(kap->ka->nchanges - kap->pchanges, max);
+       error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp);
+       if (error == 0) {
+               kap->ka->changelist += count;
+               kap->pchanges += count;
+               *events = count;
+       }
 
-       get_mplock();
-       while (uap->nchanges > 0) {
-               n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
-               error = copyin(uap->changelist, kev, n * sizeof(struct kevent));
+       return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
+           k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn,
+           struct timespec *tsp_in)
+{
+       struct kevent *kevp;
+       struct timespec *tsp;
+       int i, n, total, error, nerrors = 0;
+       int lres;
+       int limit = kq_checkloop;
+       struct kevent kev[KQ_NEVENTS];
+       struct knote marker;
+
+       tsp = tsp_in;
+       *res = 0;
+
+       lwkt_gettoken(&kq_token);
+       for ( ;; ) {
+               n = 0;
+               error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
                if (error)
                        goto done;
+               if (n == 0)
+                       break;
                for (i = 0; i < n; i++) {
                        kevp = &kev[i];
                        kevp->flags &= ~EV_SYSFLAGS;
                        error = kqueue_register(kq, kevp);
+
+                       /*
+                        * If a registration returns an error we
+                        * immediately post the error.  The kevent()
+                        * call itself will fail with the error if
+                        * no space is available for posting.
+                        *
+                        * Such errors normally bypass the timeout/blocking
+                        * code.  However, if the copyoutfn function refuses
+                        * to post the error (see sys_poll()), then we
+                        * ignore it too.
+                        */
                        if (error) {
-                               if (uap->nevents != 0) {
-                                       kevp->flags = EV_ERROR;
-                                       kevp->data = error;
-                                       copyout(kevp, uap->eventlist,
-                                               sizeof(*kevp));
-                                       uap->eventlist++;
-                                       uap->nevents--;
+                               kevp->flags = EV_ERROR;
+                               kevp->data = error;
+                               lres = *res;
+                               kevent_copyoutfn(uap, kevp, 1, res);
+                               if (lres != *res) {
+                                       nevents--;
                                        nerrors++;
-                               } else {
-                                       goto done;
                                }
                        }
                }
-               uap->nchanges -= n;
-               uap->changelist += n;
        }
        if (nerrors) {
-               uap->sysmsg_result = nerrors;
                error = 0;
                goto done;
        }
@@ -542,31 +651,139 @@ sys_kevent(struct kevent_args *uap)
        /*
         * Loop as required.
         *
-        * Collect as many events as we can.  The timeout on successive
-        * loops is disabled (kqueue_scan() becomes non-blocking).
+        * Collect as many events as we can. Sleeping on successive
+        * loops is disabled if copyoutfn has incremented (*res).
+        *
+        * The loop stops if an error occurs, all events have been
+        * scanned (the marker has been reached), or fewer than the
+        * maximum number of events is found.
+        *
+        * The copyoutfn function does not have to increment (*res) in
+        * order for the loop to continue.
+        *
+        * NOTE: doselect() usually passes 0x7FFFFFFF for nevents.
         */
        total = 0;
        error = 0;
-       while ((n = uap->nevents - total) > 0) {
+       marker.kn_filter = EVFILT_MARKER;
+       marker.kn_status = KN_PROCESSING;
+       TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
+       while ((n = nevents - total) > 0) {
                if (n > KQ_NEVENTS)
                        n = KQ_NEVENTS;
-               i = kqueue_scan(kq, kev, n, tsp, &error);
-               if (i == 0)
-                       break;
-               error = copyout(kev, uap->eventlist + total,
-                               (size_t)i * sizeof(struct kevent));
-               total += i;
-               if (error || i != n)
+
+               /*
+                * If no events are pending sleep until timeout (if any)
+                * or an event occurs.
+                *
+                * After the sleep completes the marker is moved to the
+                * end of the list, making any received events available
+                * to our scan.
+                */
+               if (kq->kq_count == 0 && *res == 0) {
+                       error = kqueue_sleep(kq, tsp);
+                       if (error)
+                               break;
+
+                       TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
+                       TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
+               }
+
+               /*
+                * Process all received events
+                * Account for all non-spurious events in our total
+                */
+               i = kqueue_scan(kq, kev, n, &marker);
+               if (i) {
+                       lres = *res;
+                       error = kevent_copyoutfn(uap, kev, i, res);
+                       total += *res - lres;
+                       if (error)
+                               break;
+               }
+               if (limit && --limit == 0)
+                       panic("kqueue: checkloop failed i=%d", i);
+
+               /*
+                * Normally when fewer events are returned than requested
+                * we can stop.  However, if only spurious events were
+                * collected the copyout will not bump (*res) and we have
+                * to continue.
+                */
+               if (i < n && *res)
                        break;
-               tsp = &ts;              /* successive loops non-blocking */
-               tsp->tv_sec = 0;
-               tsp->tv_nsec = 0;
+
+               /*
+                * Deal with an edge case where spurious events can cause
+                * a loop to occur without moving the marker.  This can
+                * prevent kqueue_scan() from picking up new events which
+                * race us.  We must be sure to move the marker for this
+                * case.
+                *
+                * NOTE: We do not want to move the marker if events
+                *       were scanned because normal kqueue operations
+                *       may reactivate events.  Moving the marker in
+                *       that case could result in duplicates for the
+                *       same event.
+                */
+               if (i == 0) {
+                       TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
+                       TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
+               }
        }
-       uap->sysmsg_result = total;
+       TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
+
+       /* Timeouts do not return EWOULDBLOCK. */
+       if (error == EWOULDBLOCK)
+               error = 0;
+
 done:
-       rel_mplock();
-       if (fp != NULL)
+       lwkt_reltoken(&kq_token);
+       return (error);
+}
+
+/*
+ * MPALMOSTSAFE
+ */
+int
+sys_kevent(struct kevent_args *uap)
+{
+       struct thread *td = curthread;
+       struct proc *p = td->td_proc;
+       struct timespec ts, *tsp;
+       struct kqueue *kq;
+       struct file *fp = NULL;
+       struct kevent_copyin_args *kap, ka;
+       int error;
+
+       if (uap->timeout) {
+               error = copyin(uap->timeout, &ts, sizeof(ts));
+               if (error)
+                       return (error);
+               tsp = &ts;
+       } else {
+               tsp = NULL;
+       }
+
+       fp = holdfp(p->p_fd, uap->fd, -1);
+       if (fp == NULL)
+               return (EBADF);
+       if (fp->f_type != DTYPE_KQUEUE) {
                fdrop(fp);
+               return (EBADF);
+       }
+
+       kq = (struct kqueue *)fp->f_data;
+
+       kap = &ka;
+       kap->ka = uap;
+       kap->pchanges = 0;
+
+       error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap,
+                           kevent_copyin, kevent_copyout, tsp);
+
+       fdrop(fp);
+
        return (error);
 }
 
@@ -593,16 +810,22 @@ kqueue_register(struct kqueue *kq, struct kevent *kev)
                return (EINVAL);
        }
 
-       if (fops->f_isfd) {
+       lwkt_gettoken(&kq_token);
+       if (fops->f_flags & FILTEROP_ISFD) {
                /* validate descriptor */
                fp = holdfp(fdp, kev->ident, -1);
-               if (fp == NULL)
+               if (fp == NULL) {
+                       lwkt_reltoken(&kq_token);
                        return (EBADF);
+               }
 
+again1:
                SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
                        if (kn->kn_kq == kq &&
                            kn->kn_filter == kev->filter &&
                            kn->kn_id == kev->ident) {
+                               if (knote_acquire(kn) == 0)
+                                       goto again1;
                                break;
                        }
                }
@@ -612,14 +835,22 @@ kqueue_register(struct kqueue *kq, struct kevent *kev)
                        
                        list = &kq->kq_knhash[
                            KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
+again2:
                        SLIST_FOREACH(kn, list, kn_link) {
                                if (kn->kn_id == kev->ident &&
-                                   kn->kn_filter == kev->filter)
+                                   kn->kn_filter == kev->filter) {
+                                       if (knote_acquire(kn) == 0)
+                                               goto again2;
                                        break;
+                               }
                        }
                }
        }
 
+       /*
+        * NOTE: At this point if kn is non-NULL we will have acquired
+        *       it and set KN_PROCESSING.
+        */
        if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
                error = ENOENT;
                goto done;
@@ -651,152 +882,249 @@ kqueue_register(struct kqueue *kq, struct kevent *kev)
                        kev->data = 0;
                        kn->kn_kevent = *kev;
 
+                       /*
+                        * KN_PROCESSING prevents the knote from getting
+                        * ripped out from under us while we are trying
+                        * to attach it, in case the attach blocks.
+                        */
+                       kn->kn_status = KN_PROCESSING;
                        knote_attach(kn);
-                       if ((error = fops->f_attach(kn)) != 0) {
+                       if ((error = filter_attach(kn)) != 0) {
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
                                knote_drop(kn);
                                goto done;
                        }
+
+                       /*
+                        * Interlock against close races which either tried
+                        * to remove our knote while we were blocked or missed
+                        * it entirely prior to our attachment.  We do not
+                        * want to end up with a knote on a closed descriptor.
+                        */
+                       if ((fops->f_flags & FILTEROP_ISFD) &&
+                           checkfdclosed(fdp, kev->ident, kn->kn_fp)) {
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
+                       }
                } else {
                        /*
                         * The user may change some filter values after the
                         * initial EV_ADD, but doing so will not reset any 
                         * filter which have already been triggered.
                         */
+                       KKASSERT(kn->kn_status & KN_PROCESSING);
                        kn->kn_sfflags = kev->fflags;
                        kn->kn_sdata = kev->data;
                        kn->kn_kevent.udata = kev->udata;
                }
 
-               crit_enter();
-               if (kn->kn_fop->f_event(kn, 0))
-                       KNOTE_ACTIVATE(kn);
-               crit_exit();
+               /*
+                * Execute the filter event to immediately activate the
+                * knote if necessary.  If reprocessing events are pending
+                * due to blocking above we do not run the filter here
+                * but instead let knote_release() do it.  Otherwise we
+                * might run the filter on a deleted event.
+                */
+               if ((kn->kn_status & KN_REPROCESS) == 0) {
+                       if (filter_event(kn, 0))
+                               KNOTE_ACTIVATE(kn);
+               }
        } else if (kev->flags & EV_DELETE) {
-               kn->kn_fop->f_detach(kn);
-               knote_drop(kn);
+               /*
+                * Delete the existing knote
+                */
+               knote_detach_and_drop(kn);
                goto done;
        }
 
+       /*
+        * Disablement does not deactivate a knote here.
+        */
        if ((kev->flags & EV_DISABLE) &&
            ((kn->kn_status & KN_DISABLED) == 0)) {
-               crit_enter();
                kn->kn_status |= KN_DISABLED;
-               crit_exit();
        }
 
+       /*
+        * Re-enablement may have to immediately enqueue an active knote.
+        */
        if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
-               crit_enter();
                kn->kn_status &= ~KN_DISABLED;
                if ((kn->kn_status & KN_ACTIVE) &&
-                   ((kn->kn_status & KN_QUEUED) == 0))
+                   ((kn->kn_status & KN_QUEUED) == 0)) {
                        knote_enqueue(kn);
-               crit_exit();
+               }
        }
 
+       /*
+        * Handle any required reprocessing
+        */
+       knote_release(kn);
+       /* kn may be invalid now */
+
 done:
+       lwkt_reltoken(&kq_token);
        if (fp != NULL)
                fdrop(fp);
        return (error);
 }
 
 /*
- * Scan the kqueue, blocking if necessary until the target time is reached.
+ * Block as necessary until the target time is reached.
  * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
  * 0 we do not block at all.
  */
 static int
-kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
-           struct timespec *tsp, int *errorp)
+kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
 {
-       struct knote *kn, marker;
-       int total;
+       int error = 0;
 
-       total = 0;
-again:
-       crit_enter();
-       if (kq->kq_count == 0) {
-               if (tsp == NULL) {
-                       kq->kq_state |= KQ_SLEEP;
-                       *errorp = tsleep(kq, PCATCH, "kqread", 0);
-               } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
-                       *errorp = EWOULDBLOCK;
-               } else {
-                       struct timespec ats;
-                       struct timespec atx = *tsp;
-                       int timeout;
+       if (tsp == NULL) {
+               kq->kq_state |= KQ_SLEEP;
+               error = tsleep(kq, PCATCH, "kqread", 0);
+       } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
+               error = EWOULDBLOCK;
+       } else {
+               struct timespec ats;
+               struct timespec atx = *tsp;
+               int timeout;
 
-                       nanouptime(&ats);
-                       timespecsub(&atx, &ats);
-                       if (ats.tv_sec < 0) {
-                               *errorp = EWOULDBLOCK;
-                       } else {
-                               timeout = atx.tv_sec > 24 * 60 * 60 ?
-                                       24 * 60 * 60 * hz : tstohz_high(&atx);
-                               kq->kq_state |= KQ_SLEEP;
-                               *errorp = tsleep(kq, PCATCH, "kqread", timeout);
-                       }
+               nanouptime(&ats);
+               timespecsub(&atx, &ats);
+               if (ats.tv_sec < 0) {
+                       error = EWOULDBLOCK;
+               } else {
+                       timeout = atx.tv_sec > 24 * 60 * 60 ?
+                               24 * 60 * 60 * hz : tstohz_high(&atx);
+                       kq->kq_state |= KQ_SLEEP;
+                       error = tsleep(kq, PCATCH, "kqread", timeout);
                }
-               crit_exit();
-               if (*errorp == 0)
-                       goto again;
-               /* don't restart after signals... */
-               if (*errorp == ERESTART)
-                       *errorp = EINTR;
-               else if (*errorp == EWOULDBLOCK)
-                       *errorp = 0;
-               goto done;
        }
 
+       /* don't restart after signals... */
+       if (error == ERESTART)
+               return (EINTR);
+
+       return (error);
+}
+
+/*
+ * Scan the kqueue, return the number of active events placed in kevp up
+ * to count.
+ *
+ * Continuous mode events may get recycled, do not continue scanning past
+ * marker unless no events have been collected.
+ */
+static int
+kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
+            struct knote *marker)
+{
+        struct knote *kn, local_marker;
+        int total;
+
+        total = 0;
+       local_marker.kn_filter = EVFILT_MARKER;
+       local_marker.kn_status = KN_PROCESSING;
+
        /*
-        * Collect events.  Continuous mode events may get recycled
-        * past the marker so we stop when we hit it unless no events
-        * have been collected.
+        * Collect events.
         */
-       TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
+       TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe);
        while (count) {
-               kn = TAILQ_FIRST(&kq->kq_knpend);
-               if (kn == &marker)
-                       break;
-               TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
-               if (kn->kn_status & KN_DISABLED) {
-                       kn->kn_status &= ~KN_QUEUED;
-                       kq->kq_count--;
+               kn = TAILQ_NEXT(&local_marker, kn_tqe);
+               if (kn->kn_filter == EVFILT_MARKER) {
+                       /* Marker reached, we are done */
+                       if (kn == marker)
+                               break;
+
+                       /* Move local marker past some other threads marker */
+                       kn = TAILQ_NEXT(kn, kn_tqe);
+                       TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
+                       TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe);
                        continue;
                }
-               if ((kn->kn_flags & EV_ONESHOT) == 0 &&
-                   kn->kn_fop->f_event(kn, 0) == 0) {
-                       kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-                       kq->kq_count--;
+
+               /*
+                * We can't skip a knote undergoing processing, otherwise
+                * we risk not returning it when the user process expects
+                * it should be returned.  Sleep and retry.
+                */
+               if (knote_acquire(kn) == 0)
                        continue;
-               }
-               *kevp++ = kn->kn_kevent;
-               ++total;
-               --count;
 
                /*
-                * Post-event action on the note
+                * Remove the event for processing.
+                *
+                * WARNING!  We must leave KN_QUEUED set to prevent the
+                *           event from being KNOTE_ACTIVATE()d while
+                *           the queue state is in limbo, in case we
+                *           block.
+                *
+                * WARNING!  We must set KN_PROCESSING to avoid races
+                *           against deletion or another thread's
+                *           processing.
                 */
-               if (kn->kn_flags & EV_ONESHOT) {
+               TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
+               kq->kq_count--;
+
+               /*
+                * We have to deal with an extremely important race against
+                * file descriptor close()s here.  The file descriptor can
+                * disappear MPSAFE, and there is a small window of
+                * opportunity between that and the call to knote_fdclose().
+                *
+                * If we hit that window here while doselect or dopoll is
+                * trying to delete a spurious event they will not be able
+                * to match up the event against a knote and will go haywire.
+                */
+               if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
+                   checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) {
+                       kn->kn_status |= KN_DELETING | KN_REPROCESS;
+               }
+
+               if (kn->kn_status & KN_DISABLED) {
+                       /*
+                        * If disabled we ensure the event is not queued
+                        * but leave its active bit set.  On re-enablement
+                        * the event may be immediately triggered.
+                        */
                        kn->kn_status &= ~KN_QUEUED;
-                       kq->kq_count--;
-                       crit_exit();
-                       kn->kn_fop->f_detach(kn);
-                       knote_drop(kn);
-                       crit_enter();
-               } else if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_data = 0;
-                       kn->kn_fflags = 0;
+               } else if ((kn->kn_flags & EV_ONESHOT) == 0 &&
+                          (kn->kn_status & KN_DELETING) == 0 &&
+                          filter_event(kn, 0) == 0) {
+                       /*
+                        * If not running in one-shot mode and the event
+                        * is no longer present we ensure it is removed
+                        * from the queue and ignore it.
+                        */
                        kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-                       kq->kq_count--;
                } else {
-                       TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
+                       /*
+                        * Post the event
+                        */
+                       *kevp++ = kn->kn_kevent;
+                       ++total;
+                       --count;
+
+                       if (kn->kn_flags & EV_ONESHOT) {
+                               kn->kn_status &= ~KN_QUEUED;
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
+                       } else if (kn->kn_flags & EV_CLEAR) {
+                               kn->kn_data = 0;
+                               kn->kn_fflags = 0;
+                               kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+                       } else {
+                               TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
+                               kq->kq_count++;
+                       }
                }
+
+               /*
+                * Handle any post-processing states
+                */
+               knote_release(kn);
        }
-       TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
-       crit_exit();
-       if (total == 0)
-               goto again;
-done:
+       TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
+
        return (total);
 }
 
@@ -831,7 +1159,7 @@ kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
        struct kqueue *kq;
        int error;
 
-       get_mplock();
+       lwkt_gettoken(&kq_token);
        kq = (struct kqueue *)fp->f_data;
 
        switch(com) {
@@ -849,34 +1177,10 @@ kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
                error = ENOTTY;
                break;
        }
-       rel_mplock();
+       lwkt_reltoken(&kq_token);
        return (error);
 }
 
-/*
- * MPALMOSTSAFE - acquires mplock
- */
-static int
-kqueue_poll(struct file *fp, int events, struct ucred *cred)
-{
-       struct kqueue *kq = (struct kqueue *)fp->f_data;
-       int revents = 0;
-
-       get_mplock();
-       crit_enter();
-        if (events & (POLLIN | POLLRDNORM)) {
-                if (kq->kq_count) {
-                        revents |= events & (POLLIN | POLLRDNORM);
-               } else {
-                        selrecord(curthread, &kq->kq_sel);
-                       kq->kq_state |= KQ_SEL;
-               }
-       }
-       crit_exit();
-       rel_mplock();
-       return (revents);
-}
-
 /*
  * MPSAFE
  */
@@ -893,20 +1197,17 @@ kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred)
 }
 
 /*
- * MPALMOSTSAFE - acquires mplock
+ * MPSAFE
  */
 static int
 kqueue_close(struct file *fp)
 {
        struct kqueue *kq = (struct kqueue *)fp->f_data;
 
-       get_mplock();
-
        kqueue_terminate(kq);
 
        fp->f_data = NULL;
        funsetown(kq->kq_sigio);
-       rel_mplock();
 
        kfree(kq, M_KQUEUE);
        return (0);
@@ -919,65 +1220,228 @@ kqueue_wakeup(struct kqueue *kq)
                kq->kq_state &= ~KQ_SLEEP;
                wakeup(kq);
        }
-       if (kq->kq_state & KQ_SEL) {
-               kq->kq_state &= ~KQ_SEL;
-               selwakeup(&kq->kq_sel);
+       KNOTE(&kq->kq_kqinfo.ki_note, 0);
+}
+
+/*
+ * Calls filterops f_attach function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ */
+static int
+filter_attach(struct knote *kn)
+{
+       int ret;
+
+       if (!(kn->kn_fop->f_flags & FILTEROP_MPSAFE)) {
+               get_mplock();
+               ret = kn->kn_fop->f_attach(kn);
+               rel_mplock();
+       } else {
+               ret = kn->kn_fop->f_attach(kn);
+       }
+
+       return (ret);
+}
+
+/*
+ * Detach the knote and drop it, destroying the knote.
+ *
+ * Calls filterops f_detach function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ */
+static void
+knote_detach_and_drop(struct knote *kn)
+{
+       kn->kn_status |= KN_DELETING | KN_REPROCESS;
+       if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
+               kn->kn_fop->f_detach(kn);
+       } else {
+               get_mplock();
+               kn->kn_fop->f_detach(kn);
+               rel_mplock();
        }
-       KNOTE(&kq->kq_sel.si_note, 0);
+       knote_drop(kn);
 }
 
 /*
- * walk down a list of knotes, activating them if their event has triggered.
+ * Calls filterops f_event function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ *
+ * If the knote is in the middle of being created or deleted we cannot
+ * safely call the filter op.
+ */
+static int
+filter_event(struct knote *kn, long hint)
+{
+       int ret;
+
+       if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
+               ret = kn->kn_fop->f_event(kn, hint);
+       } else {
+               get_mplock();
+               ret = kn->kn_fop->f_event(kn, hint);
+               rel_mplock();
+       }
+       return (ret);
+}
+
+/*
+ * Walk down a list of knotes, activating them if their event has triggered.
+ *
+ * If we encounter any knotes which are undergoing processing we just mark
+ * them for reprocessing and do not try to [re]activate the knote.  However,
+ * if a hint is being passed we have to wait and that makes things a bit
+ * sticky.
  */
 void
 knote(struct klist *list, long hint)
 {
        struct knote *kn;
 
-       SLIST_FOREACH(kn, list, kn_selnext)
-               if (kn->kn_fop->f_event(kn, hint))
-                       KNOTE_ACTIVATE(kn);
+       lwkt_gettoken(&kq_token);
+restart:
+       SLIST_FOREACH(kn, list, kn_next) {
+               if (kn->kn_status & KN_PROCESSING) {
+                       /*
+                        * Someone else is processing the knote, ask the
+                        * other thread to reprocess it and don't mess
+                        * with it otherwise.
+                        */
+                       if (hint == 0) {
+                               kn->kn_status |= KN_REPROCESS;
+                               continue;
+                       }
+
+                       /*
+                        * If the hint is non-zero we have to wait or risk
+                        * losing the state the caller is trying to update.
+                        *
+                        * XXX This is a real problem, certain process
+                        *     and signal filters will bump kn_data for
+                        *     already-processed notes more than once if
+                        *     we restart the list scan.  FIXME.
+                        */
+                       kn->kn_status |= KN_WAITING | KN_REPROCESS;
+                       tsleep(kn, 0, "knotec", hz);
+                       goto restart;
+               }
+
+               /*
+                * Become the reprocessing master ourselves.
+                *
+                * If hint is non-zer running the event is mandatory
+                * when not deleting so do it whether reprocessing is
+                * set or not.
+                */
+               kn->kn_status |= KN_PROCESSING;
+               if ((kn->kn_status & KN_DELETING) == 0) {
+                       if (filter_event(kn, hint))
+                               KNOTE_ACTIVATE(kn);
+               }
+               if (knote_release(kn))
+                       goto restart;
+       }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
- * remove all knotes from a specified klist
+ * Insert knote at head of klist.
+ *
+ * This function may only be called via a filter function and thus
+ * kq_token should already be held and marked for processing.
+ */
+void
+knote_insert(struct klist *klist, struct knote *kn)
+{
+       KKASSERT(kn->kn_status & KN_PROCESSING);
+       ASSERT_LWKT_TOKEN_HELD(&kq_token);
+       SLIST_INSERT_HEAD(klist, kn, kn_next);
+}
+
+/*
+ * Remove knote from a klist
+ *
+ * This function may only be called via a filter function and thus
+ * kq_token should already be held and marked for processing.
  */
 void
-knote_remove(struct klist *list)
+knote_remove(struct klist *klist, struct knote *kn)
+{
+       KKASSERT(kn->kn_status & KN_PROCESSING);
+       ASSERT_LWKT_TOKEN_HELD(&kq_token);
+       SLIST_REMOVE(klist, kn, knote, kn_next);
+}
+
+/*
+ * Remove all knotes from a specified klist
+ *
+ * Only called from aio.
+ */
+void
+knote_empty(struct klist *list)
 {
        struct knote *kn;
 
+       lwkt_gettoken(&kq_token);
        while ((kn = SLIST_FIRST(list)) != NULL) {
-               kn->kn_fop->f_detach(kn);
-               knote_drop(kn);
+               if (knote_acquire(kn))
+                       knote_detach_and_drop(kn);
+       }
+       lwkt_reltoken(&kq_token);
+}
+
+void
+knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst,
+                   struct filterops *ops, void *hook)
+{
+       struct knote *kn;
+
+       lwkt_gettoken(&kq_token);
+       while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) {
+               if (knote_acquire(kn)) {
+                       knote_remove(&src->ki_note, kn);
+                       kn->kn_fop = ops;
+                       kn->kn_hook = hook;
+                       knote_insert(&dst->ki_note, kn);
+                       knote_release(kn);
+                       /* kn may be invalid now */
+               }
        }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
- * remove all knotes referencing a specified fd
+ * Remove all knotes referencing a specified fd
  */
 void
 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd)
 {
        struct knote *kn;
 
+       lwkt_gettoken(&kq_token);
 restart:
        SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
                if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) {
-                       kn->kn_fop->f_detach(kn);
-                       knote_drop(kn);
+                       if (knote_acquire(kn))
+                               knote_detach_and_drop(kn);
                        goto restart;
                }
        }
+       lwkt_reltoken(&kq_token);
 }
 
+/*
+ * Low level attach function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
 knote_attach(struct knote *kn)
 {
        struct klist *list;
        struct kqueue *kq = kn->kn_kq;
 
-       if (kn->kn_fop->f_isfd) {
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
                KKASSERT(kn->kn_fp);
                list = &kn->kn_fp->f_klist;
        } else {
@@ -988,12 +1452,12 @@ knote_attach(struct knote *kn)
        }
        SLIST_INSERT_HEAD(list, kn, kn_link);
        TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink);
-       kn->kn_status = 0;
 }
 
 /*
- * should be called outside of a critical section, since we don't want to
- * hold a critical section while calling fdrop and free.
+ * Low level drop function.
+ *
+ * The knote should already be marked for processing.
  */
 static void
 knote_drop(struct knote *kn)
@@ -1003,7 +1467,7 @@ knote_drop(struct knote *kn)
 
        kq = kn->kn_kq;
 
-       if (kn->kn_fop->f_isfd)
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD)
                list = &kn->kn_fp->f_klist;
        else
                list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
@@ -1012,20 +1476,24 @@ knote_drop(struct knote *kn)
        TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
        if (kn->kn_status & KN_QUEUED)
                knote_dequeue(kn);
-       if (kn->kn_fop->f_isfd)
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
                fdrop(kn->kn_fp);
+               kn->kn_fp = NULL;
+       }
        knote_free(kn);
 }
 
-
+/*
+ * Low level enqueue function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
 knote_enqueue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
 
-       crit_enter();
        KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
-
        TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
        kn->kn_status |= KN_QUEUED;
        ++kq->kq_count;
@@ -1035,22 +1503,24 @@ knote_enqueue(struct knote *kn)
         */
        if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1)
                pgsigio(kq->kq_sigio, SIGIO, 0);
-       crit_exit();
+
        kqueue_wakeup(kq);
 }
 
+/*
+ * Low level dequeue function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
 knote_dequeue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
 
        KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
-       crit_enter();
-
        TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
        kn->kn_status &= ~KN_QUEUED;
        kq->kq_count--;
-       crit_exit();
 }
 
 static void