kernel - Provide descriptions for lwkt.* and debug.* sysctl's
[dragonfly.git] / sys / kern / kern_event.c
index 5d0bdd5..1aefd31 100644 (file)
 #include <sys/file.h>
 #include <sys/lock.h>
 #include <sys/fcntl.h>
-#include <sys/select.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
-#include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
+#include <sys/thread.h>
 #include <sys/uio.h>
-#include <sys/thread2.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
+#include <sys/ktr.h>
+
+#include <sys/thread2.h>
 #include <sys/file2.h>
+#include <sys/mplock2.h>
 
 #include <vm/vm_zone.h>
 
+/*
+ * Global token for kqueue subsystem
+ */
+struct lwkt_token kq_token = LWKT_TOKEN_UP_INITIALIZER(kq_token);
+SYSCTL_INT(_lwkt, OID_AUTO, kq_mpsafe,
+    CTLFLAG_RW, &kq_token.t_flags, 0,
+    "Require MP lock for kq_token");
+SYSCTL_LONG(_lwkt, OID_AUTO, kq_collisions,
+    CTLFLAG_RW, &kq_token.t_collisions, 0,
+    "Collision counter of kq_token");
+
 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
-static int     kqueue_scan(struct file *fp, int maxevents,
-                   struct kevent *ulistp, const struct timespec *timeout,
-                   struct thread *td, int *res);
+struct kevent_copyin_args {
+       struct kevent_args      *ka;
+       int                     pchanges;
+};
+
+static int     kqueue_sleep(struct kqueue *kq, struct timespec *tsp);
+static int     kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
+                   struct knote *marker);
 static int     kqueue_read(struct file *fp, struct uio *uio,
                    struct ucred *cred, int flags);
 static int     kqueue_write(struct file *fp, struct uio *uio,
                    struct ucred *cred, int flags);
 static int     kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
                    struct ucred *cred, struct sysmsg *msg);
-static int     kqueue_poll(struct file *fp, int events, struct ucred *cred);
 static int     kqueue_kqfilter(struct file *fp, struct knote *kn);
 static int     kqueue_stat(struct file *fp, struct stat *st,
                    struct ucred *cred);
 static int     kqueue_close(struct file *fp);
-static void    kqueue_wakeup(struct kqueue *kq);
+static void    kqueue_wakeup(struct kqueue *kq);
+static int     filter_attach(struct knote *kn);
+static int     filter_event(struct knote *kn, long hint);
 
 /*
  * MPSAFE
@@ -80,15 +99,15 @@ static struct fileops kqueueops = {
        .fo_read = kqueue_read,
        .fo_write = kqueue_write,
        .fo_ioctl = kqueue_ioctl,
-       .fo_poll = kqueue_poll,
        .fo_kqfilter = kqueue_kqfilter,
        .fo_stat = kqueue_stat,
        .fo_close = kqueue_close,
        .fo_shutdown = nofo_shutdown
 };
 
-static void    knote_attach(struct knote *kn, struct filedesc *fdp);
-static void    knote_drop(struct knote *kn, struct thread *td);
+static void    knote_attach(struct knote *kn);
+static void    knote_drop(struct knote *kn);
+static void    knote_detach_and_drop(struct knote *kn);
 static void    knote_enqueue(struct knote *kn);
 static void    knote_dequeue(struct knote *kn);
 static void    knote_init(void);
@@ -107,9 +126,9 @@ static void filt_timerdetach(struct knote *kn);
 static int     filt_timer(struct knote *kn, long hint);
 
 static struct filterops file_filtops =
-       { 1, filt_fileattach, NULL, NULL };
+       { FILTEROP_ISFD, filt_fileattach, NULL, NULL };
 static struct filterops kqread_filtops =
-       { 1, NULL, filt_kqdetach, filt_kqueue };
+       { FILTEROP_ISFD, NULL, filt_kqdetach, filt_kqueue };
 static struct filterops proc_filtops =
        { 0, filt_procattach, filt_procdetach, filt_proc };
 static struct filterops timer_filtops =
@@ -120,6 +139,9 @@ static int          kq_ncallouts = 0;
 static int             kq_calloutmax = (4 * 1024);
 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
+static int             kq_checkloop = 1000000;
+SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW,
+    &kq_checkloop, 0, "Maximum number of callouts allocated for kqueue");
 
 #define KNOTE_ACTIVATE(kn) do {                                        \
        kn->kn_status |= KN_ACTIVE;                                     \
@@ -144,6 +166,7 @@ static struct filterops *sysfilt_ops[] = {
        &proc_filtops,                  /* EVFILT_PROC */
        &sig_filtops,                   /* EVFILT_SIGNAL */
        &timer_filtops,                 /* EVFILT_TIMER */
+       &file_filtops,                  /* EVFILT_EXCEPT */
 };
 
 static int
@@ -153,22 +176,18 @@ filt_fileattach(struct knote *kn)
 }
 
 /*
- * MPALMOSTSAFE - acquires mplock
+ * MPSAFE
  */
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
-       get_mplock();
-       if (kn->kn_filter != EVFILT_READ) {
-               rel_mplock();
-               return (1);
-       }
+       if (kn->kn_filter != EVFILT_READ)
+               return (EOPNOTSUPP);
 
        kn->kn_fop = &kqread_filtops;
-       SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
-       rel_mplock();
+       knote_insert(&kq->kq_kqinfo.ki_note, kn);
        return (0);
 }
 
@@ -177,7 +196,7 @@ filt_kqdetach(struct knote *kn)
 {
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
-       SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
+       knote_remove(&kq->kq_kqinfo.ki_note, kn);
 }
 
 /*ARGSUSED*/
@@ -197,15 +216,20 @@ filt_procattach(struct knote *kn)
        int immediate;
 
        immediate = 0;
+       lwkt_gettoken(&proc_token);
        p = pfind(kn->kn_id);
        if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
                p = zpfind(kn->kn_id);
                immediate = 1;
        }
-       if (p == NULL)
+       if (p == NULL) {
+               lwkt_reltoken(&proc_token);
                return (ESRCH);
-       if (! PRISON_CHECK(curproc->p_ucred, p->p_ucred))
+       }
+       if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) {
+               lwkt_reltoken(&proc_token);
                return (EACCES);
+       }
 
        kn->kn_ptr.p_proc = p;
        kn->kn_flags |= EV_CLEAR;               /* automatically set */
@@ -219,16 +243,16 @@ filt_procattach(struct knote *kn)
                kn->kn_flags &= ~EV_FLAG1;
        }
 
-       /* XXX lock the proc here while adding to the list? */
-       SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+       knote_insert(&p->p_klist, kn);
 
        /*
         * Immediately activate any exit notes if the target process is a
         * zombie.  This is necessary to handle the case where the target
-        * process, e.g. a child, dies before the kevent is registered.
+        * process, e.g. a child, dies before the kevent is negistered.
         */
        if (immediate && filt_proc(kn, NOTE_EXIT))
                KNOTE_ACTIVATE(kn);
+       lwkt_reltoken(&proc_token);
 
        return (0);
 }
@@ -248,9 +272,9 @@ filt_procdetach(struct knote *kn)
 
        if (kn->kn_status & KN_DETACHED)
                return;
-       /* XXX locking?  this might modify another process. */
+       /* XXX locking? take proc_token here? */
        p = kn->kn_ptr.p_proc;
-       SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+       knote_remove(&p->p_klist, kn);
 }
 
 static int
@@ -277,7 +301,7 @@ filt_proc(struct knote *kn, long hint)
        if (event == NOTE_EXIT) {
                struct proc *p = kn->kn_ptr.p_proc;
                if ((kn->kn_status & KN_DETACHED) == 0) {
-                       SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+                       knote_remove(&p->p_klist, kn);
                        kn->kn_status |= KN_DETACHED;
                        kn->kn_data = p->p_xstat;
                        kn->kn_ptr.p_proc = NULL;
@@ -304,7 +328,7 @@ filt_proc(struct knote *kn, long hint)
                kev.fflags = kn->kn_sfflags;
                kev.data = kn->kn_id;                   /* parent */
                kev.udata = kn->kn_kevent.udata;        /* preserve udata */
-               error = kqueue_register(kn->kn_kq, &kev, NULL);
+               error = kqueue_register(kn->kn_kq, &kev);
                if (error)
                        kn->kn_fflags |= NOTE_TRACKERR;
        }
@@ -312,6 +336,12 @@ filt_proc(struct knote *kn, long hint)
        return (kn->kn_fflags != 0);
 }
 
+/*
+ * The callout interlocks with callout_stop() (or should), so the
+ * knote should still be a valid structure.  However the timeout
+ * can race a deletion so if KN_DELETING is set we just don't touch
+ * the knote.
+ */
 static void
 filt_timerexpire(void *knx)
 {
@@ -320,16 +350,20 @@ filt_timerexpire(void *knx)
        struct timeval tv;
        int tticks;
 
-       kn->kn_data++;
-       KNOTE_ACTIVATE(kn);
+       lwkt_gettoken(&kq_token);
+       if ((kn->kn_status & KN_DELETING) == 0) {
+               kn->kn_data++;
+               KNOTE_ACTIVATE(kn);
 
-       if ((kn->kn_flags & EV_ONESHOT) == 0) {
-               tv.tv_sec = kn->kn_sdata / 1000;
-               tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
-               tticks = tvtohz_high(&tv);
-               calloutp = (struct callout *)kn->kn_hook;
-               callout_reset(calloutp, tticks, filt_timerexpire, kn);
+               if ((kn->kn_flags & EV_ONESHOT) == 0) {
+                       tv.tv_sec = kn->kn_sdata / 1000;
+                       tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
+                       tticks = tvtohz_high(&tv);
+                       calloutp = (struct callout *)kn->kn_hook;
+                       callout_reset(calloutp, tticks, filt_timerexpire, kn);
+               }
        }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
@@ -378,16 +412,109 @@ filt_timer(struct knote *kn, long hint)
        return (kn->kn_data != 0);
 }
 
+/*
+ * Acquire a knote, return non-zero on success, 0 on failure.
+ *
+ * If we cannot acquire the knote we sleep and return 0.  The knote
+ * may be stale on return in this case and the caller must restart
+ * whatever loop they are in.
+ */
+static __inline
+int
+knote_acquire(struct knote *kn)
+{
+       if (kn->kn_status & KN_PROCESSING) {
+               kn->kn_status |= KN_WAITING | KN_REPROCESS;
+               tsleep(kn, 0, "kqepts", hz);
+               /* knote may be stale now */
+               return(0);
+       }
+       kn->kn_status |= KN_PROCESSING;
+       return(1);
+}
+
+/*
+ * Release an acquired knote, clearing KN_PROCESSING and handling any
+ * KN_REPROCESS events.
+ *
+ * Non-zero is returned if the knote is destroyed.
+ */
+static __inline
+int
+knote_release(struct knote *kn)
+{
+       while (kn->kn_status & KN_REPROCESS) {
+               kn->kn_status &= ~KN_REPROCESS;
+               if (kn->kn_status & KN_WAITING) {
+                       kn->kn_status &= ~KN_WAITING;
+                       wakeup(kn);
+               }
+               if (kn->kn_status & KN_DELETING) {
+                       knote_detach_and_drop(kn);
+                       return(1);
+                       /* NOT REACHED */
+               }
+               if (filter_event(kn, 0))
+                       KNOTE_ACTIVATE(kn);
+       }
+       kn->kn_status &= ~KN_PROCESSING;
+       return(0);
+}
+
+/*
+ * Initialize a kqueue.
+ *
+ * NOTE: The lwp/proc code initializes a kqueue for select/poll ops.
+ *
+ * MPSAFE
+ */
+void
+kqueue_init(struct kqueue *kq, struct filedesc *fdp)
+{
+       TAILQ_INIT(&kq->kq_knpend);
+       TAILQ_INIT(&kq->kq_knlist);
+       kq->kq_count = 0;
+       kq->kq_fdp = fdp;
+       SLIST_INIT(&kq->kq_kqinfo.ki_note);
+}
+
+/*
+ * Terminate a kqueue.  Freeing the actual kq itself is left up to the
+ * caller (it might be embedded in a lwp so we don't do it here).
+ *
+ * The kq's knlist must be completely eradicated so block on any
+ * processing races.
+ */
+void
+kqueue_terminate(struct kqueue *kq)
+{
+       struct knote *kn;
+
+       lwkt_gettoken(&kq_token);
+       while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) {
+               if (knote_acquire(kn))
+                       knote_detach_and_drop(kn);
+       }
+       if (kq->kq_knhash) {
+               kfree(kq->kq_knhash, M_KQUEUE);
+               kq->kq_knhash = NULL;
+               kq->kq_knhashmask = 0;
+       }
+       lwkt_reltoken(&kq_token);
+}
+
+/*
+ * MPSAFE
+ */
 int
 sys_kqueue(struct kqueue_args *uap)
 {
-       struct proc *p = curproc;
-       struct filedesc *fdp = p->p_fd;
+       struct thread *td = curthread;
        struct kqueue *kq;
        struct file *fp;
        int fd, error;
 
-       error = falloc(p, &fp, &fd);
+       error = falloc(td->td_lwp, &fp, &fd);
        if (error)
                return (error);
        fp->f_flag = FREAD | FWRITE;
@@ -395,90 +522,273 @@ sys_kqueue(struct kqueue_args *uap)
        fp->f_ops = &kqueueops;
 
        kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
-       TAILQ_INIT(&kq->kq_head);
-       kq->kq_fdp = fdp;
+       kqueue_init(kq, td->td_proc->p_fd);
        fp->f_data = kq;
 
-       fsetfd(p, fp, fd);
+       fsetfd(kq->kq_fdp, fp, fd);
        uap->sysmsg_result = fd;
        fdrop(fp);
        return (error);
 }
 
-int
-sys_kevent(struct kevent_args *uap)
+/*
+ * Copy 'count' items into the destination list pointed to by uap->eventlist.
+ */
+static int
+kevent_copyout(void *arg, struct kevent *kevp, int count, int *res)
 {
-       struct thread *td = curthread;
-       struct proc *p = td->td_proc;
-       struct kevent *kevp;
-       struct kqueue *kq;
-       struct file *fp = NULL;
-       struct timespec ts;
-       int i, n, nerrors, error;
+       struct kevent_copyin_args *kap;
+       int error;
 
-       KKASSERT(p);
+       kap = (struct kevent_copyin_args *)arg;
 
-       fp = holdfp(p->p_fd, uap->fd, -1);
-       if (fp == NULL)
-               return (EBADF);
-       if (fp->f_type != DTYPE_KQUEUE) {
-               fdrop(fp);
-               return (EBADF);
+       error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp));
+       if (error == 0) {
+               kap->ka->eventlist += count;
+               *res += count;
+       } else {
+               *res = -1;
        }
 
-       if (uap->timeout != NULL) {
-               error = copyin(uap->timeout, &ts, sizeof(ts));
-               if (error)
-                       goto done;
-               uap->timeout = &ts;
+       return (error);
+}
+
+/*
+ * Copy at most 'max' items from the list pointed to by kap->changelist,
+ * return number of items in 'events'.
+ */
+static int
+kevent_copyin(void *arg, struct kevent *kevp, int max, int *events)
+{
+       struct kevent_copyin_args *kap;
+       int error, count;
+
+       kap = (struct kevent_copyin_args *)arg;
+
+       count = min(kap->ka->nchanges - kap->pchanges, max);
+       error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp);
+       if (error == 0) {
+               kap->ka->changelist += count;
+               kap->pchanges += count;
+               *events = count;
        }
 
-       kq = (struct kqueue *)fp->f_data;
-       nerrors = 0;
+       return (error);
+}
 
-       while (uap->nchanges > 0) {
-               n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
-               error = copyin(uap->changelist, kq->kq_kev,
-                   n * sizeof(struct kevent));
+/*
+ * MPSAFE
+ */
+int
+kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
+           k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn,
+           struct timespec *tsp_in)
+{
+       struct kevent *kevp;
+       struct timespec *tsp;
+       int i, n, total, error, nerrors = 0;
+       int lres;
+       int limit = kq_checkloop;
+       struct kevent kev[KQ_NEVENTS];
+       struct knote marker;
+
+       tsp = tsp_in;
+       *res = 0;
+
+       lwkt_gettoken(&kq_token);
+       for ( ;; ) {
+               n = 0;
+               error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
                if (error)
                        goto done;
+               if (n == 0)
+                       break;
                for (i = 0; i < n; i++) {
-                       kevp = &kq->kq_kev[i];
+                       kevp = &kev[i];
                        kevp->flags &= ~EV_SYSFLAGS;
-                       error = kqueue_register(kq, kevp, td);
+                       error = kqueue_register(kq, kevp);
+
+                       /*
+                        * If a registration returns an error we
+                        * immediately post the error.  The kevent()
+                        * call itself will fail with the error if
+                        * no space is available for posting.
+                        *
+                        * Such errors normally bypass the timeout/blocking
+                        * code.  However, if the copyoutfn function refuses
+                        * to post the error (see sys_poll()), then we
+                        * ignore it too.
+                        */
                        if (error) {
-                               if (uap->nevents != 0) {
-                                       kevp->flags = EV_ERROR;
-                                       kevp->data = error;
-                                       (void) copyout((caddr_t)kevp,
-                                           (caddr_t)uap->eventlist,
-                                           sizeof(*kevp));
-                                       uap->eventlist++;
-                                       uap->nevents--;
+                               kevp->flags = EV_ERROR;
+                               kevp->data = error;
+                               lres = *res;
+                               kevent_copyoutfn(uap, kevp, 1, res);
+                               if (lres != *res) {
+                                       nevents--;
                                        nerrors++;
-                               } else {
-                                       goto done;
                                }
                        }
                }
-               uap->nchanges -= n;
-               uap->changelist += n;
        }
        if (nerrors) {
-               uap->sysmsg_result = nerrors;
                error = 0;
                goto done;
        }
 
-       error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td, &uap->sysmsg_result);
+       /*
+        * Acquire/wait for events - setup timeout
+        */
+       if (tsp != NULL) {
+               struct timespec ats;
+
+               if (tsp->tv_sec || tsp->tv_nsec) {
+                       nanouptime(&ats);
+                       timespecadd(tsp, &ats);         /* tsp = target time */
+               }
+       }
+
+       /*
+        * Loop as required.
+        *
+        * Collect as many events as we can. Sleeping on successive
+        * loops is disabled if copyoutfn has incremented (*res).
+        *
+        * The loop stops if an error occurs, all events have been
+        * scanned (the marker has been reached), or fewer than the
+        * maximum number of events is found.
+        *
+        * The copyoutfn function does not have to increment (*res) in
+        * order for the loop to continue.
+        *
+        * NOTE: doselect() usually passes 0x7FFFFFFF for nevents.
+        */
+       total = 0;
+       error = 0;
+       marker.kn_filter = EVFILT_MARKER;
+       marker.kn_status = KN_PROCESSING;
+       TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
+       while ((n = nevents - total) > 0) {
+               if (n > KQ_NEVENTS)
+                       n = KQ_NEVENTS;
+
+               /*
+                * If no events are pending sleep until timeout (if any)
+                * or an event occurs.
+                *
+                * After the sleep completes the marker is moved to the
+                * end of the list, making any received events available
+                * to our scan.
+                */
+               if (kq->kq_count == 0 && *res == 0) {
+                       error = kqueue_sleep(kq, tsp);
+                       if (error)
+                               break;
+
+                       TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
+                       TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
+               }
+
+               /*
+                * Process all received events
+                * Account for all non-spurious events in our total
+                */
+               i = kqueue_scan(kq, kev, n, &marker);
+               if (i) {
+                       lres = *res;
+                       error = kevent_copyoutfn(uap, kev, i, res);
+                       total += *res - lres;
+                       if (error)
+                               break;
+               }
+               if (limit && --limit == 0)
+                       panic("kqueue: checkloop failed i=%d", i);
+
+               /*
+                * Normally when fewer events are returned than requested
+                * we can stop.  However, if only spurious events were
+                * collected the copyout will not bump (*res) and we have
+                * to continue.
+                */
+               if (i < n && *res)
+                       break;
+
+               /*
+                * Deal with an edge case where spurious events can cause
+                * a loop to occur without moving the marker.  This can
+                * prevent kqueue_scan() from picking up new events which
+                * race us.  We must be sure to move the marker for this
+                * case.
+                *
+                * NOTE: We do not want to move the marker if events
+                *       were scanned because normal kqueue operations
+                *       may reactivate events.  Moving the marker in
+                *       that case could result in duplicates for the
+                *       same event.
+                */
+               if (i == 0) {
+                       TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
+                       TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
+               }
+       }
+       TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
+
+       /* Timeouts do not return EWOULDBLOCK. */
+       if (error == EWOULDBLOCK)
+               error = 0;
+
 done:
-       if (fp != NULL)
+       lwkt_reltoken(&kq_token);
+       return (error);
+}
+
+/*
+ * MPALMOSTSAFE
+ */
+int
+sys_kevent(struct kevent_args *uap)
+{
+       struct thread *td = curthread;
+       struct proc *p = td->td_proc;
+       struct timespec ts, *tsp;
+       struct kqueue *kq;
+       struct file *fp = NULL;
+       struct kevent_copyin_args *kap, ka;
+       int error;
+
+       if (uap->timeout) {
+               error = copyin(uap->timeout, &ts, sizeof(ts));
+               if (error)
+                       return (error);
+               tsp = &ts;
+       } else {
+               tsp = NULL;
+       }
+
+       fp = holdfp(p->p_fd, uap->fd, -1);
+       if (fp == NULL)
+               return (EBADF);
+       if (fp->f_type != DTYPE_KQUEUE) {
                fdrop(fp);
+               return (EBADF);
+       }
+
+       kq = (struct kqueue *)fp->f_data;
+
+       kap = &ka;
+       kap->ka = uap;
+       kap->pchanges = 0;
+
+       error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap,
+                           kevent_copyin, kevent_copyout, tsp);
+
+       fdrop(fp);
+
        return (error);
 }
 
 int
-kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
+kqueue_register(struct kqueue *kq, struct kevent *kev)
 {
        struct filedesc *fdp = kq->kq_fdp;
        struct filterops *fops;
@@ -500,32 +810,47 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
                return (EINVAL);
        }
 
-       if (fops->f_isfd) {
+       lwkt_gettoken(&kq_token);
+       if (fops->f_flags & FILTEROP_ISFD) {
                /* validate descriptor */
                fp = holdfp(fdp, kev->ident, -1);
-               if (fp == NULL)
+               if (fp == NULL) {
+                       lwkt_reltoken(&kq_token);
                        return (EBADF);
+               }
 
-               if (kev->ident < fdp->fd_knlistsize) {
-                       SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
-                               if (kq == kn->kn_kq &&
-                                   kev->filter == kn->kn_filter)
-                                       break;
+again1:
+               SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
+                       if (kn->kn_kq == kq &&
+                           kn->kn_filter == kev->filter &&
+                           kn->kn_id == kev->ident) {
+                               if (knote_acquire(kn) == 0)
+                                       goto again1;
+                               break;
+                       }
                }
        } else {
-               if (fdp->fd_knhashmask != 0) {
+               if (kq->kq_knhashmask) {
                        struct klist *list;
                        
-                       list = &fdp->fd_knhash[
-                           KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
-                       SLIST_FOREACH(kn, list, kn_link)
-                               if (kev->ident == kn->kn_id &&
-                                   kq == kn->kn_kq &&
-                                   kev->filter == kn->kn_filter)
+                       list = &kq->kq_knhash[
+                           KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
+again2:
+                       SLIST_FOREACH(kn, list, kn_link) {
+                               if (kn->kn_id == kev->ident &&
+                                   kn->kn_filter == kev->filter) {
+                                       if (knote_acquire(kn) == 0)
+                                               goto again2;
                                        break;
+                               }
+                       }
                }
        }
 
+       /*
+        * NOTE: At this point if kn is non-NULL we will have acquired
+        *       it and set KN_PROCESSING.
+        */
        if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
                error = ENOENT;
                goto done;
@@ -535,7 +860,6 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
         * kn now contains the matching knote, or NULL if no match
         */
        if (kev->flags & EV_ADD) {
-
                if (kn == NULL) {
                        kn = knote_alloc();
                        if (kn == NULL) {
@@ -558,180 +882,250 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
                        kev->data = 0;
                        kn->kn_kevent = *kev;
 
-                       knote_attach(kn, fdp);
-                       if ((error = fops->f_attach(kn)) != 0) {
-                               knote_drop(kn, td);
+                       /*
+                        * KN_PROCESSING prevents the knote from getting
+                        * ripped out from under us while we are trying
+                        * to attach it, in case the attach blocks.
+                        */
+                       kn->kn_status = KN_PROCESSING;
+                       knote_attach(kn);
+                       if ((error = filter_attach(kn)) != 0) {
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
+                               knote_drop(kn);
                                goto done;
                        }
+
+                       /*
+                        * Interlock against close races which either tried
+                        * to remove our knote while we were blocked or missed
+                        * it entirely prior to our attachment.  We do not
+                        * want to end up with a knote on a closed descriptor.
+                        */
+                       if ((fops->f_flags & FILTEROP_ISFD) &&
+                           checkfdclosed(fdp, kev->ident, kn->kn_fp)) {
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
+                       }
                } else {
                        /*
                         * The user may change some filter values after the
                         * initial EV_ADD, but doing so will not reset any 
                         * filter which have already been triggered.
                         */
+                       KKASSERT(kn->kn_status & KN_PROCESSING);
                        kn->kn_sfflags = kev->fflags;
                        kn->kn_sdata = kev->data;
                        kn->kn_kevent.udata = kev->udata;
                }
 
-               crit_enter();
-               if (kn->kn_fop->f_event(kn, 0))
-                       KNOTE_ACTIVATE(kn);
-               crit_exit();
+               /*
+                * Execute the filter event to immediately activate the
+                * knote if necessary.  If reprocessing events are pending
+                * due to blocking above we do not run the filter here
+                * but instead let knote_release() do it.  Otherwise we
+                * might run the filter on a deleted event.
+                */
+               if ((kn->kn_status & KN_REPROCESS) == 0) {
+                       if (filter_event(kn, 0))
+                               KNOTE_ACTIVATE(kn);
+               }
        } else if (kev->flags & EV_DELETE) {
-               kn->kn_fop->f_detach(kn);
-               knote_drop(kn, td);
+               /*
+                * Delete the existing knote
+                */
+               knote_detach_and_drop(kn);
                goto done;
        }
 
+       /*
+        * Disablement does not deactivate a knote here.
+        */
        if ((kev->flags & EV_DISABLE) &&
            ((kn->kn_status & KN_DISABLED) == 0)) {
-               crit_enter();
                kn->kn_status |= KN_DISABLED;
-               crit_exit();
        }
 
+       /*
+        * Re-enablement may have to immediately enqueue an active knote.
+        */
        if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
-               crit_enter();
                kn->kn_status &= ~KN_DISABLED;
                if ((kn->kn_status & KN_ACTIVE) &&
-                   ((kn->kn_status & KN_QUEUED) == 0))
+                   ((kn->kn_status & KN_QUEUED) == 0)) {
                        knote_enqueue(kn);
-               crit_exit();
+               }
        }
 
+       /*
+        * Handle any required reprocessing
+        */
+       knote_release(kn);
+       /* kn may be invalid now */
+
 done:
+       lwkt_reltoken(&kq_token);
        if (fp != NULL)
                fdrop(fp);
        return (error);
 }
 
+/*
+ * Block as necessary until the target time is reached.
+ * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
+ * 0 we do not block at all.
+ */
 static int
-kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
-       const struct timespec *tsp, struct thread *td, int *res)
+kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
 {
-       struct kqueue *kq = (struct kqueue *)fp->f_data;
-       struct kevent *kevp;
-       struct timeval atv, rtv, ttv;
-       struct knote *kn, marker;
-       int count, timeout, nkev = 0, error = 0;
-
-       count = maxevents;
-       if (count == 0)
-               goto done;
+       int error = 0;
 
-       if (tsp != NULL) {
-               TIMESPEC_TO_TIMEVAL(&atv, tsp);
-               if (itimerfix(&atv)) {
-                       error = EINVAL;
-                       goto done;
-               }
-               if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
-                       timeout = -1;
-               else 
-                       timeout = atv.tv_sec > 24 * 60 * 60 ?
-                           24 * 60 * 60 * hz : tvtohz_high(&atv);
-               getmicrouptime(&rtv);
-               timevaladd(&atv, &rtv);
+       if (tsp == NULL) {
+               kq->kq_state |= KQ_SLEEP;
+               error = tsleep(kq, PCATCH, "kqread", 0);
+       } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
+               error = EWOULDBLOCK;
        } else {
-               atv.tv_sec = 0;
-               atv.tv_usec = 0;
-               timeout = 0;
-       }
-       goto start;
+               struct timespec ats;
+               struct timespec atx = *tsp;
+               int timeout;
 
-retry:
-       if (atv.tv_sec || atv.tv_usec) {
-               getmicrouptime(&rtv);
-               if (timevalcmp(&rtv, &atv, >=))
-                       goto done;
-               ttv = atv;
-               timevalsub(&ttv, &rtv);
-               timeout = ttv.tv_sec > 24 * 60 * 60 ?
-                       24 * 60 * 60 * hz : tvtohz_high(&ttv);
-       }
-
-start:
-       kevp = kq->kq_kev;
-       crit_enter();
-       if (kq->kq_count == 0) {
-               if (timeout < 0) { 
+               nanouptime(&ats);
+               timespecsub(&atx, &ats);
+               if (ats.tv_sec < 0) {
                        error = EWOULDBLOCK;
                } else {
+                       timeout = atx.tv_sec > 24 * 60 * 60 ?
+                               24 * 60 * 60 * hz : tstohz_high(&atx);
                        kq->kq_state |= KQ_SLEEP;
                        error = tsleep(kq, PCATCH, "kqread", timeout);
                }
-               crit_exit();
-               if (error == 0)
-                       goto retry;
-               /* don't restart after signals... */
-               if (error == ERESTART)
-                       error = EINTR;
-               else if (error == EWOULDBLOCK)
-                       error = 0;
-               goto done;
        }
 
-       TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); 
+       /* don't restart after signals... */
+       if (error == ERESTART)
+               return (EINTR);
+
+       return (error);
+}
+
+/*
+ * Scan the kqueue, return the number of active events placed in kevp up
+ * to count.
+ *
+ * Continuous mode events may get recycled, do not continue scanning past
+ * marker unless no events have been collected.
+ */
+static int
+kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
+            struct knote *marker)
+{
+        struct knote *kn, local_marker;
+        int total;
+
+        total = 0;
+       local_marker.kn_filter = EVFILT_MARKER;
+       local_marker.kn_status = KN_PROCESSING;
+
+       /*
+        * Collect events.
+        */
+       TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe);
        while (count) {
-               kn = TAILQ_FIRST(&kq->kq_head);
-               TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
-               if (kn == &marker) {
-                       crit_exit();
-                       if (count == maxevents)
-                               goto retry;
-                       goto done;
-               }
-               if (kn->kn_status & KN_DISABLED) {
-                       kn->kn_status &= ~KN_QUEUED;
-                       kq->kq_count--;
+               kn = TAILQ_NEXT(&local_marker, kn_tqe);
+               if (kn->kn_filter == EVFILT_MARKER) {
+                       /* Marker reached, we are done */
+                       if (kn == marker)
+                               break;
+
+                       /* Move local marker past some other threads marker */
+                       kn = TAILQ_NEXT(kn, kn_tqe);
+                       TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
+                       TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe);
                        continue;
                }
-               if ((kn->kn_flags & EV_ONESHOT) == 0 &&
-                   kn->kn_fop->f_event(kn, 0) == 0) {
-                       kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-                       kq->kq_count--;
+
+               /*
+                * We can't skip a knote undergoing processing, otherwise
+                * we risk not returning it when the user process expects
+                * it should be returned.  Sleep and retry.
+                */
+               if (knote_acquire(kn) == 0)
                        continue;
+
+               /*
+                * Remove the event for processing.
+                *
+                * WARNING!  We must leave KN_QUEUED set to prevent the
+                *           event from being KNOTE_ACTIVATE()d while
+                *           the queue state is in limbo, in case we
+                *           block.
+                *
+                * WARNING!  We must set KN_PROCESSING to avoid races
+                *           against deletion or another thread's
+                *           processing.
+                */
+               TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
+               kq->kq_count--;
+
+               /*
+                * We have to deal with an extremely important race against
+                * file descriptor close()s here.  The file descriptor can
+                * disappear MPSAFE, and there is a small window of
+                * opportunity between that and the call to knote_fdclose().
+                *
+                * If we hit that window here while doselect or dopoll is
+                * trying to delete a spurious event they will not be able
+                * to match up the event against a knote and will go haywire.
+                */
+               if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
+                   checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) {
+                       kn->kn_status |= KN_DELETING | KN_REPROCESS;
                }
-               *kevp = kn->kn_kevent;
-               kevp++;
-               nkev++;
-               if (kn->kn_flags & EV_ONESHOT) {
+
+               if (kn->kn_status & KN_DISABLED) {
+                       /*
+                        * If disabled we ensure the event is not queued
+                        * but leave its active bit set.  On re-enablement
+                        * the event may be immediately triggered.
+                        */
                        kn->kn_status &= ~KN_QUEUED;
-                       kq->kq_count--;
-                       crit_exit();
-                       kn->kn_fop->f_detach(kn);
-                       knote_drop(kn, td);
-                       crit_enter();
-               } else if (kn->kn_flags & EV_CLEAR) {
-                       kn->kn_data = 0;
-                       kn->kn_fflags = 0;
+               } else if ((kn->kn_flags & EV_ONESHOT) == 0 &&
+                          (kn->kn_status & KN_DELETING) == 0 &&
+                          filter_event(kn, 0) == 0) {
+                       /*
+                        * If not running in one-shot mode and the event
+                        * is no longer present we ensure it is removed
+                        * from the queue and ignore it.
+                        */
                        kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-                       kq->kq_count--;
                } else {
-                       TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
-               }
-               count--;
-               if (nkev == KQ_NEVENTS) {
-                       crit_exit();
-                       error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
-                           sizeof(struct kevent) * nkev);
-                       ulistp += nkev;
-                       nkev = 0;
-                       kevp = kq->kq_kev;
-                       crit_enter();
-                       if (error)
-                               break;
+                       /*
+                        * Post the event
+                        */
+                       *kevp++ = kn->kn_kevent;
+                       ++total;
+                       --count;
+
+                       if (kn->kn_flags & EV_ONESHOT) {
+                               kn->kn_status &= ~KN_QUEUED;
+                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
+                       } else if (kn->kn_flags & EV_CLEAR) {
+                               kn->kn_data = 0;
+                               kn->kn_fflags = 0;
+                               kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+                       } else {
+                               TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
+                               kq->kq_count++;
+                       }
                }
+
+               /*
+                * Handle any post-processing states
+                */
+               knote_release(kn);
        }
-       TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); 
-       crit_exit();
-done:
-       if (nkev != 0)
-               error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
-                   sizeof(struct kevent) * nkev);
-        *res = maxevents - count;
-       return (error);
+       TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
+
+       return (total);
 }
 
 /*
@@ -756,7 +1150,7 @@ kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
 }
 
 /*
- * MPSAFE
+ * MPALMOSTSAFE
  */
 static int
 kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
@@ -765,7 +1159,7 @@ kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
        struct kqueue *kq;
        int error;
 
-       get_mplock();
+       lwkt_gettoken(&kq_token);
        kq = (struct kqueue *)fp->f_data;
 
        switch(com) {
@@ -783,34 +1177,10 @@ kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
                error = ENOTTY;
                break;
        }
-       rel_mplock();
+       lwkt_reltoken(&kq_token);
        return (error);
 }
 
-/*
- * MPALMOSTSAFE - acquires mplock
- */
-static int
-kqueue_poll(struct file *fp, int events, struct ucred *cred)
-{
-       struct kqueue *kq = (struct kqueue *)fp->f_data;
-       int revents = 0;
-
-       get_mplock();
-       crit_enter();
-        if (events & (POLLIN | POLLRDNORM)) {
-                if (kq->kq_count) {
-                        revents |= events & (POLLIN | POLLRDNORM);
-               } else {
-                        selrecord(curthread, &kq->kq_sel);
-                       kq->kq_state |= KQ_SEL;
-               }
-       }
-       crit_exit();
-       rel_mplock();
-       return (revents);
-}
-
 /*
  * MPSAFE
  */
@@ -827,58 +1197,17 @@ kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred)
 }
 
 /*
- * MPALMOSTSAFE - acquires mplock
+ * MPSAFE
  */
 static int
 kqueue_close(struct file *fp)
 {
-       struct thread *td = curthread;
-       struct proc *p = td->td_proc;
        struct kqueue *kq = (struct kqueue *)fp->f_data;
-       struct filedesc *fdp;
-       struct knote **knp, *kn, *kn0;
-       int i;
-
-       KKASSERT(p);
-       get_mplock();
-       fdp = p->p_fd;
-       for (i = 0; i < fdp->fd_knlistsize; i++) {
-               knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
-               kn = *knp;
-               while (kn != NULL) {
-                       kn0 = SLIST_NEXT(kn, kn_link);
-                       if (kq == kn->kn_kq) {
-                               kn->kn_fop->f_detach(kn);
-                               fdrop(kn->kn_fp);
-                               knote_free(kn);
-                               *knp = kn0;
-                       } else {
-                               knp = &SLIST_NEXT(kn, kn_link);
-                       }
-                       kn = kn0;
-               }
-       }
-       if (fdp->fd_knhashmask != 0) {
-               for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
-                       knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
-                       kn = *knp;
-                       while (kn != NULL) {
-                               kn0 = SLIST_NEXT(kn, kn_link);
-                               if (kq == kn->kn_kq) {
-                                       kn->kn_fop->f_detach(kn);
-               /* XXX non-fd release of kn->kn_ptr */
-                                       knote_free(kn);
-                                       *knp = kn0;
-                               } else {
-                                       knp = &SLIST_NEXT(kn, kn_link);
-                               }
-                               kn = kn0;
-                       }
-               }
-       }
+
+       kqueue_terminate(kq);
+
        fp->f_data = NULL;
        funsetown(kq->kq_sigio);
-       rel_mplock();
 
        kfree(kq, M_KQUEUE);
        return (0);
@@ -891,125 +1220,281 @@ kqueue_wakeup(struct kqueue *kq)
                kq->kq_state &= ~KQ_SLEEP;
                wakeup(kq);
        }
-       if (kq->kq_state & KQ_SEL) {
-               kq->kq_state &= ~KQ_SEL;
-               selwakeup(&kq->kq_sel);
+       KNOTE(&kq->kq_kqinfo.ki_note, 0);
+}
+
+/*
+ * Calls filterops f_attach function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ */
+static int
+filter_attach(struct knote *kn)
+{
+       int ret;
+
+       if (!(kn->kn_fop->f_flags & FILTEROP_MPSAFE)) {
+               get_mplock();
+               ret = kn->kn_fop->f_attach(kn);
+               rel_mplock();
+       } else {
+               ret = kn->kn_fop->f_attach(kn);
+       }
+
+       return (ret);
+}
+
+/*
+ * Detach the knote and drop it, destroying the knote.
+ *
+ * Calls filterops f_detach function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ */
+static void
+knote_detach_and_drop(struct knote *kn)
+{
+       kn->kn_status |= KN_DELETING | KN_REPROCESS;
+       if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
+               kn->kn_fop->f_detach(kn);
+       } else {
+               get_mplock();
+               kn->kn_fop->f_detach(kn);
+               rel_mplock();
+       }
+       knote_drop(kn);
+}
+
+/*
+ * Calls filterops f_event function, acquiring mplock if filter is not
+ * marked as FILTEROP_MPSAFE.
+ *
+ * If the knote is in the middle of being created or deleted we cannot
+ * safely call the filter op.
+ */
+static int
+filter_event(struct knote *kn, long hint)
+{
+       int ret;
+
+       if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
+               ret = kn->kn_fop->f_event(kn, hint);
+       } else {
+               get_mplock();
+               ret = kn->kn_fop->f_event(kn, hint);
+               rel_mplock();
        }
-       KNOTE(&kq->kq_sel.si_note, 0);
+       return (ret);
 }
 
 /*
- * walk down a list of knotes, activating them if their event has triggered.
+ * Walk down a list of knotes, activating them if their event has triggered.
+ *
+ * If we encounter any knotes which are undergoing processing we just mark
+ * them for reprocessing and do not try to [re]activate the knote.  However,
+ * if a hint is being passed we have to wait and that makes things a bit
+ * sticky.
  */
 void
 knote(struct klist *list, long hint)
 {
        struct knote *kn;
 
-       SLIST_FOREACH(kn, list, kn_selnext)
-               if (kn->kn_fop->f_event(kn, hint))
-                       KNOTE_ACTIVATE(kn);
+       lwkt_gettoken(&kq_token);
+restart:
+       SLIST_FOREACH(kn, list, kn_next) {
+               if (kn->kn_status & KN_PROCESSING) {
+                       /*
+                        * Someone else is processing the knote, ask the
+                        * other thread to reprocess it and don't mess
+                        * with it otherwise.
+                        */
+                       if (hint == 0) {
+                               kn->kn_status |= KN_REPROCESS;
+                               continue;
+                       }
+
+                       /*
+                        * If the hint is non-zero we have to wait or risk
+                        * losing the state the caller is trying to update.
+                        *
+                        * XXX This is a real problem, certain process
+                        *     and signal filters will bump kn_data for
+                        *     already-processed notes more than once if
+                        *     we restart the list scan.  FIXME.
+                        */
+                       kn->kn_status |= KN_WAITING | KN_REPROCESS;
+                       tsleep(kn, 0, "knotec", hz);
+                       goto restart;
+               }
+
+               /*
+                * Become the reprocessing master ourselves.
+                *
+                * If hint is non-zer running the event is mandatory
+                * when not deleting so do it whether reprocessing is
+                * set or not.
+                */
+               kn->kn_status |= KN_PROCESSING;
+               if ((kn->kn_status & KN_DELETING) == 0) {
+                       if (filter_event(kn, hint))
+                               KNOTE_ACTIVATE(kn);
+               }
+               if (knote_release(kn))
+                       goto restart;
+       }
+       lwkt_reltoken(&kq_token);
+}
+
+/*
+ * Insert knote at head of klist.
+ *
+ * This function may only be called via a filter function and thus
+ * kq_token should already be held and marked for processing.
+ */
+void
+knote_insert(struct klist *klist, struct knote *kn)
+{
+       KKASSERT(kn->kn_status & KN_PROCESSING);
+       ASSERT_LWKT_TOKEN_HELD(&kq_token);
+       SLIST_INSERT_HEAD(klist, kn, kn_next);
+}
+
+/*
+ * Remove knote from a klist
+ *
+ * This function may only be called via a filter function and thus
+ * kq_token should already be held and marked for processing.
+ */
+void
+knote_remove(struct klist *klist, struct knote *kn)
+{
+       KKASSERT(kn->kn_status & KN_PROCESSING);
+       ASSERT_LWKT_TOKEN_HELD(&kq_token);
+       SLIST_REMOVE(klist, kn, knote, kn_next);
 }
 
 /*
- * remove all knotes from a specified klist
+ * Remove all knotes from a specified klist
+ *
+ * Only called from aio.
  */
 void
-knote_remove(struct thread *td, struct klist *list)
+knote_empty(struct klist *list)
 {
        struct knote *kn;
 
+       lwkt_gettoken(&kq_token);
        while ((kn = SLIST_FIRST(list)) != NULL) {
-               kn->kn_fop->f_detach(kn);
-               knote_drop(kn, td);
+               if (knote_acquire(kn))
+                       knote_detach_and_drop(kn);
+       }
+       lwkt_reltoken(&kq_token);
+}
+
+void
+knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst,
+                   struct filterops *ops, void *hook)
+{
+       struct knote *kn;
+
+       lwkt_gettoken(&kq_token);
+       while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) {
+               if (knote_acquire(kn)) {
+                       knote_remove(&src->ki_note, kn);
+                       kn->kn_fop = ops;
+                       kn->kn_hook = hook;
+                       knote_insert(&dst->ki_note, kn);
+                       knote_release(kn);
+                       /* kn may be invalid now */
+               }
        }
+       lwkt_reltoken(&kq_token);
 }
 
 /*
- * remove all knotes referencing a specified fd
+ * Remove all knotes referencing a specified fd
  */
 void
-knote_fdclose(struct proc *p, int fd)
+knote_fdclose(struct file *fp, struct filedesc *fdp, int fd)
 {
-       struct filedesc *fdp = p->p_fd;
-       struct klist *list = &fdp->fd_knlist[fd];
-       /* Take any thread of p */
-       struct thread *td = FIRST_LWP_IN_PROC(p)->lwp_thread;
+       struct knote *kn;
 
-       knote_remove(td, list);
+       lwkt_gettoken(&kq_token);
+restart:
+       SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
+               if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) {
+                       if (knote_acquire(kn))
+                               knote_detach_and_drop(kn);
+                       goto restart;
+               }
+       }
+       lwkt_reltoken(&kq_token);
 }
 
+/*
+ * Low level attach function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
-knote_attach(struct knote *kn, struct filedesc *fdp)
+knote_attach(struct knote *kn)
 {
        struct klist *list;
-       int size;
-
-       if (! kn->kn_fop->f_isfd) {
-               if (fdp->fd_knhashmask == 0)
-                       fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
-                           &fdp->fd_knhashmask);
-               list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
-               goto done;
-       }
+       struct kqueue *kq = kn->kn_kq;
 
-       if (fdp->fd_knlistsize <= kn->kn_id) {
-               size = fdp->fd_knlistsize;
-               while (size <= kn->kn_id)
-                       size += KQEXTENT;
-               MALLOC(list, struct klist *,
-                   size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
-               bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
-                   fdp->fd_knlistsize * sizeof(struct klist *));
-               bzero((caddr_t)list +
-                   fdp->fd_knlistsize * sizeof(struct klist *),
-                   (size - fdp->fd_knlistsize) * sizeof(struct klist *));
-               if (fdp->fd_knlist != NULL)
-                       FREE(fdp->fd_knlist, M_KQUEUE);
-               fdp->fd_knlistsize = size;
-               fdp->fd_knlist = list;
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
+               KKASSERT(kn->kn_fp);
+               list = &kn->kn_fp->f_klist;
+       } else {
+               if (kq->kq_knhashmask == 0)
+                       kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
+                                                &kq->kq_knhashmask);
+               list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
        }
-       list = &fdp->fd_knlist[kn->kn_id];
-done:
        SLIST_INSERT_HEAD(list, kn, kn_link);
-       kn->kn_status = 0;
+       TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink);
 }
 
 /*
- * should be called outside of a critical section, since we don't want to
- * hold a critical section while calling fdrop and free.
+ * Low level drop function.
+ *
+ * The knote should already be marked for processing.
  */
 static void
-knote_drop(struct knote *kn, struct thread *td)
+knote_drop(struct knote *kn)
 {
-        struct filedesc *fdp;
+       struct kqueue *kq;
        struct klist *list;
 
-       KKASSERT(td->td_proc);
-        fdp = td->td_proc->p_fd;
-       if (kn->kn_fop->f_isfd)
-               list = &fdp->fd_knlist[kn->kn_id];
+       kq = kn->kn_kq;
+
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD)
+               list = &kn->kn_fp->f_klist;
        else
-               list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+               list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
        SLIST_REMOVE(list, kn, knote, kn_link);
+       TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
        if (kn->kn_status & KN_QUEUED)
                knote_dequeue(kn);
-       if (kn->kn_fop->f_isfd)
+       if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
                fdrop(kn->kn_fp);
+               kn->kn_fp = NULL;
+       }
        knote_free(kn);
 }
 
-
+/*
+ * Low level enqueue function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
 knote_enqueue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
 
-       crit_enter();
        KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
-
-       TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
+       TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
        kn->kn_status |= KN_QUEUED;
        ++kq->kq_count;
 
@@ -1018,22 +1503,24 @@ knote_enqueue(struct knote *kn)
         */
        if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1)
                pgsigio(kq->kq_sigio, SIGIO, 0);
-       crit_exit();
+
        kqueue_wakeup(kq);
 }
 
+/*
+ * Low level dequeue function.
+ *
+ * The knote should already be marked for processing.
+ */
 static void
 knote_dequeue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
 
        KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
-       crit_enter();
-
-       TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
+       TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
        kn->kn_status &= ~KN_QUEUED;
        kq->kq_count--;
-       crit_exit();
 }
 
 static void