gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	*
	14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	24	* SUCH DAMAGE.
	25	*
	26	* $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $
	27	* $DragonFly: src/sys/kern/kern_event.c,v 1.33 2007/02/03 17:05:57 corecode Exp $
	28	*/
	29
	30	#include <sys/param.h>
	31	#include <sys/systm.h>
	32	#include <sys/kernel.h>
	33	#include <sys/proc.h>
	34	#include <sys/malloc.h>
	35	#include <sys/unistd.h>
	36	#include <sys/file.h>
	37	#include <sys/lock.h>
	38	#include <sys/fcntl.h>
	39	#include <sys/queue.h>
	40	#include <sys/event.h>
	41	#include <sys/eventvar.h>
	42	#include <sys/protosw.h>
	43	#include <sys/socket.h>
	44	#include <sys/socketvar.h>
	45	#include <sys/stat.h>
	46	#include <sys/sysctl.h>
	47	#include <sys/sysproto.h>
	48	#include <sys/thread.h>
	49	#include <sys/uio.h>
	50	#include <sys/signalvar.h>
	51	#include <sys/filio.h>
	52	#include <sys/ktr.h>
	53
	54	#include <sys/thread2.h>
	55	#include <sys/file2.h>
	56	#include <sys/mplock2.h>
	57
	58	/*
	59	* Global token for kqueue subsystem
	60	*/
	61	struct lwkt_token kq_token = LWKT_TOKEN_INITIALIZER(kq_token);
	62	SYSCTL_LONG(_lwkt, OID_AUTO, kq_collisions,
	63	CTLFLAG_RW, &kq_token.t_collisions, 0,
	64	"Collision counter of kq_token");
	65
	66	MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
	67
	68	struct kevent_copyin_args {
	69	struct kevent_args *ka;
	70	int pchanges;
	71	};
	72
	73	static int kqueue_sleep(struct kqueue kq, struct timespec tsp);
	74	static int kqueue_scan(struct kqueue kq, struct kevent kevp, int count,
	75	struct knote *marker);
	76	static int kqueue_read(struct file fp, struct uio uio,
	77	struct ucred *cred, int flags);
	78	static int kqueue_write(struct file fp, struct uio uio,
	79	struct ucred *cred, int flags);
	80	static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
	81	struct ucred cred, struct sysmsg msg);
	82	static int kqueue_kqfilter(struct file fp, struct knote kn);
	83	static int kqueue_stat(struct file fp, struct stat st,
	84	struct ucred *cred);
	85	static int kqueue_close(struct file *fp);
	86	static void kqueue_wakeup(struct kqueue *kq);
	87	static int filter_attach(struct knote *kn);
	88	static int filter_event(struct knote *kn, long hint);
	89
	90	/*
	91	* MPSAFE
	92	*/
	93	static struct fileops kqueueops = {
	94	.fo_read = kqueue_read,
	95	.fo_write = kqueue_write,
	96	.fo_ioctl = kqueue_ioctl,
	97	.fo_kqfilter = kqueue_kqfilter,
	98	.fo_stat = kqueue_stat,
	99	.fo_close = kqueue_close,
	100	.fo_shutdown = nofo_shutdown
	101	};
	102
	103	static void knote_attach(struct knote *kn);
	104	static void knote_drop(struct knote *kn);
	105	static void knote_detach_and_drop(struct knote *kn);
	106	static void knote_enqueue(struct knote *kn);
	107	static void knote_dequeue(struct knote *kn);
	108	static struct knote *knote_alloc(void);
	109	static void knote_free(struct knote *kn);
	110
	111	static void filt_kqdetach(struct knote *kn);
	112	static int filt_kqueue(struct knote *kn, long hint);
	113	static int filt_procattach(struct knote *kn);
	114	static void filt_procdetach(struct knote *kn);
	115	static int filt_proc(struct knote *kn, long hint);
	116	static int filt_fileattach(struct knote *kn);
	117	static void filt_timerexpire(void *knx);
	118	static int filt_timerattach(struct knote *kn);
	119	static void filt_timerdetach(struct knote *kn);
	120	static int filt_timer(struct knote *kn, long hint);
	121
	122	static struct filterops file_filtops =
	123	{ FILTEROP_ISFD, filt_fileattach, NULL, NULL };
	124	static struct filterops kqread_filtops =
	125	{ FILTEROP_ISFD, NULL, filt_kqdetach, filt_kqueue };
	126	static struct filterops proc_filtops =
	127	{ 0, filt_procattach, filt_procdetach, filt_proc };
	128	static struct filterops timer_filtops =
	129	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
	130
	131	static int kq_ncallouts = 0;
	132	static int kq_calloutmax = (4 * 1024);
	133	SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
	134	&kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
	135	static int kq_checkloop = 1000000;
	136	SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW,
	137	&kq_checkloop, 0, "Maximum number of callouts allocated for kqueue");
	138
	139	#define KNOTE_ACTIVATE(kn) do { \
	140	kn->kn_status \|= KN_ACTIVE; \
	141	if ((kn->kn_status & (KN_QUEUED \| KN_DISABLED)) == 0) \
	142	knote_enqueue(kn); \
	143	} while(0)
	144
	145	#define KN_HASHSIZE 64 /* XXX should be tunable */
	146	#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
	147
	148	extern struct filterops aio_filtops;
	149	extern struct filterops sig_filtops;
	150
	151	/*
	152	* Table for for all system-defined filters.
	153	*/
	154	static struct filterops *sysfilt_ops[] = {
	155	&file_filtops, /* EVFILT_READ */
	156	&file_filtops, /* EVFILT_WRITE */
	157	&aio_filtops, /* EVFILT_AIO */
	158	&file_filtops, /* EVFILT_VNODE */
	159	&proc_filtops, /* EVFILT_PROC */
	160	&sig_filtops, /* EVFILT_SIGNAL */
	161	&timer_filtops, /* EVFILT_TIMER */
	162	&file_filtops, /* EVFILT_EXCEPT */
	163	};
	164
	165	static int
	166	filt_fileattach(struct knote *kn)
	167	{
	168	return (fo_kqfilter(kn->kn_fp, kn));
	169	}
	170
	171	/*
	172	* MPSAFE
	173	*/
	174	static int
	175	kqueue_kqfilter(struct file fp, struct knote kn)
	176	{
	177	struct kqueue kq = (struct kqueue )kn->kn_fp->f_data;
	178
	179	if (kn->kn_filter != EVFILT_READ)
	180	return (EOPNOTSUPP);
	181
	182	kn->kn_fop = &kqread_filtops;
	183	knote_insert(&kq->kq_kqinfo.ki_note, kn);
	184	return (0);
	185	}
	186
	187	static void
	188	filt_kqdetach(struct knote *kn)
	189	{
	190	struct kqueue kq = (struct kqueue )kn->kn_fp->f_data;
	191
	192	knote_remove(&kq->kq_kqinfo.ki_note, kn);
	193	}
	194
	195	/ARGSUSED/
	196	static int
	197	filt_kqueue(struct knote *kn, long hint)
	198	{
	199	struct kqueue kq = (struct kqueue )kn->kn_fp->f_data;
	200
	201	kn->kn_data = kq->kq_count;
	202	return (kn->kn_data > 0);
	203	}
	204
	205	static int
	206	filt_procattach(struct knote *kn)
	207	{
	208	struct proc *p;
	209	int immediate;
	210
	211	immediate = 0;
	212	p = pfind(kn->kn_id);
	213	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
	214	p = zpfind(kn->kn_id);
	215	immediate = 1;
	216	}
	217	if (p == NULL) {
	218	return (ESRCH);
	219	}
	220	if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) {
	221	if (p)
	222	PRELE(p);
	223	return (EACCES);
	224	}
	225
	226	lwkt_gettoken(&p->p_token);
	227	kn->kn_ptr.p_proc = p;
	228	kn->kn_flags \|= EV_CLEAR; /* automatically set */
	229
	230	/*
	231	* internal flag indicating registration done by kernel
	232	*/
	233	if (kn->kn_flags & EV_FLAG1) {
	234	kn->kn_data = kn->kn_sdata; /* ppid */
	235	kn->kn_fflags = NOTE_CHILD;
	236	kn->kn_flags &= ~EV_FLAG1;
	237	}
	238
	239	knote_insert(&p->p_klist, kn);
	240
	241	/*
	242	* Immediately activate any exit notes if the target process is a
	243	* zombie. This is necessary to handle the case where the target
	244	* process, e.g. a child, dies before the kevent is negistered.
	245	*/
	246	if (immediate && filt_proc(kn, NOTE_EXIT))
	247	KNOTE_ACTIVATE(kn);
	248	lwkt_reltoken(&p->p_token);
	249	PRELE(p);
	250
	251	return (0);
	252	}
	253
	254	/*
	255	* The knote may be attached to a different process, which may exit,
	256	* leaving nothing for the knote to be attached to. So when the process
	257	* exits, the knote is marked as DETACHED and also flagged as ONESHOT so
	258	* it will be deleted when read out. However, as part of the knote deletion,
	259	* this routine is called, so a check is needed to avoid actually performing
	260	* a detach, because the original process does not exist any more.
	261	*/
	262	static void
	263	filt_procdetach(struct knote *kn)
	264	{
	265	struct proc *p;
	266
	267	if (kn->kn_status & KN_DETACHED)
	268	return;
	269	/* XXX locking? take proc_token here? */
	270	p = kn->kn_ptr.p_proc;
	271	knote_remove(&p->p_klist, kn);
	272	}
	273
	274	static int
	275	filt_proc(struct knote *kn, long hint)
	276	{
	277	u_int event;
	278
	279	/*
	280	* mask off extra data
	281	*/
	282	event = (u_int)hint & NOTE_PCTRLMASK;
	283
	284	/*
	285	* if the user is interested in this event, record it.
	286	*/
	287	if (kn->kn_sfflags & event)
	288	kn->kn_fflags \|= event;
	289
	290	/*
	291	* Process is gone, so flag the event as finished. Detach the
	292	* knote from the process now because the process will be poof,
	293	* gone later on.
	294	*/
	295	if (event == NOTE_EXIT) {
	296	struct proc *p = kn->kn_ptr.p_proc;
	297	if ((kn->kn_status & KN_DETACHED) == 0) {
	298	knote_remove(&p->p_klist, kn);
	299	kn->kn_status \|= KN_DETACHED;
	300	kn->kn_data = p->p_xstat;
	301	kn->kn_ptr.p_proc = NULL;
	302	}
	303	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	304	return (1);
	305	}
	306
	307	/*
	308	* process forked, and user wants to track the new process,
	309	* so attach a new knote to it, and immediately report an
	310	* event with the parent's pid.
	311	*/
	312	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
	313	struct kevent kev;
	314	int error;
	315
	316	/*
	317	* register knote with new process.
	318	*/
	319	kev.ident = hint & NOTE_PDATAMASK; /* pid */
	320	kev.filter = kn->kn_filter;
	321	kev.flags = kn->kn_flags \| EV_ADD \| EV_ENABLE \| EV_FLAG1;
	322	kev.fflags = kn->kn_sfflags;
	323	kev.data = kn->kn_id; /* parent */
	324	kev.udata = kn->kn_kevent.udata; /* preserve udata */
	325	error = kqueue_register(kn->kn_kq, &kev);
	326	if (error)
	327	kn->kn_fflags \|= NOTE_TRACKERR;
	328	}
	329
	330	return (kn->kn_fflags != 0);
	331	}
	332
	333	/*
	334	* The callout interlocks with callout_terminate() but can still
	335	* race a deletion so if KN_DELETING is set we just don't touch
	336	* the knote.
	337	*/
	338	static void
	339	filt_timerexpire(void *knx)
	340	{
	341	struct knote *kn = knx;
	342	struct callout *calloutp;
	343	struct timeval tv;
	344	int tticks;
	345
	346	lwkt_gettoken(&kq_token);
	347	if ((kn->kn_status & KN_DELETING) == 0) {
	348	kn->kn_data++;
	349	KNOTE_ACTIVATE(kn);
	350
	351	if ((kn->kn_flags & EV_ONESHOT) == 0) {
	352	tv.tv_sec = kn->kn_sdata / 1000;
	353	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
	354	tticks = tvtohz_high(&tv);
	355	calloutp = (struct callout *)kn->kn_hook;
	356	callout_reset(calloutp, tticks, filt_timerexpire, kn);
	357	}
	358	}
	359	lwkt_reltoken(&kq_token);
	360	}
	361
	362	/*
	363	* data contains amount of time to sleep, in milliseconds
	364	*/
	365	static int
	366	filt_timerattach(struct knote *kn)
	367	{
	368	struct callout *calloutp;
	369	struct timeval tv;
	370	int tticks;
	371
	372	if (kq_ncallouts >= kq_calloutmax) {
	373	kn->kn_hook = NULL;
	374	return (ENOMEM);
	375	}
	376	kq_ncallouts++;
	377
	378	tv.tv_sec = kn->kn_sdata / 1000;
	379	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
	380	tticks = tvtohz_high(&tv);
	381
	382	kn->kn_flags \|= EV_CLEAR; /* automatically set */
	383	MALLOC(calloutp, struct callout , sizeof(calloutp),
	384	M_KQUEUE, M_WAITOK);
	385	callout_init(calloutp);
	386	kn->kn_hook = (caddr_t)calloutp;
	387	callout_reset(calloutp, tticks, filt_timerexpire, kn);
	388
	389	return (0);
	390	}
	391
	392	/*
	393	* This function is called with the knote flagged locked but it is
	394	* still possible to race a callout event due to the callback blocking.
	395	* We must call callout_terminate() instead of callout_stop() to deal
	396	* with the race.
	397	*/
	398	static void
	399	filt_timerdetach(struct knote *kn)
	400	{
	401	struct callout *calloutp;
	402
	403	calloutp = (struct callout *)kn->kn_hook;
	404	callout_terminate(calloutp);
	405	FREE(calloutp, M_KQUEUE);
	406	kq_ncallouts--;
	407	}
	408
	409	static int
	410	filt_timer(struct knote *kn, long hint)
	411	{
	412
	413	return (kn->kn_data != 0);
	414	}
	415
	416	/*
	417	* Acquire a knote, return non-zero on success, 0 on failure.
	418	*
	419	* If we cannot acquire the knote we sleep and return 0. The knote
	420	* may be stale on return in this case and the caller must restart
	421	* whatever loop they are in.
	422	*/
	423	static __inline
	424	int
	425	knote_acquire(struct knote *kn)
	426	{
	427	if (kn->kn_status & KN_PROCESSING) {
	428	kn->kn_status \|= KN_WAITING \| KN_REPROCESS;
	429	tsleep(kn, 0, "kqepts", hz);
	430	/* knote may be stale now */
	431	return(0);
	432	}
	433	kn->kn_status \|= KN_PROCESSING;
	434	return(1);
	435	}
	436
	437	/*
	438	* Release an acquired knote, clearing KN_PROCESSING and handling any
	439	* KN_REPROCESS events.
	440	*
	441	* Non-zero is returned if the knote is destroyed.
	442	*/
	443	static __inline
	444	int
	445	knote_release(struct knote *kn)
	446	{
	447	while (kn->kn_status & KN_REPROCESS) {
	448	kn->kn_status &= ~KN_REPROCESS;
	449	if (kn->kn_status & KN_WAITING) {
	450	kn->kn_status &= ~KN_WAITING;
	451	wakeup(kn);
	452	}
	453	if (kn->kn_status & KN_DELETING) {
	454	knote_detach_and_drop(kn);
	455	return(1);
	456	/* NOT REACHED */
	457	}
	458	if (filter_event(kn, 0))
	459	KNOTE_ACTIVATE(kn);
	460	}
	461	kn->kn_status &= ~KN_PROCESSING;
	462	return(0);
	463	}
	464
	465	/*
	466	* Initialize a kqueue.
	467	*
	468	* NOTE: The lwp/proc code initializes a kqueue for select/poll ops.
	469	*
	470	* MPSAFE
	471	*/
	472	void
	473	kqueue_init(struct kqueue kq, struct filedesc fdp)
	474	{
	475	TAILQ_INIT(&kq->kq_knpend);
	476	TAILQ_INIT(&kq->kq_knlist);
	477	kq->kq_count = 0;
	478	kq->kq_fdp = fdp;
	479	SLIST_INIT(&kq->kq_kqinfo.ki_note);
	480	}
	481
	482	/*
	483	* Terminate a kqueue. Freeing the actual kq itself is left up to the
	484	* caller (it might be embedded in a lwp so we don't do it here).
	485	*
	486	* The kq's knlist must be completely eradicated so block on any
	487	* processing races.
	488	*/
	489	void
	490	kqueue_terminate(struct kqueue *kq)
	491	{
	492	struct knote *kn;
	493
	494	lwkt_gettoken(&kq_token);
	495	while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) {
	496	if (knote_acquire(kn))
	497	knote_detach_and_drop(kn);
	498	}
	499	if (kq->kq_knhash) {
	500	kfree(kq->kq_knhash, M_KQUEUE);
	501	kq->kq_knhash = NULL;
	502	kq->kq_knhashmask = 0;
	503	}
	504	lwkt_reltoken(&kq_token);
	505	}
	506
	507	/*
	508	* MPSAFE
	509	*/
	510	int
	511	sys_kqueue(struct kqueue_args *uap)
	512	{
	513	struct thread *td = curthread;
	514	struct kqueue *kq;
	515	struct file *fp;
	516	int fd, error;
	517
	518	error = falloc(td->td_lwp, &fp, &fd);
	519	if (error)
	520	return (error);
	521	fp->f_flag = FREAD \| FWRITE;
	522	fp->f_type = DTYPE_KQUEUE;
	523	fp->f_ops = &kqueueops;
	524
	525	kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK \| M_ZERO);
	526	kqueue_init(kq, td->td_proc->p_fd);
	527	fp->f_data = kq;
	528
	529	fsetfd(kq->kq_fdp, fp, fd);
	530	uap->sysmsg_result = fd;
	531	fdrop(fp);
	532	return (error);
	533	}
	534
	535	/*
	536	* Copy 'count' items into the destination list pointed to by uap->eventlist.
	537	*/
	538	static int
	539	kevent_copyout(void arg, struct kevent kevp, int count, int *res)
	540	{
	541	struct kevent_copyin_args *kap;
	542	int error;
	543
	544	kap = (struct kevent_copyin_args *)arg;
	545
	546	error = copyout(kevp, kap->ka->eventlist, count * sizeof(*kevp));
	547	if (error == 0) {
	548	kap->ka->eventlist += count;
	549	*res += count;
	550	} else {
	551	*res = -1;
	552	}
	553
	554	return (error);
	555	}
	556
	557	/*
	558	* Copy at most 'max' items from the list pointed to by kap->changelist,
	559	* return number of items in 'events'.
	560	*/
	561	static int
	562	kevent_copyin(void arg, struct kevent kevp, int max, int *events)
	563	{
	564	struct kevent_copyin_args *kap;
	565	int error, count;
	566
	567	kap = (struct kevent_copyin_args *)arg;
	568
	569	count = min(kap->ka->nchanges - kap->pchanges, max);
	570	error = copyin(kap->ka->changelist, kevp, count * sizeof *kevp);
	571	if (error == 0) {
	572	kap->ka->changelist += count;
	573	kap->pchanges += count;
	574	*events = count;
	575	}
	576
	577	return (error);
	578	}
	579
	580	/*
	581	* MPSAFE
	582	*/
	583	int
	584	kern_kevent(struct kqueue kq, int nevents, int res, void *uap,
	585	k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn,
	586	struct timespec *tsp_in)
	587	{
	588	struct kevent *kevp;
	589	struct timespec *tsp;
	590	int i, n, total, error, nerrors = 0;
	591	int lres;
	592	int limit = kq_checkloop;
	593	struct kevent kev[KQ_NEVENTS];
	594	struct knote marker;
	595
	596	tsp = tsp_in;
	597	*res = 0;
	598
	599	lwkt_gettoken(&kq_token);
	600	for ( ;; ) {
	601	n = 0;
	602	error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
	603	if (error)
	604	goto done;
	605	if (n == 0)
	606	break;
	607	for (i = 0; i < n; i++) {
	608	kevp = &kev[i];
	609	kevp->flags &= ~EV_SYSFLAGS;
	610	error = kqueue_register(kq, kevp);
	611
	612	/*
	613	* If a registration returns an error we
	614	* immediately post the error. The kevent()
	615	* call itself will fail with the error if
	616	* no space is available for posting.
	617	*
	618	* Such errors normally bypass the timeout/blocking
	619	* code. However, if the copyoutfn function refuses
	620	* to post the error (see sys_poll()), then we
	621	* ignore it too.
	622	*/
	623	if (error) {
	624	kevp->flags = EV_ERROR;
	625	kevp->data = error;
	626	lres = *res;
	627	kevent_copyoutfn(uap, kevp, 1, res);
	628	if (*res < 0) {
	629	goto done;
	630	} else if (lres != *res) {
	631	nevents--;
	632	nerrors++;
	633	}
	634	}
	635	}
	636	}
	637	if (nerrors) {
	638	error = 0;
	639	goto done;
	640	}
	641
	642	/*
	643	* Acquire/wait for events - setup timeout
	644	*/
	645	if (tsp != NULL) {
	646	struct timespec ats;
	647
	648	if (tsp->tv_sec \|\| tsp->tv_nsec) {
	649	nanouptime(&ats);
	650	timespecadd(tsp, &ats); /* tsp = target time */
	651	}
	652	}
	653
	654	/*
	655	* Loop as required.
	656	*
	657	* Collect as many events as we can. Sleeping on successive
	658	* loops is disabled if copyoutfn has incremented (*res).
	659	*
	660	* The loop stops if an error occurs, all events have been
	661	* scanned (the marker has been reached), or fewer than the
	662	* maximum number of events is found.
	663	*
	664	* The copyoutfn function does not have to increment (*res) in
	665	* order for the loop to continue.
	666	*
	667	* NOTE: doselect() usually passes 0x7FFFFFFF for nevents.
	668	*/
	669	total = 0;
	670	error = 0;
	671	marker.kn_filter = EVFILT_MARKER;
	672	marker.kn_status = KN_PROCESSING;
	673	TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
	674	while ((n = nevents - total) > 0) {
	675	if (n > KQ_NEVENTS)
	676	n = KQ_NEVENTS;
	677
	678	/*
	679	* If no events are pending sleep until timeout (if any)
	680	* or an event occurs.
	681	*
	682	* After the sleep completes the marker is moved to the
	683	* end of the list, making any received events available
	684	* to our scan.
	685	*/
	686	if (kq->kq_count == 0 && *res == 0) {
	687	error = kqueue_sleep(kq, tsp);
	688	if (error)
	689	break;
	690
	691	TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
	692	TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
	693	}
	694
	695	/*
	696	* Process all received events
	697	* Account for all non-spurious events in our total
	698	*/
	699	i = kqueue_scan(kq, kev, n, &marker);
	700	if (i) {
	701	lres = *res;
	702	error = kevent_copyoutfn(uap, kev, i, res);
	703	total += *res - lres;
	704	if (error)
	705	break;
	706	}
	707	if (limit && --limit == 0)
	708	panic("kqueue: checkloop failed i=%d", i);
	709
	710	/*
	711	* Normally when fewer events are returned than requested
	712	* we can stop. However, if only spurious events were
	713	* collected the copyout will not bump (*res) and we have
	714	* to continue.
	715	*/
	716	if (i < n && *res)
	717	break;
	718
	719	/*
	720	* Deal with an edge case where spurious events can cause
	721	* a loop to occur without moving the marker. This can
	722	* prevent kqueue_scan() from picking up new events which
	723	* race us. We must be sure to move the marker for this
	724	* case.
	725	*
	726	* NOTE: We do not want to move the marker if events
	727	* were scanned because normal kqueue operations
	728	* may reactivate events. Moving the marker in
	729	* that case could result in duplicates for the
	730	* same event.
	731	*/
	732	if (i == 0) {
	733	TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
	734	TAILQ_INSERT_TAIL(&kq->kq_knpend, &marker, kn_tqe);
	735	}
	736	}
	737	TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
	738
	739	/* Timeouts do not return EWOULDBLOCK. */
	740	if (error == EWOULDBLOCK)
	741	error = 0;
	742
	743	done:
	744	lwkt_reltoken(&kq_token);
	745	return (error);
	746	}
	747
	748	/*
	749	* MPALMOSTSAFE
	750	*/
	751	int
	752	sys_kevent(struct kevent_args *uap)
	753	{
	754	struct thread *td = curthread;
	755	struct proc *p = td->td_proc;
	756	struct timespec ts, *tsp;
	757	struct kqueue *kq;
	758	struct file *fp = NULL;
	759	struct kevent_copyin_args *kap, ka;
	760	int error;
	761
	762	if (uap->timeout) {
	763	error = copyin(uap->timeout, &ts, sizeof(ts));
	764	if (error)
	765	return (error);
	766	tsp = &ts;
	767	} else {
	768	tsp = NULL;
	769	}
	770
	771	fp = holdfp(p->p_fd, uap->fd, -1);
	772	if (fp == NULL)
	773	return (EBADF);
	774	if (fp->f_type != DTYPE_KQUEUE) {
	775	fdrop(fp);
	776	return (EBADF);
	777	}
	778
	779	kq = (struct kqueue *)fp->f_data;
	780
	781	kap = &ka;
	782	kap->ka = uap;
	783	kap->pchanges = 0;
	784
	785	error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap,
	786	kevent_copyin, kevent_copyout, tsp);
	787
	788	fdrop(fp);
	789
	790	return (error);
	791	}
	792
	793	int
	794	kqueue_register(struct kqueue kq, struct kevent kev)
	795	{
	796	struct filedesc *fdp = kq->kq_fdp;
	797	struct filterops *fops;
	798	struct file *fp = NULL;
	799	struct knote *kn = NULL;
	800	int error = 0;
	801
	802	if (kev->filter < 0) {
	803	if (kev->filter + EVFILT_SYSCOUNT < 0)
	804	return (EINVAL);
	805	fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
	806	} else {
	807	/*
	808	* XXX
	809	* filter attach routine is responsible for insuring that
	810	* the identifier can be attached to it.
	811	*/
	812	kprintf("unknown filter: %d\n", kev->filter);
	813	return (EINVAL);
	814	}
	815
	816	lwkt_gettoken(&kq_token);
	817	if (fops->f_flags & FILTEROP_ISFD) {
	818	/* validate descriptor */
	819	fp = holdfp(fdp, kev->ident, -1);
	820	if (fp == NULL) {
	821	lwkt_reltoken(&kq_token);
	822	return (EBADF);
	823	}
	824
	825	again1:
	826	SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
	827	if (kn->kn_kq == kq &&
	828	kn->kn_filter == kev->filter &&
	829	kn->kn_id == kev->ident) {
	830	if (knote_acquire(kn) == 0)
	831	goto again1;
	832	break;
	833	}
	834	}
	835	} else {
	836	if (kq->kq_knhashmask) {
	837	struct klist *list;
	838
	839	list = &kq->kq_knhash[
	840	KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
	841	again2:
	842	SLIST_FOREACH(kn, list, kn_link) {
	843	if (kn->kn_id == kev->ident &&
	844	kn->kn_filter == kev->filter) {
	845	if (knote_acquire(kn) == 0)
	846	goto again2;
	847	break;
	848	}
	849	}
	850	}
	851	}
	852
	853	/*
	854	* NOTE: At this point if kn is non-NULL we will have acquired
	855	* it and set KN_PROCESSING.
	856	*/
	857	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
	858	error = ENOENT;
	859	goto done;
	860	}
	861
	862	/*
	863	* kn now contains the matching knote, or NULL if no match
	864	*/
	865	if (kev->flags & EV_ADD) {
	866	if (kn == NULL) {
	867	kn = knote_alloc();
	868	if (kn == NULL) {
	869	error = ENOMEM;
	870	goto done;
	871	}
	872	kn->kn_fp = fp;
	873	kn->kn_kq = kq;
	874	kn->kn_fop = fops;
	875
	876	/*
	877	* apply reference count to knote structure, and
	878	* do not release it at the end of this routine.
	879	*/
	880	fp = NULL;
	881
	882	kn->kn_sfflags = kev->fflags;
	883	kn->kn_sdata = kev->data;
	884	kev->fflags = 0;
	885	kev->data = 0;
	886	kn->kn_kevent = *kev;
	887
	888	/*
	889	* KN_PROCESSING prevents the knote from getting
	890	* ripped out from under us while we are trying
	891	* to attach it, in case the attach blocks.
	892	*/
	893	kn->kn_status = KN_PROCESSING;
	894	knote_attach(kn);
	895	if ((error = filter_attach(kn)) != 0) {
	896	kn->kn_status \|= KN_DELETING \| KN_REPROCESS;
	897	knote_drop(kn);
	898	goto done;
	899	}
	900
	901	/*
	902	* Interlock against close races which either tried
	903	* to remove our knote while we were blocked or missed
	904	* it entirely prior to our attachment. We do not
	905	* want to end up with a knote on a closed descriptor.
	906	*/
	907	if ((fops->f_flags & FILTEROP_ISFD) &&
	908	checkfdclosed(fdp, kev->ident, kn->kn_fp)) {
	909	kn->kn_status \|= KN_DELETING \| KN_REPROCESS;
	910	}
	911	} else {
	912	/*
	913	* The user may change some filter values after the
	914	* initial EV_ADD, but doing so will not reset any
	915	* filter which have already been triggered.
	916	*/
	917	KKASSERT(kn->kn_status & KN_PROCESSING);
	918	kn->kn_sfflags = kev->fflags;
	919	kn->kn_sdata = kev->data;
	920	kn->kn_kevent.udata = kev->udata;
	921	}
	922
	923	/*
	924	* Execute the filter event to immediately activate the
	925	* knote if necessary. If reprocessing events are pending
	926	* due to blocking above we do not run the filter here
	927	* but instead let knote_release() do it. Otherwise we
	928	* might run the filter on a deleted event.
	929	*/
	930	if ((kn->kn_status & KN_REPROCESS) == 0) {
	931	if (filter_event(kn, 0))
	932	KNOTE_ACTIVATE(kn);
	933	}
	934	} else if (kev->flags & EV_DELETE) {
	935	/*
	936	* Delete the existing knote
	937	*/
	938	knote_detach_and_drop(kn);
	939	goto done;
	940	}
	941
	942	/*
	943	* Disablement does not deactivate a knote here.
	944	*/
	945	if ((kev->flags & EV_DISABLE) &&
	946	((kn->kn_status & KN_DISABLED) == 0)) {
	947	kn->kn_status \|= KN_DISABLED;
	948	}
	949
	950	/*
	951	* Re-enablement may have to immediately enqueue an active knote.
	952	*/
	953	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
	954	kn->kn_status &= ~KN_DISABLED;
	955	if ((kn->kn_status & KN_ACTIVE) &&
	956	((kn->kn_status & KN_QUEUED) == 0)) {
	957	knote_enqueue(kn);
	958	}
	959	}
	960
	961	/*
	962	* Handle any required reprocessing
	963	*/
	964	knote_release(kn);
	965	/* kn may be invalid now */
	966
	967	done:
	968	lwkt_reltoken(&kq_token);
	969	if (fp != NULL)
	970	fdrop(fp);
	971	return (error);
	972	}
	973
	974	/*
	975	* Block as necessary until the target time is reached.
	976	* If tsp is NULL we block indefinitely. If tsp->ts_secs/nsecs are both
	977	* 0 we do not block at all.
	978	*/
	979	static int
	980	kqueue_sleep(struct kqueue kq, struct timespec tsp)
	981	{
	982	int error = 0;
	983
	984	if (tsp == NULL) {
	985	kq->kq_state \|= KQ_SLEEP;
	986	error = tsleep(kq, PCATCH, "kqread", 0);
	987	} else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
	988	error = EWOULDBLOCK;
	989	} else {
	990	struct timespec ats;
	991	struct timespec atx = *tsp;
	992	int timeout;
	993
	994	nanouptime(&ats);
	995	timespecsub(&atx, &ats);
	996	if (ats.tv_sec < 0) {
	997	error = EWOULDBLOCK;
	998	} else {
	999	timeout = atx.tv_sec > 24 * 60 * 60 ?
	1000	24 * 60 * 60 * hz : tstohz_high(&atx);
	1001	kq->kq_state \|= KQ_SLEEP;
	1002	error = tsleep(kq, PCATCH, "kqread", timeout);
	1003	}
	1004	}
	1005
	1006	/* don't restart after signals... */
	1007	if (error == ERESTART)
	1008	return (EINTR);
	1009
	1010	return (error);
	1011	}
	1012
	1013	/*
	1014	* Scan the kqueue, return the number of active events placed in kevp up
	1015	* to count.
	1016	*
	1017	* Continuous mode events may get recycled, do not continue scanning past
	1018	* marker unless no events have been collected.
	1019	*/
	1020	static int
	1021	kqueue_scan(struct kqueue kq, struct kevent kevp, int count,
	1022	struct knote *marker)
	1023	{
	1024	struct knote *kn, local_marker;
	1025	int total;
	1026
	1027	total = 0;
	1028	local_marker.kn_filter = EVFILT_MARKER;
	1029	local_marker.kn_status = KN_PROCESSING;
	1030
	1031	/*
	1032	* Collect events.
	1033	*/
	1034	TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe);
	1035	while (count) {
	1036	kn = TAILQ_NEXT(&local_marker, kn_tqe);
	1037	if (kn->kn_filter == EVFILT_MARKER) {
	1038	/* Marker reached, we are done */
	1039	if (kn == marker)
	1040	break;
	1041
	1042	/* Move local marker past some other threads marker */
	1043	kn = TAILQ_NEXT(kn, kn_tqe);
	1044	TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
	1045	TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe);
	1046	continue;
	1047	}
	1048
	1049	/*
	1050	* We can't skip a knote undergoing processing, otherwise
	1051	* we risk not returning it when the user process expects
	1052	* it should be returned. Sleep and retry.
	1053	*/
	1054	if (knote_acquire(kn) == 0)
	1055	continue;
	1056
	1057	/*
	1058	* Remove the event for processing.
	1059	*
	1060	* WARNING! We must leave KN_QUEUED set to prevent the
	1061	* event from being KNOTE_ACTIVATE()d while
	1062	* the queue state is in limbo, in case we
	1063	* block.
	1064	*
	1065	* WARNING! We must set KN_PROCESSING to avoid races
	1066	* against deletion or another thread's
	1067	* processing.
	1068	*/
	1069	TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
	1070	kq->kq_count--;
	1071
	1072	/*
	1073	* We have to deal with an extremely important race against
	1074	* file descriptor close()s here. The file descriptor can
	1075	* disappear MPSAFE, and there is a small window of
	1076	* opportunity between that and the call to knote_fdclose().
	1077	*
	1078	* If we hit that window here while doselect or dopoll is
	1079	* trying to delete a spurious event they will not be able
	1080	* to match up the event against a knote and will go haywire.
	1081	*/
	1082	if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
	1083	checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) {
	1084	kn->kn_status \|= KN_DELETING \| KN_REPROCESS;
	1085	}
	1086
	1087	if (kn->kn_status & KN_DISABLED) {
	1088	/*
	1089	* If disabled we ensure the event is not queued
	1090	* but leave its active bit set. On re-enablement
	1091	* the event may be immediately triggered.
	1092	*/
	1093	kn->kn_status &= ~KN_QUEUED;
	1094	} else if ((kn->kn_flags & EV_ONESHOT) == 0 &&
	1095	(kn->kn_status & KN_DELETING) == 0 &&
	1096	filter_event(kn, 0) == 0) {
	1097	/*
	1098	* If not running in one-shot mode and the event
	1099	* is no longer present we ensure it is removed
	1100	* from the queue and ignore it.
	1101	*/
	1102	kn->kn_status &= ~(KN_QUEUED \| KN_ACTIVE);
	1103	} else {
	1104	/*
	1105	* Post the event
	1106	*/
	1107	*kevp++ = kn->kn_kevent;
	1108	++total;
	1109	--count;
	1110
	1111	if (kn->kn_flags & EV_ONESHOT) {
	1112	kn->kn_status &= ~KN_QUEUED;
	1113	kn->kn_status \|= KN_DELETING \| KN_REPROCESS;
	1114	} else if (kn->kn_flags & EV_CLEAR) {
	1115	kn->kn_data = 0;
	1116	kn->kn_fflags = 0;
	1117	kn->kn_status &= ~(KN_QUEUED \| KN_ACTIVE);
	1118	} else {
	1119	TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
	1120	kq->kq_count++;
	1121	}
	1122	}
	1123
	1124	/*
	1125	* Handle any post-processing states
	1126	*/
	1127	knote_release(kn);
	1128	}
	1129	TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
	1130
	1131	return (total);
	1132	}
	1133
	1134	/*
	1135	* XXX
	1136	* This could be expanded to call kqueue_scan, if desired.
	1137	*
	1138	* MPSAFE
	1139	*/
	1140	static int
	1141	kqueue_read(struct file fp, struct uio uio, struct ucred *cred, int flags)
	1142	{
	1143	return (ENXIO);
	1144	}
	1145
	1146	/*
	1147	* MPSAFE
	1148	*/
	1149	static int
	1150	kqueue_write(struct file fp, struct uio uio, struct ucred *cred, int flags)
	1151	{
	1152	return (ENXIO);
	1153	}
	1154
	1155	/*
	1156	* MPALMOSTSAFE
	1157	*/
	1158	static int
	1159	kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
	1160	struct ucred cred, struct sysmsg msg)
	1161	{
	1162	struct kqueue *kq;
	1163	int error;
	1164
	1165	lwkt_gettoken(&kq_token);
	1166	kq = (struct kqueue *)fp->f_data;
	1167
	1168	switch(com) {
	1169	case FIOASYNC:
	1170	if ((int )data)
	1171	kq->kq_state \|= KQ_ASYNC;
	1172	else
	1173	kq->kq_state &= ~KQ_ASYNC;
	1174	error = 0;
	1175	break;
	1176	case FIOSETOWN:
	1177	error = fsetown((int )data, &kq->kq_sigio);
	1178	break;
	1179	default:
	1180	error = ENOTTY;
	1181	break;
	1182	}
	1183	lwkt_reltoken(&kq_token);
	1184	return (error);
	1185	}
	1186
	1187	/*
	1188	* MPSAFE
	1189	*/
	1190	static int
	1191	kqueue_stat(struct file fp, struct stat st, struct ucred *cred)
	1192	{
	1193	struct kqueue kq = (struct kqueue )fp->f_data;
	1194
	1195	bzero((void )st, sizeof(st));
	1196	st->st_size = kq->kq_count;
	1197	st->st_blksize = sizeof(struct kevent);
	1198	st->st_mode = S_IFIFO;
	1199	return (0);
	1200	}
	1201
	1202	/*
	1203	* MPSAFE
	1204	*/
	1205	static int
	1206	kqueue_close(struct file *fp)
	1207	{
	1208	struct kqueue kq = (struct kqueue )fp->f_data;
	1209
	1210	kqueue_terminate(kq);
	1211
	1212	fp->f_data = NULL;
	1213	funsetown(&kq->kq_sigio);
	1214
	1215	kfree(kq, M_KQUEUE);
	1216	return (0);
	1217	}
	1218
	1219	static void
	1220	kqueue_wakeup(struct kqueue *kq)
	1221	{
	1222	if (kq->kq_state & KQ_SLEEP) {
	1223	kq->kq_state &= ~KQ_SLEEP;
	1224	wakeup(kq);
	1225	}
	1226	KNOTE(&kq->kq_kqinfo.ki_note, 0);
	1227	}
	1228
	1229	/*
	1230	* Calls filterops f_attach function, acquiring mplock if filter is not
	1231	* marked as FILTEROP_MPSAFE.
	1232	*/
	1233	static int
	1234	filter_attach(struct knote *kn)
	1235	{
	1236	int ret;
	1237
	1238	if (!(kn->kn_fop->f_flags & FILTEROP_MPSAFE)) {
	1239	get_mplock();
	1240	ret = kn->kn_fop->f_attach(kn);
	1241	rel_mplock();
	1242	} else {
	1243	ret = kn->kn_fop->f_attach(kn);
	1244	}
	1245
	1246	return (ret);
	1247	}
	1248
	1249	/*
	1250	* Detach the knote and drop it, destroying the knote.
	1251	*
	1252	* Calls filterops f_detach function, acquiring mplock if filter is not
	1253	* marked as FILTEROP_MPSAFE.
	1254	*/
	1255	static void
	1256	knote_detach_and_drop(struct knote *kn)
	1257	{
	1258	kn->kn_status \|= KN_DELETING \| KN_REPROCESS;
	1259	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
	1260	kn->kn_fop->f_detach(kn);
	1261	} else {
	1262	get_mplock();
	1263	kn->kn_fop->f_detach(kn);
	1264	rel_mplock();
	1265	}
	1266	knote_drop(kn);
	1267	}
	1268
	1269	/*
	1270	* Calls filterops f_event function, acquiring mplock if filter is not
	1271	* marked as FILTEROP_MPSAFE.
	1272	*
	1273	* If the knote is in the middle of being created or deleted we cannot
	1274	* safely call the filter op.
	1275	*/
	1276	static int
	1277	filter_event(struct knote *kn, long hint)
	1278	{
	1279	int ret;
	1280
	1281	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
	1282	ret = kn->kn_fop->f_event(kn, hint);
	1283	} else {
	1284	get_mplock();
	1285	ret = kn->kn_fop->f_event(kn, hint);
	1286	rel_mplock();
	1287	}
	1288	return (ret);
	1289	}
	1290
	1291	/*
	1292	* Walk down a list of knotes, activating them if their event has triggered.
	1293	*
	1294	* If we encounter any knotes which are undergoing processing we just mark
	1295	* them for reprocessing and do not try to [re]activate the knote. However,
	1296	* if a hint is being passed we have to wait and that makes things a bit
	1297	* sticky.
	1298	*/
	1299	void
	1300	knote(struct klist *list, long hint)
	1301	{
	1302	struct knote *kn;
	1303
	1304	lwkt_gettoken(&kq_token);
	1305	restart:
	1306	SLIST_FOREACH(kn, list, kn_next) {
	1307	if (kn->kn_status & KN_PROCESSING) {
	1308	/*
	1309	* Someone else is processing the knote, ask the
	1310	* other thread to reprocess it and don't mess
	1311	* with it otherwise.
	1312	*/
	1313	if (hint == 0) {
	1314	kn->kn_status \|= KN_REPROCESS;
	1315	continue;
	1316	}
	1317
	1318	/*
	1319	* If the hint is non-zero we have to wait or risk
	1320	* losing the state the caller is trying to update.
	1321	*
	1322	* XXX This is a real problem, certain process
	1323	* and signal filters will bump kn_data for
	1324	* already-processed notes more than once if
	1325	* we restart the list scan. FIXME.
	1326	*/
	1327	kn->kn_status \|= KN_WAITING \| KN_REPROCESS;
	1328	tsleep(kn, 0, "knotec", hz);
	1329	goto restart;
	1330	}
	1331
	1332	/*
	1333	* Become the reprocessing master ourselves.
	1334	*
	1335	* If hint is non-zer running the event is mandatory
	1336	* when not deleting so do it whether reprocessing is
	1337	* set or not.
	1338	*/
	1339	kn->kn_status \|= KN_PROCESSING;
	1340	if ((kn->kn_status & KN_DELETING) == 0) {
	1341	if (filter_event(kn, hint))
	1342	KNOTE_ACTIVATE(kn);
	1343	}
	1344	if (knote_release(kn))
	1345	goto restart;
	1346	}
	1347	lwkt_reltoken(&kq_token);
	1348	}
	1349
	1350	/*
	1351	* Insert knote at head of klist.
	1352	*
	1353	* This function may only be called via a filter function and thus
	1354	* kq_token should already be held and marked for processing.
	1355	*/
	1356	void
	1357	knote_insert(struct klist klist, struct knote kn)
	1358	{
	1359	KKASSERT(kn->kn_status & KN_PROCESSING);
	1360	ASSERT_LWKT_TOKEN_HELD(&kq_token);
	1361	SLIST_INSERT_HEAD(klist, kn, kn_next);
	1362	}
	1363
	1364	/*
	1365	* Remove knote from a klist
	1366	*
	1367	* This function may only be called via a filter function and thus
	1368	* kq_token should already be held and marked for processing.
	1369	*/
	1370	void
	1371	knote_remove(struct klist klist, struct knote kn)
	1372	{
	1373	KKASSERT(kn->kn_status & KN_PROCESSING);
	1374	ASSERT_LWKT_TOKEN_HELD(&kq_token);
	1375	SLIST_REMOVE(klist, kn, knote, kn_next);
	1376	}
	1377
	1378	/*
	1379	* Remove all knotes from a specified klist
	1380	*
	1381	* Only called from aio.
	1382	*/
	1383	void
	1384	knote_empty(struct klist *list)
	1385	{
	1386	struct knote *kn;
	1387
	1388	lwkt_gettoken(&kq_token);
	1389	while ((kn = SLIST_FIRST(list)) != NULL) {
	1390	if (knote_acquire(kn))
	1391	knote_detach_and_drop(kn);
	1392	}
	1393	lwkt_reltoken(&kq_token);
	1394	}
	1395
	1396	void
	1397	knote_assume_knotes(struct kqinfo src, struct kqinfo dst,
	1398	struct filterops ops, void hook)
	1399	{
	1400	struct knote *kn;
	1401
	1402	lwkt_gettoken(&kq_token);
	1403	while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) {
	1404	if (knote_acquire(kn)) {
	1405	knote_remove(&src->ki_note, kn);
	1406	kn->kn_fop = ops;
	1407	kn->kn_hook = hook;
	1408	knote_insert(&dst->ki_note, kn);
	1409	knote_release(kn);
	1410	/* kn may be invalid now */
	1411	}
	1412	}
	1413	lwkt_reltoken(&kq_token);
	1414	}
	1415
	1416	/*
	1417	* Remove all knotes referencing a specified fd
	1418	*/
	1419	void
	1420	knote_fdclose(struct file fp, struct filedesc fdp, int fd)
	1421	{
	1422	struct knote *kn;
	1423
	1424	lwkt_gettoken(&kq_token);
	1425	restart:
	1426	SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
	1427	if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) {
	1428	if (knote_acquire(kn))
	1429	knote_detach_and_drop(kn);
	1430	goto restart;
	1431	}
	1432	}
	1433	lwkt_reltoken(&kq_token);
	1434	}
	1435
	1436	/*
	1437	* Low level attach function.
	1438	*
	1439	* The knote should already be marked for processing.
	1440	*/
	1441	static void
	1442	knote_attach(struct knote *kn)
	1443	{
	1444	struct klist *list;
	1445	struct kqueue *kq = kn->kn_kq;
	1446
	1447	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
	1448	KKASSERT(kn->kn_fp);
	1449	list = &kn->kn_fp->f_klist;
	1450	} else {
	1451	if (kq->kq_knhashmask == 0)
	1452	kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
	1453	&kq->kq_knhashmask);
	1454	list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
	1455	}
	1456	SLIST_INSERT_HEAD(list, kn, kn_link);
	1457	TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink);
	1458	}
	1459
	1460	/*
	1461	* Low level drop function.
	1462	*
	1463	* The knote should already be marked for processing.
	1464	*/
	1465	static void
	1466	knote_drop(struct knote *kn)
	1467	{
	1468	struct kqueue *kq;
	1469	struct klist *list;
	1470
	1471	kq = kn->kn_kq;
	1472
	1473	if (kn->kn_fop->f_flags & FILTEROP_ISFD)
	1474	list = &kn->kn_fp->f_klist;
	1475	else
	1476	list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
	1477
	1478	SLIST_REMOVE(list, kn, knote, kn_link);
	1479	TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
	1480	if (kn->kn_status & KN_QUEUED)
	1481	knote_dequeue(kn);
	1482	if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
	1483	fdrop(kn->kn_fp);
	1484	kn->kn_fp = NULL;
	1485	}
	1486	knote_free(kn);
	1487	}
	1488
	1489	/*
	1490	* Low level enqueue function.
	1491	*
	1492	* The knote should already be marked for processing.
	1493	*/
	1494	static void
	1495	knote_enqueue(struct knote *kn)
	1496	{
	1497	struct kqueue *kq = kn->kn_kq;
	1498
	1499	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
	1500	TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
	1501	kn->kn_status \|= KN_QUEUED;
	1502	++kq->kq_count;
	1503
	1504	/*
	1505	* Send SIGIO on request (typically set up as a mailbox signal)
	1506	*/
	1507	if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1)
	1508	pgsigio(kq->kq_sigio, SIGIO, 0);
	1509
	1510	kqueue_wakeup(kq);
	1511	}
	1512
	1513	/*
	1514	* Low level dequeue function.
	1515	*
	1516	* The knote should already be marked for processing.
	1517	*/
	1518	static void
	1519	knote_dequeue(struct knote *kn)
	1520	{
	1521	struct kqueue *kq = kn->kn_kq;
	1522
	1523	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
	1524	TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
	1525	kn->kn_status &= ~KN_QUEUED;
	1526	kq->kq_count--;
	1527	}
	1528
	1529	static struct knote *
	1530	knote_alloc(void)
	1531	{
	1532	return kmalloc(sizeof(struct knote), M_KQUEUE, M_WAITOK);
	1533	}
	1534
	1535	static void
	1536	knote_free(struct knote *kn)
	1537	{
	1538	kfree(kn, M_KQUEUE);
	1539	}