gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1997 John S. Dyson. All rights reserved.
	3	*
	4	* Redistribution and use in source and binary forms, with or without
	5	* modification, are permitted provided that the following conditions
	6	* are met:
	7	* 1. Redistributions of source code must retain the above copyright
	8	* notice, this list of conditions and the following disclaimer.
	9	* 2. John S. Dyson's name may not be used to endorse or promote products
	10	* derived from this software without specific prior written permission.
	11	*
	12	* DISCLAIMER: This code isn't warranted to do anything useful. Anything
	13	* bad that happens because of using this software isn't the responsibility
	14	* of the author. This software is distributed AS-IS.
	15	*
	16	* $FreeBSD: src/sys/kern/vfs_aio.c,v 1.70.2.28 2003/05/29 06:15:35 alc Exp $
	17	* $DragonFly: src/sys/kern/vfs_aio.c,v 1.4 2003/06/23 17:55:41 dillon Exp $
	18	*/
	19
	20	/*
	21	* This file contains support for the POSIX 1003.1B AIO/LIO facility.
	22	*/
	23
	24	#include <sys/param.h>
	25	#include <sys/systm.h>
	26	#include <sys/buf.h>
	27	#include <sys/sysproto.h>
	28	#include <sys/filedesc.h>
	29	#include <sys/kernel.h>
	30	#include <sys/fcntl.h>
	31	#include <sys/file.h>
	32	#include <sys/lock.h>
	33	#include <sys/unistd.h>
	34	#include <sys/proc.h>
	35	#include <sys/resourcevar.h>
	36	#include <sys/signalvar.h>
	37	#include <sys/protosw.h>
	38	#include <sys/socketvar.h>
	39	#include <sys/sysctl.h>
	40	#include <sys/vnode.h>
	41	#include <sys/conf.h>
	42	#include <sys/event.h>
	43
	44	#include <vm/vm.h>
	45	#include <vm/vm_extern.h>
	46	#include <vm/pmap.h>
	47	#include <vm/vm_map.h>
	48	#include <vm/vm_zone.h>
	49	#include <sys/aio.h>
	50
	51	#include <machine/limits.h>
	52	#include "opt_vfs_aio.h"
	53
	54	#ifdef VFS_AIO
	55
	56	/*
	57	* Counter for allocating reference ids to new jobs. Wrapped to 1 on
	58	* overflow.
	59	*/
	60	static long jobrefid;
	61
	62	#define JOBST_NULL 0x0
	63	#define JOBST_JOBQGLOBAL 0x2
	64	#define JOBST_JOBRUNNING 0x3
	65	#define JOBST_JOBFINISHED 0x4
	66	#define JOBST_JOBQBUF 0x5
	67	#define JOBST_JOBBFINISHED 0x6
	68
	69	#ifndef MAX_AIO_PER_PROC
	70	#define MAX_AIO_PER_PROC 32
	71	#endif
	72
	73	#ifndef MAX_AIO_QUEUE_PER_PROC
	74	#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
	75	#endif
	76
	77	#ifndef MAX_AIO_PROCS
	78	#define MAX_AIO_PROCS 32
	79	#endif
	80
	81	#ifndef MAX_AIO_QUEUE
	82	#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
	83	#endif
	84
	85	#ifndef TARGET_AIO_PROCS
	86	#define TARGET_AIO_PROCS 4
	87	#endif
	88
	89	#ifndef MAX_BUF_AIO
	90	#define MAX_BUF_AIO 16
	91	#endif
	92
	93	#ifndef AIOD_TIMEOUT_DEFAULT
	94	#define AIOD_TIMEOUT_DEFAULT (10 * hz)
	95	#endif
	96
	97	#ifndef AIOD_LIFETIME_DEFAULT
	98	#define AIOD_LIFETIME_DEFAULT (30 * hz)
	99	#endif
	100
	101	SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
	102
	103	static int max_aio_procs = MAX_AIO_PROCS;
	104	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
	105	CTLFLAG_RW, &max_aio_procs, 0,
	106	"Maximum number of kernel threads to use for handling async IO");
	107
	108	static int num_aio_procs = 0;
	109	SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
	110	CTLFLAG_RD, &num_aio_procs, 0,
	111	"Number of presently active kernel threads for async IO");
	112
	113	/*
	114	* The code will adjust the actual number of AIO processes towards this
	115	* number when it gets a chance.
	116	*/
	117	static int target_aio_procs = TARGET_AIO_PROCS;
	118	SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
	119	0, "Preferred number of ready kernel threads for async IO");
	120
	121	static int max_queue_count = MAX_AIO_QUEUE;
	122	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
	123	"Maximum number of aio requests to queue, globally");
	124
	125	static int num_queue_count = 0;
	126	SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
	127	"Number of queued aio requests");
	128
	129	static int num_buf_aio = 0;
	130	SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
	131	"Number of aio requests presently handled by the buf subsystem");
	132
	133	/* Number of async I/O thread in the process of being started */
	134	/* XXX This should be local to _aio_aqueue() */
	135	static int num_aio_resv_start = 0;
	136
	137	static int aiod_timeout;
	138	SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
	139	"Timeout value for synchronous aio operations");
	140
	141	static int aiod_lifetime;
	142	SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
	143	"Maximum lifetime for idle aiod");
	144
	145	static int max_aio_per_proc = MAX_AIO_PER_PROC;
	146	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
	147	0, "Maximum active aio requests per process (stored in the process)");
	148
	149	static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
	150	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
	151	&max_aio_queue_per_proc, 0,
	152	"Maximum queued aio requests per process (stored in the process)");
	153
	154	static int max_buf_aio = MAX_BUF_AIO;
	155	SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
	156	"Maximum buf aio requests per process (stored in the process)");
	157
	158	/*
	159	* AIO process info
	160	*/
	161	#define AIOP_FREE 0x1 /* proc on free queue */
	162	#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
	163
	164	struct aioproclist {
	165	int aioprocflags; /* AIO proc flags */
	166	TAILQ_ENTRY(aioproclist) list; /* List of processes */
	167	struct proc aioproc; / The AIO thread */
	168	};
	169
	170	/*
	171	* data-structure for lio signal management
	172	*/
	173	struct aio_liojob {
	174	int lioj_flags;
	175	int lioj_buffer_count;
	176	int lioj_buffer_finished_count;
	177	int lioj_queue_count;
	178	int lioj_queue_finished_count;
	179	struct sigevent lioj_signal; /* signal on all I/O done */
	180	TAILQ_ENTRY(aio_liojob) lioj_list;
	181	struct kaioinfo *lioj_ki;
	182	};
	183	#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
	184	#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
	185
	186	/*
	187	* per process aio data structure
	188	*/
	189	struct kaioinfo {
	190	int kaio_flags; /* per process kaio flags */
	191	int kaio_maxactive_count; /* maximum number of AIOs */
	192	int kaio_active_count; /* number of currently used AIOs */
	193	int kaio_qallowed_count; /* maxiumu size of AIO queue */
	194	int kaio_queue_count; /* size of AIO queue */
	195	int kaio_ballowed_count; /* maximum number of buffers */
	196	int kaio_queue_finished_count; /* number of daemon jobs finished */
	197	int kaio_buffer_count; /* number of physio buffers */
	198	int kaio_buffer_finished_count; /* count of I/O done */
	199	struct proc kaio_p; / process that uses this kaio block */
	200	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
	201	TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */
	202	TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */
	203	TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */
	204	TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */
	205	TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
	206	};
	207
	208	#define KAIO_RUNDOWN 0x1 /* process is being run down */
	209	#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
	210
	211	static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
	212	static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
	213	static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
	214	static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */
	215
	216	static void aio_init_aioinfo(struct proc *p);
	217	static void aio_onceonly(void *);
	218	static int aio_free_entry(struct aiocblist *aiocbe);
	219	static void aio_process(struct aiocblist *aiocbe);
	220	static int aio_newproc(void);
	221	static int aio_aqueue(struct aiocb *job, int type);
	222	static void aio_physwakeup(struct buf *bp);
	223	static int aio_fphysio(struct aiocblist *aiocbe);
	224	static int aio_qphysio(struct proc p, struct aiocblist iocb);
	225	static void aio_daemon(void *uproc);
	226	static void process_signal(void *aioj);
	227
	228	SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
	229
	230	/*
	231	* Zones for:
	232	* kaio Per process async io info
	233	* aiop async io thread data
	234	* aiocb async io jobs
	235	* aiol list io job pointer - internal to aio_suspend XXX
	236	* aiolio list io jobs
	237	*/
	238	static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
	239
	240	/*
	241	* Startup initialization
	242	*/
	243	static void
	244	aio_onceonly(void *na)
	245	{
	246	TAILQ_INIT(&aio_freeproc);
	247	TAILQ_INIT(&aio_activeproc);
	248	TAILQ_INIT(&aio_jobs);
	249	TAILQ_INIT(&aio_bufjobs);
	250	TAILQ_INIT(&aio_freejobs);
	251	kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
	252	aiop_zone = zinit("AIOP", sizeof(struct aioproclist), 0, 0, 1);
	253	aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
	254	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1);
	255	aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1);
	256	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
	257	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
	258	jobrefid = 1;
	259	}
	260
	261	/*
	262	* Init the per-process aioinfo structure. The aioinfo limits are set
	263	* per-process for user limit (resource) management.
	264	*/
	265	static void
	266	aio_init_aioinfo(struct proc *p)
	267	{
	268	struct kaioinfo *ki;
	269	if (p->p_aioinfo == NULL) {
	270	ki = zalloc(kaio_zone);
	271	p->p_aioinfo = ki;
	272	ki->kaio_flags = 0;
	273	ki->kaio_maxactive_count = max_aio_per_proc;
	274	ki->kaio_active_count = 0;
	275	ki->kaio_qallowed_count = max_aio_queue_per_proc;
	276	ki->kaio_queue_count = 0;
	277	ki->kaio_ballowed_count = max_buf_aio;
	278	ki->kaio_buffer_count = 0;
	279	ki->kaio_buffer_finished_count = 0;
	280	ki->kaio_p = p;
	281	TAILQ_INIT(&ki->kaio_jobdone);
	282	TAILQ_INIT(&ki->kaio_jobqueue);
	283	TAILQ_INIT(&ki->kaio_bufdone);
	284	TAILQ_INIT(&ki->kaio_bufqueue);
	285	TAILQ_INIT(&ki->kaio_liojoblist);
	286	TAILQ_INIT(&ki->kaio_sockqueue);
	287	}
	288
	289	while (num_aio_procs < target_aio_procs)
	290	aio_newproc();
	291	}
	292
	293	/*
	294	* Free a job entry. Wait for completion if it is currently active, but don't
	295	* delay forever. If we delay, we return a flag that says that we have to
	296	* restart the queue scan.
	297	*/
	298	static int
	299	aio_free_entry(struct aiocblist *aiocbe)
	300	{
	301	struct kaioinfo *ki;
	302	struct aio_liojob *lj;
	303	struct proc *p;
	304	int error;
	305	int s;
	306
	307	if (aiocbe->jobstate == JOBST_NULL)
	308	panic("aio_free_entry: freeing already free job");
	309
	310	p = aiocbe->userproc;
	311	ki = p->p_aioinfo;
	312	lj = aiocbe->lio;
	313	if (ki == NULL)
	314	panic("aio_free_entry: missing p->p_aioinfo");
	315
	316	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
	317	aiocbe->jobflags \|= AIOCBLIST_RUNDOWN;
	318	tsleep(aiocbe, PRIBIO, "jobwai", 0);
	319	}
	320	if (aiocbe->bp == NULL) {
	321	if (ki->kaio_queue_count <= 0)
	322	panic("aio_free_entry: process queue size <= 0");
	323	if (num_queue_count <= 0)
	324	panic("aio_free_entry: system wide queue size <= 0");
	325
	326	if (lj) {
	327	lj->lioj_queue_count--;
	328	if (aiocbe->jobflags & AIOCBLIST_DONE)
	329	lj->lioj_queue_finished_count--;
	330	}
	331	ki->kaio_queue_count--;
	332	if (aiocbe->jobflags & AIOCBLIST_DONE)
	333	ki->kaio_queue_finished_count--;
	334	num_queue_count--;
	335	} else {
	336	if (lj) {
	337	lj->lioj_buffer_count--;
	338	if (aiocbe->jobflags & AIOCBLIST_DONE)
	339	lj->lioj_buffer_finished_count--;
	340	}
	341	if (aiocbe->jobflags & AIOCBLIST_DONE)
	342	ki->kaio_buffer_finished_count--;
	343	ki->kaio_buffer_count--;
	344	num_buf_aio--;
	345	}
	346
	347	/* aiocbe is going away, we need to destroy any knotes */
	348	knote_remove(p, &aiocbe->klist);
	349
	350	if ((ki->kaio_flags & KAIO_WAKEUP) \|\| ((ki->kaio_flags & KAIO_RUNDOWN)
	351	&& ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
	352	ki->kaio_flags &= ~KAIO_WAKEUP;
	353	wakeup(p);
	354	}
	355
	356	if (aiocbe->jobstate == JOBST_JOBQBUF) {
	357	if ((error = aio_fphysio(aiocbe)) != 0)
	358	return error;
	359	if (aiocbe->jobstate != JOBST_JOBBFINISHED)
	360	panic("aio_free_entry: invalid physio finish-up state");
	361	s = splbio();
	362	TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
	363	splx(s);
	364	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
	365	s = splnet();
	366	TAILQ_REMOVE(&aio_jobs, aiocbe, list);
	367	TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
	368	splx(s);
	369	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
	370	TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
	371	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
	372	s = splbio();
	373	TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
	374	splx(s);
	375	if (aiocbe->bp) {
	376	vunmapbuf(aiocbe->bp);
	377	relpbuf(aiocbe->bp, NULL);
	378	aiocbe->bp = NULL;
	379	}
	380	}
	381	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
	382	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	383	zfree(aiolio_zone, lj);
	384	}
	385	aiocbe->jobstate = JOBST_NULL;
	386	untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
	387	fdrop(aiocbe->fd_file, curproc);
	388	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
	389	return 0;
	390	}
	391	#endif /* VFS_AIO */
	392
	393	/*
	394	* Rundown the jobs for a given process.
	395	*/
	396	void
	397	aio_proc_rundown(struct proc *p)
	398	{
	399	#ifndef VFS_AIO
	400	return;
	401	#else
	402	int s;
	403	struct kaioinfo *ki;
	404	struct aio_liojob lj, ljn;
	405	struct aiocblist aiocbe, aiocbn;
	406	struct file *fp;
	407	struct socket *so;
	408
	409	ki = p->p_aioinfo;
	410	if (ki == NULL)
	411	return;
	412
	413	ki->kaio_flags \|= LIOJ_SIGNAL_POSTED;
	414	while ((ki->kaio_active_count > 0) \|\| (ki->kaio_buffer_count >
	415	ki->kaio_buffer_finished_count)) {
	416	ki->kaio_flags \|= KAIO_RUNDOWN;
	417	if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
	418	break;
	419	}
	420
	421	/*
	422	* Move any aio ops that are waiting on socket I/O to the normal job
	423	* queues so they are cleaned up with any others.
	424	*/
	425	s = splnet();
	426	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
	427	aiocbn) {
	428	aiocbn = TAILQ_NEXT(aiocbe, plist);
	429	fp = aiocbe->fd_file;
	430	if (fp != NULL) {
	431	so = (struct socket *)fp->f_data;
	432	TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
	433	if (TAILQ_EMPTY(&so->so_aiojobq)) {
	434	so->so_snd.sb_flags &= ~SB_AIO;
	435	so->so_rcv.sb_flags &= ~SB_AIO;
	436	}
	437	}
	438	TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
	439	TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
	440	TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
	441	}
	442	splx(s);
	443
	444	restart1:
	445	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
	446	aiocbn = TAILQ_NEXT(aiocbe, plist);
	447	if (aio_free_entry(aiocbe))
	448	goto restart1;
	449	}
	450
	451	restart2:
	452	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
	453	aiocbn) {
	454	aiocbn = TAILQ_NEXT(aiocbe, plist);
	455	if (aio_free_entry(aiocbe))
	456	goto restart2;
	457	}
	458
	459	/*
	460	* Note the use of lots of splbio here, trying to avoid splbio for long chains
	461	* of I/O. Probably unnecessary.
	462	*/
	463	restart3:
	464	s = splbio();
	465	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
	466	ki->kaio_flags \|= KAIO_WAKEUP;
	467	tsleep(p, PRIBIO, "aioprn", 0);
	468	splx(s);
	469	goto restart3;
	470	}
	471	splx(s);
	472
	473	restart4:
	474	s = splbio();
	475	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
	476	aiocbn = TAILQ_NEXT(aiocbe, plist);
	477	if (aio_free_entry(aiocbe)) {
	478	splx(s);
	479	goto restart4;
	480	}
	481	}
	482	splx(s);
	483
	484	/*
	485	* If we've slept, jobs might have moved from one queue to another.
	486	* Retry rundown if we didn't manage to empty the queues.
	487	*/
	488	if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL \|\|
	489	TAILQ_FIRST(&ki->kaio_jobqueue) != NULL \|\|
	490	TAILQ_FIRST(&ki->kaio_bufqueue) != NULL \|\|
	491	TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
	492	goto restart1;
	493
	494	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
	495	ljn = TAILQ_NEXT(lj, lioj_list);
	496	if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
	497	0)) {
	498	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	499	zfree(aiolio_zone, lj);
	500	} else {
	501	#ifdef DIAGNOSTIC
	502	printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
	503	"QF:%d\n", lj->lioj_buffer_count,
	504	lj->lioj_buffer_finished_count,
	505	lj->lioj_queue_count,
	506	lj->lioj_queue_finished_count);
	507	#endif
	508	}
	509	}
	510
	511	zfree(kaio_zone, ki);
	512	p->p_aioinfo = NULL;
	513	#endif /* VFS_AIO */
	514	}
	515
	516	#ifdef VFS_AIO
	517	/*
	518	* Select a job to run (called by an AIO daemon).
	519	*/
	520	static struct aiocblist *
	521	aio_selectjob(struct aioproclist *aiop)
	522	{
	523	int s;
	524	struct aiocblist *aiocbe;
	525	struct kaioinfo *ki;
	526	struct proc *userp;
	527
	528	s = splnet();
	529	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
	530	TAILQ_NEXT(aiocbe, list)) {
	531	userp = aiocbe->userproc;
	532	ki = userp->p_aioinfo;
	533
	534	if (ki->kaio_active_count < ki->kaio_maxactive_count) {
	535	TAILQ_REMOVE(&aio_jobs, aiocbe, list);
	536	splx(s);
	537	return aiocbe;
	538	}
	539	}
	540	splx(s);
	541
	542	return NULL;
	543	}
	544
	545	/*
	546	* The AIO processing activity. This is the code that does the I/O request for
	547	* the non-physio version of the operations. The normal vn operations are used,
	548	* and this code should work in all instances for every type of file, including
	549	* pipes, sockets, fifos, and regular files.
	550	*/
	551	static void
	552	aio_process(struct aiocblist *aiocbe)
	553	{
	554	struct proc *mycp;
	555	struct aiocb *cb;
	556	struct file *fp;
	557	struct uio auio;
	558	struct iovec aiov;
	559	int cnt;
	560	int error;
	561	int oublock_st, oublock_end;
	562	int inblock_st, inblock_end;
	563
	564	mycp = curproc;
	565	cb = &aiocbe->uaiocb;
	566	fp = aiocbe->fd_file;
	567
	568	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
	569	aiov.iov_len = cb->aio_nbytes;
	570
	571	auio.uio_iov = &aiov;
	572	auio.uio_iovcnt = 1;
	573	auio.uio_offset = cb->aio_offset;
	574	auio.uio_resid = cb->aio_nbytes;
	575	cnt = cb->aio_nbytes;
	576	auio.uio_segflg = UIO_USERSPACE;
	577	auio.uio_procp = mycp;
	578
	579	inblock_st = mycp->p_stats->p_ru.ru_inblock;
	580	oublock_st = mycp->p_stats->p_ru.ru_oublock;
	581	/*
	582	* _aio_aqueue() acquires a reference to the file that is
	583	* released in aio_free_entry().
	584	*/
	585	if (cb->aio_lio_opcode == LIO_READ) {
	586	auio.uio_rw = UIO_READ;
	587	error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
	588	} else {
	589	auio.uio_rw = UIO_WRITE;
	590	error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
	591	}
	592	inblock_end = mycp->p_stats->p_ru.ru_inblock;
	593	oublock_end = mycp->p_stats->p_ru.ru_oublock;
	594
	595	aiocbe->inputcharge = inblock_end - inblock_st;
	596	aiocbe->outputcharge = oublock_end - oublock_st;
	597
	598	if ((error) && (auio.uio_resid != cnt)) {
	599	if (error == ERESTART \|\| error == EINTR \|\| error == EWOULDBLOCK)
	600	error = 0;
	601	if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
	602	psignal(aiocbe->userproc, SIGPIPE);
	603	}
	604
	605	cnt -= auio.uio_resid;
	606	cb->_aiocb_private.error = error;
	607	cb->_aiocb_private.status = cnt;
	608	}
	609
	610	/*
	611	* The AIO daemon, most of the actual work is done in aio_process,
	612	* but the setup (and address space mgmt) is done in this routine.
	613	*/
	614	static void
	615	aio_daemon(void *uproc)
	616	{
	617	int s;
	618	struct aio_liojob *lj;
	619	struct aiocb *cb;
	620	struct aiocblist *aiocbe;
	621	struct aioproclist *aiop;
	622	struct kaioinfo *ki;
	623	struct proc curcp, mycp, *userp;
	624	struct vmspace myvm, tmpvm;
	625
	626	/*
	627	* Local copies of curproc (cp) and vmspace (myvm)
	628	*/
	629	mycp = curproc;
	630	myvm = mycp->p_vmspace;
	631
	632	if (mycp->p_textvp) {
	633	vrele(mycp->p_textvp);
	634	mycp->p_textvp = NULL;
	635	}
	636
	637	/*
	638	* Allocate and ready the aio control info. There is one aiop structure
	639	* per daemon.
	640	*/
	641	aiop = zalloc(aiop_zone);
	642	aiop->aioproc = mycp;
	643	aiop->aioprocflags \|= AIOP_FREE;
	644
	645	s = splnet();
	646
	647	/*
	648	* Place thread (lightweight process) onto the AIO free thread list.
	649	*/
	650	if (TAILQ_EMPTY(&aio_freeproc))
	651	wakeup(&aio_freeproc);
	652	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
	653
	654	splx(s);
	655
	656	/* Make up a name for the daemon. */
	657	strcpy(mycp->p_comm, "aiod");
	658
	659	/*
	660	* Get rid of our current filedescriptors. AIOD's don't need any
	661	* filedescriptors, except as temporarily inherited from the client.
	662	* Credentials are also cloned, and made equivalent to "root".
	663	*/
	664	fdfree(mycp);
	665	mycp->p_fd = NULL;
	666	mycp->p_ucred = crcopy(mycp->p_ucred);
	667	mycp->p_ucred->cr_uid = 0;
	668	uifree(mycp->p_ucred->cr_uidinfo);
	669	mycp->p_ucred->cr_uidinfo = uifind(0);
	670	mycp->p_ucred->cr_ngroups = 1;
	671	mycp->p_ucred->cr_groups[0] = 1;
	672
	673	/* The daemon resides in its own pgrp. */
	674	enterpgrp(mycp, mycp->p_pid, 1);
	675
	676	/* Mark special process type. */
	677	mycp->p_flag \|= P_SYSTEM \| P_KTHREADP;
	678
	679	/*
	680	* Wakeup parent process. (Parent sleeps to keep from blasting away
	681	* and creating too many daemons.)
	682	*/
	683	wakeup(mycp);
	684
	685	for (;;) {
	686	/*
	687	* curcp is the current daemon process context.
	688	* userp is the current user process context.
	689	*/
	690	curcp = mycp;
	691
	692	/*
	693	* Take daemon off of free queue
	694	*/
	695	if (aiop->aioprocflags & AIOP_FREE) {
	696	s = splnet();
	697	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	698	TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
	699	aiop->aioprocflags &= ~AIOP_FREE;
	700	splx(s);
	701	}
	702	aiop->aioprocflags &= ~AIOP_SCHED;
	703
	704	/*
	705	* Check for jobs.
	706	*/
	707	while ((aiocbe = aio_selectjob(aiop)) != NULL) {
	708	cb = &aiocbe->uaiocb;
	709	userp = aiocbe->userproc;
	710
	711	aiocbe->jobstate = JOBST_JOBRUNNING;
	712
	713	/*
	714	* Connect to process address space for user program.
	715	*/
	716	if (userp != curcp) {
	717	/*
	718	* Save the current address space that we are
	719	* connected to.
	720	*/
	721	tmpvm = mycp->p_vmspace;
	722
	723	/*
	724	* Point to the new user address space, and
	725	* refer to it.
	726	*/
	727	mycp->p_vmspace = userp->p_vmspace;
	728	mycp->p_vmspace->vm_refcnt++;
	729
	730	/* Activate the new mapping. */
	731	pmap_activate(mycp);
	732
	733	/*
	734	* If the old address space wasn't the daemons
	735	* own address space, then we need to remove the
	736	* daemon's reference from the other process
	737	* that it was acting on behalf of.
	738	*/
	739	if (tmpvm != myvm) {
	740	vmspace_free(tmpvm);
	741	}
	742	curcp = userp;
	743	}
	744
	745	ki = userp->p_aioinfo;
	746	lj = aiocbe->lio;
	747
	748	/* Account for currently active jobs. */
	749	ki->kaio_active_count++;
	750
	751	/* Do the I/O function. */
	752	aio_process(aiocbe);
	753
	754	/* Decrement the active job count. */
	755	ki->kaio_active_count--;
	756
	757	/*
	758	* Increment the completion count for wakeup/signal
	759	* comparisons.
	760	*/
	761	aiocbe->jobflags \|= AIOCBLIST_DONE;
	762	ki->kaio_queue_finished_count++;
	763	if (lj)
	764	lj->lioj_queue_finished_count++;
	765	if ((ki->kaio_flags & KAIO_WAKEUP) \|\| ((ki->kaio_flags
	766	& KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
	767	ki->kaio_flags &= ~KAIO_WAKEUP;
	768	wakeup(userp);
	769	}
	770
	771	s = splbio();
	772	if (lj && (lj->lioj_flags &
	773	(LIOJ_SIGNAL\|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
	774	if ((lj->lioj_queue_finished_count ==
	775	lj->lioj_queue_count) &&
	776	(lj->lioj_buffer_finished_count ==
	777	lj->lioj_buffer_count)) {
	778	psignal(userp,
	779	lj->lioj_signal.sigev_signo);
	780	lj->lioj_flags \|=
	781	LIOJ_SIGNAL_POSTED;
	782	}
	783	}
	784	splx(s);
	785
	786	aiocbe->jobstate = JOBST_JOBFINISHED;
	787
	788	s = splnet();
	789	TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
	790	TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
	791	splx(s);
	792	KNOTE(&aiocbe->klist, 0);
	793
	794	if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
	795	wakeup(aiocbe);
	796	aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
	797	}
	798
	799	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
	800	psignal(userp, cb->aio_sigevent.sigev_signo);
	801	}
	802	}
	803
	804	/*
	805	* Disconnect from user address space.
	806	*/
	807	if (curcp != mycp) {
	808	/* Get the user address space to disconnect from. */
	809	tmpvm = mycp->p_vmspace;
	810
	811	/* Get original address space for daemon. */
	812	mycp->p_vmspace = myvm;
	813
	814	/* Activate the daemon's address space. */
	815	pmap_activate(mycp);
	816	#ifdef DIAGNOSTIC
	817	if (tmpvm == myvm) {
	818	printf("AIOD: vmspace problem -- %d\n",
	819	mycp->p_pid);
	820	}
	821	#endif
	822	/* Remove our vmspace reference. */
	823	vmspace_free(tmpvm);
	824
	825	curcp = mycp;
	826	}
	827
	828	/*
	829	* If we are the first to be put onto the free queue, wakeup
	830	* anyone waiting for a daemon.
	831	*/
	832	s = splnet();
	833	TAILQ_REMOVE(&aio_activeproc, aiop, list);
	834	if (TAILQ_EMPTY(&aio_freeproc))
	835	wakeup(&aio_freeproc);
	836	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
	837	aiop->aioprocflags \|= AIOP_FREE;
	838	splx(s);
	839
	840	/*
	841	* If daemon is inactive for a long time, allow it to exit,
	842	* thereby freeing resources.
	843	*/
	844	if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
	845	PRIBIO, "aiordy", aiod_lifetime)) {
	846	s = splnet();
	847	if (TAILQ_EMPTY(&aio_jobs)) {
	848	if ((aiop->aioprocflags & AIOP_FREE) &&
	849	(num_aio_procs > target_aio_procs)) {
	850	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	851	splx(s);
	852	zfree(aiop_zone, aiop);
	853	num_aio_procs--;
	854	#ifdef DIAGNOSTIC
	855	if (mycp->p_vmspace->vm_refcnt <= 1) {
	856	printf("AIOD: bad vm refcnt for"
	857	" exiting daemon: %d\n",
	858	mycp->p_vmspace->vm_refcnt);
	859	}
	860	#endif
	861	exit1(mycp, 0);
	862	}
	863	}
	864	splx(s);
	865	}
	866	}
	867	}
	868
	869	/*
	870	* Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
	871	* AIO daemon modifies its environment itself.
	872	*/
	873	static int
	874	aio_newproc()
	875	{
	876	int error;
	877	struct proc p, np;
	878
	879	p = &proc0;
	880	error = fork1(p, RFPROC\|RFMEM\|RFNOWAIT, &np);
	881	if (error)
	882	return error;
	883	cpu_set_fork_handler(np, aio_daemon, curproc);
	884	start_forked_proc(p, np);
	885
	886	/*
	887	* Wait until daemon is started, but continue on just in case to
	888	* handle error conditions.
	889	*/
	890	error = tsleep(np, PZERO, "aiosta", aiod_timeout);
	891	num_aio_procs++;
	892
	893	return error;
	894	}
	895
	896	/*
	897	* Try the high-performance, low-overhead physio method for eligible
	898	* VCHR devices. This method doesn't use an aio helper thread, and
	899	* thus has very low overhead.
	900	*
	901	* Assumes that the caller, _aio_aqueue(), has incremented the file
	902	* structure's reference count, preventing its deallocation for the
	903	* duration of this call.
	904	*/
	905	static int
	906	aio_qphysio(struct proc p, struct aiocblist aiocbe)
	907	{
	908	int error;
	909	struct aiocb *cb;
	910	struct file *fp;
	911	struct buf *bp;
	912	struct vnode *vp;
	913	struct kaioinfo *ki;
	914	struct aio_liojob *lj;
	915	int s;
	916	int notify;
	917
	918	cb = &aiocbe->uaiocb;
	919	fp = aiocbe->fd_file;
	920
	921	if (fp->f_type != DTYPE_VNODE)
	922	return (-1);
	923
	924	vp = (struct vnode *)fp->f_data;
	925
	926	/*
	927	* If its not a disk, we don't want to return a positive error.
	928	* It causes the aio code to not fall through to try the thread
	929	* way when you're talking to a regular file.
	930	*/
	931	if (!vn_isdisk(vp, &error)) {
	932	if (error == ENOTBLK)
	933	return (-1);
	934	else
	935	return (error);
	936	}
	937
	938	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
	939	return (-1);
	940
	941	if (cb->aio_nbytes >
	942	MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
	943	return (-1);
	944
	945	ki = p->p_aioinfo;
	946	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
	947	return (-1);
	948
	949	ki->kaio_buffer_count++;
	950
	951	lj = aiocbe->lio;
	952	if (lj)
	953	lj->lioj_buffer_count++;
	954
	955	/* Create and build a buffer header for a transfer. */
	956	bp = (struct buf *)getpbuf(NULL);
	957	BUF_KERNPROC(bp);
	958
	959	/*
	960	* Get a copy of the kva from the physical buffer.
	961	*/
	962	bp->b_caller1 = p;
	963	bp->b_dev = vp->v_rdev;
	964	error = 0;
	965
	966	bp->b_bcount = cb->aio_nbytes;
	967	bp->b_bufsize = cb->aio_nbytes;
	968	bp->b_flags = B_PHYS \| B_CALL \| (cb->aio_lio_opcode == LIO_WRITE ?
	969	B_WRITE : B_READ);
	970	bp->b_iodone = aio_physwakeup;
	971	bp->b_saveaddr = bp->b_data;
	972	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
	973	bp->b_blkno = btodb(cb->aio_offset);
	974
	975	/* Bring buffer into kernel space. */
	976	if (vmapbuf(bp) < 0) {
	977	error = EFAULT;
	978	goto doerror;
	979	}
	980
	981	s = splbio();
	982	aiocbe->bp = bp;
	983	bp->b_spc = (void *)aiocbe;
	984	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
	985	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
	986	aiocbe->jobstate = JOBST_JOBQBUF;
	987	cb->_aiocb_private.status = cb->aio_nbytes;
	988	num_buf_aio++;
	989	bp->b_error = 0;
	990
	991	splx(s);
	992
	993	/* Perform transfer. */
	994	BUF_STRATEGY(bp, 0);
	995
	996	notify = 0;
	997	s = splbio();
	998
	999	/*
	1000	* If we had an error invoking the request, or an error in processing
	1001	* the request before we have returned, we process it as an error in
	1002	* transfer. Note that such an I/O error is not indicated immediately,
	1003	* but is returned using the aio_error mechanism. In this case,
	1004	* aio_suspend will return immediately.
	1005	*/
	1006	if (bp->b_error \|\| (bp->b_flags & B_ERROR)) {
	1007	struct aiocb *job = aiocbe->uuaiocb;
	1008
	1009	aiocbe->uaiocb._aiocb_private.status = 0;
	1010	suword(&job->_aiocb_private.status, 0);
	1011	aiocbe->uaiocb._aiocb_private.error = bp->b_error;
	1012	suword(&job->_aiocb_private.error, bp->b_error);
	1013
	1014	ki->kaio_buffer_finished_count++;
	1015
	1016	if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
	1017	aiocbe->jobstate = JOBST_JOBBFINISHED;
	1018	aiocbe->jobflags \|= AIOCBLIST_DONE;
	1019	TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
	1020	TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
	1021	TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
	1022	notify = 1;
	1023	}
	1024	}
	1025	splx(s);
	1026	if (notify)
	1027	KNOTE(&aiocbe->klist, 0);
	1028	return 0;
	1029
	1030	doerror:
	1031	ki->kaio_buffer_count--;
	1032	if (lj)
	1033	lj->lioj_buffer_count--;
	1034	aiocbe->bp = NULL;
	1035	relpbuf(bp, NULL);
	1036	return error;
	1037	}
	1038
	1039	/*
	1040	* This waits/tests physio completion.
	1041	*/
	1042	static int
	1043	aio_fphysio(struct aiocblist *iocb)
	1044	{
	1045	int s;
	1046	struct buf *bp;
	1047	int error;
	1048
	1049	bp = iocb->bp;
	1050
	1051	s = splbio();
	1052	while ((bp->b_flags & B_DONE) == 0) {
	1053	if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
	1054	if ((bp->b_flags & B_DONE) == 0) {
	1055	splx(s);
	1056	return EINPROGRESS;
	1057	} else
	1058	break;
	1059	}
	1060	}
	1061	splx(s);
	1062
	1063	/* Release mapping into kernel space. */
	1064	vunmapbuf(bp);
	1065	iocb->bp = 0;
	1066
	1067	error = 0;
	1068
	1069	/* Check for an error. */
	1070	if (bp->b_flags & B_ERROR)
	1071	error = bp->b_error;
	1072
	1073	relpbuf(bp, NULL);
	1074	return (error);
	1075	}
	1076	#endif /* VFS_AIO */
	1077
	1078	/*
	1079	* Wake up aio requests that may be serviceable now.
	1080	*/
	1081	void
	1082	aio_swake(struct socket so, struct sockbuf sb)
	1083	{
	1084	#ifndef VFS_AIO
	1085	return;
	1086	#else
	1087	struct aiocblist cb,cbn;
	1088	struct proc *p;
	1089	struct kaioinfo *ki = NULL;
	1090	int opcode, wakecount = 0;
	1091	struct aioproclist *aiop;
	1092
	1093	if (sb == &so->so_snd) {
	1094	opcode = LIO_WRITE;
	1095	so->so_snd.sb_flags &= ~SB_AIO;
	1096	} else {
	1097	opcode = LIO_READ;
	1098	so->so_rcv.sb_flags &= ~SB_AIO;
	1099	}
	1100
	1101	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
	1102	cbn = TAILQ_NEXT(cb, list);
	1103	if (opcode == cb->uaiocb.aio_lio_opcode) {
	1104	p = cb->userproc;
	1105	ki = p->p_aioinfo;
	1106	TAILQ_REMOVE(&so->so_aiojobq, cb, list);
	1107	TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
	1108	TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
	1109	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
	1110	wakecount++;
	1111	if (cb->jobstate != JOBST_JOBQGLOBAL)
	1112	panic("invalid queue value");
	1113	}
	1114	}
	1115
	1116	while (wakecount--) {
	1117	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
	1118	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	1119	TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
	1120	aiop->aioprocflags &= ~AIOP_FREE;
	1121	wakeup(aiop->aioproc);
	1122	}
	1123	}
	1124	#endif /* VFS_AIO */
	1125	}
	1126
	1127	#ifdef VFS_AIO
	1128	/*
	1129	* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
	1130	* technique is done in this code.
	1131	*/
	1132	static int
	1133	_aio_aqueue(struct aiocb job, struct aio_liojob lj, int type)
	1134	{
	1135	struct proc *p = curprpoc;
	1136	struct filedesc *fdp;
	1137	struct file *fp;
	1138	unsigned int fd;
	1139	struct socket *so;
	1140	int s;
	1141	int error;
	1142	int opcode, user_opcode;
	1143	struct aiocblist *aiocbe;
	1144	struct aioproclist *aiop;
	1145	struct kaioinfo *ki;
	1146	struct kevent kev;
	1147	struct kqueue *kq;
	1148	struct file *kq_fp;
	1149
	1150	if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL)
	1151	TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
	1152	else
	1153	aiocbe = zalloc (aiocb_zone);
	1154
	1155	aiocbe->inputcharge = 0;
	1156	aiocbe->outputcharge = 0;
	1157	callout_handle_init(&aiocbe->timeouthandle);
	1158	SLIST_INIT(&aiocbe->klist);
	1159
	1160	suword(&job->_aiocb_private.status, -1);
	1161	suword(&job->_aiocb_private.error, 0);
	1162	suword(&job->_aiocb_private.kernelinfo, -1);
	1163
	1164	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
	1165	if (error) {
	1166	suword(&job->_aiocb_private.error, error);
	1167	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
	1168	return error;
	1169	}
	1170	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
	1171	!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
	1172	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
	1173	return EINVAL;
	1174	}
	1175
	1176	/* Save userspace address of the job info. */
	1177	aiocbe->uuaiocb = job;
	1178
	1179	/* Get the opcode. */
	1180	user_opcode = aiocbe->uaiocb.aio_lio_opcode;
	1181	if (type != LIO_NOP)
	1182	aiocbe->uaiocb.aio_lio_opcode = type;
	1183	opcode = aiocbe->uaiocb.aio_lio_opcode;
	1184
	1185	/* Get the fd info for process. */
	1186	fdp = p->p_fd;
	1187
	1188	/*
	1189	* Range check file descriptor.
	1190	*/
	1191	fd = aiocbe->uaiocb.aio_fildes;
	1192	if (fd >= fdp->fd_nfiles) {
	1193	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
	1194	if (type == 0)
	1195	suword(&job->_aiocb_private.error, EBADF);
	1196	return EBADF;
	1197	}
	1198
	1199	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
	1200	if ((fp == NULL) \|\| ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
	1201	0))) {
	1202	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
	1203	if (type == 0)
	1204	suword(&job->_aiocb_private.error, EBADF);
	1205	return EBADF;
	1206	}
	1207	fhold(fp);
	1208
	1209	if (aiocbe->uaiocb.aio_offset == -1LL) {
	1210	error = EINVAL;
	1211	goto aqueue_fail;
	1212	}
	1213	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
	1214	if (error) {
	1215	error = EINVAL;
	1216	goto aqueue_fail;
	1217	}
	1218	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
	1219	if (jobrefid == LONG_MAX)
	1220	jobrefid = 1;
	1221	else
	1222	jobrefid++;
	1223
	1224	if (opcode == LIO_NOP) {
	1225	fdrop(fp, p);
	1226	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
	1227	if (type == 0) {
	1228	suword(&job->_aiocb_private.error, 0);
	1229	suword(&job->_aiocb_private.status, 0);
	1230	suword(&job->_aiocb_private.kernelinfo, 0);
	1231	}
	1232	return 0;
	1233	}
	1234	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
	1235	if (type == 0)
	1236	suword(&job->_aiocb_private.status, 0);
	1237	error = EINVAL;
	1238	goto aqueue_fail;
	1239	}
	1240
	1241	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
	1242	kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
	1243	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
	1244	}
	1245	else {
	1246	/*
	1247	* This method for requesting kevent-based notification won't
	1248	* work on the alpha, since we're passing in a pointer
	1249	* via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT-
	1250	* based method instead.
	1251	*/
	1252	if (user_opcode == LIO_NOP \|\| user_opcode == LIO_READ \|\|
	1253	user_opcode == LIO_WRITE)
	1254	goto no_kqueue;
	1255
	1256	error = copyin((struct kevent *)(uintptr_t)user_opcode,
	1257	&kev, sizeof(kev));
	1258	if (error)
	1259	goto aqueue_fail;
	1260	}
	1261	if ((u_int)kev.ident >= fdp->fd_nfiles \|\|
	1262	(kq_fp = fdp->fd_ofiles[kev.ident]) == NULL \|\|
	1263	(kq_fp->f_type != DTYPE_KQUEUE)) {
	1264	error = EBADF;
	1265	goto aqueue_fail;
	1266	}
	1267	kq = (struct kqueue *)kq_fp->f_data;
	1268	kev.ident = (uintptr_t)aiocbe->uuaiocb;
	1269	kev.filter = EVFILT_AIO;
	1270	kev.flags = EV_ADD \| EV_ENABLE \| EV_FLAG1;
	1271	kev.data = (intptr_t)aiocbe;
	1272	error = kqueue_register(kq, &kev, p);
	1273	aqueue_fail:
	1274	if (error) {
	1275	fdrop(fp, p);
	1276	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
	1277	if (type == 0)
	1278	suword(&job->_aiocb_private.error, error);
	1279	goto done;
	1280	}
	1281	no_kqueue:
	1282
	1283	suword(&job->_aiocb_private.error, EINPROGRESS);
	1284	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
	1285	aiocbe->userproc = p;
	1286	aiocbe->jobflags = 0;
	1287	aiocbe->lio = lj;
	1288	ki = p->p_aioinfo;
	1289
	1290	if (fp->f_type == DTYPE_SOCKET) {
	1291	/*
	1292	* Alternate queueing for socket ops: Reach down into the
	1293	* descriptor to get the socket data. Then check to see if the
	1294	* socket is ready to be read or written (based on the requested
	1295	* operation).
	1296	*
	1297	* If it is not ready for io, then queue the aiocbe on the
	1298	* socket, and set the flags so we get a call when sbnotify()
	1299	* happens.
	1300	*/
	1301	so = (struct socket *)fp->f_data;
	1302	s = splnet();
	1303	if (((opcode == LIO_READ) && (!soreadable(so))) \|\| ((opcode ==
	1304	LIO_WRITE) && (!sowriteable(so)))) {
	1305	TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
	1306	TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
	1307	if (opcode == LIO_READ)
	1308	so->so_rcv.sb_flags \|= SB_AIO;
	1309	else
	1310	so->so_snd.sb_flags \|= SB_AIO;
	1311	aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
	1312	ki->kaio_queue_count++;
	1313	num_queue_count++;
	1314	splx(s);
	1315	error = 0;
	1316	goto done;
	1317	}
	1318	splx(s);
	1319	}
	1320
	1321	if ((error = aio_qphysio(p, aiocbe)) == 0)
	1322	goto done;
	1323	if (error > 0) {
	1324	suword(&job->_aiocb_private.status, 0);
	1325	aiocbe->uaiocb._aiocb_private.error = error;
	1326	suword(&job->_aiocb_private.error, error);
	1327	goto done;
	1328	}
	1329
	1330	/* No buffer for daemon I/O. */
	1331	aiocbe->bp = NULL;
	1332
	1333	ki->kaio_queue_count++;
	1334	if (lj)
	1335	lj->lioj_queue_count++;
	1336	s = splnet();
	1337	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
	1338	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
	1339	splx(s);
	1340	aiocbe->jobstate = JOBST_JOBQGLOBAL;
	1341
	1342	num_queue_count++;
	1343	error = 0;
	1344
	1345	/*
	1346	* If we don't have a free AIO process, and we are below our quota, then
	1347	* start one. Otherwise, depend on the subsequent I/O completions to
	1348	* pick-up this job. If we don't sucessfully create the new process
	1349	* (thread) due to resource issues, we return an error for now (EAGAIN),
	1350	* which is likely not the correct thing to do.
	1351	*/
	1352	s = splnet();
	1353	retryproc:
	1354	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
	1355	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	1356	TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
	1357	aiop->aioprocflags &= ~AIOP_FREE;
	1358	wakeup(aiop->aioproc);
	1359	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
	1360	((ki->kaio_active_count + num_aio_resv_start) <
	1361	ki->kaio_maxactive_count)) {
	1362	num_aio_resv_start++;
	1363	if ((error = aio_newproc()) == 0) {
	1364	num_aio_resv_start--;
	1365	goto retryproc;
	1366	}
	1367	num_aio_resv_start--;
	1368	}
	1369	splx(s);
	1370	done:
	1371	return error;
	1372	}
	1373
	1374	/*
	1375	* This routine queues an AIO request, checking for quotas.
	1376	*/
	1377	static int
	1378	aio_aqueue(struct aiocb *job, int type)
	1379	{
	1380	struct proc *p = curprpoc;
	1381	struct kaioinfo *ki;
	1382
	1383	if (p->p_aioinfo == NULL)
	1384	aio_init_aioinfo(p);
	1385
	1386	if (num_queue_count >= max_queue_count)
	1387	return EAGAIN;
	1388
	1389	ki = p->p_aioinfo;
	1390	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
	1391	return EAGAIN;
	1392
	1393	return _aio_aqueue(job, NULL, type);
	1394	}
	1395	#endif /* VFS_AIO */
	1396
	1397	/*
	1398	* Support the aio_return system call, as a side-effect, kernel resources are
	1399	* released.
	1400	*/
	1401	int
	1402	aio_return(struct aio_return_args *uap)
	1403	{
	1404	#ifndef VFS_AIO
	1405	return ENOSYS;
	1406	#else
	1407	struct proc *p = curproc;
	1408	int s;
	1409	long jobref;
	1410	struct aiocblist cb, ncb;
	1411	struct aiocb *ujob;
	1412	struct kaioinfo *ki;
	1413
	1414	ki = p->p_aioinfo;
	1415	if (ki == NULL)
	1416	return EINVAL;
	1417
	1418	ujob = uap->aiocbp;
	1419
	1420	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
	1421	if (jobref == -1 \|\| jobref == 0)
	1422	return EINVAL;
	1423
	1424	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
	1425	if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
	1426	jobref) {
	1427	if (ujob == cb->uuaiocb) {
	1428	p->p_retval[0] =
	1429	cb->uaiocb._aiocb_private.status;
	1430	} else
	1431	p->p_retval[0] = EFAULT;
	1432	if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
	1433	p->p_stats->p_ru.ru_oublock +=
	1434	cb->outputcharge;
	1435	cb->outputcharge = 0;
	1436	} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
	1437	p->p_stats->p_ru.ru_inblock += cb->inputcharge;
	1438	cb->inputcharge = 0;
	1439	}
	1440	aio_free_entry(cb);
	1441	return 0;
	1442	}
	1443	}
	1444	s = splbio();
	1445	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
	1446	ncb = TAILQ_NEXT(cb, plist);
	1447	if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
	1448	== jobref) {
	1449	splx(s);
	1450	if (ujob == cb->uuaiocb) {
	1451	p->p_retval[0] =
	1452	cb->uaiocb._aiocb_private.status;
	1453	} else
	1454	p->p_retval[0] = EFAULT;
	1455	aio_free_entry(cb);
	1456	return 0;
	1457	}
	1458	}
	1459	splx(s);
	1460
	1461	return (EINVAL);
	1462	#endif /* VFS_AIO */
	1463	}
	1464
	1465	/*
	1466	* Allow a process to wakeup when any of the I/O requests are completed.
	1467	*/
	1468	int
	1469	aio_suspend(struct aio_suspend_args *uap)
	1470	{
	1471	#ifndef VFS_AIO
	1472	return ENOSYS;
	1473	#else
	1474	struct proc *p = curproc;
	1475	struct timeval atv;
	1476	struct timespec ts;
	1477	struct aiocb const cbptr, *cbp;
	1478	struct kaioinfo *ki;
	1479	struct aiocblist *cb;
	1480	int i;
	1481	int njoblist;
	1482	int error, s, timo;
	1483	long *ijoblist;
	1484	struct aiocb **ujoblist;
	1485
	1486	if (uap->nent > AIO_LISTIO_MAX)
	1487	return EINVAL;
	1488
	1489	timo = 0;
	1490	if (uap->timeout) {
	1491	/* Get timespec struct. */
	1492	if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
	1493	return error;
	1494
	1495	if (ts.tv_nsec < 0 \|\| ts.tv_nsec >= 1000000000)
	1496	return (EINVAL);
	1497
	1498	TIMESPEC_TO_TIMEVAL(&atv, &ts);
	1499	if (itimerfix(&atv))
	1500	return (EINVAL);
	1501	timo = tvtohz(&atv);
	1502	}
	1503
	1504	ki = p->p_aioinfo;
	1505	if (ki == NULL)
	1506	return EAGAIN;
	1507
	1508	njoblist = 0;
	1509	ijoblist = zalloc(aiol_zone);
	1510	ujoblist = zalloc(aiol_zone);
	1511	cbptr = uap->aiocbp;
	1512
	1513	for (i = 0; i < uap->nent; i++) {
	1514	cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
	1515	if (cbp == 0)
	1516	continue;
	1517	ujoblist[njoblist] = cbp;
	1518	ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
	1519	njoblist++;
	1520	}
	1521
	1522	if (njoblist == 0) {
	1523	zfree(aiol_zone, ijoblist);
	1524	zfree(aiol_zone, ujoblist);
	1525	return 0;
	1526	}
	1527
	1528	error = 0;
	1529	for (;;) {
	1530	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
	1531	for (i = 0; i < njoblist; i++) {
	1532	if (((intptr_t)
	1533	cb->uaiocb._aiocb_private.kernelinfo) ==
	1534	ijoblist[i]) {
	1535	if (ujoblist[i] != cb->uuaiocb)
	1536	error = EINVAL;
	1537	zfree(aiol_zone, ijoblist);
	1538	zfree(aiol_zone, ujoblist);
	1539	return error;
	1540	}
	1541	}
	1542	}
	1543
	1544	s = splbio();
	1545	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
	1546	TAILQ_NEXT(cb, plist)) {
	1547	for (i = 0; i < njoblist; i++) {
	1548	if (((intptr_t)
	1549	cb->uaiocb._aiocb_private.kernelinfo) ==
	1550	ijoblist[i]) {
	1551	splx(s);
	1552	if (ujoblist[i] != cb->uuaiocb)
	1553	error = EINVAL;
	1554	zfree(aiol_zone, ijoblist);
	1555	zfree(aiol_zone, ujoblist);
	1556	return error;
	1557	}
	1558	}
	1559	}
	1560
	1561	ki->kaio_flags \|= KAIO_WAKEUP;
	1562	error = tsleep(p, PRIBIO \| PCATCH, "aiospn", timo);
	1563	splx(s);
	1564
	1565	if (error == ERESTART \|\| error == EINTR) {
	1566	zfree(aiol_zone, ijoblist);
	1567	zfree(aiol_zone, ujoblist);
	1568	return EINTR;
	1569	} else if (error == EWOULDBLOCK) {
	1570	zfree(aiol_zone, ijoblist);
	1571	zfree(aiol_zone, ujoblist);
	1572	return EAGAIN;
	1573	}
	1574	}
	1575
	1576	/* NOTREACHED */
	1577	return EINVAL;
	1578	#endif /* VFS_AIO */
	1579	}
	1580
	1581	/*
	1582	* aio_cancel cancels any non-physio aio operations not currently in
	1583	* progress.
	1584	*/
	1585	int
	1586	aio_cancel(struct aio_cancel_args *uap)
	1587	{
	1588	#ifndef VFS_AIO
	1589	return ENOSYS;
	1590	#else
	1591	struct proc *p = curproc;
	1592	struct kaioinfo *ki;
	1593	struct aiocblist cbe, cbn;
	1594	struct file *fp;
	1595	struct filedesc *fdp;
	1596	struct socket *so;
	1597	struct proc *po;
	1598	int s,error;
	1599	int cancelled=0;
	1600	int notcancelled=0;
	1601	struct vnode *vp;
	1602
	1603	fdp = p->p_fd;
	1604	if ((u_int)uap->fd >= fdp->fd_nfiles \|\|
	1605	(fp = fdp->fd_ofiles[uap->fd]) == NULL)
	1606	return (EBADF);
	1607
	1608	if (fp->f_type == DTYPE_VNODE) {
	1609	vp = (struct vnode *)fp->f_data;
	1610
	1611	if (vn_isdisk(vp,&error)) {
	1612	p->p_retval[0] = AIO_NOTCANCELED;
	1613	return 0;
	1614	}
	1615	} else if (fp->f_type == DTYPE_SOCKET) {
	1616	so = (struct socket *)fp->f_data;
	1617
	1618	s = splnet();
	1619
	1620	for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
	1621	cbn = TAILQ_NEXT(cbe, list);
	1622	if ((uap->aiocbp == NULL) \|\|
	1623	(uap->aiocbp == cbe->uuaiocb) ) {
	1624	po = cbe->userproc;
	1625	ki = po->p_aioinfo;
	1626	TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
	1627	TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
	1628	TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
	1629	if (ki->kaio_flags & KAIO_WAKEUP) {
	1630	wakeup(po);
	1631	}
	1632	cbe->jobstate = JOBST_JOBFINISHED;
	1633	cbe->uaiocb._aiocb_private.status=-1;
	1634	cbe->uaiocb._aiocb_private.error=ECANCELED;
	1635	cancelled++;
	1636	/* XXX cancelled, knote? */
	1637	if (cbe->uaiocb.aio_sigevent.sigev_notify ==
	1638	SIGEV_SIGNAL)
	1639	psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
	1640	if (uap->aiocbp)
	1641	break;
	1642	}
	1643	}
	1644	splx(s);
	1645
	1646	if ((cancelled) && (uap->aiocbp)) {
	1647	p->p_retval[0] = AIO_CANCELED;
	1648	return 0;
	1649	}
	1650	}
	1651	ki=p->p_aioinfo;
	1652	if (ki == NULL)
	1653	goto done;
	1654	s = splnet();
	1655
	1656	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
	1657	cbn = TAILQ_NEXT(cbe, plist);
	1658
	1659	if ((uap->fd == cbe->uaiocb.aio_fildes) &&
	1660	((uap->aiocbp == NULL ) \|\|
	1661	(uap->aiocbp == cbe->uuaiocb))) {
	1662
	1663	if (cbe->jobstate == JOBST_JOBQGLOBAL) {
	1664	TAILQ_REMOVE(&aio_jobs, cbe, list);
	1665	TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
	1666	TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
	1667	plist);
	1668	cancelled++;
	1669	ki->kaio_queue_finished_count++;
	1670	cbe->jobstate = JOBST_JOBFINISHED;
	1671	cbe->uaiocb._aiocb_private.status = -1;
	1672	cbe->uaiocb._aiocb_private.error = ECANCELED;
	1673	/* XXX cancelled, knote? */
	1674	if (cbe->uaiocb.aio_sigevent.sigev_notify ==
	1675	SIGEV_SIGNAL)
	1676	psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
	1677	} else {
	1678	notcancelled++;
	1679	}
	1680	}
	1681	}
	1682	splx(s);
	1683	done:
	1684	if (notcancelled) {
	1685	p->p_retval[0] = AIO_NOTCANCELED;
	1686	return 0;
	1687	}
	1688	if (cancelled) {
	1689	p->p_retval[0] = AIO_CANCELED;
	1690	return 0;
	1691	}
	1692	p->p_retval[0] = AIO_ALLDONE;
	1693
	1694	return 0;
	1695	#endif /* VFS_AIO */
	1696	}
	1697
	1698	/*
	1699	* aio_error is implemented in the kernel level for compatibility purposes only.
	1700	* For a user mode async implementation, it would be best to do it in a userland
	1701	* subroutine.
	1702	*/
	1703	int
	1704	aio_error(struct aio_error_args *uap)
	1705	{
	1706	#ifndef VFS_AIO
	1707	return ENOSYS;
	1708	#else
	1709	struct proc *p = curproc;
	1710	int s;
	1711	struct aiocblist *cb;
	1712	struct kaioinfo *ki;
	1713	long jobref;
	1714
	1715	ki = p->p_aioinfo;
	1716	if (ki == NULL)
	1717	return EINVAL;
	1718
	1719	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
	1720	if ((jobref == -1) \|\| (jobref == 0))
	1721	return EINVAL;
	1722
	1723	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
	1724	if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
	1725	jobref) {
	1726	p->p_retval[0] = cb->uaiocb._aiocb_private.error;
	1727	return 0;
	1728	}
	1729	}
	1730
	1731	s = splnet();
	1732
	1733	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
	1734	plist)) {
	1735	if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
	1736	jobref) {
	1737	p->p_retval[0] = EINPROGRESS;
	1738	splx(s);
	1739	return 0;
	1740	}
	1741	}
	1742
	1743	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
	1744	plist)) {
	1745	if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
	1746	jobref) {
	1747	p->p_retval[0] = EINPROGRESS;
	1748	splx(s);
	1749	return 0;
	1750	}
	1751	}
	1752	splx(s);
	1753
	1754	s = splbio();
	1755	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
	1756	plist)) {
	1757	if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
	1758	jobref) {
	1759	p->p_retval[0] = cb->uaiocb._aiocb_private.error;
	1760	splx(s);
	1761	return 0;
	1762	}
	1763	}
	1764
	1765	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
	1766	plist)) {
	1767	if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
	1768	jobref) {
	1769	p->p_retval[0] = EINPROGRESS;
	1770	splx(s);
	1771	return 0;
	1772	}
	1773	}
	1774	splx(s);
	1775
	1776	#if (0)
	1777	/*
	1778	* Hack for lio.
	1779	*/
	1780	status = fuword(&uap->aiocbp->_aiocb_private.status);
	1781	if (status == -1)
	1782	return fuword(&uap->aiocbp->_aiocb_private.error);
	1783	#endif
	1784	return EINVAL;
	1785	#endif /* VFS_AIO */
	1786	}
	1787
	1788	/* syscall - asynchronous read from a file (REALTIME) */
	1789	int
	1790	aio_read(struct aio_read_args *uap)
	1791	{
	1792	#ifndef VFS_AIO
	1793	return ENOSYS;
	1794	#else
	1795	return aio_aqueue(uap->aiocbp, LIO_READ);
	1796	#endif /* VFS_AIO */
	1797	}
	1798
	1799	/* syscall - asynchronous write to a file (REALTIME) */
	1800	int
	1801	aio_write(struct aio_write_args *uap)
	1802	{
	1803	#ifndef VFS_AIO
	1804	return ENOSYS;
	1805	#else
	1806	return aio_aqueue(uap->aiocbp, LIO_WRITE);
	1807	#endif /* VFS_AIO */
	1808	}
	1809
	1810	/* syscall - XXX undocumented */
	1811	int
	1812	lio_listio(struct lio_listio_args *uap)
	1813	{
	1814	#ifndef VFS_AIO
	1815	return ENOSYS;
	1816	#else
	1817	struct proc *p = curproc;
	1818	int nent, nentqueued;
	1819	struct aiocb iocb, const *cbptr;
	1820	struct aiocblist *cb;
	1821	struct kaioinfo *ki;
	1822	struct aio_liojob *lj;
	1823	int error, runningcode;
	1824	int nerror;
	1825	int i;
	1826	int s;
	1827
	1828	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	1829	return EINVAL;
	1830
	1831	nent = uap->nent;
	1832	if (nent > AIO_LISTIO_MAX)
	1833	return EINVAL;
	1834
	1835	if (p->p_aioinfo == NULL)
	1836	aio_init_aioinfo(p);
	1837
	1838	if ((nent + num_queue_count) > max_queue_count)
	1839	return EAGAIN;
	1840
	1841	ki = p->p_aioinfo;
	1842	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
	1843	return EAGAIN;
	1844
	1845	lj = zalloc(aiolio_zone);
	1846	if (!lj)
	1847	return EAGAIN;
	1848
	1849	lj->lioj_flags = 0;
	1850	lj->lioj_buffer_count = 0;
	1851	lj->lioj_buffer_finished_count = 0;
	1852	lj->lioj_queue_count = 0;
	1853	lj->lioj_queue_finished_count = 0;
	1854	lj->lioj_ki = ki;
	1855
	1856	/*
	1857	* Setup signal.
	1858	*/
	1859	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	1860	error = copyin(uap->sig, &lj->lioj_signal,
	1861	sizeof(lj->lioj_signal));
	1862	if (error) {
	1863	zfree(aiolio_zone, lj);
	1864	return error;
	1865	}
	1866	if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
	1867	zfree(aiolio_zone, lj);
	1868	return EINVAL;
	1869	}
	1870	lj->lioj_flags \|= LIOJ_SIGNAL;
	1871	lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
	1872	} else
	1873	lj->lioj_flags &= ~LIOJ_SIGNAL;
	1874
	1875	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
	1876	/*
	1877	* Get pointers to the list of I/O requests.
	1878	*/
	1879	nerror = 0;
	1880	nentqueued = 0;
	1881	cbptr = uap->acb_list;
	1882	for (i = 0; i < uap->nent; i++) {
	1883	iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
	1884	if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
	1885	error = _aio_aqueue(iocb, lj, 0);
	1886	if (error == 0)
	1887	nentqueued++;
	1888	else
	1889	nerror++;
	1890	}
	1891	}
	1892
	1893	/*
	1894	* If we haven't queued any, then just return error.
	1895	*/
	1896	if (nentqueued == 0)
	1897	return 0;
	1898
	1899	/*
	1900	* Calculate the appropriate error return.
	1901	*/
	1902	runningcode = 0;
	1903	if (nerror)
	1904	runningcode = EIO;
	1905
	1906	if (uap->mode == LIO_WAIT) {
	1907	int command, found, jobref;
	1908
	1909	for (;;) {
	1910	found = 0;
	1911	for (i = 0; i < uap->nent; i++) {
	1912	/*
	1913	* Fetch address of the control buf pointer in
	1914	* user space.
	1915	*/
	1916	iocb = (struct aiocb *)
	1917	(intptr_t)fuword(&cbptr[i]);
	1918	if (((intptr_t)iocb == -1) \|\| ((intptr_t)iocb
	1919	== 0))
	1920	continue;
	1921
	1922	/*
	1923	* Fetch the associated command from user space.
	1924	*/
	1925	command = fuword(&iocb->aio_lio_opcode);
	1926	if (command == LIO_NOP) {
	1927	found++;
	1928	continue;
	1929	}
	1930
	1931	jobref = fuword(&iocb->_aiocb_private.kernelinfo);
	1932
	1933	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
	1934	if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
	1935	== jobref) {
	1936	if (cb->uaiocb.aio_lio_opcode
	1937	== LIO_WRITE) {
	1938	p->p_stats->p_ru.ru_oublock
	1939	+=
	1940	cb->outputcharge;
	1941	cb->outputcharge = 0;
	1942	} else if (cb->uaiocb.aio_lio_opcode
	1943	== LIO_READ) {
	1944	p->p_stats->p_ru.ru_inblock
	1945	+= cb->inputcharge;
	1946	cb->inputcharge = 0;
	1947	}
	1948	found++;
	1949	break;
	1950	}
	1951	}
	1952
	1953	s = splbio();
	1954	TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
	1955	if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
	1956	== jobref) {
	1957	found++;
	1958	break;
	1959	}
	1960	}
	1961	splx(s);
	1962	}
	1963
	1964	/*
	1965	* If all I/Os have been disposed of, then we can
	1966	* return.
	1967	*/
	1968	if (found == nentqueued)
	1969	return runningcode;
	1970
	1971	ki->kaio_flags \|= KAIO_WAKEUP;
	1972	error = tsleep(p, PRIBIO \| PCATCH, "aiospn", 0);
	1973
	1974	if (error == EINTR)
	1975	return EINTR;
	1976	else if (error == EWOULDBLOCK)
	1977	return EAGAIN;
	1978	}
	1979	}
	1980
	1981	return runningcode;
	1982	#endif /* VFS_AIO */
	1983	}
	1984
	1985	#ifdef VFS_AIO
	1986	/*
	1987	* This is a weird hack so that we can post a signal. It is safe to do so from
	1988	* a timeout routine, but not from an interrupt routine.
	1989	*/
	1990	static void
	1991	process_signal(void *aioj)
	1992	{
	1993	struct aiocblist *aiocbe = aioj;
	1994	struct aio_liojob *lj = aiocbe->lio;
	1995	struct aiocb *cb = &aiocbe->uaiocb;
	1996
	1997	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
	1998	(lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
	1999	psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
	2000	lj->lioj_flags \|= LIOJ_SIGNAL_POSTED;
	2001	}
	2002
	2003	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL)
	2004	psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
	2005	}
	2006
	2007	/*
	2008	* Interrupt handler for physio, performs the necessary process wakeups, and
	2009	* signals.
	2010	*/
	2011	static void
	2012	aio_physwakeup(struct buf *bp)
	2013	{
	2014	struct aiocblist *aiocbe;
	2015	struct proc *p;
	2016	struct kaioinfo *ki;
	2017	struct aio_liojob *lj;
	2018
	2019	wakeup(bp);
	2020
	2021	aiocbe = (struct aiocblist *)bp->b_spc;
	2022	if (aiocbe) {
	2023	p = bp->b_caller1;
	2024
	2025	aiocbe->jobstate = JOBST_JOBBFINISHED;
	2026	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
	2027	aiocbe->uaiocb._aiocb_private.error = 0;
	2028	aiocbe->jobflags \|= AIOCBLIST_DONE;
	2029
	2030	if (bp->b_flags & B_ERROR)
	2031	aiocbe->uaiocb._aiocb_private.error = bp->b_error;
	2032
	2033	lj = aiocbe->lio;
	2034	if (lj) {
	2035	lj->lioj_buffer_finished_count++;
	2036
	2037	/*
	2038	* wakeup/signal if all of the interrupt jobs are done.
	2039	*/
	2040	if (lj->lioj_buffer_finished_count ==
	2041	lj->lioj_buffer_count) {
	2042	/*
	2043	* Post a signal if it is called for.
	2044	*/
	2045	if ((lj->lioj_flags &
	2046	(LIOJ_SIGNAL\|LIOJ_SIGNAL_POSTED)) ==
	2047	LIOJ_SIGNAL) {
	2048	lj->lioj_flags \|= LIOJ_SIGNAL_POSTED;
	2049	aiocbe->timeouthandle =
	2050	timeout(process_signal,
	2051	aiocbe, 0);
	2052	}
	2053	}
	2054	}
	2055
	2056	ki = p->p_aioinfo;
	2057	if (ki) {
	2058	ki->kaio_buffer_finished_count++;
	2059	TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
	2060	TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
	2061	TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
	2062
	2063	KNOTE(&aiocbe->klist, 0);
	2064	/* Do the wakeup. */
	2065	if (ki->kaio_flags & (KAIO_RUNDOWN\|KAIO_WAKEUP)) {
	2066	ki->kaio_flags &= ~KAIO_WAKEUP;
	2067	wakeup(p);
	2068	}
	2069	}
	2070
	2071	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
	2072	aiocbe->timeouthandle =
	2073	timeout(process_signal, aiocbe, 0);
	2074	}
	2075	}
	2076	#endif /* VFS_AIO */
	2077
	2078	/* syscall - wait for the next completion of an aio request */
	2079	int
	2080	aio_waitcomplete(struct aio_waitcomplete_args *uap)
	2081	{
	2082	#ifndef VFS_AIO
	2083	return ENOSYS;
	2084	#else
	2085	struct proc *p = curproc;
	2086	struct timeval atv;
	2087	struct timespec ts;
	2088	struct kaioinfo *ki;
	2089	struct aiocblist *cb = NULL;
	2090	int error, s, timo;
	2091
	2092	suword(uap->aiocbp, (int)NULL);
	2093
	2094	timo = 0;
	2095	if (uap->timeout) {
	2096	/* Get timespec struct. */
	2097	error = copyin(uap->timeout, &ts, sizeof(ts));
	2098	if (error)
	2099	return error;
	2100
	2101	if ((ts.tv_nsec < 0) \|\| (ts.tv_nsec >= 1000000000))
	2102	return (EINVAL);
	2103
	2104	TIMESPEC_TO_TIMEVAL(&atv, &ts);
	2105	if (itimerfix(&atv))
	2106	return (EINVAL);
	2107	timo = tvtohz(&atv);
	2108	}
	2109
	2110	ki = p->p_aioinfo;
	2111	if (ki == NULL)
	2112	return EAGAIN;
	2113
	2114	for (;;) {
	2115	if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
	2116	suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
	2117	p->p_retval[0] = cb->uaiocb._aiocb_private.status;
	2118	if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
	2119	p->p_stats->p_ru.ru_oublock +=
	2120	cb->outputcharge;
	2121	cb->outputcharge = 0;
	2122	} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
	2123	p->p_stats->p_ru.ru_inblock += cb->inputcharge;
	2124	cb->inputcharge = 0;
	2125	}
	2126	aio_free_entry(cb);
	2127	return cb->uaiocb._aiocb_private.error;
	2128	}
	2129
	2130	s = splbio();
	2131	if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
	2132	splx(s);
	2133	suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
	2134	p->p_retval[0] = cb->uaiocb._aiocb_private.status;
	2135	aio_free_entry(cb);
	2136	return cb->uaiocb._aiocb_private.error;
	2137	}
	2138
	2139	ki->kaio_flags \|= KAIO_WAKEUP;
	2140	error = tsleep(p, PRIBIO \| PCATCH, "aiowc", timo);
	2141	splx(s);
	2142
	2143	if (error == ERESTART)
	2144	return EINTR;
	2145	else if (error < 0)
	2146	return error;
	2147	else if (error == EINTR)
	2148	return EINTR;
	2149	else if (error == EWOULDBLOCK)
	2150	return EAGAIN;
	2151	}
	2152	#endif /* VFS_AIO */
	2153	}
	2154
	2155	#ifndef VFS_AIO
	2156	static int
	2157	filt_aioattach(struct knote *kn)
	2158	{
	2159
	2160	return (ENXIO);
	2161	}
	2162
	2163	struct filterops aio_filtops =
	2164	{ 0, filt_aioattach, NULL, NULL };
	2165
	2166	#else
	2167	/* kqueue attach function */
	2168	static int
	2169	filt_aioattach(struct knote *kn)
	2170	{
	2171	struct aiocblist aiocbe = (struct aiocblist )kn->kn_sdata;
	2172
	2173	/*
	2174	* The aiocbe pointer must be validated before using it, so
	2175	* registration is restricted to the kernel; the user cannot
	2176	* set EV_FLAG1.
	2177	*/
	2178	if ((kn->kn_flags & EV_FLAG1) == 0)
	2179	return (EPERM);
	2180	kn->kn_flags &= ~EV_FLAG1;
	2181
	2182	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
	2183
	2184	return (0);
	2185	}
	2186
	2187	/* kqueue detach function */
	2188	static void
	2189	filt_aiodetach(struct knote *kn)
	2190	{
	2191	struct aiocblist aiocbe = (struct aiocblist )kn->kn_sdata;
	2192
	2193	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
	2194	}
	2195
	2196	/* kqueue filter function */
	2197	/ARGSUSED/
	2198	static int
	2199	filt_aio(struct knote *kn, long hint)
	2200	{
	2201	struct aiocblist aiocbe = (struct aiocblist )kn->kn_sdata;
	2202
	2203	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
	2204	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
	2205	aiocbe->jobstate != JOBST_JOBBFINISHED)
	2206	return (0);
	2207	kn->kn_flags \|= EV_EOF;
	2208	return (1);
	2209	}
	2210
	2211	struct filterops aio_filtops =
	2212	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
	2213	#endif /* VFS_AIO */