Fix a minor bug in the last commit. lwp_cpumask has to be in the LWP copy
[dragonfly.git] / sys / kern / vfs_aio.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1997 John S. Dyson. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 * derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author. This software is distributed AS-IS.
15 *
16 * $FreeBSD: src/sys/kern/vfs_aio.c,v 1.70.2.28 2003/05/29 06:15:35 alc Exp $
9f87144f 17 * $DragonFly: src/sys/kern/vfs_aio.c,v 1.27 2006/05/06 06:38:38 dillon Exp $
984263bc
MD
18 */
19
20/*
21 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
22 */
23
24#include <sys/param.h>
25#include <sys/systm.h>
26#include <sys/buf.h>
27#include <sys/sysproto.h>
28#include <sys/filedesc.h>
29#include <sys/kernel.h>
30#include <sys/fcntl.h>
31#include <sys/file.h>
32#include <sys/lock.h>
33#include <sys/unistd.h>
34#include <sys/proc.h>
35#include <sys/resourcevar.h>
36#include <sys/signalvar.h>
37#include <sys/protosw.h>
38#include <sys/socketvar.h>
39#include <sys/sysctl.h>
40#include <sys/vnode.h>
41#include <sys/conf.h>
42#include <sys/event.h>
43
44#include <vm/vm.h>
45#include <vm/vm_extern.h>
46#include <vm/pmap.h>
47#include <vm/vm_map.h>
48#include <vm/vm_zone.h>
49#include <sys/aio.h>
7b95be2a
MD
50#include <sys/file2.h>
51#include <sys/buf2.h>
831f78e5 52#include <sys/thread2.h>
984263bc
MD
53
54#include <machine/limits.h>
55#include "opt_vfs_aio.h"
56
57#ifdef VFS_AIO
58
59/*
60 * Counter for allocating reference ids to new jobs. Wrapped to 1 on
61 * overflow.
62 */
63static long jobrefid;
64
65#define JOBST_NULL 0x0
66#define JOBST_JOBQGLOBAL 0x2
67#define JOBST_JOBRUNNING 0x3
68#define JOBST_JOBFINISHED 0x4
69#define JOBST_JOBQBUF 0x5
70#define JOBST_JOBBFINISHED 0x6
71
72#ifndef MAX_AIO_PER_PROC
73#define MAX_AIO_PER_PROC 32
74#endif
75
76#ifndef MAX_AIO_QUEUE_PER_PROC
77#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
78#endif
79
80#ifndef MAX_AIO_PROCS
81#define MAX_AIO_PROCS 32
82#endif
83
84#ifndef MAX_AIO_QUEUE
85#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
86#endif
87
88#ifndef TARGET_AIO_PROCS
89#define TARGET_AIO_PROCS 4
90#endif
91
92#ifndef MAX_BUF_AIO
93#define MAX_BUF_AIO 16
94#endif
95
96#ifndef AIOD_TIMEOUT_DEFAULT
97#define AIOD_TIMEOUT_DEFAULT (10 * hz)
98#endif
99
100#ifndef AIOD_LIFETIME_DEFAULT
101#define AIOD_LIFETIME_DEFAULT (30 * hz)
102#endif
103
104SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
105
106static int max_aio_procs = MAX_AIO_PROCS;
107SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
108 CTLFLAG_RW, &max_aio_procs, 0,
109 "Maximum number of kernel threads to use for handling async IO");
110
111static int num_aio_procs = 0;
112SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
113 CTLFLAG_RD, &num_aio_procs, 0,
114 "Number of presently active kernel threads for async IO");
115
116/*
117 * The code will adjust the actual number of AIO processes towards this
118 * number when it gets a chance.
119 */
120static int target_aio_procs = TARGET_AIO_PROCS;
121SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
122 0, "Preferred number of ready kernel threads for async IO");
123
124static int max_queue_count = MAX_AIO_QUEUE;
125SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
126 "Maximum number of aio requests to queue, globally");
127
128static int num_queue_count = 0;
129SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
130 "Number of queued aio requests");
131
132static int num_buf_aio = 0;
133SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
134 "Number of aio requests presently handled by the buf subsystem");
135
136/* Number of async I/O thread in the process of being started */
137/* XXX This should be local to _aio_aqueue() */
138static int num_aio_resv_start = 0;
139
140static int aiod_timeout;
141SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
142 "Timeout value for synchronous aio operations");
143
144static int aiod_lifetime;
145SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
146 "Maximum lifetime for idle aiod");
147
148static int max_aio_per_proc = MAX_AIO_PER_PROC;
149SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
150 0, "Maximum active aio requests per process (stored in the process)");
151
152static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
153SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
154 &max_aio_queue_per_proc, 0,
155 "Maximum queued aio requests per process (stored in the process)");
156
157static int max_buf_aio = MAX_BUF_AIO;
158SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
159 "Maximum buf aio requests per process (stored in the process)");
160
161/*
162 * AIO process info
163 */
164#define AIOP_FREE 0x1 /* proc on free queue */
165#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
166
167struct aioproclist {
168 int aioprocflags; /* AIO proc flags */
169 TAILQ_ENTRY(aioproclist) list; /* List of processes */
170 struct proc *aioproc; /* The AIO thread */
171};
172
173/*
174 * data-structure for lio signal management
175 */
176struct aio_liojob {
177 int lioj_flags;
178 int lioj_buffer_count;
179 int lioj_buffer_finished_count;
180 int lioj_queue_count;
181 int lioj_queue_finished_count;
182 struct sigevent lioj_signal; /* signal on all I/O done */
183 TAILQ_ENTRY(aio_liojob) lioj_list;
184 struct kaioinfo *lioj_ki;
185};
186#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
187#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
188
189/*
190 * per process aio data structure
191 */
192struct kaioinfo {
193 int kaio_flags; /* per process kaio flags */
194 int kaio_maxactive_count; /* maximum number of AIOs */
195 int kaio_active_count; /* number of currently used AIOs */
196 int kaio_qallowed_count; /* maxiumu size of AIO queue */
197 int kaio_queue_count; /* size of AIO queue */
198 int kaio_ballowed_count; /* maximum number of buffers */
199 int kaio_queue_finished_count; /* number of daemon jobs finished */
200 int kaio_buffer_count; /* number of physio buffers */
201 int kaio_buffer_finished_count; /* count of I/O done */
202 struct proc *kaio_p; /* process that uses this kaio block */
203 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
204 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */
205 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */
206 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */
207 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */
208 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
209};
210
211#define KAIO_RUNDOWN 0x1 /* process is being run down */
212#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
213
214static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
215static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
216static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
217static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */
218
219static void aio_init_aioinfo(struct proc *p);
220static void aio_onceonly(void *);
221static int aio_free_entry(struct aiocblist *aiocbe);
222static void aio_process(struct aiocblist *aiocbe);
223static int aio_newproc(void);
41c20dac 224static int aio_aqueue(struct aiocb *job, int type);
81b5c339 225static void aio_physwakeup(struct bio *bio);
984263bc
MD
226static int aio_fphysio(struct aiocblist *aiocbe);
227static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
228static void aio_daemon(void *uproc);
229static void process_signal(void *aioj);
230
231SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
232
233/*
234 * Zones for:
235 * kaio Per process async io info
236 * aiop async io thread data
237 * aiocb async io jobs
238 * aiol list io job pointer - internal to aio_suspend XXX
239 * aiolio list io jobs
240 */
241static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
242
243/*
244 * Startup initialization
245 */
246static void
247aio_onceonly(void *na)
248{
249 TAILQ_INIT(&aio_freeproc);
250 TAILQ_INIT(&aio_activeproc);
251 TAILQ_INIT(&aio_jobs);
252 TAILQ_INIT(&aio_bufjobs);
253 TAILQ_INIT(&aio_freejobs);
254 kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
255 aiop_zone = zinit("AIOP", sizeof(struct aioproclist), 0, 0, 1);
256 aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
257 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1);
258 aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1);
259 aiod_timeout = AIOD_TIMEOUT_DEFAULT;
260 aiod_lifetime = AIOD_LIFETIME_DEFAULT;
261 jobrefid = 1;
262}
263
264/*
265 * Init the per-process aioinfo structure. The aioinfo limits are set
266 * per-process for user limit (resource) management.
267 */
268static void
269aio_init_aioinfo(struct proc *p)
270{
271 struct kaioinfo *ki;
272 if (p->p_aioinfo == NULL) {
273 ki = zalloc(kaio_zone);
274 p->p_aioinfo = ki;
275 ki->kaio_flags = 0;
276 ki->kaio_maxactive_count = max_aio_per_proc;
277 ki->kaio_active_count = 0;
278 ki->kaio_qallowed_count = max_aio_queue_per_proc;
279 ki->kaio_queue_count = 0;
280 ki->kaio_ballowed_count = max_buf_aio;
281 ki->kaio_buffer_count = 0;
282 ki->kaio_buffer_finished_count = 0;
283 ki->kaio_p = p;
284 TAILQ_INIT(&ki->kaio_jobdone);
285 TAILQ_INIT(&ki->kaio_jobqueue);
286 TAILQ_INIT(&ki->kaio_bufdone);
287 TAILQ_INIT(&ki->kaio_bufqueue);
288 TAILQ_INIT(&ki->kaio_liojoblist);
289 TAILQ_INIT(&ki->kaio_sockqueue);
290 }
291
292 while (num_aio_procs < target_aio_procs)
293 aio_newproc();
294}
295
296/*
297 * Free a job entry. Wait for completion if it is currently active, but don't
298 * delay forever. If we delay, we return a flag that says that we have to
299 * restart the queue scan.
300 */
301static int
302aio_free_entry(struct aiocblist *aiocbe)
303{
304 struct kaioinfo *ki;
305 struct aio_liojob *lj;
306 struct proc *p;
307 int error;
984263bc
MD
308
309 if (aiocbe->jobstate == JOBST_NULL)
310 panic("aio_free_entry: freeing already free job");
311
312 p = aiocbe->userproc;
313 ki = p->p_aioinfo;
314 lj = aiocbe->lio;
315 if (ki == NULL)
316 panic("aio_free_entry: missing p->p_aioinfo");
317
318 while (aiocbe->jobstate == JOBST_JOBRUNNING) {
319 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
377d4740 320 tsleep(aiocbe, 0, "jobwai", 0);
984263bc
MD
321 }
322 if (aiocbe->bp == NULL) {
323 if (ki->kaio_queue_count <= 0)
324 panic("aio_free_entry: process queue size <= 0");
325 if (num_queue_count <= 0)
326 panic("aio_free_entry: system wide queue size <= 0");
327
328 if (lj) {
329 lj->lioj_queue_count--;
330 if (aiocbe->jobflags & AIOCBLIST_DONE)
331 lj->lioj_queue_finished_count--;
332 }
333 ki->kaio_queue_count--;
334 if (aiocbe->jobflags & AIOCBLIST_DONE)
335 ki->kaio_queue_finished_count--;
336 num_queue_count--;
337 } else {
338 if (lj) {
339 lj->lioj_buffer_count--;
340 if (aiocbe->jobflags & AIOCBLIST_DONE)
341 lj->lioj_buffer_finished_count--;
342 }
343 if (aiocbe->jobflags & AIOCBLIST_DONE)
344 ki->kaio_buffer_finished_count--;
345 ki->kaio_buffer_count--;
346 num_buf_aio--;
347 }
348
349 /* aiocbe is going away, we need to destroy any knotes */
7b95be2a 350 knote_remove(p->p_thread, &aiocbe->klist);
984263bc
MD
351
352 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
353 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
354 ki->kaio_flags &= ~KAIO_WAKEUP;
355 wakeup(p);
356 }
357
358 if (aiocbe->jobstate == JOBST_JOBQBUF) {
359 if ((error = aio_fphysio(aiocbe)) != 0)
360 return error;
361 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
362 panic("aio_free_entry: invalid physio finish-up state");
e43a034f 363 crit_enter();
984263bc 364 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
e43a034f 365 crit_exit();
984263bc 366 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
e43a034f 367 crit_enter();
984263bc
MD
368 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
369 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
e43a034f 370 crit_exit();
984263bc
MD
371 } else if (aiocbe->jobstate == JOBST_JOBFINISHED)
372 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
373 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
e43a034f 374 crit_enter();
984263bc 375 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
e43a034f 376 crit_exit();
984263bc
MD
377 if (aiocbe->bp) {
378 vunmapbuf(aiocbe->bp);
379 relpbuf(aiocbe->bp, NULL);
380 aiocbe->bp = NULL;
381 }
382 }
383 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
384 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
385 zfree(aiolio_zone, lj);
386 }
387 aiocbe->jobstate = JOBST_NULL;
a5eb27b6 388 callout_stop(&aiocbe->timeout);
9f87144f 389 fdrop(aiocbe->fd_file);
984263bc
MD
390 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
391 return 0;
392}
393#endif /* VFS_AIO */
394
395/*
396 * Rundown the jobs for a given process.
397 */
398void
399aio_proc_rundown(struct proc *p)
400{
401#ifndef VFS_AIO
402 return;
403#else
984263bc
MD
404 struct kaioinfo *ki;
405 struct aio_liojob *lj, *ljn;
406 struct aiocblist *aiocbe, *aiocbn;
407 struct file *fp;
408 struct socket *so;
409
410 ki = p->p_aioinfo;
411 if (ki == NULL)
412 return;
413
414 ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
415 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
416 ki->kaio_buffer_finished_count)) {
417 ki->kaio_flags |= KAIO_RUNDOWN;
377d4740 418 if (tsleep(p, 0, "kaiowt", aiod_timeout))
984263bc
MD
419 break;
420 }
421
422 /*
423 * Move any aio ops that are waiting on socket I/O to the normal job
424 * queues so they are cleaned up with any others.
425 */
e43a034f 426 crit_enter();
984263bc
MD
427 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
428 aiocbn) {
429 aiocbn = TAILQ_NEXT(aiocbe, plist);
430 fp = aiocbe->fd_file;
431 if (fp != NULL) {
432 so = (struct socket *)fp->f_data;
433 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
434 if (TAILQ_EMPTY(&so->so_aiojobq)) {
435 so->so_snd.sb_flags &= ~SB_AIO;
436 so->so_rcv.sb_flags &= ~SB_AIO;
437 }
438 }
439 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
440 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
441 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
442 }
e43a034f 443 crit_exit();
984263bc
MD
444
445restart1:
446 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
447 aiocbn = TAILQ_NEXT(aiocbe, plist);
448 if (aio_free_entry(aiocbe))
449 goto restart1;
450 }
451
452restart2:
453 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
454 aiocbn) {
455 aiocbn = TAILQ_NEXT(aiocbe, plist);
456 if (aio_free_entry(aiocbe))
457 goto restart2;
458 }
459
984263bc 460restart3:
e43a034f 461 crit_enter();
984263bc
MD
462 while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
463 ki->kaio_flags |= KAIO_WAKEUP;
377d4740 464 tsleep(p, 0, "aioprn", 0);
e43a034f 465 crit_exit();
984263bc
MD
466 goto restart3;
467 }
e43a034f 468 crit_exit();
984263bc
MD
469
470restart4:
e43a034f 471 crit_enter();
984263bc
MD
472 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
473 aiocbn = TAILQ_NEXT(aiocbe, plist);
474 if (aio_free_entry(aiocbe)) {
e43a034f 475 crit_exit();
984263bc
MD
476 goto restart4;
477 }
478 }
e43a034f 479 crit_exit();
984263bc
MD
480
481 /*
482 * If we've slept, jobs might have moved from one queue to another.
483 * Retry rundown if we didn't manage to empty the queues.
484 */
485 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
486 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
487 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
488 TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
489 goto restart1;
490
491 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
492 ljn = TAILQ_NEXT(lj, lioj_list);
493 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
494 0)) {
495 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
496 zfree(aiolio_zone, lj);
497 } else {
498#ifdef DIAGNOSTIC
499 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
500 "QF:%d\n", lj->lioj_buffer_count,
501 lj->lioj_buffer_finished_count,
502 lj->lioj_queue_count,
503 lj->lioj_queue_finished_count);
504#endif
505 }
506 }
507
508 zfree(kaio_zone, ki);
509 p->p_aioinfo = NULL;
510#endif /* VFS_AIO */
511}
512
513#ifdef VFS_AIO
514/*
515 * Select a job to run (called by an AIO daemon).
516 */
517static struct aiocblist *
518aio_selectjob(struct aioproclist *aiop)
519{
984263bc
MD
520 struct aiocblist *aiocbe;
521 struct kaioinfo *ki;
522 struct proc *userp;
523
e43a034f 524 crit_enter();
984263bc
MD
525 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
526 TAILQ_NEXT(aiocbe, list)) {
527 userp = aiocbe->userproc;
528 ki = userp->p_aioinfo;
529
530 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
531 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
e43a034f 532 crit_exit();
984263bc
MD
533 return aiocbe;
534 }
535 }
e43a034f 536 crit_exit();
984263bc
MD
537
538 return NULL;
539}
540
541/*
542 * The AIO processing activity. This is the code that does the I/O request for
543 * the non-physio version of the operations. The normal vn operations are used,
544 * and this code should work in all instances for every type of file, including
545 * pipes, sockets, fifos, and regular files.
546 */
547static void
548aio_process(struct aiocblist *aiocbe)
549{
7b95be2a 550 struct thread *mytd;
984263bc
MD
551 struct aiocb *cb;
552 struct file *fp;
553 struct uio auio;
554 struct iovec aiov;
555 int cnt;
556 int error;
557 int oublock_st, oublock_end;
558 int inblock_st, inblock_end;
559
7b95be2a 560 mytd = curthread;
984263bc
MD
561 cb = &aiocbe->uaiocb;
562 fp = aiocbe->fd_file;
563
564 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
565 aiov.iov_len = cb->aio_nbytes;
566
567 auio.uio_iov = &aiov;
568 auio.uio_iovcnt = 1;
569 auio.uio_offset = cb->aio_offset;
570 auio.uio_resid = cb->aio_nbytes;
571 cnt = cb->aio_nbytes;
572 auio.uio_segflg = UIO_USERSPACE;
7b95be2a 573 auio.uio_td = mytd;
984263bc 574
7b95be2a
MD
575 inblock_st = mytd->td_proc->p_stats->p_ru.ru_inblock;
576 oublock_st = mytd->td_proc->p_stats->p_ru.ru_oublock;
984263bc
MD
577 /*
578 * _aio_aqueue() acquires a reference to the file that is
579 * released in aio_free_entry().
580 */
581 if (cb->aio_lio_opcode == LIO_READ) {
582 auio.uio_rw = UIO_READ;
87de5057 583 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET);
984263bc
MD
584 } else {
585 auio.uio_rw = UIO_WRITE;
87de5057 586 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET);
984263bc 587 }
7b95be2a
MD
588 inblock_end = mytd->td_proc->p_stats->p_ru.ru_inblock;
589 oublock_end = mytd->td_proc->p_stats->p_ru.ru_oublock;
984263bc
MD
590
591 aiocbe->inputcharge = inblock_end - inblock_st;
592 aiocbe->outputcharge = oublock_end - oublock_st;
593
594 if ((error) && (auio.uio_resid != cnt)) {
595 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
596 error = 0;
597 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
598 psignal(aiocbe->userproc, SIGPIPE);
599 }
600
601 cnt -= auio.uio_resid;
602 cb->_aiocb_private.error = error;
603 cb->_aiocb_private.status = cnt;
604}
605
606/*
607 * The AIO daemon, most of the actual work is done in aio_process,
608 * but the setup (and address space mgmt) is done in this routine.
8a8d5d85
MD
609 *
610 * The MP lock is held on entry.
984263bc
MD
611 */
612static void
613aio_daemon(void *uproc)
614{
984263bc
MD
615 struct aio_liojob *lj;
616 struct aiocb *cb;
617 struct aiocblist *aiocbe;
618 struct aioproclist *aiop;
619 struct kaioinfo *ki;
620 struct proc *curcp, *mycp, *userp;
621 struct vmspace *myvm, *tmpvm;
e9a372eb 622 struct ucred *cr;
984263bc
MD
623
624 /*
625 * Local copies of curproc (cp) and vmspace (myvm)
626 */
627 mycp = curproc;
628 myvm = mycp->p_vmspace;
629
630 if (mycp->p_textvp) {
631 vrele(mycp->p_textvp);
632 mycp->p_textvp = NULL;
633 }
634
635 /*
636 * Allocate and ready the aio control info. There is one aiop structure
637 * per daemon.
638 */
639 aiop = zalloc(aiop_zone);
640 aiop->aioproc = mycp;
641 aiop->aioprocflags |= AIOP_FREE;
642
e43a034f 643 crit_enter();
984263bc
MD
644
645 /*
646 * Place thread (lightweight process) onto the AIO free thread list.
647 */
648 if (TAILQ_EMPTY(&aio_freeproc))
649 wakeup(&aio_freeproc);
650 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
651
e43a034f 652 crit_exit();
984263bc
MD
653
654 /* Make up a name for the daemon. */
655 strcpy(mycp->p_comm, "aiod");
656
657 /*
658 * Get rid of our current filedescriptors. AIOD's don't need any
659 * filedescriptors, except as temporarily inherited from the client.
660 * Credentials are also cloned, and made equivalent to "root".
661 */
662 fdfree(mycp);
663 mycp->p_fd = NULL;
e9a372eb
MD
664 cr = cratom(&mycp->p_ucred);
665 cr->cr_uid = 0;
6eedb489 666 uireplace(&cr->cr_uidinfo, uifind(0));
e9a372eb
MD
667 cr->cr_ngroups = 1;
668 cr->cr_groups[0] = 1;
984263bc
MD
669
670 /* The daemon resides in its own pgrp. */
671 enterpgrp(mycp, mycp->p_pid, 1);
672
673 /* Mark special process type. */
674 mycp->p_flag |= P_SYSTEM | P_KTHREADP;
675
676 /*
677 * Wakeup parent process. (Parent sleeps to keep from blasting away
678 * and creating too many daemons.)
679 */
680 wakeup(mycp);
681
682 for (;;) {
683 /*
684 * curcp is the current daemon process context.
685 * userp is the current user process context.
686 */
687 curcp = mycp;
688
689 /*
690 * Take daemon off of free queue
691 */
692 if (aiop->aioprocflags & AIOP_FREE) {
e43a034f 693 crit_enter();
984263bc
MD
694 TAILQ_REMOVE(&aio_freeproc, aiop, list);
695 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
696 aiop->aioprocflags &= ~AIOP_FREE;
e43a034f 697 crit_exit();
984263bc
MD
698 }
699 aiop->aioprocflags &= ~AIOP_SCHED;
700
701 /*
702 * Check for jobs.
703 */
704 while ((aiocbe = aio_selectjob(aiop)) != NULL) {
705 cb = &aiocbe->uaiocb;
706 userp = aiocbe->userproc;
707
708 aiocbe->jobstate = JOBST_JOBRUNNING;
709
710 /*
711 * Connect to process address space for user program.
712 */
713 if (userp != curcp) {
714 /*
715 * Save the current address space that we are
716 * connected to.
717 */
718 tmpvm = mycp->p_vmspace;
719
720 /*
721 * Point to the new user address space, and
722 * refer to it.
723 */
724 mycp->p_vmspace = userp->p_vmspace;
725 mycp->p_vmspace->vm_refcnt++;
726
727 /* Activate the new mapping. */
728 pmap_activate(mycp);
729
730 /*
731 * If the old address space wasn't the daemons
732 * own address space, then we need to remove the
733 * daemon's reference from the other process
734 * that it was acting on behalf of.
735 */
736 if (tmpvm != myvm) {
737 vmspace_free(tmpvm);
738 }
739 curcp = userp;
740 }
741
742 ki = userp->p_aioinfo;
743 lj = aiocbe->lio;
744
745 /* Account for currently active jobs. */
746 ki->kaio_active_count++;
747
748 /* Do the I/O function. */
749 aio_process(aiocbe);
750
751 /* Decrement the active job count. */
752 ki->kaio_active_count--;
753
754 /*
755 * Increment the completion count for wakeup/signal
756 * comparisons.
757 */
758 aiocbe->jobflags |= AIOCBLIST_DONE;
759 ki->kaio_queue_finished_count++;
760 if (lj)
761 lj->lioj_queue_finished_count++;
762 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
763 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
764 ki->kaio_flags &= ~KAIO_WAKEUP;
765 wakeup(userp);
766 }
767
e43a034f 768 crit_enter();
984263bc
MD
769 if (lj && (lj->lioj_flags &
770 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
771 if ((lj->lioj_queue_finished_count ==
772 lj->lioj_queue_count) &&
773 (lj->lioj_buffer_finished_count ==
774 lj->lioj_buffer_count)) {
775 psignal(userp,
776 lj->lioj_signal.sigev_signo);
777 lj->lioj_flags |=
778 LIOJ_SIGNAL_POSTED;
779 }
780 }
e43a034f 781 crit_exit();
984263bc
MD
782
783 aiocbe->jobstate = JOBST_JOBFINISHED;
784
e43a034f 785 crit_enter();
984263bc
MD
786 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
787 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
e43a034f 788 crit_exit();
984263bc
MD
789 KNOTE(&aiocbe->klist, 0);
790
791 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
792 wakeup(aiocbe);
793 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
794 }
795
796 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
797 psignal(userp, cb->aio_sigevent.sigev_signo);
798 }
799 }
800
801 /*
802 * Disconnect from user address space.
803 */
804 if (curcp != mycp) {
805 /* Get the user address space to disconnect from. */
806 tmpvm = mycp->p_vmspace;
807
808 /* Get original address space for daemon. */
809 mycp->p_vmspace = myvm;
810
811 /* Activate the daemon's address space. */
812 pmap_activate(mycp);
813#ifdef DIAGNOSTIC
814 if (tmpvm == myvm) {
815 printf("AIOD: vmspace problem -- %d\n",
816 mycp->p_pid);
817 }
818#endif
819 /* Remove our vmspace reference. */
820 vmspace_free(tmpvm);
821
822 curcp = mycp;
823 }
824
825 /*
826 * If we are the first to be put onto the free queue, wakeup
827 * anyone waiting for a daemon.
828 */
e43a034f 829 crit_enter();
984263bc
MD
830 TAILQ_REMOVE(&aio_activeproc, aiop, list);
831 if (TAILQ_EMPTY(&aio_freeproc))
832 wakeup(&aio_freeproc);
833 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
834 aiop->aioprocflags |= AIOP_FREE;
e43a034f 835 crit_exit();
984263bc
MD
836
837 /*
838 * If daemon is inactive for a long time, allow it to exit,
839 * thereby freeing resources.
840 */
841 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
377d4740 842 0, "aiordy", aiod_lifetime)) {
e43a034f 843 crit_enter();
984263bc
MD
844 if (TAILQ_EMPTY(&aio_jobs)) {
845 if ((aiop->aioprocflags & AIOP_FREE) &&
846 (num_aio_procs > target_aio_procs)) {
847 TAILQ_REMOVE(&aio_freeproc, aiop, list);
e43a034f 848 crit_exit();
984263bc
MD
849 zfree(aiop_zone, aiop);
850 num_aio_procs--;
851#ifdef DIAGNOSTIC
852 if (mycp->p_vmspace->vm_refcnt <= 1) {
853 printf("AIOD: bad vm refcnt for"
854 " exiting daemon: %d\n",
855 mycp->p_vmspace->vm_refcnt);
856 }
857#endif
7b95be2a 858 exit1(0);
984263bc
MD
859 }
860 }
e43a034f 861 crit_exit();
984263bc
MD
862 }
863 }
864}
865
866/*
867 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
868 * AIO daemon modifies its environment itself.
869 */
870static int
871aio_newproc()
872{
873 int error;
553ea3c8
SS
874 struct lwp *lp;
875 struct proc *np;
984263bc 876
553ea3c8
SS
877 lp = &proc0.p_lwp;
878 error = fork1(lp, RFPROC|RFMEM|RFNOWAIT, &np);
984263bc
MD
879 if (error)
880 return error;
881 cpu_set_fork_handler(np, aio_daemon, curproc);
553ea3c8 882 start_forked_proc(lp, np);
984263bc
MD
883
884 /*
885 * Wait until daemon is started, but continue on just in case to
886 * handle error conditions.
887 */
377d4740 888 error = tsleep(np, 0, "aiosta", aiod_timeout);
984263bc
MD
889 num_aio_procs++;
890
891 return error;
892}
893
894/*
895 * Try the high-performance, low-overhead physio method for eligible
896 * VCHR devices. This method doesn't use an aio helper thread, and
897 * thus has very low overhead.
898 *
899 * Assumes that the caller, _aio_aqueue(), has incremented the file
900 * structure's reference count, preventing its deallocation for the
901 * duration of this call.
902 */
903static int
904aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
905{
906 int error;
907 struct aiocb *cb;
908 struct file *fp;
909 struct buf *bp;
910 struct vnode *vp;
911 struct kaioinfo *ki;
912 struct aio_liojob *lj;
984263bc
MD
913 int notify;
914
915 cb = &aiocbe->uaiocb;
916 fp = aiocbe->fd_file;
917
918 if (fp->f_type != DTYPE_VNODE)
919 return (-1);
920
921 vp = (struct vnode *)fp->f_data;
922
923 /*
924 * If its not a disk, we don't want to return a positive error.
925 * It causes the aio code to not fall through to try the thread
926 * way when you're talking to a regular file.
927 */
928 if (!vn_isdisk(vp, &error)) {
929 if (error == ENOTBLK)
930 return (-1);
931 else
932 return (error);
933 }
934
935 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
936 return (-1);
937
938 if (cb->aio_nbytes >
939 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
940 return (-1);
941
942 ki = p->p_aioinfo;
943 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
944 return (-1);
945
946 ki->kaio_buffer_count++;
947
948 lj = aiocbe->lio;
949 if (lj)
950 lj->lioj_buffer_count++;
951
952 /* Create and build a buffer header for a transfer. */
81b5c339 953 bp = getpbuf(NULL);
984263bc
MD
954 BUF_KERNPROC(bp);
955
956 /*
957 * Get a copy of the kva from the physical buffer.
958 */
81b5c339 959 bp->b_bio1.bio_caller_info1.ptr = p;
984263bc
MD
960 error = 0;
961
10f3fee5
MD
962 bp->b_cmd = (cb->aio_lio_opcode == LIO_WRITE) ?
963 BUF_CMD_WRITE : BUF_CMD_READ;
81b5c339 964 bp->b_bio1.bio_done = aio_physwakeup;
54078292 965 bp->b_bio1.bio_offset = cb->aio_offset;
984263bc
MD
966
967 /* Bring buffer into kernel space. */
3591bbc6 968 if (vmapbuf(bp, __DEVOLATILE(char *, cb->aio_buf), cb->aio_nbytes) < 0) {
984263bc
MD
969 error = EFAULT;
970 goto doerror;
971 }
972
e43a034f
MD
973 crit_enter();
974
984263bc 975 aiocbe->bp = bp;
81b5c339 976 bp->b_bio1.bio_caller_info2.ptr = aiocbe;
984263bc
MD
977 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
978 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
979 aiocbe->jobstate = JOBST_JOBQBUF;
980 cb->_aiocb_private.status = cb->aio_nbytes;
981 num_buf_aio++;
982 bp->b_error = 0;
983
e43a034f 984 crit_exit();
984263bc
MD
985
986 /* Perform transfer. */
81b5c339 987 dev_dstrategy(vp->v_rdev, &bp->b_bio1);
984263bc
MD
988
989 notify = 0;
e43a034f 990 crit_enter();
984263bc
MD
991
992 /*
993 * If we had an error invoking the request, or an error in processing
994 * the request before we have returned, we process it as an error in
995 * transfer. Note that such an I/O error is not indicated immediately,
996 * but is returned using the aio_error mechanism. In this case,
997 * aio_suspend will return immediately.
998 */
999 if (bp->b_error || (bp->b_flags & B_ERROR)) {
1000 struct aiocb *job = aiocbe->uuaiocb;
1001
1002 aiocbe->uaiocb._aiocb_private.status = 0;
1003 suword(&job->_aiocb_private.status, 0);
1004 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1005 suword(&job->_aiocb_private.error, bp->b_error);
1006
1007 ki->kaio_buffer_finished_count++;
1008
1009 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1010 aiocbe->jobstate = JOBST_JOBBFINISHED;
1011 aiocbe->jobflags |= AIOCBLIST_DONE;
1012 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1013 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1014 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1015 notify = 1;
1016 }
1017 }
e43a034f 1018 crit_exit();
984263bc
MD
1019 if (notify)
1020 KNOTE(&aiocbe->klist, 0);
1021 return 0;
1022
1023doerror:
1024 ki->kaio_buffer_count--;
1025 if (lj)
1026 lj->lioj_buffer_count--;
1027 aiocbe->bp = NULL;
1028 relpbuf(bp, NULL);
1029 return error;
1030}
1031
1032/*
1033 * This waits/tests physio completion.
1034 */
1035static int
1036aio_fphysio(struct aiocblist *iocb)
1037{
984263bc
MD
1038 struct buf *bp;
1039 int error;
1040
1041 bp = iocb->bp;
1042
e43a034f 1043 crit_enter();
10f3fee5 1044 while (bp->b_cmd != BUF_CMD_DONE) {
377d4740 1045 if (tsleep(bp, 0, "physstr", aiod_timeout)) {
10f3fee5 1046 if (bp->b_cmd != BUF_CMD_DONE) {
e43a034f 1047 crit_exit();
984263bc 1048 return EINPROGRESS;
10f3fee5 1049 } else {
984263bc 1050 break;
10f3fee5 1051 }
984263bc
MD
1052 }
1053 }
e43a034f 1054 crit_exit();
984263bc
MD
1055
1056 /* Release mapping into kernel space. */
1057 vunmapbuf(bp);
1058 iocb->bp = 0;
1059
1060 error = 0;
1061
1062 /* Check for an error. */
1063 if (bp->b_flags & B_ERROR)
1064 error = bp->b_error;
1065
1066 relpbuf(bp, NULL);
1067 return (error);
1068}
1069#endif /* VFS_AIO */
1070
1071/*
1072 * Wake up aio requests that may be serviceable now.
1073 */
1074void
1075aio_swake(struct socket *so, struct sockbuf *sb)
1076{
1077#ifndef VFS_AIO
1078 return;
1079#else
1080 struct aiocblist *cb,*cbn;
1081 struct proc *p;
1082 struct kaioinfo *ki = NULL;
1083 int opcode, wakecount = 0;
1084 struct aioproclist *aiop;
1085
1086 if (sb == &so->so_snd) {
1087 opcode = LIO_WRITE;
1088 so->so_snd.sb_flags &= ~SB_AIO;
1089 } else {
1090 opcode = LIO_READ;
1091 so->so_rcv.sb_flags &= ~SB_AIO;
1092 }
1093
1094 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1095 cbn = TAILQ_NEXT(cb, list);
1096 if (opcode == cb->uaiocb.aio_lio_opcode) {
1097 p = cb->userproc;
1098 ki = p->p_aioinfo;
1099 TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1100 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1101 TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1102 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1103 wakecount++;
1104 if (cb->jobstate != JOBST_JOBQGLOBAL)
1105 panic("invalid queue value");
1106 }
1107 }
1108
1109 while (wakecount--) {
1110 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1111 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1112 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1113 aiop->aioprocflags &= ~AIOP_FREE;
1114 wakeup(aiop->aioproc);
1115 }
1116 }
1117#endif /* VFS_AIO */
1118}
1119
1120#ifdef VFS_AIO
1121/*
1122 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR
1123 * technique is done in this code.
1124 */
1125static int
41c20dac 1126_aio_aqueue(struct aiocb *job, struct aio_liojob *lj, int type)
984263bc 1127{
7b95be2a 1128 struct proc *p = curproc;
984263bc
MD
1129 struct filedesc *fdp;
1130 struct file *fp;
1131 unsigned int fd;
1132 struct socket *so;
984263bc
MD
1133 int error;
1134 int opcode, user_opcode;
1135 struct aiocblist *aiocbe;
1136 struct aioproclist *aiop;
1137 struct kaioinfo *ki;
1138 struct kevent kev;
1139 struct kqueue *kq;
1140 struct file *kq_fp;
1141
1142 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL)
1143 TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1144 else
1145 aiocbe = zalloc (aiocb_zone);
1146
1147 aiocbe->inputcharge = 0;
1148 aiocbe->outputcharge = 0;
a5eb27b6 1149 callout_init(&aiocbe->timeout);
984263bc
MD
1150 SLIST_INIT(&aiocbe->klist);
1151
1152 suword(&job->_aiocb_private.status, -1);
1153 suword(&job->_aiocb_private.error, 0);
1154 suword(&job->_aiocb_private.kernelinfo, -1);
1155
1156 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1157 if (error) {
1158 suword(&job->_aiocb_private.error, error);
1159 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1160 return error;
1161 }
1162 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1163 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1164 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1165 return EINVAL;
1166 }
1167
1168 /* Save userspace address of the job info. */
1169 aiocbe->uuaiocb = job;
1170
1171 /* Get the opcode. */
1172 user_opcode = aiocbe->uaiocb.aio_lio_opcode;
1173 if (type != LIO_NOP)
1174 aiocbe->uaiocb.aio_lio_opcode = type;
1175 opcode = aiocbe->uaiocb.aio_lio_opcode;
1176
1177 /* Get the fd info for process. */
1178 fdp = p->p_fd;
1179
1180 /*
1181 * Range check file descriptor.
1182 */
1183 fd = aiocbe->uaiocb.aio_fildes;
1184 if (fd >= fdp->fd_nfiles) {
1185 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1186 if (type == 0)
1187 suword(&job->_aiocb_private.error, EBADF);
1188 return EBADF;
1189 }
1190
0679adc4 1191 fp = aiocbe->fd_file = fdp->fd_files[fd].fp;
984263bc
MD
1192 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1193 0))) {
1194 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1195 if (type == 0)
1196 suword(&job->_aiocb_private.error, EBADF);
1197 return EBADF;
1198 }
1199 fhold(fp);
1200
1201 if (aiocbe->uaiocb.aio_offset == -1LL) {
1202 error = EINVAL;
1203 goto aqueue_fail;
1204 }
1205 error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1206 if (error) {
1207 error = EINVAL;
1208 goto aqueue_fail;
1209 }
1210 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1211 if (jobrefid == LONG_MAX)
1212 jobrefid = 1;
1213 else
1214 jobrefid++;
1215
1216 if (opcode == LIO_NOP) {
9f87144f 1217 fdrop(fp);
984263bc
MD
1218 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1219 if (type == 0) {
1220 suword(&job->_aiocb_private.error, 0);
1221 suword(&job->_aiocb_private.status, 0);
1222 suword(&job->_aiocb_private.kernelinfo, 0);
1223 }
1224 return 0;
1225 }
1226 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1227 if (type == 0)
1228 suword(&job->_aiocb_private.status, 0);
1229 error = EINVAL;
1230 goto aqueue_fail;
1231 }
1232
1233 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1234 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1235 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1236 }
1237 else {
1238 /*
1239 * This method for requesting kevent-based notification won't
1240 * work on the alpha, since we're passing in a pointer
1241 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT-
1242 * based method instead.
1243 */
1244 if (user_opcode == LIO_NOP || user_opcode == LIO_READ ||
1245 user_opcode == LIO_WRITE)
1246 goto no_kqueue;
1247
1248 error = copyin((struct kevent *)(uintptr_t)user_opcode,
1249 &kev, sizeof(kev));
1250 if (error)
1251 goto aqueue_fail;
1252 }
1253 if ((u_int)kev.ident >= fdp->fd_nfiles ||
0679adc4 1254 (kq_fp = fdp->fd_files[kev.ident].fp) == NULL ||
984263bc
MD
1255 (kq_fp->f_type != DTYPE_KQUEUE)) {
1256 error = EBADF;
1257 goto aqueue_fail;
1258 }
1259 kq = (struct kqueue *)kq_fp->f_data;
1260 kev.ident = (uintptr_t)aiocbe->uuaiocb;
1261 kev.filter = EVFILT_AIO;
1262 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1263 kev.data = (intptr_t)aiocbe;
7b95be2a 1264 error = kqueue_register(kq, &kev, p->p_thread);
984263bc
MD
1265aqueue_fail:
1266 if (error) {
9f87144f 1267 fdrop(fp);
984263bc
MD
1268 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1269 if (type == 0)
1270 suword(&job->_aiocb_private.error, error);
1271 goto done;
1272 }
1273no_kqueue:
1274
1275 suword(&job->_aiocb_private.error, EINPROGRESS);
1276 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1277 aiocbe->userproc = p;
1278 aiocbe->jobflags = 0;
1279 aiocbe->lio = lj;
1280 ki = p->p_aioinfo;
1281
1282 if (fp->f_type == DTYPE_SOCKET) {
1283 /*
1284 * Alternate queueing for socket ops: Reach down into the
1285 * descriptor to get the socket data. Then check to see if the
1286 * socket is ready to be read or written (based on the requested
1287 * operation).
1288 *
1289 * If it is not ready for io, then queue the aiocbe on the
1290 * socket, and set the flags so we get a call when sbnotify()
1291 * happens.
1292 */
1293 so = (struct socket *)fp->f_data;
e43a034f 1294 crit_enter();
984263bc
MD
1295 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1296 LIO_WRITE) && (!sowriteable(so)))) {
1297 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1298 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1299 if (opcode == LIO_READ)
1300 so->so_rcv.sb_flags |= SB_AIO;
1301 else
1302 so->so_snd.sb_flags |= SB_AIO;
1303 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1304 ki->kaio_queue_count++;
1305 num_queue_count++;
e43a034f 1306 crit_exit();
984263bc
MD
1307 error = 0;
1308 goto done;
1309 }
e43a034f 1310 crit_exit();
984263bc
MD
1311 }
1312
1313 if ((error = aio_qphysio(p, aiocbe)) == 0)
1314 goto done;
1315 if (error > 0) {
1316 suword(&job->_aiocb_private.status, 0);
1317 aiocbe->uaiocb._aiocb_private.error = error;
1318 suword(&job->_aiocb_private.error, error);
1319 goto done;
1320 }
1321
1322 /* No buffer for daemon I/O. */
1323 aiocbe->bp = NULL;
1324
1325 ki->kaio_queue_count++;
1326 if (lj)
1327 lj->lioj_queue_count++;
e43a034f 1328 crit_enter();
984263bc
MD
1329 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1330 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
e43a034f 1331 crit_exit();
984263bc
MD
1332 aiocbe->jobstate = JOBST_JOBQGLOBAL;
1333
1334 num_queue_count++;
1335 error = 0;
1336
1337 /*
1338 * If we don't have a free AIO process, and we are below our quota, then
1339 * start one. Otherwise, depend on the subsequent I/O completions to
d0d91865 1340 * pick-up this job. If we don't successfully create the new process
984263bc
MD
1341 * (thread) due to resource issues, we return an error for now (EAGAIN),
1342 * which is likely not the correct thing to do.
1343 */
e43a034f 1344 crit_enter();
984263bc
MD
1345retryproc:
1346 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1347 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1348 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1349 aiop->aioprocflags &= ~AIOP_FREE;
1350 wakeup(aiop->aioproc);
1351 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1352 ((ki->kaio_active_count + num_aio_resv_start) <
1353 ki->kaio_maxactive_count)) {
1354 num_aio_resv_start++;
1355 if ((error = aio_newproc()) == 0) {
1356 num_aio_resv_start--;
1357 goto retryproc;
1358 }
1359 num_aio_resv_start--;
1360 }
e43a034f 1361 crit_exit();
984263bc
MD
1362done:
1363 return error;
1364}
1365
1366/*
1367 * This routine queues an AIO request, checking for quotas.
1368 */
1369static int
41c20dac 1370aio_aqueue(struct aiocb *job, int type)
984263bc 1371{
7b95be2a 1372 struct proc *p = curproc;
984263bc
MD
1373 struct kaioinfo *ki;
1374
1375 if (p->p_aioinfo == NULL)
1376 aio_init_aioinfo(p);
1377
1378 if (num_queue_count >= max_queue_count)
1379 return EAGAIN;
1380
1381 ki = p->p_aioinfo;
1382 if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1383 return EAGAIN;
1384
41c20dac 1385 return _aio_aqueue(job, NULL, type);
984263bc
MD
1386}
1387#endif /* VFS_AIO */
1388
1389/*
1390 * Support the aio_return system call, as a side-effect, kernel resources are
1391 * released.
1392 */
1393int
41c20dac 1394aio_return(struct aio_return_args *uap)
984263bc
MD
1395{
1396#ifndef VFS_AIO
1397 return ENOSYS;
1398#else
41c20dac 1399 struct proc *p = curproc;
984263bc
MD
1400 long jobref;
1401 struct aiocblist *cb, *ncb;
1402 struct aiocb *ujob;
1403 struct kaioinfo *ki;
1404
1405 ki = p->p_aioinfo;
1406 if (ki == NULL)
1407 return EINVAL;
1408
1409 ujob = uap->aiocbp;
1410
1411 jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1412 if (jobref == -1 || jobref == 0)
1413 return EINVAL;
1414
1415 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1416 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1417 jobref) {
1418 if (ujob == cb->uuaiocb) {
c7114eea 1419 uap->sysmsg_result =
984263bc
MD
1420 cb->uaiocb._aiocb_private.status;
1421 } else
c7114eea 1422 uap->sysmsg_result = EFAULT;
984263bc
MD
1423 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1424 p->p_stats->p_ru.ru_oublock +=
1425 cb->outputcharge;
1426 cb->outputcharge = 0;
1427 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1428 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1429 cb->inputcharge = 0;
1430 }
1431 aio_free_entry(cb);
1432 return 0;
1433 }
1434 }
e43a034f 1435 crit_enter();
984263bc
MD
1436 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1437 ncb = TAILQ_NEXT(cb, plist);
1438 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1439 == jobref) {
e43a034f 1440 crit_exit();
984263bc 1441 if (ujob == cb->uuaiocb) {
c7114eea 1442 uap->sysmsg_result =
984263bc
MD
1443 cb->uaiocb._aiocb_private.status;
1444 } else
c7114eea 1445 uap->sysmsg_result = EFAULT;
984263bc
MD
1446 aio_free_entry(cb);
1447 return 0;
1448 }
1449 }
e43a034f 1450 crit_exit();
984263bc
MD
1451
1452 return (EINVAL);
1453#endif /* VFS_AIO */
1454}
1455
1456/*
1457 * Allow a process to wakeup when any of the I/O requests are completed.
1458 */
1459int
41c20dac 1460aio_suspend(struct aio_suspend_args *uap)
984263bc
MD
1461{
1462#ifndef VFS_AIO
1463 return ENOSYS;
1464#else
41c20dac 1465 struct proc *p = curproc;
984263bc
MD
1466 struct timeval atv;
1467 struct timespec ts;
1468 struct aiocb *const *cbptr, *cbp;
1469 struct kaioinfo *ki;
1470 struct aiocblist *cb;
1471 int i;
1472 int njoblist;
831f78e5 1473 int error, timo;
984263bc
MD
1474 long *ijoblist;
1475 struct aiocb **ujoblist;
1476
1477 if (uap->nent > AIO_LISTIO_MAX)
1478 return EINVAL;
1479
1480 timo = 0;
1481 if (uap->timeout) {
1482 /* Get timespec struct. */
1483 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1484 return error;
1485
1486 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1487 return (EINVAL);
1488
1489 TIMESPEC_TO_TIMEVAL(&atv, &ts);
1490 if (itimerfix(&atv))
1491 return (EINVAL);
a94976ad 1492 timo = tvtohz_high(&atv);
984263bc
MD
1493 }
1494
1495 ki = p->p_aioinfo;
1496 if (ki == NULL)
1497 return EAGAIN;
1498
1499 njoblist = 0;
1500 ijoblist = zalloc(aiol_zone);
1501 ujoblist = zalloc(aiol_zone);
1502 cbptr = uap->aiocbp;
1503
1504 for (i = 0; i < uap->nent; i++) {
1505 cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1506 if (cbp == 0)
1507 continue;
1508 ujoblist[njoblist] = cbp;
1509 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1510 njoblist++;
1511 }
1512
1513 if (njoblist == 0) {
1514 zfree(aiol_zone, ijoblist);
1515 zfree(aiol_zone, ujoblist);
1516 return 0;
1517 }
1518
1519 error = 0;
1520 for (;;) {
1521 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1522 for (i = 0; i < njoblist; i++) {
1523 if (((intptr_t)
1524 cb->uaiocb._aiocb_private.kernelinfo) ==
1525 ijoblist[i]) {
1526 if (ujoblist[i] != cb->uuaiocb)
1527 error = EINVAL;
1528 zfree(aiol_zone, ijoblist);
1529 zfree(aiol_zone, ujoblist);
1530 return error;
1531 }
1532 }
1533 }
1534
e43a034f 1535 crit_enter();
984263bc
MD
1536 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1537 TAILQ_NEXT(cb, plist)) {
1538 for (i = 0; i < njoblist; i++) {
1539 if (((intptr_t)
1540 cb->uaiocb._aiocb_private.kernelinfo) ==
1541 ijoblist[i]) {
e43a034f 1542 crit_exit();
984263bc
MD
1543 if (ujoblist[i] != cb->uuaiocb)
1544 error = EINVAL;
1545 zfree(aiol_zone, ijoblist);
1546 zfree(aiol_zone, ujoblist);
1547 return error;
1548 }
1549 }
1550 }
1551
1552 ki->kaio_flags |= KAIO_WAKEUP;
377d4740 1553 error = tsleep(p, PCATCH, "aiospn", timo);
e43a034f 1554 crit_exit();
984263bc
MD
1555
1556 if (error == ERESTART || error == EINTR) {
1557 zfree(aiol_zone, ijoblist);
1558 zfree(aiol_zone, ujoblist);
1559 return EINTR;
1560 } else if (error == EWOULDBLOCK) {
1561 zfree(aiol_zone, ijoblist);
1562 zfree(aiol_zone, ujoblist);
1563 return EAGAIN;
1564 }
1565 }
1566
1567/* NOTREACHED */
1568 return EINVAL;
1569#endif /* VFS_AIO */
1570}
1571
1572/*
1573 * aio_cancel cancels any non-physio aio operations not currently in
1574 * progress.
1575 */
1576int
41c20dac 1577aio_cancel(struct aio_cancel_args *uap)
984263bc
MD
1578{
1579#ifndef VFS_AIO
1580 return ENOSYS;
1581#else
41c20dac 1582 struct proc *p = curproc;
984263bc
MD
1583 struct kaioinfo *ki;
1584 struct aiocblist *cbe, *cbn;
1585 struct file *fp;
1586 struct filedesc *fdp;
1587 struct socket *so;
1588 struct proc *po;
e43a034f 1589 int error;
984263bc
MD
1590 int cancelled=0;
1591 int notcancelled=0;
1592 struct vnode *vp;
1593
1594 fdp = p->p_fd;
1595 if ((u_int)uap->fd >= fdp->fd_nfiles ||
0679adc4 1596 (fp = fdp->fd_files[uap->fd].fp) == NULL)
984263bc
MD
1597 return (EBADF);
1598
1599 if (fp->f_type == DTYPE_VNODE) {
1600 vp = (struct vnode *)fp->f_data;
1601
1602 if (vn_isdisk(vp,&error)) {
c7114eea 1603 uap->sysmsg_result = AIO_NOTCANCELED;
984263bc
MD
1604 return 0;
1605 }
1606 } else if (fp->f_type == DTYPE_SOCKET) {
1607 so = (struct socket *)fp->f_data;
1608
e43a034f 1609 crit_enter();
984263bc
MD
1610
1611 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1612 cbn = TAILQ_NEXT(cbe, list);
1613 if ((uap->aiocbp == NULL) ||
1614 (uap->aiocbp == cbe->uuaiocb) ) {
1615 po = cbe->userproc;
1616 ki = po->p_aioinfo;
1617 TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1618 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1619 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1620 if (ki->kaio_flags & KAIO_WAKEUP) {
1621 wakeup(po);
1622 }
1623 cbe->jobstate = JOBST_JOBFINISHED;
1624 cbe->uaiocb._aiocb_private.status=-1;
1625 cbe->uaiocb._aiocb_private.error=ECANCELED;
1626 cancelled++;
1627/* XXX cancelled, knote? */
1628 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1629 SIGEV_SIGNAL)
1630 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1631 if (uap->aiocbp)
1632 break;
1633 }
1634 }
e43a034f 1635 crit_exit();
984263bc
MD
1636
1637 if ((cancelled) && (uap->aiocbp)) {
c7114eea 1638 uap->sysmsg_result = AIO_CANCELED;
984263bc
MD
1639 return 0;
1640 }
1641 }
1642 ki=p->p_aioinfo;
1643 if (ki == NULL)
1644 goto done;
e43a034f 1645 crit_enter();
984263bc
MD
1646
1647 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1648 cbn = TAILQ_NEXT(cbe, plist);
1649
1650 if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1651 ((uap->aiocbp == NULL ) ||
1652 (uap->aiocbp == cbe->uuaiocb))) {
1653
1654 if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1655 TAILQ_REMOVE(&aio_jobs, cbe, list);
1656 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1657 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1658 plist);
1659 cancelled++;
1660 ki->kaio_queue_finished_count++;
1661 cbe->jobstate = JOBST_JOBFINISHED;
1662 cbe->uaiocb._aiocb_private.status = -1;
1663 cbe->uaiocb._aiocb_private.error = ECANCELED;
1664/* XXX cancelled, knote? */
1665 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1666 SIGEV_SIGNAL)
1667 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1668 } else {
1669 notcancelled++;
1670 }
1671 }
1672 }
e43a034f 1673 crit_exit();
984263bc
MD
1674done:
1675 if (notcancelled) {
c7114eea 1676 uap->sysmsg_result = AIO_NOTCANCELED;
984263bc
MD
1677 return 0;
1678 }
1679 if (cancelled) {
c7114eea 1680 uap->sysmsg_result = AIO_CANCELED;
984263bc
MD
1681 return 0;
1682 }
c7114eea 1683 uap->sysmsg_result = AIO_ALLDONE;
984263bc
MD
1684
1685 return 0;
1686#endif /* VFS_AIO */
1687}
1688
1689/*
1690 * aio_error is implemented in the kernel level for compatibility purposes only.
1691 * For a user mode async implementation, it would be best to do it in a userland
1692 * subroutine.
1693 */
1694int
41c20dac 1695aio_error(struct aio_error_args *uap)
984263bc
MD
1696{
1697#ifndef VFS_AIO
1698 return ENOSYS;
1699#else
41c20dac 1700 struct proc *p = curproc;
984263bc
MD
1701 struct aiocblist *cb;
1702 struct kaioinfo *ki;
1703 long jobref;
1704
1705 ki = p->p_aioinfo;
1706 if (ki == NULL)
1707 return EINVAL;
1708
1709 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1710 if ((jobref == -1) || (jobref == 0))
1711 return EINVAL;
1712
1713 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1714 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1715 jobref) {
c7114eea 1716 uap->sysmsg_result = cb->uaiocb._aiocb_private.error;
984263bc
MD
1717 return 0;
1718 }
1719 }
1720
e43a034f 1721 crit_enter();
984263bc
MD
1722
1723 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1724 plist)) {
1725 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1726 jobref) {
c7114eea 1727 uap->sysmsg_result = EINPROGRESS;
e43a034f 1728 crit_exit();
984263bc
MD
1729 return 0;
1730 }
1731 }
1732
1733 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1734 plist)) {
1735 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1736 jobref) {
c7114eea 1737 uap->sysmsg_result = EINPROGRESS;
e43a034f 1738 crit_exit();
984263bc
MD
1739 return 0;
1740 }
1741 }
e43a034f 1742 crit_exit();
984263bc 1743
e43a034f 1744 crit_enter();
984263bc
MD
1745 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1746 plist)) {
1747 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1748 jobref) {
c7114eea 1749 uap->sysmsg_result = cb->uaiocb._aiocb_private.error;
e43a034f 1750 crit_exit();
984263bc
MD
1751 return 0;
1752 }
1753 }
1754
1755 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1756 plist)) {
1757 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1758 jobref) {
c7114eea 1759 uap->sysmsg_result = EINPROGRESS;
e43a034f 1760 crit_exit();
984263bc
MD
1761 return 0;
1762 }
1763 }
e43a034f 1764 crit_exit();
984263bc
MD
1765
1766#if (0)
1767 /*
1768 * Hack for lio.
1769 */
1770 status = fuword(&uap->aiocbp->_aiocb_private.status);
1771 if (status == -1)
1772 return fuword(&uap->aiocbp->_aiocb_private.error);
1773#endif
1774 return EINVAL;
1775#endif /* VFS_AIO */
1776}
1777
1778/* syscall - asynchronous read from a file (REALTIME) */
1779int
41c20dac 1780aio_read(struct aio_read_args *uap)
984263bc
MD
1781{
1782#ifndef VFS_AIO
1783 return ENOSYS;
1784#else
41c20dac 1785 return aio_aqueue(uap->aiocbp, LIO_READ);
984263bc
MD
1786#endif /* VFS_AIO */
1787}
1788
1789/* syscall - asynchronous write to a file (REALTIME) */
1790int
41c20dac 1791aio_write(struct aio_write_args *uap)
984263bc
MD
1792{
1793#ifndef VFS_AIO
1794 return ENOSYS;
1795#else
41c20dac 1796 return aio_aqueue(uap->aiocbp, LIO_WRITE);
984263bc
MD
1797#endif /* VFS_AIO */
1798}
1799
1800/* syscall - XXX undocumented */
1801int
41c20dac 1802lio_listio(struct lio_listio_args *uap)
984263bc
MD
1803{
1804#ifndef VFS_AIO
1805 return ENOSYS;
1806#else
41c20dac 1807 struct proc *p = curproc;
984263bc
MD
1808 int nent, nentqueued;
1809 struct aiocb *iocb, * const *cbptr;
1810 struct aiocblist *cb;
1811 struct kaioinfo *ki;
1812 struct aio_liojob *lj;
1813 int error, runningcode;
1814 int nerror;
1815 int i;
984263bc
MD
1816
1817 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1818 return EINVAL;
1819
1820 nent = uap->nent;
1821 if (nent > AIO_LISTIO_MAX)
1822 return EINVAL;
1823
1824 if (p->p_aioinfo == NULL)
1825 aio_init_aioinfo(p);
1826
1827 if ((nent + num_queue_count) > max_queue_count)
1828 return EAGAIN;
1829
1830 ki = p->p_aioinfo;
1831 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1832 return EAGAIN;
1833
1834 lj = zalloc(aiolio_zone);
1835 if (!lj)
1836 return EAGAIN;
1837
1838 lj->lioj_flags = 0;
1839 lj->lioj_buffer_count = 0;
1840 lj->lioj_buffer_finished_count = 0;
1841 lj->lioj_queue_count = 0;
1842 lj->lioj_queue_finished_count = 0;
1843 lj->lioj_ki = ki;
1844
1845 /*
1846 * Setup signal.
1847 */
1848 if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1849 error = copyin(uap->sig, &lj->lioj_signal,
1850 sizeof(lj->lioj_signal));
1851 if (error) {
1852 zfree(aiolio_zone, lj);
1853 return error;
1854 }
1855 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
1856 zfree(aiolio_zone, lj);
1857 return EINVAL;
1858 }
1859 lj->lioj_flags |= LIOJ_SIGNAL;
1860 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
1861 } else
1862 lj->lioj_flags &= ~LIOJ_SIGNAL;
1863
1864 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
1865 /*
1866 * Get pointers to the list of I/O requests.
1867 */
1868 nerror = 0;
1869 nentqueued = 0;
1870 cbptr = uap->acb_list;
1871 for (i = 0; i < uap->nent; i++) {
1872 iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1873 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
41c20dac 1874 error = _aio_aqueue(iocb, lj, 0);
984263bc
MD
1875 if (error == 0)
1876 nentqueued++;
1877 else
1878 nerror++;
1879 }
1880 }
1881
1882 /*
1883 * If we haven't queued any, then just return error.
1884 */
1885 if (nentqueued == 0)
1886 return 0;
1887
1888 /*
1889 * Calculate the appropriate error return.
1890 */
1891 runningcode = 0;
1892 if (nerror)
1893 runningcode = EIO;
1894
1895 if (uap->mode == LIO_WAIT) {
1896 int command, found, jobref;
1897
1898 for (;;) {
1899 found = 0;
1900 for (i = 0; i < uap->nent; i++) {
1901 /*
1902 * Fetch address of the control buf pointer in
1903 * user space.
1904 */
1905 iocb = (struct aiocb *)
1906 (intptr_t)fuword(&cbptr[i]);
1907 if (((intptr_t)iocb == -1) || ((intptr_t)iocb
1908 == 0))
1909 continue;
1910
1911 /*
1912 * Fetch the associated command from user space.
1913 */
1914 command = fuword(&iocb->aio_lio_opcode);
1915 if (command == LIO_NOP) {
1916 found++;
1917 continue;
1918 }
1919
1920 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
1921
1922 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1923 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
1924 == jobref) {
1925 if (cb->uaiocb.aio_lio_opcode
1926 == LIO_WRITE) {
1927 p->p_stats->p_ru.ru_oublock
1928 +=
1929 cb->outputcharge;
1930 cb->outputcharge = 0;
1931 } else if (cb->uaiocb.aio_lio_opcode
1932 == LIO_READ) {
1933 p->p_stats->p_ru.ru_inblock
1934 += cb->inputcharge;
1935 cb->inputcharge = 0;
1936 }
1937 found++;
1938 break;
1939 }
1940 }
1941
e43a034f 1942 crit_enter();
984263bc
MD
1943 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
1944 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
1945 == jobref) {
1946 found++;
1947 break;
1948 }
1949 }
e43a034f 1950 crit_exit();
984263bc
MD
1951 }
1952
1953 /*
1954 * If all I/Os have been disposed of, then we can
1955 * return.
1956 */
1957 if (found == nentqueued)
1958 return runningcode;
1959
1960 ki->kaio_flags |= KAIO_WAKEUP;
377d4740 1961 error = tsleep(p, PCATCH, "aiospn", 0);
984263bc
MD
1962
1963 if (error == EINTR)
1964 return EINTR;
1965 else if (error == EWOULDBLOCK)
1966 return EAGAIN;
1967 }
1968 }
1969
1970 return runningcode;
1971#endif /* VFS_AIO */
1972}
1973
1974#ifdef VFS_AIO
1975/*
1976 * This is a weird hack so that we can post a signal. It is safe to do so from
1977 * a timeout routine, but *not* from an interrupt routine.
1978 */
1979static void
1980process_signal(void *aioj)
1981{
1982 struct aiocblist *aiocbe = aioj;
1983 struct aio_liojob *lj = aiocbe->lio;
1984 struct aiocb *cb = &aiocbe->uaiocb;
1985
1986 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
1987 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
1988 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
1989 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1990 }
1991
1992 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL)
1993 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
1994}
1995
1996/*
1997 * Interrupt handler for physio, performs the necessary process wakeups, and
1998 * signals.
1999 */
2000static void
81b5c339 2001aio_physwakeup(struct bio *bio)
984263bc 2002{
81b5c339 2003 struct buf *bp = bio->bio_buf;
984263bc
MD
2004 struct aiocblist *aiocbe;
2005 struct proc *p;
2006 struct kaioinfo *ki;
2007 struct aio_liojob *lj;
2008
81b5c339 2009 aiocbe = bio->bio_caller_info2.ptr;
984263bc 2010
984263bc 2011 if (aiocbe) {
81b5c339 2012 p = bio->bio_caller_info1.ptr;
984263bc
MD
2013
2014 aiocbe->jobstate = JOBST_JOBBFINISHED;
2015 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2016 aiocbe->uaiocb._aiocb_private.error = 0;
2017 aiocbe->jobflags |= AIOCBLIST_DONE;
2018
2019 if (bp->b_flags & B_ERROR)
2020 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2021
2022 lj = aiocbe->lio;
2023 if (lj) {
2024 lj->lioj_buffer_finished_count++;
2025
2026 /*
2027 * wakeup/signal if all of the interrupt jobs are done.
2028 */
2029 if (lj->lioj_buffer_finished_count ==
2030 lj->lioj_buffer_count) {
2031 /*
2032 * Post a signal if it is called for.
2033 */
2034 if ((lj->lioj_flags &
2035 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2036 LIOJ_SIGNAL) {
2037 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
a5eb27b6
MD
2038 callout_reset(&aiocbe->timeout, 0,
2039 process_signal, aiocbe);
984263bc
MD
2040 }
2041 }
2042 }
2043
2044 ki = p->p_aioinfo;
2045 if (ki) {
2046 ki->kaio_buffer_finished_count++;
2047 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2048 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2049 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2050
2051 KNOTE(&aiocbe->klist, 0);
2052 /* Do the wakeup. */
2053 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2054 ki->kaio_flags &= ~KAIO_WAKEUP;
2055 wakeup(p);
2056 }
2057 }
2058
a5eb27b6
MD
2059 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2060 callout_reset(&aiocbe->timeout, 0,
2061 process_signal, aiocbe);
2062 }
984263bc 2063 }
10f3fee5 2064 bp->b_cmd = BUF_CMD_DONE;
81b5c339 2065 wakeup(bp);
984263bc
MD
2066}
2067#endif /* VFS_AIO */
2068
2069/* syscall - wait for the next completion of an aio request */
2070int
41c20dac 2071aio_waitcomplete(struct aio_waitcomplete_args *uap)
984263bc
MD
2072{
2073#ifndef VFS_AIO
2074 return ENOSYS;
2075#else
41c20dac 2076 struct proc *p = curproc;
984263bc
MD
2077 struct timeval atv;
2078 struct timespec ts;
2079 struct kaioinfo *ki;
2080 struct aiocblist *cb = NULL;
831f78e5 2081 int error, timo;
984263bc
MD
2082
2083 suword(uap->aiocbp, (int)NULL);
2084
2085 timo = 0;
2086 if (uap->timeout) {
2087 /* Get timespec struct. */
2088 error = copyin(uap->timeout, &ts, sizeof(ts));
2089 if (error)
2090 return error;
2091
2092 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2093 return (EINVAL);
2094
2095 TIMESPEC_TO_TIMEVAL(&atv, &ts);
2096 if (itimerfix(&atv))
2097 return (EINVAL);
a94976ad 2098 timo = tvtohz_high(&atv);
984263bc
MD
2099 }
2100
2101 ki = p->p_aioinfo;
2102 if (ki == NULL)
2103 return EAGAIN;
2104
2105 for (;;) {
2106 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2107 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
c7114eea 2108 uap->sysmsg_result = cb->uaiocb._aiocb_private.status;
984263bc
MD
2109 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2110 p->p_stats->p_ru.ru_oublock +=
2111 cb->outputcharge;
2112 cb->outputcharge = 0;
2113 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2114 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2115 cb->inputcharge = 0;
2116 }
2117 aio_free_entry(cb);
2118 return cb->uaiocb._aiocb_private.error;
2119 }
2120
e43a034f 2121 crit_enter();
984263bc 2122 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
e43a034f 2123 crit_exit();
984263bc 2124 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
c7114eea 2125 uap->sysmsg_result = cb->uaiocb._aiocb_private.status;
984263bc
MD
2126 aio_free_entry(cb);
2127 return cb->uaiocb._aiocb_private.error;
2128 }
2129
2130 ki->kaio_flags |= KAIO_WAKEUP;
377d4740 2131 error = tsleep(p, PCATCH, "aiowc", timo);
e43a034f 2132 crit_exit();
984263bc
MD
2133
2134 if (error == ERESTART)
2135 return EINTR;
2136 else if (error < 0)
2137 return error;
2138 else if (error == EINTR)
2139 return EINTR;
2140 else if (error == EWOULDBLOCK)
2141 return EAGAIN;
2142 }
2143#endif /* VFS_AIO */
2144}
2145
2146#ifndef VFS_AIO
2147static int
2148filt_aioattach(struct knote *kn)
2149{
2150
2151 return (ENXIO);
2152}
2153
2154struct filterops aio_filtops =
2155 { 0, filt_aioattach, NULL, NULL };
2156
2157#else
2158/* kqueue attach function */
2159static int
2160filt_aioattach(struct knote *kn)
2161{
2162 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2163
2164 /*
2165 * The aiocbe pointer must be validated before using it, so
2166 * registration is restricted to the kernel; the user cannot
2167 * set EV_FLAG1.
2168 */
2169 if ((kn->kn_flags & EV_FLAG1) == 0)
2170 return (EPERM);
2171 kn->kn_flags &= ~EV_FLAG1;
2172
2173 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2174
2175 return (0);
2176}
2177
2178/* kqueue detach function */
2179static void
2180filt_aiodetach(struct knote *kn)
2181{
2182 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2183
2184 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2185}
2186
2187/* kqueue filter function */
2188/*ARGSUSED*/
2189static int
2190filt_aio(struct knote *kn, long hint)
2191{
2192 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2193
2194 kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2195 if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2196 aiocbe->jobstate != JOBST_JOBBFINISHED)
2197 return (0);
2198 kn->kn_flags |= EV_EOF;
2199 return (1);
2200}
2201
2202struct filterops aio_filtops =
2203 { 0, filt_aioattach, filt_aiodetach, filt_aio };
2204#endif /* VFS_AIO */