Initial import from FreeBSD RELENG_4:
[dragonfly.git] / sys / kern / vfs_aio.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 1997 John S. Dyson. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 * derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author. This software is distributed AS-IS.
15 *
16 * $FreeBSD: src/sys/kern/vfs_aio.c,v 1.70.2.28 2003/05/29 06:15:35 alc Exp $
17 */
18
19/*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/buf.h>
26#include <sys/sysproto.h>
27#include <sys/filedesc.h>
28#include <sys/kernel.h>
29#include <sys/fcntl.h>
30#include <sys/file.h>
31#include <sys/lock.h>
32#include <sys/unistd.h>
33#include <sys/proc.h>
34#include <sys/resourcevar.h>
35#include <sys/signalvar.h>
36#include <sys/protosw.h>
37#include <sys/socketvar.h>
38#include <sys/sysctl.h>
39#include <sys/vnode.h>
40#include <sys/conf.h>
41#include <sys/event.h>
42
43#include <vm/vm.h>
44#include <vm/vm_extern.h>
45#include <vm/pmap.h>
46#include <vm/vm_map.h>
47#include <vm/vm_zone.h>
48#include <sys/aio.h>
49
50#include <machine/limits.h>
51#include "opt_vfs_aio.h"
52
53#ifdef VFS_AIO
54
55/*
56 * Counter for allocating reference ids to new jobs. Wrapped to 1 on
57 * overflow.
58 */
59static long jobrefid;
60
61#define JOBST_NULL 0x0
62#define JOBST_JOBQGLOBAL 0x2
63#define JOBST_JOBRUNNING 0x3
64#define JOBST_JOBFINISHED 0x4
65#define JOBST_JOBQBUF 0x5
66#define JOBST_JOBBFINISHED 0x6
67
68#ifndef MAX_AIO_PER_PROC
69#define MAX_AIO_PER_PROC 32
70#endif
71
72#ifndef MAX_AIO_QUEUE_PER_PROC
73#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
74#endif
75
76#ifndef MAX_AIO_PROCS
77#define MAX_AIO_PROCS 32
78#endif
79
80#ifndef MAX_AIO_QUEUE
81#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
82#endif
83
84#ifndef TARGET_AIO_PROCS
85#define TARGET_AIO_PROCS 4
86#endif
87
88#ifndef MAX_BUF_AIO
89#define MAX_BUF_AIO 16
90#endif
91
92#ifndef AIOD_TIMEOUT_DEFAULT
93#define AIOD_TIMEOUT_DEFAULT (10 * hz)
94#endif
95
96#ifndef AIOD_LIFETIME_DEFAULT
97#define AIOD_LIFETIME_DEFAULT (30 * hz)
98#endif
99
100SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
101
102static int max_aio_procs = MAX_AIO_PROCS;
103SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
104 CTLFLAG_RW, &max_aio_procs, 0,
105 "Maximum number of kernel threads to use for handling async IO");
106
107static int num_aio_procs = 0;
108SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
109 CTLFLAG_RD, &num_aio_procs, 0,
110 "Number of presently active kernel threads for async IO");
111
112/*
113 * The code will adjust the actual number of AIO processes towards this
114 * number when it gets a chance.
115 */
116static int target_aio_procs = TARGET_AIO_PROCS;
117SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
118 0, "Preferred number of ready kernel threads for async IO");
119
120static int max_queue_count = MAX_AIO_QUEUE;
121SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
122 "Maximum number of aio requests to queue, globally");
123
124static int num_queue_count = 0;
125SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
126 "Number of queued aio requests");
127
128static int num_buf_aio = 0;
129SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
130 "Number of aio requests presently handled by the buf subsystem");
131
132/* Number of async I/O thread in the process of being started */
133/* XXX This should be local to _aio_aqueue() */
134static int num_aio_resv_start = 0;
135
136static int aiod_timeout;
137SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
138 "Timeout value for synchronous aio operations");
139
140static int aiod_lifetime;
141SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
142 "Maximum lifetime for idle aiod");
143
144static int max_aio_per_proc = MAX_AIO_PER_PROC;
145SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
146 0, "Maximum active aio requests per process (stored in the process)");
147
148static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
149SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
150 &max_aio_queue_per_proc, 0,
151 "Maximum queued aio requests per process (stored in the process)");
152
153static int max_buf_aio = MAX_BUF_AIO;
154SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
155 "Maximum buf aio requests per process (stored in the process)");
156
157/*
158 * AIO process info
159 */
160#define AIOP_FREE 0x1 /* proc on free queue */
161#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
162
163struct aioproclist {
164 int aioprocflags; /* AIO proc flags */
165 TAILQ_ENTRY(aioproclist) list; /* List of processes */
166 struct proc *aioproc; /* The AIO thread */
167};
168
169/*
170 * data-structure for lio signal management
171 */
172struct aio_liojob {
173 int lioj_flags;
174 int lioj_buffer_count;
175 int lioj_buffer_finished_count;
176 int lioj_queue_count;
177 int lioj_queue_finished_count;
178 struct sigevent lioj_signal; /* signal on all I/O done */
179 TAILQ_ENTRY(aio_liojob) lioj_list;
180 struct kaioinfo *lioj_ki;
181};
182#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
183#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
184
185/*
186 * per process aio data structure
187 */
188struct kaioinfo {
189 int kaio_flags; /* per process kaio flags */
190 int kaio_maxactive_count; /* maximum number of AIOs */
191 int kaio_active_count; /* number of currently used AIOs */
192 int kaio_qallowed_count; /* maxiumu size of AIO queue */
193 int kaio_queue_count; /* size of AIO queue */
194 int kaio_ballowed_count; /* maximum number of buffers */
195 int kaio_queue_finished_count; /* number of daemon jobs finished */
196 int kaio_buffer_count; /* number of physio buffers */
197 int kaio_buffer_finished_count; /* count of I/O done */
198 struct proc *kaio_p; /* process that uses this kaio block */
199 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
200 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */
201 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */
202 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */
203 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */
204 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
205};
206
207#define KAIO_RUNDOWN 0x1 /* process is being run down */
208#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
209
210static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
211static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
212static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
213static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */
214
215static void aio_init_aioinfo(struct proc *p);
216static void aio_onceonly(void *);
217static int aio_free_entry(struct aiocblist *aiocbe);
218static void aio_process(struct aiocblist *aiocbe);
219static int aio_newproc(void);
220static int aio_aqueue(struct proc *p, struct aiocb *job, int type);
221static void aio_physwakeup(struct buf *bp);
222static int aio_fphysio(struct aiocblist *aiocbe);
223static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
224static void aio_daemon(void *uproc);
225static void process_signal(void *aioj);
226
227SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
228
229/*
230 * Zones for:
231 * kaio Per process async io info
232 * aiop async io thread data
233 * aiocb async io jobs
234 * aiol list io job pointer - internal to aio_suspend XXX
235 * aiolio list io jobs
236 */
237static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
238
239/*
240 * Startup initialization
241 */
242static void
243aio_onceonly(void *na)
244{
245 TAILQ_INIT(&aio_freeproc);
246 TAILQ_INIT(&aio_activeproc);
247 TAILQ_INIT(&aio_jobs);
248 TAILQ_INIT(&aio_bufjobs);
249 TAILQ_INIT(&aio_freejobs);
250 kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
251 aiop_zone = zinit("AIOP", sizeof(struct aioproclist), 0, 0, 1);
252 aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
253 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1);
254 aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1);
255 aiod_timeout = AIOD_TIMEOUT_DEFAULT;
256 aiod_lifetime = AIOD_LIFETIME_DEFAULT;
257 jobrefid = 1;
258}
259
260/*
261 * Init the per-process aioinfo structure. The aioinfo limits are set
262 * per-process for user limit (resource) management.
263 */
264static void
265aio_init_aioinfo(struct proc *p)
266{
267 struct kaioinfo *ki;
268 if (p->p_aioinfo == NULL) {
269 ki = zalloc(kaio_zone);
270 p->p_aioinfo = ki;
271 ki->kaio_flags = 0;
272 ki->kaio_maxactive_count = max_aio_per_proc;
273 ki->kaio_active_count = 0;
274 ki->kaio_qallowed_count = max_aio_queue_per_proc;
275 ki->kaio_queue_count = 0;
276 ki->kaio_ballowed_count = max_buf_aio;
277 ki->kaio_buffer_count = 0;
278 ki->kaio_buffer_finished_count = 0;
279 ki->kaio_p = p;
280 TAILQ_INIT(&ki->kaio_jobdone);
281 TAILQ_INIT(&ki->kaio_jobqueue);
282 TAILQ_INIT(&ki->kaio_bufdone);
283 TAILQ_INIT(&ki->kaio_bufqueue);
284 TAILQ_INIT(&ki->kaio_liojoblist);
285 TAILQ_INIT(&ki->kaio_sockqueue);
286 }
287
288 while (num_aio_procs < target_aio_procs)
289 aio_newproc();
290}
291
292/*
293 * Free a job entry. Wait for completion if it is currently active, but don't
294 * delay forever. If we delay, we return a flag that says that we have to
295 * restart the queue scan.
296 */
297static int
298aio_free_entry(struct aiocblist *aiocbe)
299{
300 struct kaioinfo *ki;
301 struct aio_liojob *lj;
302 struct proc *p;
303 int error;
304 int s;
305
306 if (aiocbe->jobstate == JOBST_NULL)
307 panic("aio_free_entry: freeing already free job");
308
309 p = aiocbe->userproc;
310 ki = p->p_aioinfo;
311 lj = aiocbe->lio;
312 if (ki == NULL)
313 panic("aio_free_entry: missing p->p_aioinfo");
314
315 while (aiocbe->jobstate == JOBST_JOBRUNNING) {
316 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
317 tsleep(aiocbe, PRIBIO, "jobwai", 0);
318 }
319 if (aiocbe->bp == NULL) {
320 if (ki->kaio_queue_count <= 0)
321 panic("aio_free_entry: process queue size <= 0");
322 if (num_queue_count <= 0)
323 panic("aio_free_entry: system wide queue size <= 0");
324
325 if (lj) {
326 lj->lioj_queue_count--;
327 if (aiocbe->jobflags & AIOCBLIST_DONE)
328 lj->lioj_queue_finished_count--;
329 }
330 ki->kaio_queue_count--;
331 if (aiocbe->jobflags & AIOCBLIST_DONE)
332 ki->kaio_queue_finished_count--;
333 num_queue_count--;
334 } else {
335 if (lj) {
336 lj->lioj_buffer_count--;
337 if (aiocbe->jobflags & AIOCBLIST_DONE)
338 lj->lioj_buffer_finished_count--;
339 }
340 if (aiocbe->jobflags & AIOCBLIST_DONE)
341 ki->kaio_buffer_finished_count--;
342 ki->kaio_buffer_count--;
343 num_buf_aio--;
344 }
345
346 /* aiocbe is going away, we need to destroy any knotes */
347 knote_remove(p, &aiocbe->klist);
348
349 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
350 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
351 ki->kaio_flags &= ~KAIO_WAKEUP;
352 wakeup(p);
353 }
354
355 if (aiocbe->jobstate == JOBST_JOBQBUF) {
356 if ((error = aio_fphysio(aiocbe)) != 0)
357 return error;
358 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
359 panic("aio_free_entry: invalid physio finish-up state");
360 s = splbio();
361 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
362 splx(s);
363 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
364 s = splnet();
365 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
366 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
367 splx(s);
368 } else if (aiocbe->jobstate == JOBST_JOBFINISHED)
369 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
370 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
371 s = splbio();
372 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
373 splx(s);
374 if (aiocbe->bp) {
375 vunmapbuf(aiocbe->bp);
376 relpbuf(aiocbe->bp, NULL);
377 aiocbe->bp = NULL;
378 }
379 }
380 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
381 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
382 zfree(aiolio_zone, lj);
383 }
384 aiocbe->jobstate = JOBST_NULL;
385 untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
386 fdrop(aiocbe->fd_file, curproc);
387 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
388 return 0;
389}
390#endif /* VFS_AIO */
391
392/*
393 * Rundown the jobs for a given process.
394 */
395void
396aio_proc_rundown(struct proc *p)
397{
398#ifndef VFS_AIO
399 return;
400#else
401 int s;
402 struct kaioinfo *ki;
403 struct aio_liojob *lj, *ljn;
404 struct aiocblist *aiocbe, *aiocbn;
405 struct file *fp;
406 struct socket *so;
407
408 ki = p->p_aioinfo;
409 if (ki == NULL)
410 return;
411
412 ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
413 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
414 ki->kaio_buffer_finished_count)) {
415 ki->kaio_flags |= KAIO_RUNDOWN;
416 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
417 break;
418 }
419
420 /*
421 * Move any aio ops that are waiting on socket I/O to the normal job
422 * queues so they are cleaned up with any others.
423 */
424 s = splnet();
425 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
426 aiocbn) {
427 aiocbn = TAILQ_NEXT(aiocbe, plist);
428 fp = aiocbe->fd_file;
429 if (fp != NULL) {
430 so = (struct socket *)fp->f_data;
431 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
432 if (TAILQ_EMPTY(&so->so_aiojobq)) {
433 so->so_snd.sb_flags &= ~SB_AIO;
434 so->so_rcv.sb_flags &= ~SB_AIO;
435 }
436 }
437 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
438 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
439 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
440 }
441 splx(s);
442
443restart1:
444 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
445 aiocbn = TAILQ_NEXT(aiocbe, plist);
446 if (aio_free_entry(aiocbe))
447 goto restart1;
448 }
449
450restart2:
451 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
452 aiocbn) {
453 aiocbn = TAILQ_NEXT(aiocbe, plist);
454 if (aio_free_entry(aiocbe))
455 goto restart2;
456 }
457
458/*
459 * Note the use of lots of splbio here, trying to avoid splbio for long chains
460 * of I/O. Probably unnecessary.
461 */
462restart3:
463 s = splbio();
464 while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
465 ki->kaio_flags |= KAIO_WAKEUP;
466 tsleep(p, PRIBIO, "aioprn", 0);
467 splx(s);
468 goto restart3;
469 }
470 splx(s);
471
472restart4:
473 s = splbio();
474 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
475 aiocbn = TAILQ_NEXT(aiocbe, plist);
476 if (aio_free_entry(aiocbe)) {
477 splx(s);
478 goto restart4;
479 }
480 }
481 splx(s);
482
483 /*
484 * If we've slept, jobs might have moved from one queue to another.
485 * Retry rundown if we didn't manage to empty the queues.
486 */
487 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
488 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
489 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
490 TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
491 goto restart1;
492
493 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
494 ljn = TAILQ_NEXT(lj, lioj_list);
495 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
496 0)) {
497 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
498 zfree(aiolio_zone, lj);
499 } else {
500#ifdef DIAGNOSTIC
501 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
502 "QF:%d\n", lj->lioj_buffer_count,
503 lj->lioj_buffer_finished_count,
504 lj->lioj_queue_count,
505 lj->lioj_queue_finished_count);
506#endif
507 }
508 }
509
510 zfree(kaio_zone, ki);
511 p->p_aioinfo = NULL;
512#endif /* VFS_AIO */
513}
514
515#ifdef VFS_AIO
516/*
517 * Select a job to run (called by an AIO daemon).
518 */
519static struct aiocblist *
520aio_selectjob(struct aioproclist *aiop)
521{
522 int s;
523 struct aiocblist *aiocbe;
524 struct kaioinfo *ki;
525 struct proc *userp;
526
527 s = splnet();
528 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
529 TAILQ_NEXT(aiocbe, list)) {
530 userp = aiocbe->userproc;
531 ki = userp->p_aioinfo;
532
533 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
534 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
535 splx(s);
536 return aiocbe;
537 }
538 }
539 splx(s);
540
541 return NULL;
542}
543
544/*
545 * The AIO processing activity. This is the code that does the I/O request for
546 * the non-physio version of the operations. The normal vn operations are used,
547 * and this code should work in all instances for every type of file, including
548 * pipes, sockets, fifos, and regular files.
549 */
550static void
551aio_process(struct aiocblist *aiocbe)
552{
553 struct proc *mycp;
554 struct aiocb *cb;
555 struct file *fp;
556 struct uio auio;
557 struct iovec aiov;
558 int cnt;
559 int error;
560 int oublock_st, oublock_end;
561 int inblock_st, inblock_end;
562
563 mycp = curproc;
564 cb = &aiocbe->uaiocb;
565 fp = aiocbe->fd_file;
566
567 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
568 aiov.iov_len = cb->aio_nbytes;
569
570 auio.uio_iov = &aiov;
571 auio.uio_iovcnt = 1;
572 auio.uio_offset = cb->aio_offset;
573 auio.uio_resid = cb->aio_nbytes;
574 cnt = cb->aio_nbytes;
575 auio.uio_segflg = UIO_USERSPACE;
576 auio.uio_procp = mycp;
577
578 inblock_st = mycp->p_stats->p_ru.ru_inblock;
579 oublock_st = mycp->p_stats->p_ru.ru_oublock;
580 /*
581 * _aio_aqueue() acquires a reference to the file that is
582 * released in aio_free_entry().
583 */
584 if (cb->aio_lio_opcode == LIO_READ) {
585 auio.uio_rw = UIO_READ;
586 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
587 } else {
588 auio.uio_rw = UIO_WRITE;
589 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
590 }
591 inblock_end = mycp->p_stats->p_ru.ru_inblock;
592 oublock_end = mycp->p_stats->p_ru.ru_oublock;
593
594 aiocbe->inputcharge = inblock_end - inblock_st;
595 aiocbe->outputcharge = oublock_end - oublock_st;
596
597 if ((error) && (auio.uio_resid != cnt)) {
598 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
599 error = 0;
600 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
601 psignal(aiocbe->userproc, SIGPIPE);
602 }
603
604 cnt -= auio.uio_resid;
605 cb->_aiocb_private.error = error;
606 cb->_aiocb_private.status = cnt;
607}
608
609/*
610 * The AIO daemon, most of the actual work is done in aio_process,
611 * but the setup (and address space mgmt) is done in this routine.
612 */
613static void
614aio_daemon(void *uproc)
615{
616 int s;
617 struct aio_liojob *lj;
618 struct aiocb *cb;
619 struct aiocblist *aiocbe;
620 struct aioproclist *aiop;
621 struct kaioinfo *ki;
622 struct proc *curcp, *mycp, *userp;
623 struct vmspace *myvm, *tmpvm;
624
625 /*
626 * Local copies of curproc (cp) and vmspace (myvm)
627 */
628 mycp = curproc;
629 myvm = mycp->p_vmspace;
630
631 if (mycp->p_textvp) {
632 vrele(mycp->p_textvp);
633 mycp->p_textvp = NULL;
634 }
635
636 /*
637 * Allocate and ready the aio control info. There is one aiop structure
638 * per daemon.
639 */
640 aiop = zalloc(aiop_zone);
641 aiop->aioproc = mycp;
642 aiop->aioprocflags |= AIOP_FREE;
643
644 s = splnet();
645
646 /*
647 * Place thread (lightweight process) onto the AIO free thread list.
648 */
649 if (TAILQ_EMPTY(&aio_freeproc))
650 wakeup(&aio_freeproc);
651 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
652
653 splx(s);
654
655 /* Make up a name for the daemon. */
656 strcpy(mycp->p_comm, "aiod");
657
658 /*
659 * Get rid of our current filedescriptors. AIOD's don't need any
660 * filedescriptors, except as temporarily inherited from the client.
661 * Credentials are also cloned, and made equivalent to "root".
662 */
663 fdfree(mycp);
664 mycp->p_fd = NULL;
665 mycp->p_ucred = crcopy(mycp->p_ucred);
666 mycp->p_ucred->cr_uid = 0;
667 uifree(mycp->p_ucred->cr_uidinfo);
668 mycp->p_ucred->cr_uidinfo = uifind(0);
669 mycp->p_ucred->cr_ngroups = 1;
670 mycp->p_ucred->cr_groups[0] = 1;
671
672 /* The daemon resides in its own pgrp. */
673 enterpgrp(mycp, mycp->p_pid, 1);
674
675 /* Mark special process type. */
676 mycp->p_flag |= P_SYSTEM | P_KTHREADP;
677
678 /*
679 * Wakeup parent process. (Parent sleeps to keep from blasting away
680 * and creating too many daemons.)
681 */
682 wakeup(mycp);
683
684 for (;;) {
685 /*
686 * curcp is the current daemon process context.
687 * userp is the current user process context.
688 */
689 curcp = mycp;
690
691 /*
692 * Take daemon off of free queue
693 */
694 if (aiop->aioprocflags & AIOP_FREE) {
695 s = splnet();
696 TAILQ_REMOVE(&aio_freeproc, aiop, list);
697 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
698 aiop->aioprocflags &= ~AIOP_FREE;
699 splx(s);
700 }
701 aiop->aioprocflags &= ~AIOP_SCHED;
702
703 /*
704 * Check for jobs.
705 */
706 while ((aiocbe = aio_selectjob(aiop)) != NULL) {
707 cb = &aiocbe->uaiocb;
708 userp = aiocbe->userproc;
709
710 aiocbe->jobstate = JOBST_JOBRUNNING;
711
712 /*
713 * Connect to process address space for user program.
714 */
715 if (userp != curcp) {
716 /*
717 * Save the current address space that we are
718 * connected to.
719 */
720 tmpvm = mycp->p_vmspace;
721
722 /*
723 * Point to the new user address space, and
724 * refer to it.
725 */
726 mycp->p_vmspace = userp->p_vmspace;
727 mycp->p_vmspace->vm_refcnt++;
728
729 /* Activate the new mapping. */
730 pmap_activate(mycp);
731
732 /*
733 * If the old address space wasn't the daemons
734 * own address space, then we need to remove the
735 * daemon's reference from the other process
736 * that it was acting on behalf of.
737 */
738 if (tmpvm != myvm) {
739 vmspace_free(tmpvm);
740 }
741 curcp = userp;
742 }
743
744 ki = userp->p_aioinfo;
745 lj = aiocbe->lio;
746
747 /* Account for currently active jobs. */
748 ki->kaio_active_count++;
749
750 /* Do the I/O function. */
751 aio_process(aiocbe);
752
753 /* Decrement the active job count. */
754 ki->kaio_active_count--;
755
756 /*
757 * Increment the completion count for wakeup/signal
758 * comparisons.
759 */
760 aiocbe->jobflags |= AIOCBLIST_DONE;
761 ki->kaio_queue_finished_count++;
762 if (lj)
763 lj->lioj_queue_finished_count++;
764 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
765 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
766 ki->kaio_flags &= ~KAIO_WAKEUP;
767 wakeup(userp);
768 }
769
770 s = splbio();
771 if (lj && (lj->lioj_flags &
772 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
773 if ((lj->lioj_queue_finished_count ==
774 lj->lioj_queue_count) &&
775 (lj->lioj_buffer_finished_count ==
776 lj->lioj_buffer_count)) {
777 psignal(userp,
778 lj->lioj_signal.sigev_signo);
779 lj->lioj_flags |=
780 LIOJ_SIGNAL_POSTED;
781 }
782 }
783 splx(s);
784
785 aiocbe->jobstate = JOBST_JOBFINISHED;
786
787 s = splnet();
788 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
789 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
790 splx(s);
791 KNOTE(&aiocbe->klist, 0);
792
793 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
794 wakeup(aiocbe);
795 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
796 }
797
798 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
799 psignal(userp, cb->aio_sigevent.sigev_signo);
800 }
801 }
802
803 /*
804 * Disconnect from user address space.
805 */
806 if (curcp != mycp) {
807 /* Get the user address space to disconnect from. */
808 tmpvm = mycp->p_vmspace;
809
810 /* Get original address space for daemon. */
811 mycp->p_vmspace = myvm;
812
813 /* Activate the daemon's address space. */
814 pmap_activate(mycp);
815#ifdef DIAGNOSTIC
816 if (tmpvm == myvm) {
817 printf("AIOD: vmspace problem -- %d\n",
818 mycp->p_pid);
819 }
820#endif
821 /* Remove our vmspace reference. */
822 vmspace_free(tmpvm);
823
824 curcp = mycp;
825 }
826
827 /*
828 * If we are the first to be put onto the free queue, wakeup
829 * anyone waiting for a daemon.
830 */
831 s = splnet();
832 TAILQ_REMOVE(&aio_activeproc, aiop, list);
833 if (TAILQ_EMPTY(&aio_freeproc))
834 wakeup(&aio_freeproc);
835 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
836 aiop->aioprocflags |= AIOP_FREE;
837 splx(s);
838
839 /*
840 * If daemon is inactive for a long time, allow it to exit,
841 * thereby freeing resources.
842 */
843 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
844 PRIBIO, "aiordy", aiod_lifetime)) {
845 s = splnet();
846 if (TAILQ_EMPTY(&aio_jobs)) {
847 if ((aiop->aioprocflags & AIOP_FREE) &&
848 (num_aio_procs > target_aio_procs)) {
849 TAILQ_REMOVE(&aio_freeproc, aiop, list);
850 splx(s);
851 zfree(aiop_zone, aiop);
852 num_aio_procs--;
853#ifdef DIAGNOSTIC
854 if (mycp->p_vmspace->vm_refcnt <= 1) {
855 printf("AIOD: bad vm refcnt for"
856 " exiting daemon: %d\n",
857 mycp->p_vmspace->vm_refcnt);
858 }
859#endif
860 exit1(mycp, 0);
861 }
862 }
863 splx(s);
864 }
865 }
866}
867
868/*
869 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
870 * AIO daemon modifies its environment itself.
871 */
872static int
873aio_newproc()
874{
875 int error;
876 struct proc *p, *np;
877
878 p = &proc0;
879 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np);
880 if (error)
881 return error;
882 cpu_set_fork_handler(np, aio_daemon, curproc);
883
884 /*
885 * Wait until daemon is started, but continue on just in case to
886 * handle error conditions.
887 */
888 error = tsleep(np, PZERO, "aiosta", aiod_timeout);
889 num_aio_procs++;
890
891 return error;
892}
893
894/*
895 * Try the high-performance, low-overhead physio method for eligible
896 * VCHR devices. This method doesn't use an aio helper thread, and
897 * thus has very low overhead.
898 *
899 * Assumes that the caller, _aio_aqueue(), has incremented the file
900 * structure's reference count, preventing its deallocation for the
901 * duration of this call.
902 */
903static int
904aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
905{
906 int error;
907 struct aiocb *cb;
908 struct file *fp;
909 struct buf *bp;
910 struct vnode *vp;
911 struct kaioinfo *ki;
912 struct aio_liojob *lj;
913 int s;
914 int notify;
915
916 cb = &aiocbe->uaiocb;
917 fp = aiocbe->fd_file;
918
919 if (fp->f_type != DTYPE_VNODE)
920 return (-1);
921
922 vp = (struct vnode *)fp->f_data;
923
924 /*
925 * If its not a disk, we don't want to return a positive error.
926 * It causes the aio code to not fall through to try the thread
927 * way when you're talking to a regular file.
928 */
929 if (!vn_isdisk(vp, &error)) {
930 if (error == ENOTBLK)
931 return (-1);
932 else
933 return (error);
934 }
935
936 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
937 return (-1);
938
939 if (cb->aio_nbytes >
940 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
941 return (-1);
942
943 ki = p->p_aioinfo;
944 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
945 return (-1);
946
947 ki->kaio_buffer_count++;
948
949 lj = aiocbe->lio;
950 if (lj)
951 lj->lioj_buffer_count++;
952
953 /* Create and build a buffer header for a transfer. */
954 bp = (struct buf *)getpbuf(NULL);
955 BUF_KERNPROC(bp);
956
957 /*
958 * Get a copy of the kva from the physical buffer.
959 */
960 bp->b_caller1 = p;
961 bp->b_dev = vp->v_rdev;
962 error = 0;
963
964 bp->b_bcount = cb->aio_nbytes;
965 bp->b_bufsize = cb->aio_nbytes;
966 bp->b_flags = B_PHYS | B_CALL | (cb->aio_lio_opcode == LIO_WRITE ?
967 B_WRITE : B_READ);
968 bp->b_iodone = aio_physwakeup;
969 bp->b_saveaddr = bp->b_data;
970 bp->b_data = (void *)(uintptr_t)cb->aio_buf;
971 bp->b_blkno = btodb(cb->aio_offset);
972
973 /* Bring buffer into kernel space. */
974 if (vmapbuf(bp) < 0) {
975 error = EFAULT;
976 goto doerror;
977 }
978
979 s = splbio();
980 aiocbe->bp = bp;
981 bp->b_spc = (void *)aiocbe;
982 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
983 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
984 aiocbe->jobstate = JOBST_JOBQBUF;
985 cb->_aiocb_private.status = cb->aio_nbytes;
986 num_buf_aio++;
987 bp->b_error = 0;
988
989 splx(s);
990
991 /* Perform transfer. */
992 BUF_STRATEGY(bp, 0);
993
994 notify = 0;
995 s = splbio();
996
997 /*
998 * If we had an error invoking the request, or an error in processing
999 * the request before we have returned, we process it as an error in
1000 * transfer. Note that such an I/O error is not indicated immediately,
1001 * but is returned using the aio_error mechanism. In this case,
1002 * aio_suspend will return immediately.
1003 */
1004 if (bp->b_error || (bp->b_flags & B_ERROR)) {
1005 struct aiocb *job = aiocbe->uuaiocb;
1006
1007 aiocbe->uaiocb._aiocb_private.status = 0;
1008 suword(&job->_aiocb_private.status, 0);
1009 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1010 suword(&job->_aiocb_private.error, bp->b_error);
1011
1012 ki->kaio_buffer_finished_count++;
1013
1014 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1015 aiocbe->jobstate = JOBST_JOBBFINISHED;
1016 aiocbe->jobflags |= AIOCBLIST_DONE;
1017 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1018 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1019 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1020 notify = 1;
1021 }
1022 }
1023 splx(s);
1024 if (notify)
1025 KNOTE(&aiocbe->klist, 0);
1026 return 0;
1027
1028doerror:
1029 ki->kaio_buffer_count--;
1030 if (lj)
1031 lj->lioj_buffer_count--;
1032 aiocbe->bp = NULL;
1033 relpbuf(bp, NULL);
1034 return error;
1035}
1036
1037/*
1038 * This waits/tests physio completion.
1039 */
1040static int
1041aio_fphysio(struct aiocblist *iocb)
1042{
1043 int s;
1044 struct buf *bp;
1045 int error;
1046
1047 bp = iocb->bp;
1048
1049 s = splbio();
1050 while ((bp->b_flags & B_DONE) == 0) {
1051 if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1052 if ((bp->b_flags & B_DONE) == 0) {
1053 splx(s);
1054 return EINPROGRESS;
1055 } else
1056 break;
1057 }
1058 }
1059 splx(s);
1060
1061 /* Release mapping into kernel space. */
1062 vunmapbuf(bp);
1063 iocb->bp = 0;
1064
1065 error = 0;
1066
1067 /* Check for an error. */
1068 if (bp->b_flags & B_ERROR)
1069 error = bp->b_error;
1070
1071 relpbuf(bp, NULL);
1072 return (error);
1073}
1074#endif /* VFS_AIO */
1075
1076/*
1077 * Wake up aio requests that may be serviceable now.
1078 */
1079void
1080aio_swake(struct socket *so, struct sockbuf *sb)
1081{
1082#ifndef VFS_AIO
1083 return;
1084#else
1085 struct aiocblist *cb,*cbn;
1086 struct proc *p;
1087 struct kaioinfo *ki = NULL;
1088 int opcode, wakecount = 0;
1089 struct aioproclist *aiop;
1090
1091 if (sb == &so->so_snd) {
1092 opcode = LIO_WRITE;
1093 so->so_snd.sb_flags &= ~SB_AIO;
1094 } else {
1095 opcode = LIO_READ;
1096 so->so_rcv.sb_flags &= ~SB_AIO;
1097 }
1098
1099 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1100 cbn = TAILQ_NEXT(cb, list);
1101 if (opcode == cb->uaiocb.aio_lio_opcode) {
1102 p = cb->userproc;
1103 ki = p->p_aioinfo;
1104 TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1105 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1106 TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1107 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1108 wakecount++;
1109 if (cb->jobstate != JOBST_JOBQGLOBAL)
1110 panic("invalid queue value");
1111 }
1112 }
1113
1114 while (wakecount--) {
1115 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1116 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1117 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1118 aiop->aioprocflags &= ~AIOP_FREE;
1119 wakeup(aiop->aioproc);
1120 }
1121 }
1122#endif /* VFS_AIO */
1123}
1124
1125#ifdef VFS_AIO
1126/*
1127 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR
1128 * technique is done in this code.
1129 */
1130static int
1131_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1132{
1133 struct filedesc *fdp;
1134 struct file *fp;
1135 unsigned int fd;
1136 struct socket *so;
1137 int s;
1138 int error;
1139 int opcode, user_opcode;
1140 struct aiocblist *aiocbe;
1141 struct aioproclist *aiop;
1142 struct kaioinfo *ki;
1143 struct kevent kev;
1144 struct kqueue *kq;
1145 struct file *kq_fp;
1146
1147 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL)
1148 TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1149 else
1150 aiocbe = zalloc (aiocb_zone);
1151
1152 aiocbe->inputcharge = 0;
1153 aiocbe->outputcharge = 0;
1154 callout_handle_init(&aiocbe->timeouthandle);
1155 SLIST_INIT(&aiocbe->klist);
1156
1157 suword(&job->_aiocb_private.status, -1);
1158 suword(&job->_aiocb_private.error, 0);
1159 suword(&job->_aiocb_private.kernelinfo, -1);
1160
1161 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1162 if (error) {
1163 suword(&job->_aiocb_private.error, error);
1164 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1165 return error;
1166 }
1167 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1168 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1169 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1170 return EINVAL;
1171 }
1172
1173 /* Save userspace address of the job info. */
1174 aiocbe->uuaiocb = job;
1175
1176 /* Get the opcode. */
1177 user_opcode = aiocbe->uaiocb.aio_lio_opcode;
1178 if (type != LIO_NOP)
1179 aiocbe->uaiocb.aio_lio_opcode = type;
1180 opcode = aiocbe->uaiocb.aio_lio_opcode;
1181
1182 /* Get the fd info for process. */
1183 fdp = p->p_fd;
1184
1185 /*
1186 * Range check file descriptor.
1187 */
1188 fd = aiocbe->uaiocb.aio_fildes;
1189 if (fd >= fdp->fd_nfiles) {
1190 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1191 if (type == 0)
1192 suword(&job->_aiocb_private.error, EBADF);
1193 return EBADF;
1194 }
1195
1196 fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1197 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1198 0))) {
1199 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1200 if (type == 0)
1201 suword(&job->_aiocb_private.error, EBADF);
1202 return EBADF;
1203 }
1204 fhold(fp);
1205
1206 if (aiocbe->uaiocb.aio_offset == -1LL) {
1207 error = EINVAL;
1208 goto aqueue_fail;
1209 }
1210 error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1211 if (error) {
1212 error = EINVAL;
1213 goto aqueue_fail;
1214 }
1215 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1216 if (jobrefid == LONG_MAX)
1217 jobrefid = 1;
1218 else
1219 jobrefid++;
1220
1221 if (opcode == LIO_NOP) {
1222 fdrop(fp, p);
1223 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1224 if (type == 0) {
1225 suword(&job->_aiocb_private.error, 0);
1226 suword(&job->_aiocb_private.status, 0);
1227 suword(&job->_aiocb_private.kernelinfo, 0);
1228 }
1229 return 0;
1230 }
1231 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1232 if (type == 0)
1233 suword(&job->_aiocb_private.status, 0);
1234 error = EINVAL;
1235 goto aqueue_fail;
1236 }
1237
1238 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1239 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1240 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1241 }
1242 else {
1243 /*
1244 * This method for requesting kevent-based notification won't
1245 * work on the alpha, since we're passing in a pointer
1246 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT-
1247 * based method instead.
1248 */
1249 if (user_opcode == LIO_NOP || user_opcode == LIO_READ ||
1250 user_opcode == LIO_WRITE)
1251 goto no_kqueue;
1252
1253 error = copyin((struct kevent *)(uintptr_t)user_opcode,
1254 &kev, sizeof(kev));
1255 if (error)
1256 goto aqueue_fail;
1257 }
1258 if ((u_int)kev.ident >= fdp->fd_nfiles ||
1259 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1260 (kq_fp->f_type != DTYPE_KQUEUE)) {
1261 error = EBADF;
1262 goto aqueue_fail;
1263 }
1264 kq = (struct kqueue *)kq_fp->f_data;
1265 kev.ident = (uintptr_t)aiocbe->uuaiocb;
1266 kev.filter = EVFILT_AIO;
1267 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1268 kev.data = (intptr_t)aiocbe;
1269 error = kqueue_register(kq, &kev, p);
1270aqueue_fail:
1271 if (error) {
1272 fdrop(fp, p);
1273 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1274 if (type == 0)
1275 suword(&job->_aiocb_private.error, error);
1276 goto done;
1277 }
1278no_kqueue:
1279
1280 suword(&job->_aiocb_private.error, EINPROGRESS);
1281 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1282 aiocbe->userproc = p;
1283 aiocbe->jobflags = 0;
1284 aiocbe->lio = lj;
1285 ki = p->p_aioinfo;
1286
1287 if (fp->f_type == DTYPE_SOCKET) {
1288 /*
1289 * Alternate queueing for socket ops: Reach down into the
1290 * descriptor to get the socket data. Then check to see if the
1291 * socket is ready to be read or written (based on the requested
1292 * operation).
1293 *
1294 * If it is not ready for io, then queue the aiocbe on the
1295 * socket, and set the flags so we get a call when sbnotify()
1296 * happens.
1297 */
1298 so = (struct socket *)fp->f_data;
1299 s = splnet();
1300 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1301 LIO_WRITE) && (!sowriteable(so)))) {
1302 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1303 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1304 if (opcode == LIO_READ)
1305 so->so_rcv.sb_flags |= SB_AIO;
1306 else
1307 so->so_snd.sb_flags |= SB_AIO;
1308 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1309 ki->kaio_queue_count++;
1310 num_queue_count++;
1311 splx(s);
1312 error = 0;
1313 goto done;
1314 }
1315 splx(s);
1316 }
1317
1318 if ((error = aio_qphysio(p, aiocbe)) == 0)
1319 goto done;
1320 if (error > 0) {
1321 suword(&job->_aiocb_private.status, 0);
1322 aiocbe->uaiocb._aiocb_private.error = error;
1323 suword(&job->_aiocb_private.error, error);
1324 goto done;
1325 }
1326
1327 /* No buffer for daemon I/O. */
1328 aiocbe->bp = NULL;
1329
1330 ki->kaio_queue_count++;
1331 if (lj)
1332 lj->lioj_queue_count++;
1333 s = splnet();
1334 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1335 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1336 splx(s);
1337 aiocbe->jobstate = JOBST_JOBQGLOBAL;
1338
1339 num_queue_count++;
1340 error = 0;
1341
1342 /*
1343 * If we don't have a free AIO process, and we are below our quota, then
1344 * start one. Otherwise, depend on the subsequent I/O completions to
1345 * pick-up this job. If we don't sucessfully create the new process
1346 * (thread) due to resource issues, we return an error for now (EAGAIN),
1347 * which is likely not the correct thing to do.
1348 */
1349 s = splnet();
1350retryproc:
1351 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1352 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1353 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1354 aiop->aioprocflags &= ~AIOP_FREE;
1355 wakeup(aiop->aioproc);
1356 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1357 ((ki->kaio_active_count + num_aio_resv_start) <
1358 ki->kaio_maxactive_count)) {
1359 num_aio_resv_start++;
1360 if ((error = aio_newproc()) == 0) {
1361 num_aio_resv_start--;
1362 goto retryproc;
1363 }
1364 num_aio_resv_start--;
1365 }
1366 splx(s);
1367done:
1368 return error;
1369}
1370
1371/*
1372 * This routine queues an AIO request, checking for quotas.
1373 */
1374static int
1375aio_aqueue(struct proc *p, struct aiocb *job, int type)
1376{
1377 struct kaioinfo *ki;
1378
1379 if (p->p_aioinfo == NULL)
1380 aio_init_aioinfo(p);
1381
1382 if (num_queue_count >= max_queue_count)
1383 return EAGAIN;
1384
1385 ki = p->p_aioinfo;
1386 if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1387 return EAGAIN;
1388
1389 return _aio_aqueue(p, job, NULL, type);
1390}
1391#endif /* VFS_AIO */
1392
1393/*
1394 * Support the aio_return system call, as a side-effect, kernel resources are
1395 * released.
1396 */
1397int
1398aio_return(struct proc *p, struct aio_return_args *uap)
1399{
1400#ifndef VFS_AIO
1401 return ENOSYS;
1402#else
1403 int s;
1404 long jobref;
1405 struct aiocblist *cb, *ncb;
1406 struct aiocb *ujob;
1407 struct kaioinfo *ki;
1408
1409 ki = p->p_aioinfo;
1410 if (ki == NULL)
1411 return EINVAL;
1412
1413 ujob = uap->aiocbp;
1414
1415 jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1416 if (jobref == -1 || jobref == 0)
1417 return EINVAL;
1418
1419 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1420 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1421 jobref) {
1422 if (ujob == cb->uuaiocb) {
1423 p->p_retval[0] =
1424 cb->uaiocb._aiocb_private.status;
1425 } else
1426 p->p_retval[0] = EFAULT;
1427 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1428 p->p_stats->p_ru.ru_oublock +=
1429 cb->outputcharge;
1430 cb->outputcharge = 0;
1431 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1432 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1433 cb->inputcharge = 0;
1434 }
1435 aio_free_entry(cb);
1436 return 0;
1437 }
1438 }
1439 s = splbio();
1440 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1441 ncb = TAILQ_NEXT(cb, plist);
1442 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1443 == jobref) {
1444 splx(s);
1445 if (ujob == cb->uuaiocb) {
1446 p->p_retval[0] =
1447 cb->uaiocb._aiocb_private.status;
1448 } else
1449 p->p_retval[0] = EFAULT;
1450 aio_free_entry(cb);
1451 return 0;
1452 }
1453 }
1454 splx(s);
1455
1456 return (EINVAL);
1457#endif /* VFS_AIO */
1458}
1459
1460/*
1461 * Allow a process to wakeup when any of the I/O requests are completed.
1462 */
1463int
1464aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1465{
1466#ifndef VFS_AIO
1467 return ENOSYS;
1468#else
1469 struct timeval atv;
1470 struct timespec ts;
1471 struct aiocb *const *cbptr, *cbp;
1472 struct kaioinfo *ki;
1473 struct aiocblist *cb;
1474 int i;
1475 int njoblist;
1476 int error, s, timo;
1477 long *ijoblist;
1478 struct aiocb **ujoblist;
1479
1480 if (uap->nent > AIO_LISTIO_MAX)
1481 return EINVAL;
1482
1483 timo = 0;
1484 if (uap->timeout) {
1485 /* Get timespec struct. */
1486 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1487 return error;
1488
1489 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1490 return (EINVAL);
1491
1492 TIMESPEC_TO_TIMEVAL(&atv, &ts);
1493 if (itimerfix(&atv))
1494 return (EINVAL);
1495 timo = tvtohz(&atv);
1496 }
1497
1498 ki = p->p_aioinfo;
1499 if (ki == NULL)
1500 return EAGAIN;
1501
1502 njoblist = 0;
1503 ijoblist = zalloc(aiol_zone);
1504 ujoblist = zalloc(aiol_zone);
1505 cbptr = uap->aiocbp;
1506
1507 for (i = 0; i < uap->nent; i++) {
1508 cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1509 if (cbp == 0)
1510 continue;
1511 ujoblist[njoblist] = cbp;
1512 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1513 njoblist++;
1514 }
1515
1516 if (njoblist == 0) {
1517 zfree(aiol_zone, ijoblist);
1518 zfree(aiol_zone, ujoblist);
1519 return 0;
1520 }
1521
1522 error = 0;
1523 for (;;) {
1524 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1525 for (i = 0; i < njoblist; i++) {
1526 if (((intptr_t)
1527 cb->uaiocb._aiocb_private.kernelinfo) ==
1528 ijoblist[i]) {
1529 if (ujoblist[i] != cb->uuaiocb)
1530 error = EINVAL;
1531 zfree(aiol_zone, ijoblist);
1532 zfree(aiol_zone, ujoblist);
1533 return error;
1534 }
1535 }
1536 }
1537
1538 s = splbio();
1539 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1540 TAILQ_NEXT(cb, plist)) {
1541 for (i = 0; i < njoblist; i++) {
1542 if (((intptr_t)
1543 cb->uaiocb._aiocb_private.kernelinfo) ==
1544 ijoblist[i]) {
1545 splx(s);
1546 if (ujoblist[i] != cb->uuaiocb)
1547 error = EINVAL;
1548 zfree(aiol_zone, ijoblist);
1549 zfree(aiol_zone, ujoblist);
1550 return error;
1551 }
1552 }
1553 }
1554
1555 ki->kaio_flags |= KAIO_WAKEUP;
1556 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1557 splx(s);
1558
1559 if (error == ERESTART || error == EINTR) {
1560 zfree(aiol_zone, ijoblist);
1561 zfree(aiol_zone, ujoblist);
1562 return EINTR;
1563 } else if (error == EWOULDBLOCK) {
1564 zfree(aiol_zone, ijoblist);
1565 zfree(aiol_zone, ujoblist);
1566 return EAGAIN;
1567 }
1568 }
1569
1570/* NOTREACHED */
1571 return EINVAL;
1572#endif /* VFS_AIO */
1573}
1574
1575/*
1576 * aio_cancel cancels any non-physio aio operations not currently in
1577 * progress.
1578 */
1579int
1580aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1581{
1582#ifndef VFS_AIO
1583 return ENOSYS;
1584#else
1585 struct kaioinfo *ki;
1586 struct aiocblist *cbe, *cbn;
1587 struct file *fp;
1588 struct filedesc *fdp;
1589 struct socket *so;
1590 struct proc *po;
1591 int s,error;
1592 int cancelled=0;
1593 int notcancelled=0;
1594 struct vnode *vp;
1595
1596 fdp = p->p_fd;
1597 if ((u_int)uap->fd >= fdp->fd_nfiles ||
1598 (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1599 return (EBADF);
1600
1601 if (fp->f_type == DTYPE_VNODE) {
1602 vp = (struct vnode *)fp->f_data;
1603
1604 if (vn_isdisk(vp,&error)) {
1605 p->p_retval[0] = AIO_NOTCANCELED;
1606 return 0;
1607 }
1608 } else if (fp->f_type == DTYPE_SOCKET) {
1609 so = (struct socket *)fp->f_data;
1610
1611 s = splnet();
1612
1613 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1614 cbn = TAILQ_NEXT(cbe, list);
1615 if ((uap->aiocbp == NULL) ||
1616 (uap->aiocbp == cbe->uuaiocb) ) {
1617 po = cbe->userproc;
1618 ki = po->p_aioinfo;
1619 TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1620 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1621 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1622 if (ki->kaio_flags & KAIO_WAKEUP) {
1623 wakeup(po);
1624 }
1625 cbe->jobstate = JOBST_JOBFINISHED;
1626 cbe->uaiocb._aiocb_private.status=-1;
1627 cbe->uaiocb._aiocb_private.error=ECANCELED;
1628 cancelled++;
1629/* XXX cancelled, knote? */
1630 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1631 SIGEV_SIGNAL)
1632 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1633 if (uap->aiocbp)
1634 break;
1635 }
1636 }
1637 splx(s);
1638
1639 if ((cancelled) && (uap->aiocbp)) {
1640 p->p_retval[0] = AIO_CANCELED;
1641 return 0;
1642 }
1643 }
1644 ki=p->p_aioinfo;
1645 if (ki == NULL)
1646 goto done;
1647 s = splnet();
1648
1649 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1650 cbn = TAILQ_NEXT(cbe, plist);
1651
1652 if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1653 ((uap->aiocbp == NULL ) ||
1654 (uap->aiocbp == cbe->uuaiocb))) {
1655
1656 if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1657 TAILQ_REMOVE(&aio_jobs, cbe, list);
1658 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1659 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1660 plist);
1661 cancelled++;
1662 ki->kaio_queue_finished_count++;
1663 cbe->jobstate = JOBST_JOBFINISHED;
1664 cbe->uaiocb._aiocb_private.status = -1;
1665 cbe->uaiocb._aiocb_private.error = ECANCELED;
1666/* XXX cancelled, knote? */
1667 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1668 SIGEV_SIGNAL)
1669 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1670 } else {
1671 notcancelled++;
1672 }
1673 }
1674 }
1675 splx(s);
1676done:
1677 if (notcancelled) {
1678 p->p_retval[0] = AIO_NOTCANCELED;
1679 return 0;
1680 }
1681 if (cancelled) {
1682 p->p_retval[0] = AIO_CANCELED;
1683 return 0;
1684 }
1685 p->p_retval[0] = AIO_ALLDONE;
1686
1687 return 0;
1688#endif /* VFS_AIO */
1689}
1690
1691/*
1692 * aio_error is implemented in the kernel level for compatibility purposes only.
1693 * For a user mode async implementation, it would be best to do it in a userland
1694 * subroutine.
1695 */
1696int
1697aio_error(struct proc *p, struct aio_error_args *uap)
1698{
1699#ifndef VFS_AIO
1700 return ENOSYS;
1701#else
1702 int s;
1703 struct aiocblist *cb;
1704 struct kaioinfo *ki;
1705 long jobref;
1706
1707 ki = p->p_aioinfo;
1708 if (ki == NULL)
1709 return EINVAL;
1710
1711 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1712 if ((jobref == -1) || (jobref == 0))
1713 return EINVAL;
1714
1715 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1716 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1717 jobref) {
1718 p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1719 return 0;
1720 }
1721 }
1722
1723 s = splnet();
1724
1725 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1726 plist)) {
1727 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1728 jobref) {
1729 p->p_retval[0] = EINPROGRESS;
1730 splx(s);
1731 return 0;
1732 }
1733 }
1734
1735 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1736 plist)) {
1737 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1738 jobref) {
1739 p->p_retval[0] = EINPROGRESS;
1740 splx(s);
1741 return 0;
1742 }
1743 }
1744 splx(s);
1745
1746 s = splbio();
1747 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1748 plist)) {
1749 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1750 jobref) {
1751 p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1752 splx(s);
1753 return 0;
1754 }
1755 }
1756
1757 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1758 plist)) {
1759 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1760 jobref) {
1761 p->p_retval[0] = EINPROGRESS;
1762 splx(s);
1763 return 0;
1764 }
1765 }
1766 splx(s);
1767
1768#if (0)
1769 /*
1770 * Hack for lio.
1771 */
1772 status = fuword(&uap->aiocbp->_aiocb_private.status);
1773 if (status == -1)
1774 return fuword(&uap->aiocbp->_aiocb_private.error);
1775#endif
1776 return EINVAL;
1777#endif /* VFS_AIO */
1778}
1779
1780/* syscall - asynchronous read from a file (REALTIME) */
1781int
1782aio_read(struct proc *p, struct aio_read_args *uap)
1783{
1784#ifndef VFS_AIO
1785 return ENOSYS;
1786#else
1787 return aio_aqueue(p, uap->aiocbp, LIO_READ);
1788#endif /* VFS_AIO */
1789}
1790
1791/* syscall - asynchronous write to a file (REALTIME) */
1792int
1793aio_write(struct proc *p, struct aio_write_args *uap)
1794{
1795#ifndef VFS_AIO
1796 return ENOSYS;
1797#else
1798 return aio_aqueue(p, uap->aiocbp, LIO_WRITE);
1799#endif /* VFS_AIO */
1800}
1801
1802/* syscall - XXX undocumented */
1803int
1804lio_listio(struct proc *p, struct lio_listio_args *uap)
1805{
1806#ifndef VFS_AIO
1807 return ENOSYS;
1808#else
1809 int nent, nentqueued;
1810 struct aiocb *iocb, * const *cbptr;
1811 struct aiocblist *cb;
1812 struct kaioinfo *ki;
1813 struct aio_liojob *lj;
1814 int error, runningcode;
1815 int nerror;
1816 int i;
1817 int s;
1818
1819 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1820 return EINVAL;
1821
1822 nent = uap->nent;
1823 if (nent > AIO_LISTIO_MAX)
1824 return EINVAL;
1825
1826 if (p->p_aioinfo == NULL)
1827 aio_init_aioinfo(p);
1828
1829 if ((nent + num_queue_count) > max_queue_count)
1830 return EAGAIN;
1831
1832 ki = p->p_aioinfo;
1833 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1834 return EAGAIN;
1835
1836 lj = zalloc(aiolio_zone);
1837 if (!lj)
1838 return EAGAIN;
1839
1840 lj->lioj_flags = 0;
1841 lj->lioj_buffer_count = 0;
1842 lj->lioj_buffer_finished_count = 0;
1843 lj->lioj_queue_count = 0;
1844 lj->lioj_queue_finished_count = 0;
1845 lj->lioj_ki = ki;
1846
1847 /*
1848 * Setup signal.
1849 */
1850 if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1851 error = copyin(uap->sig, &lj->lioj_signal,
1852 sizeof(lj->lioj_signal));
1853 if (error) {
1854 zfree(aiolio_zone, lj);
1855 return error;
1856 }
1857 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
1858 zfree(aiolio_zone, lj);
1859 return EINVAL;
1860 }
1861 lj->lioj_flags |= LIOJ_SIGNAL;
1862 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
1863 } else
1864 lj->lioj_flags &= ~LIOJ_SIGNAL;
1865
1866 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
1867 /*
1868 * Get pointers to the list of I/O requests.
1869 */
1870 nerror = 0;
1871 nentqueued = 0;
1872 cbptr = uap->acb_list;
1873 for (i = 0; i < uap->nent; i++) {
1874 iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
1875 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
1876 error = _aio_aqueue(p, iocb, lj, 0);
1877 if (error == 0)
1878 nentqueued++;
1879 else
1880 nerror++;
1881 }
1882 }
1883
1884 /*
1885 * If we haven't queued any, then just return error.
1886 */
1887 if (nentqueued == 0)
1888 return 0;
1889
1890 /*
1891 * Calculate the appropriate error return.
1892 */
1893 runningcode = 0;
1894 if (nerror)
1895 runningcode = EIO;
1896
1897 if (uap->mode == LIO_WAIT) {
1898 int command, found, jobref;
1899
1900 for (;;) {
1901 found = 0;
1902 for (i = 0; i < uap->nent; i++) {
1903 /*
1904 * Fetch address of the control buf pointer in
1905 * user space.
1906 */
1907 iocb = (struct aiocb *)
1908 (intptr_t)fuword(&cbptr[i]);
1909 if (((intptr_t)iocb == -1) || ((intptr_t)iocb
1910 == 0))
1911 continue;
1912
1913 /*
1914 * Fetch the associated command from user space.
1915 */
1916 command = fuword(&iocb->aio_lio_opcode);
1917 if (command == LIO_NOP) {
1918 found++;
1919 continue;
1920 }
1921
1922 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
1923
1924 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1925 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
1926 == jobref) {
1927 if (cb->uaiocb.aio_lio_opcode
1928 == LIO_WRITE) {
1929 p->p_stats->p_ru.ru_oublock
1930 +=
1931 cb->outputcharge;
1932 cb->outputcharge = 0;
1933 } else if (cb->uaiocb.aio_lio_opcode
1934 == LIO_READ) {
1935 p->p_stats->p_ru.ru_inblock
1936 += cb->inputcharge;
1937 cb->inputcharge = 0;
1938 }
1939 found++;
1940 break;
1941 }
1942 }
1943
1944 s = splbio();
1945 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
1946 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
1947 == jobref) {
1948 found++;
1949 break;
1950 }
1951 }
1952 splx(s);
1953 }
1954
1955 /*
1956 * If all I/Os have been disposed of, then we can
1957 * return.
1958 */
1959 if (found == nentqueued)
1960 return runningcode;
1961
1962 ki->kaio_flags |= KAIO_WAKEUP;
1963 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
1964
1965 if (error == EINTR)
1966 return EINTR;
1967 else if (error == EWOULDBLOCK)
1968 return EAGAIN;
1969 }
1970 }
1971
1972 return runningcode;
1973#endif /* VFS_AIO */
1974}
1975
1976#ifdef VFS_AIO
1977/*
1978 * This is a weird hack so that we can post a signal. It is safe to do so from
1979 * a timeout routine, but *not* from an interrupt routine.
1980 */
1981static void
1982process_signal(void *aioj)
1983{
1984 struct aiocblist *aiocbe = aioj;
1985 struct aio_liojob *lj = aiocbe->lio;
1986 struct aiocb *cb = &aiocbe->uaiocb;
1987
1988 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
1989 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
1990 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
1991 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1992 }
1993
1994 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL)
1995 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
1996}
1997
1998/*
1999 * Interrupt handler for physio, performs the necessary process wakeups, and
2000 * signals.
2001 */
2002static void
2003aio_physwakeup(struct buf *bp)
2004{
2005 struct aiocblist *aiocbe;
2006 struct proc *p;
2007 struct kaioinfo *ki;
2008 struct aio_liojob *lj;
2009
2010 wakeup(bp);
2011
2012 aiocbe = (struct aiocblist *)bp->b_spc;
2013 if (aiocbe) {
2014 p = bp->b_caller1;
2015
2016 aiocbe->jobstate = JOBST_JOBBFINISHED;
2017 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2018 aiocbe->uaiocb._aiocb_private.error = 0;
2019 aiocbe->jobflags |= AIOCBLIST_DONE;
2020
2021 if (bp->b_flags & B_ERROR)
2022 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2023
2024 lj = aiocbe->lio;
2025 if (lj) {
2026 lj->lioj_buffer_finished_count++;
2027
2028 /*
2029 * wakeup/signal if all of the interrupt jobs are done.
2030 */
2031 if (lj->lioj_buffer_finished_count ==
2032 lj->lioj_buffer_count) {
2033 /*
2034 * Post a signal if it is called for.
2035 */
2036 if ((lj->lioj_flags &
2037 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2038 LIOJ_SIGNAL) {
2039 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2040 aiocbe->timeouthandle =
2041 timeout(process_signal,
2042 aiocbe, 0);
2043 }
2044 }
2045 }
2046
2047 ki = p->p_aioinfo;
2048 if (ki) {
2049 ki->kaio_buffer_finished_count++;
2050 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2051 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2052 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2053
2054 KNOTE(&aiocbe->klist, 0);
2055 /* Do the wakeup. */
2056 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2057 ki->kaio_flags &= ~KAIO_WAKEUP;
2058 wakeup(p);
2059 }
2060 }
2061
2062 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2063 aiocbe->timeouthandle =
2064 timeout(process_signal, aiocbe, 0);
2065 }
2066}
2067#endif /* VFS_AIO */
2068
2069/* syscall - wait for the next completion of an aio request */
2070int
2071aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap)
2072{
2073#ifndef VFS_AIO
2074 return ENOSYS;
2075#else
2076 struct timeval atv;
2077 struct timespec ts;
2078 struct kaioinfo *ki;
2079 struct aiocblist *cb = NULL;
2080 int error, s, timo;
2081
2082 suword(uap->aiocbp, (int)NULL);
2083
2084 timo = 0;
2085 if (uap->timeout) {
2086 /* Get timespec struct. */
2087 error = copyin(uap->timeout, &ts, sizeof(ts));
2088 if (error)
2089 return error;
2090
2091 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2092 return (EINVAL);
2093
2094 TIMESPEC_TO_TIMEVAL(&atv, &ts);
2095 if (itimerfix(&atv))
2096 return (EINVAL);
2097 timo = tvtohz(&atv);
2098 }
2099
2100 ki = p->p_aioinfo;
2101 if (ki == NULL)
2102 return EAGAIN;
2103
2104 for (;;) {
2105 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2106 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2107 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2108 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2109 p->p_stats->p_ru.ru_oublock +=
2110 cb->outputcharge;
2111 cb->outputcharge = 0;
2112 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2113 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2114 cb->inputcharge = 0;
2115 }
2116 aio_free_entry(cb);
2117 return cb->uaiocb._aiocb_private.error;
2118 }
2119
2120 s = splbio();
2121 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2122 splx(s);
2123 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2124 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2125 aio_free_entry(cb);
2126 return cb->uaiocb._aiocb_private.error;
2127 }
2128
2129 ki->kaio_flags |= KAIO_WAKEUP;
2130 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2131 splx(s);
2132
2133 if (error == ERESTART)
2134 return EINTR;
2135 else if (error < 0)
2136 return error;
2137 else if (error == EINTR)
2138 return EINTR;
2139 else if (error == EWOULDBLOCK)
2140 return EAGAIN;
2141 }
2142#endif /* VFS_AIO */
2143}
2144
2145#ifndef VFS_AIO
2146static int
2147filt_aioattach(struct knote *kn)
2148{
2149
2150 return (ENXIO);
2151}
2152
2153struct filterops aio_filtops =
2154 { 0, filt_aioattach, NULL, NULL };
2155
2156#else
2157/* kqueue attach function */
2158static int
2159filt_aioattach(struct knote *kn)
2160{
2161 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2162
2163 /*
2164 * The aiocbe pointer must be validated before using it, so
2165 * registration is restricted to the kernel; the user cannot
2166 * set EV_FLAG1.
2167 */
2168 if ((kn->kn_flags & EV_FLAG1) == 0)
2169 return (EPERM);
2170 kn->kn_flags &= ~EV_FLAG1;
2171
2172 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2173
2174 return (0);
2175}
2176
2177/* kqueue detach function */
2178static void
2179filt_aiodetach(struct knote *kn)
2180{
2181 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2182
2183 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2184}
2185
2186/* kqueue filter function */
2187/*ARGSUSED*/
2188static int
2189filt_aio(struct knote *kn, long hint)
2190{
2191 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2192
2193 kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2194 if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2195 aiocbe->jobstate != JOBST_JOBBFINISHED)
2196 return (0);
2197 kn->kn_flags |= EV_EOF;
2198 return (1);
2199}
2200
2201struct filterops aio_filtops =
2202 { 0, filt_aioattach, filt_aiodetach, filt_aio };
2203#endif /* VFS_AIO */