| Commit | Line | Data |
|---|---|---|
| 984263bc MD |
1 | /* |
| 2 | * Copyright (c) 1997 John S. Dyson. All rights reserved. | |
| 3 | * | |
| 4 | * Redistribution and use in source and binary forms, with or without | |
| 5 | * modification, are permitted provided that the following conditions | |
| 6 | * are met: | |
| 7 | * 1. Redistributions of source code must retain the above copyright | |
| 8 | * notice, this list of conditions and the following disclaimer. | |
| 9 | * 2. John S. Dyson's name may not be used to endorse or promote products | |
| 10 | * derived from this software without specific prior written permission. | |
| 11 | * | |
| 12 | * DISCLAIMER: This code isn't warranted to do anything useful. Anything | |
| 13 | * bad that happens because of using this software isn't the responsibility | |
| 14 | * of the author. This software is distributed AS-IS. | |
| 15 | * | |
| 16 | * $FreeBSD: src/sys/kern/vfs_aio.c,v 1.70.2.28 2003/05/29 06:15:35 alc Exp $ | |
| c34665ce | 17 | * $DragonFly: src/sys/kern/vfs_aio.c,v 1.42 2007/07/20 17:21:52 dillon Exp $ |
| 984263bc MD |
18 | */ |
| 19 | ||
| 20 | /* | |
| 21 | * This file contains support for the POSIX 1003.1B AIO/LIO facility. | |
| 22 | */ | |
| 23 | ||
| 24 | #include <sys/param.h> | |
| 25 | #include <sys/systm.h> | |
| 26 | #include <sys/buf.h> | |
| 27 | #include <sys/sysproto.h> | |
| 28 | #include <sys/filedesc.h> | |
| 29 | #include <sys/kernel.h> | |
| 30 | #include <sys/fcntl.h> | |
| 31 | #include <sys/file.h> | |
| 32 | #include <sys/lock.h> | |
| 33 | #include <sys/unistd.h> | |
| 34 | #include <sys/proc.h> | |
| 35 | #include <sys/resourcevar.h> | |
| 36 | #include <sys/signalvar.h> | |
| 37 | #include <sys/protosw.h> | |
| 38 | #include <sys/socketvar.h> | |
| 39 | #include <sys/sysctl.h> | |
| 40 | #include <sys/vnode.h> | |
| 41 | #include <sys/conf.h> | |
| 42 | #include <sys/event.h> | |
| 5a26d050 | 43 | #include <sys/objcache.h> |
| 984263bc MD |
44 | |
| 45 | #include <vm/vm.h> | |
| 46 | #include <vm/vm_extern.h> | |
| 47 | #include <vm/pmap.h> | |
| 48 | #include <vm/vm_map.h> | |
| 984263bc | 49 | #include <sys/aio.h> |
| 684a93c4 | 50 | |
| 7b95be2a MD |
51 | #include <sys/file2.h> |
| 52 | #include <sys/buf2.h> | |
| ba39e2e0 | 53 | #include <sys/sysref2.h> |
| 831f78e5 | 54 | #include <sys/thread2.h> |
| 684a93c4 | 55 | #include <sys/mplock2.h> |
| 984263bc MD |
56 | |
| 57 | #include <machine/limits.h> | |
| 58 | #include "opt_vfs_aio.h" | |
| 59 | ||
| 60 | #ifdef VFS_AIO | |
| 61 | ||
| 62 | /* | |
| 63 | * Counter for allocating reference ids to new jobs. Wrapped to 1 on | |
| 64 | * overflow. | |
| 65 | */ | |
| 66 | static long jobrefid; | |
| 67 | ||
| 68 | #define JOBST_NULL 0x0 | |
| 69 | #define JOBST_JOBQGLOBAL 0x2 | |
| 70 | #define JOBST_JOBRUNNING 0x3 | |
| 71 | #define JOBST_JOBFINISHED 0x4 | |
| 72 | #define JOBST_JOBQBUF 0x5 | |
| 73 | #define JOBST_JOBBFINISHED 0x6 | |
| 74 | ||
| 75 | #ifndef MAX_AIO_PER_PROC | |
| 76 | #define MAX_AIO_PER_PROC 32 | |
| 77 | #endif | |
| 78 | ||
| 79 | #ifndef MAX_AIO_QUEUE_PER_PROC | |
| 80 | #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ | |
| 81 | #endif | |
| 82 | ||
| 83 | #ifndef MAX_AIO_PROCS | |
| 84 | #define MAX_AIO_PROCS 32 | |
| 85 | #endif | |
| 86 | ||
| 87 | #ifndef MAX_AIO_QUEUE | |
| 88 | #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ | |
| 89 | #endif | |
| 90 | ||
| 91 | #ifndef TARGET_AIO_PROCS | |
| 92 | #define TARGET_AIO_PROCS 4 | |
| 93 | #endif | |
| 94 | ||
| 95 | #ifndef MAX_BUF_AIO | |
| 96 | #define MAX_BUF_AIO 16 | |
| 97 | #endif | |
| 98 | ||
| 99 | #ifndef AIOD_TIMEOUT_DEFAULT | |
| 100 | #define AIOD_TIMEOUT_DEFAULT (10 * hz) | |
| 101 | #endif | |
| 102 | ||
| 103 | #ifndef AIOD_LIFETIME_DEFAULT | |
| 104 | #define AIOD_LIFETIME_DEFAULT (30 * hz) | |
| 105 | #endif | |
| 106 | ||
| 107 | SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); | |
| 108 | ||
| 109 | static int max_aio_procs = MAX_AIO_PROCS; | |
| 110 | SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, | |
| 111 | CTLFLAG_RW, &max_aio_procs, 0, | |
| 112 | "Maximum number of kernel threads to use for handling async IO"); | |
| 113 | ||
| 114 | static int num_aio_procs = 0; | |
| 115 | SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, | |
| 116 | CTLFLAG_RD, &num_aio_procs, 0, | |
| 117 | "Number of presently active kernel threads for async IO"); | |
| 118 | ||
| 119 | /* | |
| 120 | * The code will adjust the actual number of AIO processes towards this | |
| 121 | * number when it gets a chance. | |
| 122 | */ | |
| 123 | static int target_aio_procs = TARGET_AIO_PROCS; | |
| 124 | SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, | |
| 125 | 0, "Preferred number of ready kernel threads for async IO"); | |
| 126 | ||
| 127 | static int max_queue_count = MAX_AIO_QUEUE; | |
| 128 | SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, | |
| 129 | "Maximum number of aio requests to queue, globally"); | |
| 130 | ||
| 131 | static int num_queue_count = 0; | |
| 132 | SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, | |
| 133 | "Number of queued aio requests"); | |
| 134 | ||
| 135 | static int num_buf_aio = 0; | |
| 136 | SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, | |
| 137 | "Number of aio requests presently handled by the buf subsystem"); | |
| 138 | ||
| 139 | /* Number of async I/O thread in the process of being started */ | |
| 140 | /* XXX This should be local to _aio_aqueue() */ | |
| 141 | static int num_aio_resv_start = 0; | |
| 142 | ||
| 143 | static int aiod_timeout; | |
| 144 | SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, | |
| 145 | "Timeout value for synchronous aio operations"); | |
| 146 | ||
| 147 | static int aiod_lifetime; | |
| 148 | SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, | |
| 149 | "Maximum lifetime for idle aiod"); | |
| 150 | ||
| 151 | static int max_aio_per_proc = MAX_AIO_PER_PROC; | |
| 152 | SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, | |
| 153 | 0, "Maximum active aio requests per process (stored in the process)"); | |
| 154 | ||
| 155 | static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; | |
| 156 | SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, | |
| 157 | &max_aio_queue_per_proc, 0, | |
| 158 | "Maximum queued aio requests per process (stored in the process)"); | |
| 159 | ||
| 160 | static int max_buf_aio = MAX_BUF_AIO; | |
| 161 | SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, | |
| 162 | "Maximum buf aio requests per process (stored in the process)"); | |
| 163 | ||
| 164 | /* | |
| 165 | * AIO process info | |
| 166 | */ | |
| 167 | #define AIOP_FREE 0x1 /* proc on free queue */ | |
| 168 | #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ | |
| 169 | ||
| 170 | struct aioproclist { | |
| 171 | int aioprocflags; /* AIO proc flags */ | |
| 172 | TAILQ_ENTRY(aioproclist) list; /* List of processes */ | |
| 173 | struct proc *aioproc; /* The AIO thread */ | |
| 174 | }; | |
| 175 | ||
| 176 | /* | |
| 177 | * data-structure for lio signal management | |
| 178 | */ | |
| 179 | struct aio_liojob { | |
| 180 | int lioj_flags; | |
| 181 | int lioj_buffer_count; | |
| 182 | int lioj_buffer_finished_count; | |
| 183 | int lioj_queue_count; | |
| 184 | int lioj_queue_finished_count; | |
| 185 | struct sigevent lioj_signal; /* signal on all I/O done */ | |
| 186 | TAILQ_ENTRY(aio_liojob) lioj_list; | |
| 187 | struct kaioinfo *lioj_ki; | |
| 188 | }; | |
| 189 | #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ | |
| 190 | #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ | |
| 191 | ||
| 192 | /* | |
| 193 | * per process aio data structure | |
| 194 | */ | |
| 195 | struct kaioinfo { | |
| 196 | int kaio_flags; /* per process kaio flags */ | |
| 197 | int kaio_maxactive_count; /* maximum number of AIOs */ | |
| 198 | int kaio_active_count; /* number of currently used AIOs */ | |
| 199 | int kaio_qallowed_count; /* maxiumu size of AIO queue */ | |
| 200 | int kaio_queue_count; /* size of AIO queue */ | |
| 201 | int kaio_ballowed_count; /* maximum number of buffers */ | |
| 202 | int kaio_queue_finished_count; /* number of daemon jobs finished */ | |
| 203 | int kaio_buffer_count; /* number of physio buffers */ | |
| 204 | int kaio_buffer_finished_count; /* count of I/O done */ | |
| 205 | struct proc *kaio_p; /* process that uses this kaio block */ | |
| 206 | TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */ | |
| 207 | TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */ | |
| 208 | TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */ | |
| 209 | TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */ | |
| 210 | TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */ | |
| 211 | TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ | |
| 212 | }; | |
| 213 | ||
| 214 | #define KAIO_RUNDOWN 0x1 /* process is being run down */ | |
| 215 | #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ | |
| 216 | ||
| 217 | static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; | |
| 218 | static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ | |
| 219 | static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ | |
| 220 | static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ | |
| 221 | ||
| 222 | static void aio_init_aioinfo(struct proc *p); | |
| 223 | static void aio_onceonly(void *); | |
| 224 | static int aio_free_entry(struct aiocblist *aiocbe); | |
| 225 | static void aio_process(struct aiocblist *aiocbe); | |
| 226 | static int aio_newproc(void); | |
| 41c20dac | 227 | static int aio_aqueue(struct aiocb *job, int type); |
| 81b5c339 | 228 | static void aio_physwakeup(struct bio *bio); |
| 984263bc MD |
229 | static int aio_fphysio(struct aiocblist *aiocbe); |
| 230 | static int aio_qphysio(struct proc *p, struct aiocblist *iocb); | |
| 11aec7f4 | 231 | static void aio_daemon(void *uproc, struct trapframe *frame); |
| 984263bc MD |
232 | static void process_signal(void *aioj); |
| 233 | ||
| 234 | SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); | |
| 235 | ||
| 236 | /* | |
| 237 | * Zones for: | |
| 238 | * kaio Per process async io info | |
| 239 | * aiop async io thread data | |
| 240 | * aiocb async io jobs | |
| 241 | * aiol list io job pointer - internal to aio_suspend XXX | |
| 242 | * aiolio list io jobs | |
| 243 | */ | |
| 5a26d050 SG |
244 | static struct objcache *kaio_oc, *aiop_oc, *aiocb_oc, *aiol_oc, *aiolio_oc; |
| 245 | ||
| 246 | static MALLOC_DEFINE(M_AIO, "AIO", "AIO"); | |
| 247 | static MALLOC_DEFINE(M_AIOP, "AIO proc", "AIO process"); | |
| 248 | static MALLOC_DEFINE(M_AIOCB, "AIO cb", "AIO cb"); | |
| 249 | static MALLOC_DEFINE(M_AIOL, "AIO list io", "AIO list io"); | |
| 250 | static MALLOC_DEFINE(M_AIOLIO, "AIO list io job", "AIO list io job"); | |
| 984263bc MD |
251 | |
| 252 | /* | |
| 253 | * Startup initialization | |
| 254 | */ | |
| 255 | static void | |
| 256 | aio_onceonly(void *na) | |
| 257 | { | |
| 258 | TAILQ_INIT(&aio_freeproc); | |
| 259 | TAILQ_INIT(&aio_activeproc); | |
| 260 | TAILQ_INIT(&aio_jobs); | |
| 261 | TAILQ_INIT(&aio_bufjobs); | |
| 262 | TAILQ_INIT(&aio_freejobs); | |
| 5a26d050 SG |
263 | kaio_oc = objcache_create_simple(M_AIO, sizeof(struct kaioinfo)); |
| 264 | aiop_oc = objcache_create_simple(M_AIOP, sizeof(struct aioproclist)); | |
| 265 | aiocb_oc = objcache_create_simple(M_AIOCB, sizeof(struct aiocblist)); | |
| 266 | aiol_oc = objcache_create_simple(M_AIOL, AIO_LISTIO_MAX*sizeof(intptr_t)); | |
| 267 | aiolio_oc = objcache_create_simple(M_AIOLIO, sizeof(struct aio_liojob)); | |
| 984263bc MD |
268 | aiod_timeout = AIOD_TIMEOUT_DEFAULT; |
| 269 | aiod_lifetime = AIOD_LIFETIME_DEFAULT; | |
| 270 | jobrefid = 1; | |
| 271 | } | |
| 272 | ||
| 273 | /* | |
| 274 | * Init the per-process aioinfo structure. The aioinfo limits are set | |
| 275 | * per-process for user limit (resource) management. | |
| 276 | */ | |
| 277 | static void | |
| 278 | aio_init_aioinfo(struct proc *p) | |
| 279 | { | |
| 280 | struct kaioinfo *ki; | |
| 281 | if (p->p_aioinfo == NULL) { | |
| 5a26d050 | 282 | ki = objcache_get(kaio_oc, M_WAITOK); |
| 984263bc MD |
283 | p->p_aioinfo = ki; |
| 284 | ki->kaio_flags = 0; | |
| 285 | ki->kaio_maxactive_count = max_aio_per_proc; | |
| 286 | ki->kaio_active_count = 0; | |
| 287 | ki->kaio_qallowed_count = max_aio_queue_per_proc; | |
| 288 | ki->kaio_queue_count = 0; | |
| 289 | ki->kaio_ballowed_count = max_buf_aio; | |
| 290 | ki->kaio_buffer_count = 0; | |
| 291 | ki->kaio_buffer_finished_count = 0; | |
| 292 | ki->kaio_p = p; | |
| 293 | TAILQ_INIT(&ki->kaio_jobdone); | |
| 294 | TAILQ_INIT(&ki->kaio_jobqueue); | |
| 295 | TAILQ_INIT(&ki->kaio_bufdone); | |
| 296 | TAILQ_INIT(&ki->kaio_bufqueue); | |
| 297 | TAILQ_INIT(&ki->kaio_liojoblist); | |
| 298 | TAILQ_INIT(&ki->kaio_sockqueue); | |
| 299 | } | |
| 300 | ||
| 301 | while (num_aio_procs < target_aio_procs) | |
| 302 | aio_newproc(); | |
| 303 | } | |
| 304 | ||
| 305 | /* | |
| 306 | * Free a job entry. Wait for completion if it is currently active, but don't | |
| 307 | * delay forever. If we delay, we return a flag that says that we have to | |
| 308 | * restart the queue scan. | |
| 309 | */ | |
| 310 | static int | |
| 311 | aio_free_entry(struct aiocblist *aiocbe) | |
| 312 | { | |
| 313 | struct kaioinfo *ki; | |
| 314 | struct aio_liojob *lj; | |
| 315 | struct proc *p; | |
| 316 | int error; | |
| 984263bc MD |
317 | |
| 318 | if (aiocbe->jobstate == JOBST_NULL) | |
| 319 | panic("aio_free_entry: freeing already free job"); | |
| 320 | ||
| 321 | p = aiocbe->userproc; | |
| 322 | ki = p->p_aioinfo; | |
| 323 | lj = aiocbe->lio; | |
| 324 | if (ki == NULL) | |
| 325 | panic("aio_free_entry: missing p->p_aioinfo"); | |
| 326 | ||
| 327 | while (aiocbe->jobstate == JOBST_JOBRUNNING) { | |
| 328 | aiocbe->jobflags |= AIOCBLIST_RUNDOWN; | |
| 377d4740 | 329 | tsleep(aiocbe, 0, "jobwai", 0); |
| 984263bc MD |
330 | } |
| 331 | if (aiocbe->bp == NULL) { | |
| 332 | if (ki->kaio_queue_count <= 0) | |
| 333 | panic("aio_free_entry: process queue size <= 0"); | |
| 334 | if (num_queue_count <= 0) | |
| 335 | panic("aio_free_entry: system wide queue size <= 0"); | |
| 336 | ||
| 337 | if (lj) { | |
| 338 | lj->lioj_queue_count--; | |
| 339 | if (aiocbe->jobflags & AIOCBLIST_DONE) | |
| 340 | lj->lioj_queue_finished_count--; | |
| 341 | } | |
| 342 | ki->kaio_queue_count--; | |
| 343 | if (aiocbe->jobflags & AIOCBLIST_DONE) | |
| 344 | ki->kaio_queue_finished_count--; | |
| 345 | num_queue_count--; | |
| 346 | } else { | |
| 347 | if (lj) { | |
| 348 | lj->lioj_buffer_count--; | |
| 349 | if (aiocbe->jobflags & AIOCBLIST_DONE) | |
| 350 | lj->lioj_buffer_finished_count--; | |
| 351 | } | |
| 352 | if (aiocbe->jobflags & AIOCBLIST_DONE) | |
| 353 | ki->kaio_buffer_finished_count--; | |
| 354 | ki->kaio_buffer_count--; | |
| 355 | num_buf_aio--; | |
| 356 | } | |
| 357 | ||
| 358 | /* aiocbe is going away, we need to destroy any knotes */ | |
| 08f2f1bb | 359 | /* XXX lwp knote wants a thread, but only cares about the process */ |
| 5b22f1a7 | 360 | knote_empty(&aiocbe->klist); |
| 984263bc MD |
361 | |
| 362 | if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) | |
| 363 | && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { | |
| 364 | ki->kaio_flags &= ~KAIO_WAKEUP; | |
| 365 | wakeup(p); | |
| 366 | } | |
| 367 | ||
| 368 | if (aiocbe->jobstate == JOBST_JOBQBUF) { | |
| 369 | if ((error = aio_fphysio(aiocbe)) != 0) | |
| 370 | return error; | |
| 371 | if (aiocbe->jobstate != JOBST_JOBBFINISHED) | |
| 372 | panic("aio_free_entry: invalid physio finish-up state"); | |
| e43a034f | 373 | crit_enter(); |
| 984263bc | 374 | TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); |
| e43a034f | 375 | crit_exit(); |
| 984263bc | 376 | } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) { |
| e43a034f | 377 | crit_enter(); |
| 984263bc MD |
378 | TAILQ_REMOVE(&aio_jobs, aiocbe, list); |
| 379 | TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); | |
| e43a034f | 380 | crit_exit(); |
| 984263bc MD |
381 | } else if (aiocbe->jobstate == JOBST_JOBFINISHED) |
| 382 | TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); | |
| 383 | else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { | |
| e43a034f | 384 | crit_enter(); |
| 984263bc | 385 | TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); |
| e43a034f | 386 | crit_exit(); |
| 984263bc MD |
387 | if (aiocbe->bp) { |
| 388 | vunmapbuf(aiocbe->bp); | |
| 389 | relpbuf(aiocbe->bp, NULL); | |
| 390 | aiocbe->bp = NULL; | |
| 391 | } | |
| 392 | } | |
| 393 | if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { | |
| 394 | TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); | |
| 5a26d050 | 395 | objcache_put(aiolio_oc, lj); |
| 984263bc MD |
396 | } |
| 397 | aiocbe->jobstate = JOBST_NULL; | |
| a5eb27b6 | 398 | callout_stop(&aiocbe->timeout); |
| 9f87144f | 399 | fdrop(aiocbe->fd_file); |
| 984263bc MD |
400 | TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); |
| 401 | return 0; | |
| 402 | } | |
| 403 | #endif /* VFS_AIO */ | |
| 404 | ||
| 405 | /* | |
| 406 | * Rundown the jobs for a given process. | |
| 407 | */ | |
| 408 | void | |
| 409 | aio_proc_rundown(struct proc *p) | |
| 410 | { | |
| 411 | #ifndef VFS_AIO | |
| 412 | return; | |
| 413 | #else | |
| 984263bc MD |
414 | struct kaioinfo *ki; |
| 415 | struct aio_liojob *lj, *ljn; | |
| 416 | struct aiocblist *aiocbe, *aiocbn; | |
| 417 | struct file *fp; | |
| 418 | struct socket *so; | |
| 419 | ||
| 420 | ki = p->p_aioinfo; | |
| 421 | if (ki == NULL) | |
| 422 | return; | |
| 423 | ||
| 424 | ki->kaio_flags |= LIOJ_SIGNAL_POSTED; | |
| 425 | while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > | |
| 426 | ki->kaio_buffer_finished_count)) { | |
| 427 | ki->kaio_flags |= KAIO_RUNDOWN; | |
| 377d4740 | 428 | if (tsleep(p, 0, "kaiowt", aiod_timeout)) |
| 984263bc MD |
429 | break; |
| 430 | } | |
| 431 | ||
| 432 | /* | |
| 433 | * Move any aio ops that are waiting on socket I/O to the normal job | |
| 434 | * queues so they are cleaned up with any others. | |
| 435 | */ | |
| e43a034f | 436 | crit_enter(); |
| 984263bc MD |
437 | for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = |
| 438 | aiocbn) { | |
| 439 | aiocbn = TAILQ_NEXT(aiocbe, plist); | |
| 440 | fp = aiocbe->fd_file; | |
| 441 | if (fp != NULL) { | |
| 442 | so = (struct socket *)fp->f_data; | |
| 443 | TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); | |
| 444 | if (TAILQ_EMPTY(&so->so_aiojobq)) { | |
| 14343ad3 MD |
445 | atomic_clear_int(&so->so_snd.ssb_flags, |
| 446 | SSB_AIO); | |
| 447 | atomic_clear_int(&so->so_rcv.ssb_flags, | |
| 448 | SSB_AIO); | |
| 984263bc MD |
449 | } |
| 450 | } | |
| 451 | TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); | |
| 452 | TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); | |
| 453 | TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); | |
| 454 | } | |
| e43a034f | 455 | crit_exit(); |
| 984263bc MD |
456 | |
| 457 | restart1: | |
| 458 | for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { | |
| 459 | aiocbn = TAILQ_NEXT(aiocbe, plist); | |
| 460 | if (aio_free_entry(aiocbe)) | |
| 461 | goto restart1; | |
| 462 | } | |
| 463 | ||
| 464 | restart2: | |
| 465 | for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = | |
| 466 | aiocbn) { | |
| 467 | aiocbn = TAILQ_NEXT(aiocbe, plist); | |
| 468 | if (aio_free_entry(aiocbe)) | |
| 469 | goto restart2; | |
| 470 | } | |
| 471 | ||
| 984263bc | 472 | restart3: |
| e43a034f | 473 | crit_enter(); |
| 984263bc MD |
474 | while (TAILQ_FIRST(&ki->kaio_bufqueue)) { |
| 475 | ki->kaio_flags |= KAIO_WAKEUP; | |
| 377d4740 | 476 | tsleep(p, 0, "aioprn", 0); |
| e43a034f | 477 | crit_exit(); |
| 984263bc MD |
478 | goto restart3; |
| 479 | } | |
| e43a034f | 480 | crit_exit(); |
| 984263bc MD |
481 | |
| 482 | restart4: | |
| e43a034f | 483 | crit_enter(); |
| 984263bc MD |
484 | for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { |
| 485 | aiocbn = TAILQ_NEXT(aiocbe, plist); | |
| 486 | if (aio_free_entry(aiocbe)) { | |
| e43a034f | 487 | crit_exit(); |
| 984263bc MD |
488 | goto restart4; |
| 489 | } | |
| 490 | } | |
| e43a034f | 491 | crit_exit(); |
| 984263bc MD |
492 | |
| 493 | /* | |
| 494 | * If we've slept, jobs might have moved from one queue to another. | |
| 495 | * Retry rundown if we didn't manage to empty the queues. | |
| 496 | */ | |
| 497 | if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL || | |
| 498 | TAILQ_FIRST(&ki->kaio_jobqueue) != NULL || | |
| 499 | TAILQ_FIRST(&ki->kaio_bufqueue) != NULL || | |
| 500 | TAILQ_FIRST(&ki->kaio_bufdone) != NULL) | |
| 501 | goto restart1; | |
| 502 | ||
| 503 | for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { | |
| 504 | ljn = TAILQ_NEXT(lj, lioj_list); | |
| 505 | if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == | |
| 506 | 0)) { | |
| 507 | TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); | |
| 5a26d050 | 508 | objcache_put(aiolio_oc, lj); |
| 984263bc MD |
509 | } else { |
| 510 | #ifdef DIAGNOSTIC | |
| 6ea70f76 | 511 | kprintf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " |
| 984263bc MD |
512 | "QF:%d\n", lj->lioj_buffer_count, |
| 513 | lj->lioj_buffer_finished_count, | |
| 514 | lj->lioj_queue_count, | |
| 515 | lj->lioj_queue_finished_count); | |
| 516 | #endif | |
| 517 | } | |
| 518 | } | |
| 519 | ||
| 5a26d050 | 520 | objcache_put(kaio_oc, ki); |
| 984263bc MD |
521 | p->p_aioinfo = NULL; |
| 522 | #endif /* VFS_AIO */ | |
| 523 | } | |
| 524 | ||
| 525 | #ifdef VFS_AIO | |
| 526 | /* | |
| 527 | * Select a job to run (called by an AIO daemon). | |
| 528 | */ | |
| 529 | static struct aiocblist * | |
| 530 | aio_selectjob(struct aioproclist *aiop) | |
| 531 | { | |
| 984263bc MD |
532 | struct aiocblist *aiocbe; |
| 533 | struct kaioinfo *ki; | |
| 534 | struct proc *userp; | |
| 535 | ||
| e43a034f | 536 | crit_enter(); |
| 984263bc MD |
537 | for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = |
| 538 | TAILQ_NEXT(aiocbe, list)) { | |
| 539 | userp = aiocbe->userproc; | |
| 540 | ki = userp->p_aioinfo; | |
| 541 | ||
| 542 | if (ki->kaio_active_count < ki->kaio_maxactive_count) { | |
| 543 | TAILQ_REMOVE(&aio_jobs, aiocbe, list); | |
| e43a034f | 544 | crit_exit(); |
| 984263bc MD |
545 | return aiocbe; |
| 546 | } | |
| 547 | } | |
| e43a034f | 548 | crit_exit(); |
| 984263bc MD |
549 | |
| 550 | return NULL; | |
| 551 | } | |
| 552 | ||
| 553 | /* | |
| 554 | * The AIO processing activity. This is the code that does the I/O request for | |
| 555 | * the non-physio version of the operations. The normal vn operations are used, | |
| 556 | * and this code should work in all instances for every type of file, including | |
| 557 | * pipes, sockets, fifos, and regular files. | |
| 558 | */ | |
| 559 | static void | |
| 560 | aio_process(struct aiocblist *aiocbe) | |
| 561 | { | |
| 7b95be2a | 562 | struct thread *mytd; |
| 984263bc MD |
563 | struct aiocb *cb; |
| 564 | struct file *fp; | |
| 565 | struct uio auio; | |
| 566 | struct iovec aiov; | |
| 567 | int cnt; | |
| 568 | int error; | |
| 569 | int oublock_st, oublock_end; | |
| 570 | int inblock_st, inblock_end; | |
| 571 | ||
| 7b95be2a | 572 | mytd = curthread; |
| 984263bc MD |
573 | cb = &aiocbe->uaiocb; |
| 574 | fp = aiocbe->fd_file; | |
| 575 | ||
| 576 | aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; | |
| 577 | aiov.iov_len = cb->aio_nbytes; | |
| 578 | ||
| 579 | auio.uio_iov = &aiov; | |
| 580 | auio.uio_iovcnt = 1; | |
| 581 | auio.uio_offset = cb->aio_offset; | |
| 582 | auio.uio_resid = cb->aio_nbytes; | |
| 583 | cnt = cb->aio_nbytes; | |
| 584 | auio.uio_segflg = UIO_USERSPACE; | |
| 7b95be2a | 585 | auio.uio_td = mytd; |
| 984263bc | 586 | |
| fde7ac71 SS |
587 | inblock_st = mytd->td_lwp->lwp_ru.ru_inblock; |
| 588 | oublock_st = mytd->td_lwp->lwp_ru.ru_oublock; | |
| 984263bc MD |
589 | /* |
| 590 | * _aio_aqueue() acquires a reference to the file that is | |
| 591 | * released in aio_free_entry(). | |
| 592 | */ | |
| 593 | if (cb->aio_lio_opcode == LIO_READ) { | |
| 594 | auio.uio_rw = UIO_READ; | |
| 9ba76b73 | 595 | error = fo_read(fp, &auio, fp->f_cred, O_FOFFSET); |
| 984263bc MD |
596 | } else { |
| 597 | auio.uio_rw = UIO_WRITE; | |
| 9ba76b73 | 598 | error = fo_write(fp, &auio, fp->f_cred, O_FOFFSET); |
| 984263bc | 599 | } |
| fde7ac71 SS |
600 | inblock_end = mytd->td_lwp->lwp_ru.ru_inblock; |
| 601 | oublock_end = mytd->td_lwp->lwp_ru.ru_oublock; | |
| 984263bc MD |
602 | |
| 603 | aiocbe->inputcharge = inblock_end - inblock_st; | |
| 604 | aiocbe->outputcharge = oublock_end - oublock_st; | |
| 605 | ||
| 606 | if ((error) && (auio.uio_resid != cnt)) { | |
| 607 | if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) | |
| 608 | error = 0; | |
| 609 | if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) | |
| 84204577 | 610 | ksignal(aiocbe->userproc, SIGPIPE); |
| 984263bc MD |
611 | } |
| 612 | ||
| 613 | cnt -= auio.uio_resid; | |
| 614 | cb->_aiocb_private.error = error; | |
| 615 | cb->_aiocb_private.status = cnt; | |
| 616 | } | |
| 617 | ||
| 618 | /* | |
| 619 | * The AIO daemon, most of the actual work is done in aio_process, | |
| 620 | * but the setup (and address space mgmt) is done in this routine. | |
| 621 | */ | |
| 622 | static void | |
| 91bd9c1e | 623 | aio_daemon(void *uproc, struct trapframe *frame) |
| 984263bc | 624 | { |
| 984263bc MD |
625 | struct aio_liojob *lj; |
| 626 | struct aiocb *cb; | |
| 627 | struct aiocblist *aiocbe; | |
| 628 | struct aioproclist *aiop; | |
| 629 | struct kaioinfo *ki; | |
| 287ebb09 MD |
630 | struct proc *mycp, *userp; |
| 631 | struct vmspace *curvm; | |
| 632 | struct lwp *mylwp; | |
| e9a372eb | 633 | struct ucred *cr; |
| 984263bc | 634 | |
| 2b0bd8aa MD |
635 | /* |
| 636 | * mplock not held on entry but we aren't mpsafe yet. | |
| 637 | */ | |
| 638 | get_mplock(); | |
| 639 | ||
| 287ebb09 MD |
640 | mylwp = curthread->td_lwp; |
| 641 | mycp = mylwp->lwp_proc; | |
| 984263bc MD |
642 | |
| 643 | if (mycp->p_textvp) { | |
| 644 | vrele(mycp->p_textvp); | |
| 645 | mycp->p_textvp = NULL; | |
| 646 | } | |
| 647 | ||
| 648 | /* | |
| 649 | * Allocate and ready the aio control info. There is one aiop structure | |
| 650 | * per daemon. | |
| 651 | */ | |
| 5a26d050 | 652 | aiop = objcache_get(aiop_oc, M_WAITOK); |
| 984263bc MD |
653 | aiop->aioproc = mycp; |
| 654 | aiop->aioprocflags |= AIOP_FREE; | |
| 655 | ||
| e43a034f | 656 | crit_enter(); |
| 984263bc MD |
657 | |
| 658 | /* | |
| 659 | * Place thread (lightweight process) onto the AIO free thread list. | |
| 660 | */ | |
| 661 | if (TAILQ_EMPTY(&aio_freeproc)) | |
| 662 | wakeup(&aio_freeproc); | |
| 663 | TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); | |
| 664 | ||
| e43a034f | 665 | crit_exit(); |
| 984263bc MD |
666 | |
| 667 | /* Make up a name for the daemon. */ | |
| 668 | strcpy(mycp->p_comm, "aiod"); | |
| 669 | ||
| 670 | /* | |
| 671 | * Get rid of our current filedescriptors. AIOD's don't need any | |
| 672 | * filedescriptors, except as temporarily inherited from the client. | |
| 673 | * Credentials are also cloned, and made equivalent to "root". | |
| 674 | */ | |
| 0a4a9c77 | 675 | fdfree(mycp, NULL); |
| e9a372eb MD |
676 | cr = cratom(&mycp->p_ucred); |
| 677 | cr->cr_uid = 0; | |
| 6eedb489 | 678 | uireplace(&cr->cr_uidinfo, uifind(0)); |
| e9a372eb MD |
679 | cr->cr_ngroups = 1; |
| 680 | cr->cr_groups[0] = 1; | |
| 984263bc MD |
681 | |
| 682 | /* The daemon resides in its own pgrp. */ | |
| 683 | enterpgrp(mycp, mycp->p_pid, 1); | |
| 684 | ||
| 685 | /* Mark special process type. */ | |
| 686 | mycp->p_flag |= P_SYSTEM | P_KTHREADP; | |
| 687 | ||
| 688 | /* | |
| 689 | * Wakeup parent process. (Parent sleeps to keep from blasting away | |
| 690 | * and creating too many daemons.) | |
| 691 | */ | |
| 692 | wakeup(mycp); | |
| 287ebb09 | 693 | curvm = NULL; |
| 984263bc MD |
694 | |
| 695 | for (;;) { | |
| 696 | /* | |
| 984263bc MD |
697 | * Take daemon off of free queue |
| 698 | */ | |
| 699 | if (aiop->aioprocflags & AIOP_FREE) { | |
| e43a034f | 700 | crit_enter(); |
| 984263bc MD |
701 | TAILQ_REMOVE(&aio_freeproc, aiop, list); |
| 702 | TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); | |
| 703 | aiop->aioprocflags &= ~AIOP_FREE; | |
| e43a034f | 704 | crit_exit(); |
| 984263bc MD |
705 | } |
| 706 | aiop->aioprocflags &= ~AIOP_SCHED; | |
| 707 | ||
| 708 | /* | |
| 709 | * Check for jobs. | |
| 710 | */ | |
| 711 | while ((aiocbe = aio_selectjob(aiop)) != NULL) { | |
| 712 | cb = &aiocbe->uaiocb; | |
| 713 | userp = aiocbe->userproc; | |
| 714 | ||
| 715 | aiocbe->jobstate = JOBST_JOBRUNNING; | |
| 716 | ||
| 717 | /* | |
| 718 | * Connect to process address space for user program. | |
| 719 | */ | |
| 287ebb09 MD |
720 | if (curvm != userp->p_vmspace) { |
| 721 | pmap_setlwpvm(mylwp, userp->p_vmspace); | |
| 722 | if (curvm) | |
| 723 | sysref_put(&curvm->vm_sysref); | |
| 724 | curvm = userp->p_vmspace; | |
| 725 | sysref_get(&curvm->vm_sysref); | |
| 984263bc MD |
726 | } |
| 727 | ||
| 728 | ki = userp->p_aioinfo; | |
| 729 | lj = aiocbe->lio; | |
| 730 | ||
| 731 | /* Account for currently active jobs. */ | |
| 732 | ki->kaio_active_count++; | |
| 733 | ||
| 734 | /* Do the I/O function. */ | |
| 735 | aio_process(aiocbe); | |
| 736 | ||
| 737 | /* Decrement the active job count. */ | |
| 738 | ki->kaio_active_count--; | |
| 739 | ||
| 740 | /* | |
| 741 | * Increment the completion count for wakeup/signal | |
| 742 | * comparisons. | |
| 743 | */ | |
| 744 | aiocbe->jobflags |= AIOCBLIST_DONE; | |
| 745 | ki->kaio_queue_finished_count++; | |
| 746 | if (lj) | |
| 747 | lj->lioj_queue_finished_count++; | |
| 748 | if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags | |
| 749 | & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { | |
| 750 | ki->kaio_flags &= ~KAIO_WAKEUP; | |
| 751 | wakeup(userp); | |
| 752 | } | |
| 753 | ||
| e43a034f | 754 | crit_enter(); |
| 984263bc MD |
755 | if (lj && (lj->lioj_flags & |
| 756 | (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { | |
| 757 | if ((lj->lioj_queue_finished_count == | |
| 758 | lj->lioj_queue_count) && | |
| 759 | (lj->lioj_buffer_finished_count == | |
| 760 | lj->lioj_buffer_count)) { | |
| 84204577 | 761 | ksignal(userp, |
| 984263bc MD |
762 | lj->lioj_signal.sigev_signo); |
| 763 | lj->lioj_flags |= | |
| 764 | LIOJ_SIGNAL_POSTED; | |
| 765 | } | |
| 766 | } | |
| e43a034f | 767 | crit_exit(); |
| 984263bc MD |
768 | |
| 769 | aiocbe->jobstate = JOBST_JOBFINISHED; | |
| 770 | ||
| e43a034f | 771 | crit_enter(); |
| 984263bc MD |
772 | TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); |
| 773 | TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist); | |
| e43a034f | 774 | crit_exit(); |
| 984263bc MD |
775 | KNOTE(&aiocbe->klist, 0); |
| 776 | ||
| 777 | if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { | |
| 778 | wakeup(aiocbe); | |
| 779 | aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; | |
| 780 | } | |
| 781 | ||
| 782 | if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { | |
| 84204577 | 783 | ksignal(userp, cb->aio_sigevent.sigev_signo); |
| 984263bc MD |
784 | } |
| 785 | } | |
| 786 | ||
| 787 | /* | |
| 788 | * Disconnect from user address space. | |
| 789 | */ | |
| 287ebb09 | 790 | if (curvm) { |
| e3161323 | 791 | /* swap our original address space back in */ |
| 287ebb09 MD |
792 | pmap_setlwpvm(mylwp, mycp->p_vmspace); |
| 793 | sysref_put(&curvm->vm_sysref); | |
| 794 | curvm = NULL; | |
| 984263bc MD |
795 | } |
| 796 | ||
| 797 | /* | |
| 798 | * If we are the first to be put onto the free queue, wakeup | |
| 799 | * anyone waiting for a daemon. | |
| 800 | */ | |
| e43a034f | 801 | crit_enter(); |
| 984263bc MD |
802 | TAILQ_REMOVE(&aio_activeproc, aiop, list); |
| 803 | if (TAILQ_EMPTY(&aio_freeproc)) | |
| 804 | wakeup(&aio_freeproc); | |
| 805 | TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); | |
| 806 | aiop->aioprocflags |= AIOP_FREE; | |
| e43a034f | 807 | crit_exit(); |
| 984263bc MD |
808 | |
| 809 | /* | |
| 810 | * If daemon is inactive for a long time, allow it to exit, | |
| 811 | * thereby freeing resources. | |
| 812 | */ | |
| 813 | if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, | |
| 377d4740 | 814 | 0, "aiordy", aiod_lifetime)) { |
| e43a034f | 815 | crit_enter(); |
| 984263bc MD |
816 | if (TAILQ_EMPTY(&aio_jobs)) { |
| 817 | if ((aiop->aioprocflags & AIOP_FREE) && | |
| 818 | (num_aio_procs > target_aio_procs)) { | |
| 819 | TAILQ_REMOVE(&aio_freeproc, aiop, list); | |
| e43a034f | 820 | crit_exit(); |
| 5a26d050 | 821 | objcache_put(aiop_oc, aiop); |
| 984263bc MD |
822 | num_aio_procs--; |
| 823 | #ifdef DIAGNOSTIC | |
| e3161323 | 824 | if (mycp->p_vmspace->vm_sysref.refcnt <= 1) { |
| 6ea70f76 | 825 | kprintf("AIOD: bad vm refcnt for" |
| 984263bc | 826 | " exiting daemon: %d\n", |
| e3161323 | 827 | mycp->p_vmspace->vm_sysref.refcnt); |
| 984263bc MD |
828 | } |
| 829 | #endif | |
| 7b95be2a | 830 | exit1(0); |
| 984263bc MD |
831 | } |
| 832 | } | |
| e43a034f | 833 | crit_exit(); |
| 984263bc MD |
834 | } |
| 835 | } | |
| 836 | } | |
| 837 | ||
| 838 | /* | |
| 839 | * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The | |
| 840 | * AIO daemon modifies its environment itself. | |
| 841 | */ | |
| 842 | static int | |
| c972a82f | 843 | aio_newproc(void) |
| 984263bc MD |
844 | { |
| 845 | int error; | |
| bb3cd951 | 846 | struct lwp *lp, *nlp; |
| 553ea3c8 | 847 | struct proc *np; |
| 984263bc | 848 | |
| 08f2f1bb | 849 | lp = &lwp0; |
| 553ea3c8 | 850 | error = fork1(lp, RFPROC|RFMEM|RFNOWAIT, &np); |
| 984263bc MD |
851 | if (error) |
| 852 | return error; | |
| 08f2f1bb | 853 | nlp = ONLY_LWP_IN_PROC(np); |
| bb3cd951 | 854 | cpu_set_fork_handler(nlp, aio_daemon, curproc); |
| 553ea3c8 | 855 | start_forked_proc(lp, np); |
| 984263bc MD |
856 | |
| 857 | /* | |
| 858 | * Wait until daemon is started, but continue on just in case to | |
| 859 | * handle error conditions. | |
| 860 | */ | |
| 377d4740 | 861 | error = tsleep(np, 0, "aiosta", aiod_timeout); |
| 984263bc MD |
862 | num_aio_procs++; |
| 863 | ||
| 864 | return error; | |
| 865 | } | |
| 866 | ||
| 867 | /* | |
| 868 | * Try the high-performance, low-overhead physio method for eligible | |
| 869 | * VCHR devices. This method doesn't use an aio helper thread, and | |
| 870 | * thus has very low overhead. | |
| 871 | * | |
| 872 | * Assumes that the caller, _aio_aqueue(), has incremented the file | |
| 873 | * structure's reference count, preventing its deallocation for the | |
| 874 | * duration of this call. | |
| 875 | */ | |
| 876 | static int | |
| 877 | aio_qphysio(struct proc *p, struct aiocblist *aiocbe) | |
| 878 | { | |
| 879 | int error; | |
| 880 | struct aiocb *cb; | |
| 881 | struct file *fp; | |
| 882 | struct buf *bp; | |
| 883 | struct vnode *vp; | |
| 884 | struct kaioinfo *ki; | |
| 885 | struct aio_liojob *lj; | |
| 984263bc MD |
886 | int notify; |
| 887 | ||
| 888 | cb = &aiocbe->uaiocb; | |
| 889 | fp = aiocbe->fd_file; | |
| 890 | ||
| 891 | if (fp->f_type != DTYPE_VNODE) | |
| 892 | return (-1); | |
| 893 | ||
| 894 | vp = (struct vnode *)fp->f_data; | |
| 895 | ||
| 896 | /* | |
| 897 | * If its not a disk, we don't want to return a positive error. | |
| 898 | * It causes the aio code to not fall through to try the thread | |
| 899 | * way when you're talking to a regular file. | |
| 900 | */ | |
| 901 | if (!vn_isdisk(vp, &error)) { | |
| 902 | if (error == ENOTBLK) | |
| 903 | return (-1); | |
| 904 | else | |
| 905 | return (error); | |
| 906 | } | |
| 907 | ||
| 908 | if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) | |
| 909 | return (-1); | |
| 910 | ||
| 911 | if (cb->aio_nbytes > | |
| 912 | MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) | |
| 913 | return (-1); | |
| 914 | ||
| 915 | ki = p->p_aioinfo; | |
| 916 | if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) | |
| 917 | return (-1); | |
| 918 | ||
| 919 | ki->kaio_buffer_count++; | |
| 920 | ||
| 921 | lj = aiocbe->lio; | |
| 922 | if (lj) | |
| 923 | lj->lioj_buffer_count++; | |
| 924 | ||
| 925 | /* Create and build a buffer header for a transfer. */ | |
| 9a82e536 | 926 | bp = getpbuf_kva(NULL); |
| 984263bc MD |
927 | BUF_KERNPROC(bp); |
| 928 | ||
| 929 | /* | |
| 930 | * Get a copy of the kva from the physical buffer. | |
| 931 | */ | |
| 81b5c339 | 932 | bp->b_bio1.bio_caller_info1.ptr = p; |
| 984263bc MD |
933 | error = 0; |
| 934 | ||
| 10f3fee5 MD |
935 | bp->b_cmd = (cb->aio_lio_opcode == LIO_WRITE) ? |
| 936 | BUF_CMD_WRITE : BUF_CMD_READ; | |
| 81b5c339 | 937 | bp->b_bio1.bio_done = aio_physwakeup; |
| ae8e83e6 | 938 | bp->b_bio1.bio_flags |= BIO_SYNC; |
| 54078292 | 939 | bp->b_bio1.bio_offset = cb->aio_offset; |
| 984263bc MD |
940 | |
| 941 | /* Bring buffer into kernel space. */ | |
| 3591bbc6 | 942 | if (vmapbuf(bp, __DEVOLATILE(char *, cb->aio_buf), cb->aio_nbytes) < 0) { |
| 984263bc MD |
943 | error = EFAULT; |
| 944 | goto doerror; | |
| 945 | } | |
| 946 | ||
| e43a034f MD |
947 | crit_enter(); |
| 948 | ||
| 984263bc | 949 | aiocbe->bp = bp; |
| 81b5c339 | 950 | bp->b_bio1.bio_caller_info2.ptr = aiocbe; |
| 984263bc MD |
951 | TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); |
| 952 | TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); | |
| 953 | aiocbe->jobstate = JOBST_JOBQBUF; | |
| 954 | cb->_aiocb_private.status = cb->aio_nbytes; | |
| 955 | num_buf_aio++; | |
| 956 | bp->b_error = 0; | |
| 957 | ||
| e43a034f | 958 | crit_exit(); |
| 984263bc | 959 | |
| c34665ce MD |
960 | /* |
| 961 | * Perform the transfer. vn_strategy must be used even though we | |
| 962 | * know we have a device in order to deal with requests which exceed | |
| 963 | * device DMA limitations. | |
| 964 | */ | |
| 965 | vn_strategy(vp, &bp->b_bio1); | |
| 984263bc MD |
966 | |
| 967 | notify = 0; | |
| e43a034f | 968 | crit_enter(); |
| 984263bc | 969 | |
| ae8e83e6 | 970 | #if 0 |
| 984263bc MD |
971 | /* |
| 972 | * If we had an error invoking the request, or an error in processing | |
| 973 | * the request before we have returned, we process it as an error in | |
| 974 | * transfer. Note that such an I/O error is not indicated immediately, | |
| 975 | * but is returned using the aio_error mechanism. In this case, | |
| 976 | * aio_suspend will return immediately. | |
| 977 | */ | |
| 978 | if (bp->b_error || (bp->b_flags & B_ERROR)) { | |
| 979 | struct aiocb *job = aiocbe->uuaiocb; | |
| 980 | ||
| 981 | aiocbe->uaiocb._aiocb_private.status = 0; | |
| 982 | suword(&job->_aiocb_private.status, 0); | |
| 983 | aiocbe->uaiocb._aiocb_private.error = bp->b_error; | |
| 984 | suword(&job->_aiocb_private.error, bp->b_error); | |
| 985 | ||
| 986 | ki->kaio_buffer_finished_count++; | |
| 987 | ||
| 988 | if (aiocbe->jobstate != JOBST_JOBBFINISHED) { | |
| 989 | aiocbe->jobstate = JOBST_JOBBFINISHED; | |
| 990 | aiocbe->jobflags |= AIOCBLIST_DONE; | |
| 991 | TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); | |
| 992 | TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); | |
| 993 | TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); | |
| 994 | notify = 1; | |
| 995 | } | |
| 996 | } | |
| ae8e83e6 | 997 | #endif |
| e43a034f | 998 | crit_exit(); |
| 984263bc MD |
999 | if (notify) |
| 1000 | KNOTE(&aiocbe->klist, 0); | |
| 1001 | return 0; | |
| 1002 | ||
| 1003 | doerror: | |
| 1004 | ki->kaio_buffer_count--; | |
| 1005 | if (lj) | |
| 1006 | lj->lioj_buffer_count--; | |
| 1007 | aiocbe->bp = NULL; | |
| 1008 | relpbuf(bp, NULL); | |
| 1009 | return error; | |
| 1010 | } | |
| 1011 | ||
| 1012 | /* | |
| 1013 | * This waits/tests physio completion. | |
| 1014 | */ | |
| 1015 | static int | |
| 1016 | aio_fphysio(struct aiocblist *iocb) | |
| 1017 | { | |
| 984263bc MD |
1018 | struct buf *bp; |
| 1019 | int error; | |
| 1020 | ||
| 1021 | bp = iocb->bp; | |
| 1022 | ||
| ae8e83e6 | 1023 | error = biowait_timeout(&bp->b_bio1, "physstr", aiod_timeout); |
| 1a493ad9 SW |
1024 | if (error == EWOULDBLOCK) |
| 1025 | return EINPROGRESS; | |
| 984263bc MD |
1026 | |
| 1027 | /* Release mapping into kernel space. */ | |
| 1028 | vunmapbuf(bp); | |
| 1029 | iocb->bp = 0; | |
| 1030 | ||
| 1031 | error = 0; | |
| 1032 | ||
| 1033 | /* Check for an error. */ | |
| 1034 | if (bp->b_flags & B_ERROR) | |
| 1035 | error = bp->b_error; | |
| 1036 | ||
| 1037 | relpbuf(bp, NULL); | |
| 1038 | return (error); | |
| 1039 | } | |
| 1040 | #endif /* VFS_AIO */ | |
| 1041 | ||
| 1042 | /* | |
| 1043 | * Wake up aio requests that may be serviceable now. | |
| 1044 | */ | |
| 1045 | void | |
| 6d49aa6f | 1046 | aio_swake(struct socket *so, struct signalsockbuf *ssb) |
| 984263bc MD |
1047 | { |
| 1048 | #ifndef VFS_AIO | |
| 1049 | return; | |
| 1050 | #else | |
| 1051 | struct aiocblist *cb,*cbn; | |
| 1052 | struct proc *p; | |
| 1053 | struct kaioinfo *ki = NULL; | |
| 1054 | int opcode, wakecount = 0; | |
| 1055 | struct aioproclist *aiop; | |
| 1056 | ||
| 6d49aa6f | 1057 | if (ssb == &so->so_snd) { |
| 984263bc | 1058 | opcode = LIO_WRITE; |
| 14343ad3 | 1059 | atomic_clear_int(&so->so_snd.ssb_flags, SSB_AIO); |
| 984263bc MD |
1060 | } else { |
| 1061 | opcode = LIO_READ; | |
| 14343ad3 | 1062 | atomic_clear_int(&so->so_rcv.ssb_flags, SSB_AIO); |
| 984263bc MD |
1063 | } |
| 1064 | ||
| 1065 | for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { | |
| 1066 | cbn = TAILQ_NEXT(cb, list); | |
| 1067 | if (opcode == cb->uaiocb.aio_lio_opcode) { | |
| 1068 | p = cb->userproc; | |
| 1069 | ki = p->p_aioinfo; | |
| 1070 | TAILQ_REMOVE(&so->so_aiojobq, cb, list); | |
| 1071 | TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); | |
| 1072 | TAILQ_INSERT_TAIL(&aio_jobs, cb, list); | |
| 1073 | TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); | |
| 1074 | wakecount++; | |
| 1075 | if (cb->jobstate != JOBST_JOBQGLOBAL) | |
| 1076 | panic("invalid queue value"); | |
| 1077 | } | |
| 1078 | } | |
| 1079 | ||
| 1080 | while (wakecount--) { | |
| 1081 | if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { | |
| 1082 | TAILQ_REMOVE(&aio_freeproc, aiop, list); | |
| 1083 | TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); | |
| 1084 | aiop->aioprocflags &= ~AIOP_FREE; | |
| 1085 | wakeup(aiop->aioproc); | |
| 1086 | } | |
| 1087 | } | |
| 1088 | #endif /* VFS_AIO */ | |
| 1089 | } | |
| 1090 | ||
| 1091 | #ifdef VFS_AIO | |
| 1092 | /* | |
| 1093 | * Queue a new AIO request. Choosing either the threaded or direct physio VCHR | |
| 1094 | * technique is done in this code. | |
| 1095 | */ | |
| 1096 | static int | |
| 41c20dac | 1097 | _aio_aqueue(struct aiocb *job, struct aio_liojob *lj, int type) |
| 984263bc | 1098 | { |
| 7b95be2a | 1099 | struct proc *p = curproc; |
| 984263bc MD |
1100 | struct file *fp; |
| 1101 | unsigned int fd; | |
| 1102 | struct socket *so; | |
| 984263bc MD |
1103 | int error; |
| 1104 | int opcode, user_opcode; | |
| 1105 | struct aiocblist *aiocbe; | |
| 1106 | struct aioproclist *aiop; | |
| 1107 | struct kaioinfo *ki; | |
| 1108 | struct kevent kev; | |
| 1109 | struct kqueue *kq; | |
| 1110 | struct file *kq_fp; | |
| 3919ced0 | 1111 | int fflags; |
| 984263bc MD |
1112 | |
| 1113 | if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) | |
| 1114 | TAILQ_REMOVE(&aio_freejobs, aiocbe, list); | |
| 1115 | else | |
| 5a26d050 | 1116 | aiocbe = objcache_get(aiocb_oc, M_WAITOK); |
| 984263bc MD |
1117 | |
| 1118 | aiocbe->inputcharge = 0; | |
| 1119 | aiocbe->outputcharge = 0; | |
| a5eb27b6 | 1120 | callout_init(&aiocbe->timeout); |
| 984263bc MD |
1121 | SLIST_INIT(&aiocbe->klist); |
| 1122 | ||
| 1123 | suword(&job->_aiocb_private.status, -1); | |
| 1124 | suword(&job->_aiocb_private.error, 0); | |
| 1125 | suword(&job->_aiocb_private.kernelinfo, -1); | |
| 1126 | ||
| 1127 | error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb)); | |
| 1128 | if (error) { | |
| 1129 | suword(&job->_aiocb_private.error, error); | |
| 1130 | TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); | |
| 1131 | return error; | |
| 1132 | } | |
| 1133 | if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && | |
| 1134 | !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { | |
| 1135 | TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); | |
| 1136 | return EINVAL; | |
| 1137 | } | |
| 1138 | ||
| 1139 | /* Save userspace address of the job info. */ | |
| 1140 | aiocbe->uuaiocb = job; | |
| 1141 | ||
| 1142 | /* Get the opcode. */ | |
| 1143 | user_opcode = aiocbe->uaiocb.aio_lio_opcode; | |
| 1144 | if (type != LIO_NOP) | |
| 1145 | aiocbe->uaiocb.aio_lio_opcode = type; | |
| 1146 | opcode = aiocbe->uaiocb.aio_lio_opcode; | |
| 1147 | ||
| 984263bc MD |
1148 | /* |
| 1149 | * Range check file descriptor. | |
| 1150 | */ | |
| 3919ced0 | 1151 | fflags = (opcode == LIO_WRITE) ? FWRITE : FREAD; |
| 984263bc | 1152 | fd = aiocbe->uaiocb.aio_fildes; |
| 3919ced0 MD |
1153 | fp = holdfp(p->p_fd, fd, fflags); |
| 1154 | if (fp == NULL) { | |
| 984263bc MD |
1155 | TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); |
| 1156 | if (type == 0) | |
| 1157 | suword(&job->_aiocb_private.error, EBADF); | |
| 1158 | return EBADF; | |
| 1159 | } | |
| 1160 | ||
| 3919ced0 | 1161 | aiocbe->fd_file = fp; |
| 984263bc MD |
1162 | |
| 1163 | if (aiocbe->uaiocb.aio_offset == -1LL) { | |
| 1164 | error = EINVAL; | |
| 1165 | goto aqueue_fail; | |
| 1166 | } | |
| 1167 | error = suword(&job->_aiocb_private.kernelinfo, jobrefid); | |
| 1168 | if (error) { | |
| 1169 | error = EINVAL; | |
| 1170 | goto aqueue_fail; | |
| 1171 | } | |
| 1172 | aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; | |
| 1173 | if (jobrefid == LONG_MAX) | |
| 1174 | jobrefid = 1; | |
| 1175 | else | |
| 1176 | jobrefid++; | |
| 1177 | ||
| 1178 | if (opcode == LIO_NOP) { | |
| 9f87144f | 1179 | fdrop(fp); |
| 984263bc MD |
1180 | TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); |
| 1181 | if (type == 0) { | |
| 1182 | suword(&job->_aiocb_private.error, 0); | |
| 1183 | suword(&job->_aiocb_private.status, 0); | |
| 1184 | suword(&job->_aiocb_private.kernelinfo, 0); | |
| 1185 | } | |
| 1186 | return 0; | |
| 1187 | } | |
| 1188 | if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { | |
| 1189 | if (type == 0) | |
| 1190 | suword(&job->_aiocb_private.status, 0); | |
| 1191 | error = EINVAL; | |
| 1192 | goto aqueue_fail; | |
| 1193 | } | |
| 1194 | ||
| 1195 | if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { | |
| 1196 | kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; | |
| 36934016 | 1197 | kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; |
| 984263bc MD |
1198 | } |
| 1199 | else { | |
| 1200 | /* | |
| 1201 | * This method for requesting kevent-based notification won't | |
| 1202 | * work on the alpha, since we're passing in a pointer | |
| 1203 | * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- | |
| 1204 | * based method instead. | |
| 1205 | */ | |
| 1206 | if (user_opcode == LIO_NOP || user_opcode == LIO_READ || | |
| 1207 | user_opcode == LIO_WRITE) | |
| 1208 | goto no_kqueue; | |
| 1209 | ||
| 1210 | error = copyin((struct kevent *)(uintptr_t)user_opcode, | |
| 1211 | &kev, sizeof(kev)); | |
| 1212 | if (error) | |
| 1213 | goto aqueue_fail; | |
| 1214 | } | |
| 3919ced0 MD |
1215 | kq_fp = holdfp(p->p_fd, (int)kev.ident, -1); |
| 1216 | if (kq_fp == NULL || kq_fp->f_type != DTYPE_KQUEUE) { | |
| 1217 | if (kq_fp) { | |
| 1218 | fdrop(kq_fp); | |
| 1219 | kq_fp = NULL; | |
| 1220 | } | |
| 984263bc MD |
1221 | error = EBADF; |
| 1222 | goto aqueue_fail; | |
| 1223 | } | |
| 1224 | kq = (struct kqueue *)kq_fp->f_data; | |
| 1225 | kev.ident = (uintptr_t)aiocbe->uuaiocb; | |
| 1226 | kev.filter = EVFILT_AIO; | |
| 1227 | kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; | |
| 1228 | kev.data = (intptr_t)aiocbe; | |
| ccafe911 | 1229 | error = kqueue_register(kq, &kev); |
| 3919ced0 | 1230 | fdrop(kq_fp); |
| 984263bc MD |
1231 | aqueue_fail: |
| 1232 | if (error) { | |
| 9f87144f | 1233 | fdrop(fp); |
| 984263bc MD |
1234 | TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); |
| 1235 | if (type == 0) | |
| 1236 | suword(&job->_aiocb_private.error, error); | |
| 1237 | goto done; | |
| 1238 | } | |
| 1239 | no_kqueue: | |
| 1240 | ||
| 1241 | suword(&job->_aiocb_private.error, EINPROGRESS); | |
| 1242 | aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; | |
| 1243 | aiocbe->userproc = p; | |
| 1244 | aiocbe->jobflags = 0; | |
| 1245 | aiocbe->lio = lj; | |
| 1246 | ki = p->p_aioinfo; | |
| 1247 | ||
| 1248 | if (fp->f_type == DTYPE_SOCKET) { | |
| 1249 | /* | |
| 1250 | * Alternate queueing for socket ops: Reach down into the | |
| 1251 | * descriptor to get the socket data. Then check to see if the | |
| 1252 | * socket is ready to be read or written (based on the requested | |
| 1253 | * operation). | |
| 1254 | * | |
| 1255 | * If it is not ready for io, then queue the aiocbe on the | |
| 6d49aa6f | 1256 | * socket, and set the flags so we get a call when ssb_notify() |
| 984263bc MD |
1257 | * happens. |
| 1258 | */ | |
| 1259 | so = (struct socket *)fp->f_data; | |
| e43a034f | 1260 | crit_enter(); |
| 984263bc MD |
1261 | if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == |
| 1262 | LIO_WRITE) && (!sowriteable(so)))) { | |
| 1263 | TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); | |
| 1264 | TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); | |
| 1265 | if (opcode == LIO_READ) | |
| 14343ad3 | 1266 | atomic_set_int(&so->so_rcv.ssb_flags, SSB_AIO); |
| 984263bc | 1267 | else |
| 14343ad3 | 1268 | atomic_set_int(&so->so_snd.ssb_flags, SSB_AIO); |
| 984263bc MD |
1269 | aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ |
| 1270 | ki->kaio_queue_count++; | |
| 1271 | num_queue_count++; | |
| e43a034f | 1272 | crit_exit(); |
| 984263bc MD |
1273 | error = 0; |
| 1274 | goto done; | |
| 1275 | } | |
| e43a034f | 1276 | crit_exit(); |
| 984263bc MD |
1277 | } |
| 1278 | ||
| 1279 | if ((error = aio_qphysio(p, aiocbe)) == 0) | |
| 1280 | goto done; | |
| 1281 | if (error > 0) { | |
| 1282 | suword(&job->_aiocb_private.status, 0); | |
| 1283 | aiocbe->uaiocb._aiocb_private.error = error; | |
| 1284 | suword(&job->_aiocb_private.error, error); | |
| 1285 | goto done; | |
| 1286 | } | |
| 1287 | ||
| 1288 | /* No buffer for daemon I/O. */ | |
| 1289 | aiocbe->bp = NULL; | |
| 1290 | ||
| 1291 | ki->kaio_queue_count++; | |
| 1292 | if (lj) | |
| 1293 | lj->lioj_queue_count++; | |
| e43a034f | 1294 | crit_enter(); |
| 984263bc MD |
1295 | TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); |
| 1296 | TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); | |
| e43a034f | 1297 | crit_exit(); |
| 984263bc MD |
1298 | aiocbe->jobstate = JOBST_JOBQGLOBAL; |
| 1299 | ||
| 1300 | num_queue_count++; | |
| 1301 | error = 0; | |
| 1302 | ||
| 1303 | /* | |
| 1304 | * If we don't have a free AIO process, and we are below our quota, then | |
| 1305 | * start one. Otherwise, depend on the subsequent I/O completions to | |
| d0d91865 | 1306 | * pick-up this job. If we don't successfully create the new process |
| 984263bc MD |
1307 | * (thread) due to resource issues, we return an error for now (EAGAIN), |
| 1308 | * which is likely not the correct thing to do. | |
| 1309 | */ | |
| e43a034f | 1310 | crit_enter(); |
| 984263bc MD |
1311 | retryproc: |
| 1312 | if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { | |
| 1313 | TAILQ_REMOVE(&aio_freeproc, aiop, list); | |
| 1314 | TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); | |
| 1315 | aiop->aioprocflags &= ~AIOP_FREE; | |
| 1316 | wakeup(aiop->aioproc); | |
| 1317 | } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && | |
| 1318 | ((ki->kaio_active_count + num_aio_resv_start) < | |
| 1319 | ki->kaio_maxactive_count)) { | |
| 1320 | num_aio_resv_start++; | |
| 1321 | if ((error = aio_newproc()) == 0) { | |
| 1322 | num_aio_resv_start--; | |
| 1323 | goto retryproc; | |
| 1324 | } | |
| 1325 | num_aio_resv_start--; | |
| 1326 | } | |
| e43a034f | 1327 | crit_exit(); |
| 984263bc MD |
1328 | done: |
| 1329 | return error; | |
| 1330 | } | |
| 1331 | ||
| 1332 | /* | |
| 1333 | * This routine queues an AIO request, checking for quotas. | |
| 1334 | */ | |
| 1335 | static int | |
| 41c20dac | 1336 | aio_aqueue(struct aiocb *job, int type) |
| 984263bc | 1337 | { |
| 7b95be2a | 1338 | struct proc *p = curproc; |
| 984263bc MD |
1339 | struct kaioinfo *ki; |
| 1340 | ||
| 1341 | if (p->p_aioinfo == NULL) | |
| 1342 | aio_init_aioinfo(p); | |
| 1343 | ||
| 1344 | if (num_queue_count >= max_queue_count) | |
| 1345 | return EAGAIN; | |
| 1346 | ||
| 1347 | ki = p->p_aioinfo; | |
| 1348 | if (ki->kaio_queue_count >= ki->kaio_qallowed_count) | |
| 1349 | return EAGAIN; | |
| 1350 | ||
| 41c20dac | 1351 | return _aio_aqueue(job, NULL, type); |
| 984263bc MD |
1352 | } |
| 1353 | #endif /* VFS_AIO */ | |
| 1354 | ||
| 1355 | /* | |
| 1356 | * Support the aio_return system call, as a side-effect, kernel resources are | |
| 1357 | * released. | |
| 3919ced0 MD |
1358 | * |
| 1359 | * MPALMOSTSAFE | |
| 984263bc MD |
1360 | */ |
| 1361 | int | |
| 753fd850 | 1362 | sys_aio_return(struct aio_return_args *uap) |
| 984263bc MD |
1363 | { |
| 1364 | #ifndef VFS_AIO | |
| 3919ced0 | 1365 | return (ENOSYS); |
| 984263bc | 1366 | #else |
| 41c20dac | 1367 | struct proc *p = curproc; |
| fde7ac71 | 1368 | struct lwp *lp = curthread->td_lwp; |
| 984263bc MD |
1369 | long jobref; |
| 1370 | struct aiocblist *cb, *ncb; | |
| 1371 | struct aiocb *ujob; | |
| 1372 | struct kaioinfo *ki; | |
| 3919ced0 | 1373 | int error; |
| 984263bc MD |
1374 | |
| 1375 | ki = p->p_aioinfo; | |
| 1376 | if (ki == NULL) | |
| 1377 | return EINVAL; | |
| 1378 | ||
| 1379 | ujob = uap->aiocbp; | |
| 1380 | ||
| 1381 | jobref = fuword(&ujob->_aiocb_private.kernelinfo); | |
| 1382 | if (jobref == -1 || jobref == 0) | |
| 1383 | return EINVAL; | |
| 1384 | ||
| 3919ced0 | 1385 | get_mplock(); |
| 984263bc MD |
1386 | TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { |
| 1387 | if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == | |
| 1388 | jobref) { | |
| 1389 | if (ujob == cb->uuaiocb) { | |
| c7114eea | 1390 | uap->sysmsg_result = |
| 984263bc | 1391 | cb->uaiocb._aiocb_private.status; |
| 3919ced0 | 1392 | } else { |
| c7114eea | 1393 | uap->sysmsg_result = EFAULT; |
| 3919ced0 | 1394 | } |
| 984263bc | 1395 | if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { |
| fde7ac71 | 1396 | lp->lwp_ru.ru_oublock += cb->outputcharge; |
| 984263bc MD |
1397 | cb->outputcharge = 0; |
| 1398 | } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { | |
| fde7ac71 | 1399 | lp->lwp_ru.ru_inblock += cb->inputcharge; |
| 984263bc MD |
1400 | cb->inputcharge = 0; |
| 1401 | } | |
| 1402 | aio_free_entry(cb); | |
| 3919ced0 MD |
1403 | error = 0; |
| 1404 | goto done; | |
| 984263bc MD |
1405 | } |
| 1406 | } | |
| e43a034f | 1407 | crit_enter(); |
| 984263bc MD |
1408 | for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { |
| 1409 | ncb = TAILQ_NEXT(cb, plist); | |
| 1410 | if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) | |
| 1411 | == jobref) { | |
| e43a034f | 1412 | crit_exit(); |
| 984263bc | 1413 | if (ujob == cb->uuaiocb) { |
| c7114eea | 1414 | uap->sysmsg_result = |
| 984263bc | 1415 | cb->uaiocb._aiocb_private.status; |
| 3919ced0 | 1416 | } else { |
| c7114eea | 1417 | uap->sysmsg_result = EFAULT; |
| 3919ced0 | 1418 | } |
| 984263bc | 1419 | aio_free_entry(cb); |
| 3919ced0 MD |
1420 | error = 0; |
| 1421 | goto done; | |
| 984263bc MD |
1422 | } |
| 1423 | } | |
| e43a034f | 1424 | crit_exit(); |
| 3919ced0 MD |
1425 | error = EINVAL; |
| 1426 | done: | |
| 1427 | rel_mplock(); | |
| 1428 | return (error); | |
| 984263bc MD |
1429 | #endif /* VFS_AIO */ |
| 1430 | } | |
| 1431 | ||
| 1432 | /* | |
| 1433 | * Allow a process to wakeup when any of the I/O requests are completed. | |
| 3919ced0 MD |
1434 | * |
| 1435 | * MPALMOSTSAFE | |
| 984263bc MD |
1436 | */ |
| 1437 | int | |
| 753fd850 | 1438 | sys_aio_suspend(struct aio_suspend_args *uap) |
| 984263bc MD |
1439 | { |
| 1440 | #ifndef VFS_AIO | |
| 1441 | return ENOSYS; | |
| 1442 | #else | |
| 41c20dac | 1443 | struct proc *p = curproc; |
| 984263bc MD |
1444 | struct timeval atv; |
| 1445 | struct timespec ts; | |
| 1446 | struct aiocb *const *cbptr, *cbp; | |
| 1447 | struct kaioinfo *ki; | |
| 1448 | struct aiocblist *cb; | |
| 1449 | int i; | |
| 1450 | int njoblist; | |
| 831f78e5 | 1451 | int error, timo; |
| 984263bc MD |
1452 | long *ijoblist; |
| 1453 | struct aiocb **ujoblist; | |
| 1454 | ||
| 3919ced0 | 1455 | if ((u_int)uap->nent > AIO_LISTIO_MAX) |
| 984263bc MD |
1456 | return EINVAL; |
| 1457 | ||
| 1458 | timo = 0; | |
| 1459 | if (uap->timeout) { | |
| 1460 | /* Get timespec struct. */ | |
| 1461 | if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) | |
| 1462 | return error; | |
| 1463 | ||
| 1464 | if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) | |
| 1465 | return (EINVAL); | |
| 1466 | ||
| 1467 | TIMESPEC_TO_TIMEVAL(&atv, &ts); | |
| 1468 | if (itimerfix(&atv)) | |
| 1469 | return (EINVAL); | |
| a94976ad | 1470 | timo = tvtohz_high(&atv); |
| 984263bc MD |
1471 | } |
| 1472 | ||
| 1473 | ki = p->p_aioinfo; | |
| 1474 | if (ki == NULL) | |
| 1475 | return EAGAIN; | |
| 1476 | ||
| 3919ced0 MD |
1477 | get_mplock(); |
| 1478 | ||
| 984263bc | 1479 | njoblist = 0; |
| 5a26d050 SG |
1480 | ijoblist = objcache_get(aiol_oc, M_WAITOK); |
| 1481 | ujoblist = objcache_get(aiol_oc, M_WAITOK); | |
| 984263bc MD |
1482 | cbptr = uap->aiocbp; |
| 1483 | ||
| 1484 | for (i = 0; i < uap->nent; i++) { | |
| 1485 | cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); | |
| 1486 | if (cbp == 0) | |
| 1487 | continue; | |
| 1488 | ujoblist[njoblist] = cbp; | |
| 1489 | ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); | |
| 1490 | njoblist++; | |
| 1491 | } | |
| 1492 | ||
| 1493 | if (njoblist == 0) { | |
| 5a26d050 SG |
1494 | objcache_put(aiol_oc, ijoblist); |
| 1495 | objcache_put(aiol_oc, ujoblist); | |
| 3919ced0 MD |
1496 | error = 0; |
| 1497 | goto done; | |
| 984263bc MD |
1498 | } |
| 1499 | ||
| 1500 | error = 0; | |
| 1501 | for (;;) { | |
| 1502 | TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { | |
| 1503 | for (i = 0; i < njoblist; i++) { | |
| 1504 | if (((intptr_t) | |
| 1505 | cb->uaiocb._aiocb_private.kernelinfo) == | |
| 1506 | ijoblist[i]) { | |
| 1507 | if (ujoblist[i] != cb->uuaiocb) | |
| 1508 | error = EINVAL; | |
| 5a26d050 SG |
1509 | objcache_put(aiol_oc, ijoblist); |
| 1510 | objcache_put(aiol_oc, ujoblist); | |
| 3919ced0 | 1511 | goto done; |
| 984263bc MD |
1512 | } |
| 1513 | } | |
| 1514 | } | |
| 1515 | ||
| e43a034f | 1516 | crit_enter(); |
| 984263bc MD |
1517 | for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = |
| 1518 | TAILQ_NEXT(cb, plist)) { | |
| 1519 | for (i = 0; i < njoblist; i++) { | |
| 1520 | if (((intptr_t) | |
| 1521 | cb->uaiocb._aiocb_private.kernelinfo) == | |
| 1522 | ijoblist[i]) { | |
| e43a034f | 1523 | crit_exit(); |
| 984263bc MD |
1524 | if (ujoblist[i] != cb->uuaiocb) |
| 1525 | error = EINVAL; | |
| 5a26d050 SG |
1526 | objcache_put(aiol_oc, ijoblist); |
| 1527 | objcache_put(aiol_oc, ujoblist); | |
| 3919ced0 | 1528 | goto done; |
| 984263bc MD |
1529 | } |
| 1530 | } | |
| 1531 | } | |
| 1532 | ||
| 1533 | ki->kaio_flags |= KAIO_WAKEUP; | |
| 377d4740 | 1534 | error = tsleep(p, PCATCH, "aiospn", timo); |
| e43a034f | 1535 | crit_exit(); |
| 984263bc MD |
1536 | |
| 1537 | if (error == ERESTART || error == EINTR) { | |
| 5a26d050 SG |
1538 | objcache_put(aiol_oc, ijoblist); |
| 1539 | objcache_put(aiol_oc, ujoblist); | |
| 3919ced0 MD |
1540 | error = EINTR; |
| 1541 | goto done; | |
| 984263bc | 1542 | } else if (error == EWOULDBLOCK) { |
| 5a26d050 SG |
1543 | objcache_put(aiol_oc, ijoblist); |
| 1544 | objcache_put(aiol_oc, ujoblist); | |
| 3919ced0 MD |
1545 | error = EAGAIN; |
| 1546 | goto done; | |
| 984263bc MD |
1547 | } |
| 1548 | } | |
| 1549 | ||
| 1550 | /* NOTREACHED */ | |
| 3919ced0 MD |
1551 | error = EINVAL; |
| 1552 | done: | |
| 1553 | rel_mplock(); | |
| 1554 | return (error); | |
| 984263bc MD |
1555 | #endif /* VFS_AIO */ |
| 1556 | } | |
| 1557 | ||
| 1558 | /* | |
| 1559 | * aio_cancel cancels any non-physio aio operations not currently in | |
| 1560 | * progress. | |
| 3919ced0 MD |
1561 | * |
| 1562 | * MPALMOSTSAFE | |
| 984263bc MD |
1563 | */ |
| 1564 | int | |
| 753fd850 | 1565 | sys_aio_cancel(struct aio_cancel_args *uap) |
| 984263bc MD |
1566 | { |
| 1567 | #ifndef VFS_AIO | |
| 1568 | return ENOSYS; | |
| 1569 | #else | |
| 41c20dac | 1570 | struct proc *p = curproc; |
| 984263bc MD |
1571 | struct kaioinfo *ki; |
| 1572 | struct aiocblist *cbe, *cbn; | |
| 1573 | struct file *fp; | |
| 984263bc MD |
1574 | struct socket *so; |
| 1575 | struct proc *po; | |
| e43a034f | 1576 | int error; |
| 984263bc MD |
1577 | int cancelled=0; |
| 1578 | int notcancelled=0; | |
| 1579 | struct vnode *vp; | |
| 1580 | ||
| 3919ced0 MD |
1581 | fp = holdfp(p->p_fd, uap->fd, -1); |
| 1582 | if (fp == NULL) | |
| 984263bc MD |
1583 | return (EBADF); |
| 1584 | ||
| 3919ced0 MD |
1585 | get_mplock(); |
| 1586 | ||
| 984263bc MD |
1587 | if (fp->f_type == DTYPE_VNODE) { |
| 1588 | vp = (struct vnode *)fp->f_data; | |
| 1589 | ||
| 1590 | if (vn_isdisk(vp,&error)) { | |
| c7114eea | 1591 | uap->sysmsg_result = AIO_NOTCANCELED; |
| 3919ced0 MD |
1592 | error = 0; |
| 1593 | goto done2; | |
| 984263bc MD |
1594 | } |
| 1595 | } else if (fp->f_type == DTYPE_SOCKET) { | |
| 1596 | so = (struct socket *)fp->f_data; | |
| 1597 | ||
| e43a034f | 1598 | crit_enter(); |
| 984263bc MD |
1599 | |
| 1600 | for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { | |
| 1601 | cbn = TAILQ_NEXT(cbe, list); | |
| 1602 | if ((uap->aiocbp == NULL) || | |
| 1603 | (uap->aiocbp == cbe->uuaiocb) ) { | |
| 1604 | po = cbe->userproc; | |
| 1605 | ki = po->p_aioinfo; | |
| 1606 | TAILQ_REMOVE(&so->so_aiojobq, cbe, list); | |
| 1607 | TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); | |
| 1608 | TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); | |
| 1609 | if (ki->kaio_flags & KAIO_WAKEUP) { | |
| 1610 | wakeup(po); | |
| 1611 | } | |
| 1612 | cbe->jobstate = JOBST_JOBFINISHED; | |
| 1613 | cbe->uaiocb._aiocb_private.status=-1; | |
| 1614 | cbe->uaiocb._aiocb_private.error=ECANCELED; | |
| 1615 | cancelled++; | |
| 1616 | /* XXX cancelled, knote? */ | |
| 1617 | if (cbe->uaiocb.aio_sigevent.sigev_notify == | |
| 1618 | SIGEV_SIGNAL) | |
| 84204577 | 1619 | ksignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); |
| 984263bc MD |
1620 | if (uap->aiocbp) |
| 1621 | break; | |
| 1622 | } | |
| 1623 | } | |
| e43a034f | 1624 | crit_exit(); |
| 984263bc MD |
1625 | |
| 1626 | if ((cancelled) && (uap->aiocbp)) { | |
| c7114eea | 1627 | uap->sysmsg_result = AIO_CANCELED; |
| 3919ced0 MD |
1628 | error = 0; |
| 1629 | goto done2; | |
| 984263bc MD |
1630 | } |
| 1631 | } | |
| 1632 | ki=p->p_aioinfo; | |
| 1633 | if (ki == NULL) | |
| 1634 | goto done; | |
| e43a034f | 1635 | crit_enter(); |
| 984263bc MD |
1636 | |
| 1637 | for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { | |
| 1638 | cbn = TAILQ_NEXT(cbe, plist); | |
| 1639 | ||
| 1640 | if ((uap->fd == cbe->uaiocb.aio_fildes) && | |
| 1641 | ((uap->aiocbp == NULL ) || | |
| 1642 | (uap->aiocbp == cbe->uuaiocb))) { | |
| 1643 | ||
| 1644 | if (cbe->jobstate == JOBST_JOBQGLOBAL) { | |
| 1645 | TAILQ_REMOVE(&aio_jobs, cbe, list); | |
| 1646 | TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); | |
| 1647 | TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, | |
| 1648 | plist); | |
| 1649 | cancelled++; | |
| 1650 | ki->kaio_queue_finished_count++; | |
| 1651 | cbe->jobstate = JOBST_JOBFINISHED; | |
| 1652 | cbe->uaiocb._aiocb_private.status = -1; | |
| 1653 | cbe->uaiocb._aiocb_private.error = ECANCELED; | |
| 1654 | /* XXX cancelled, knote? */ | |
| 1655 | if (cbe->uaiocb.aio_sigevent.sigev_notify == | |
| 1656 | SIGEV_SIGNAL) | |
| 84204577 | 1657 | ksignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); |
| 984263bc MD |
1658 | } else { |
| 1659 | notcancelled++; | |
| 1660 | } | |
| 1661 | } | |
| 1662 | } | |
| e43a034f | 1663 | crit_exit(); |
| 984263bc | 1664 | done: |
| 3919ced0 | 1665 | if (notcancelled) |
| c7114eea | 1666 | uap->sysmsg_result = AIO_NOTCANCELED; |
| 3919ced0 | 1667 | else if (cancelled) |
| c7114eea | 1668 | uap->sysmsg_result = AIO_CANCELED; |
| 3919ced0 MD |
1669 | else |
| 1670 | uap->sysmsg_result = AIO_ALLDONE; | |
| 1671 | error = 0; | |
| 1672 | done2: | |
| 1673 | rel_mplock(); | |
| 1674 | fdrop(fp); | |
| 1675 | return error; | |
| 984263bc MD |
1676 | #endif /* VFS_AIO */ |
| 1677 | } | |
| 1678 | ||
| 1679 | /* | |
| 1680 | * aio_error is implemented in the kernel level for compatibility purposes only. | |
| 1681 | * For a user mode async implementation, it would be best to do it in a userland | |
| 1682 | * subroutine. | |
| 3919ced0 MD |
1683 | * |
| 1684 | * MPALMOSTSAFE | |
| 984263bc MD |
1685 | */ |
| 1686 | int | |
| 753fd850 | 1687 | sys_aio_error(struct aio_error_args *uap) |
| 984263bc MD |
1688 | { |
| 1689 | #ifndef VFS_AIO | |
| 1690 | return ENOSYS; | |
| 1691 | #else | |
| 41c20dac | 1692 | struct proc *p = curproc; |
| 984263bc MD |
1693 | struct aiocblist *cb; |
| 1694 | struct kaioinfo *ki; | |
| 1695 | long jobref; | |
| 3919ced0 | 1696 | int error; |
| 984263bc MD |
1697 | |
| 1698 | ki = p->p_aioinfo; | |
| 1699 | if (ki == NULL) | |
| 1700 | return EINVAL; | |
| 1701 | ||
| 1702 | jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); | |
| 1703 | if ((jobref == -1) || (jobref == 0)) | |
| 1704 | return EINVAL; | |
| 1705 | ||
| 3919ced0 MD |
1706 | get_mplock(); |
| 1707 | error = 0; | |
| 1708 | ||
| 984263bc MD |
1709 | TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { |
| 1710 | if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == | |
| 1711 | jobref) { | |
| c7114eea | 1712 | uap->sysmsg_result = cb->uaiocb._aiocb_private.error; |
| 3919ced0 | 1713 | goto done; |
| 984263bc MD |
1714 | } |
| 1715 | } | |
| 1716 | ||
| e43a034f | 1717 | crit_enter(); |
| 984263bc MD |
1718 | |
| 1719 | for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, | |
| 1720 | plist)) { | |
| 1721 | if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == | |
| 1722 | jobref) { | |
| c7114eea | 1723 | uap->sysmsg_result = EINPROGRESS; |
| e43a034f | 1724 | crit_exit(); |
| 3919ced0 | 1725 | goto done; |
| 984263bc MD |
1726 | } |
| 1727 | } | |
| 1728 | ||
| 1729 | for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, | |
| 1730 | plist)) { | |
| 1731 | if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == | |
| 1732 | jobref) { | |
| c7114eea | 1733 | uap->sysmsg_result = EINPROGRESS; |
| e43a034f | 1734 | crit_exit(); |
| 3919ced0 | 1735 | goto done; |
| 984263bc MD |
1736 | } |
| 1737 | } | |
| e43a034f | 1738 | crit_exit(); |
| 984263bc | 1739 | |
| e43a034f | 1740 | crit_enter(); |
| 984263bc MD |
1741 | for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, |
| 1742 | plist)) { | |
| 1743 | if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == | |
| 1744 | jobref) { | |
| c7114eea | 1745 | uap->sysmsg_result = cb->uaiocb._aiocb_private.error; |
| e43a034f | 1746 | crit_exit(); |
| 3919ced0 | 1747 | goto done; |
| 984263bc MD |
1748 | } |
| 1749 | } | |
| 1750 | ||
| 1751 | for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, | |
| 1752 | plist)) { | |
| 1753 | if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == | |
| 1754 | jobref) { | |
| c7114eea | 1755 | uap->sysmsg_result = EINPROGRESS; |
| e43a034f | 1756 | crit_exit(); |
| 3919ced0 | 1757 | goto done; |
| 984263bc MD |
1758 | } |
| 1759 | } | |
| e43a034f | 1760 | crit_exit(); |
| 3919ced0 MD |
1761 | error = EINVAL; |
| 1762 | done: | |
| 1763 | rel_mplock(); | |
| 1764 | return (error); | |
| 984263bc MD |
1765 | #endif /* VFS_AIO */ |
| 1766 | } | |
| 1767 | ||
| 3919ced0 MD |
1768 | /* |
| 1769 | * syscall - asynchronous read from a file (REALTIME) | |
| 1770 | * | |
| 1771 | * MPALMOSTSAFE | |
| 1772 | */ | |
| 984263bc | 1773 | int |
| 753fd850 | 1774 | sys_aio_read(struct aio_read_args *uap) |
| 984263bc MD |
1775 | { |
| 1776 | #ifndef VFS_AIO | |
| 1777 | return ENOSYS; | |
| 1778 | #else | |
| 3919ced0 MD |
1779 | int error; |
| 1780 | ||
| 1781 | get_mplock(); | |
| 1782 | error = aio_aqueue(uap->aiocbp, LIO_READ); | |
| 1783 | rel_mplock(); | |
| 1784 | return (error); | |
| 984263bc MD |
1785 | #endif /* VFS_AIO */ |
| 1786 | } | |
| 1787 | ||
| 3919ced0 MD |
1788 | /* |
| 1789 | * syscall - asynchronous write to a file (REALTIME) | |
| 1790 | * | |
| 1791 | * MPALMOSTSAFE | |
| 1792 | */ | |
| 984263bc | 1793 | int |
| 753fd850 | 1794 | sys_aio_write(struct aio_write_args *uap) |
| 984263bc MD |
1795 | { |
| 1796 | #ifndef VFS_AIO | |
| 1797 | return ENOSYS; | |
| 1798 | #else | |
| 3919ced0 MD |
1799 | int error; |
| 1800 | ||
| 1801 | get_mplock(); | |
| 1802 | error = aio_aqueue(uap->aiocbp, LIO_WRITE); | |
| 1803 | rel_mplock(); | |
| 1804 | return (error); | |
| 984263bc MD |
1805 | #endif /* VFS_AIO */ |
| 1806 | } | |
| 1807 | ||
| 3919ced0 MD |
1808 | /* |
| 1809 | * syscall - XXX undocumented | |
| 1810 | * | |
| 1811 | * MPALMOSTSAFE | |
| 1812 | */ | |
| 984263bc | 1813 | int |
| 753fd850 | 1814 | sys_lio_listio(struct lio_listio_args *uap) |
| 984263bc MD |
1815 | { |
| 1816 | #ifndef VFS_AIO | |
| 1817 | return ENOSYS; | |
| 1818 | #else | |
| 41c20dac | 1819 | struct proc *p = curproc; |
| fde7ac71 | 1820 | struct lwp *lp = curthread->td_lwp; |
| 984263bc MD |
1821 | int nent, nentqueued; |
| 1822 | struct aiocb *iocb, * const *cbptr; | |
| 1823 | struct aiocblist *cb; | |
| 1824 | struct kaioinfo *ki; | |
| 1825 | struct aio_liojob *lj; | |
| 1826 | int error, runningcode; | |
| 1827 | int nerror; | |
| 1828 | int i; | |
| 984263bc MD |
1829 | |
| 1830 | if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) | |
| 1831 | return EINVAL; | |
| 1832 | ||
| 1833 | nent = uap->nent; | |
| 1834 | if (nent > AIO_LISTIO_MAX) | |
| 1835 | return EINVAL; | |
| 1836 | ||
| 3919ced0 MD |
1837 | get_mplock(); |
| 1838 | ||
| 984263bc MD |
1839 | if (p->p_aioinfo == NULL) |
| 1840 | aio_init_aioinfo(p); | |
| 1841 | ||
| 3919ced0 MD |
1842 | if ((nent + num_queue_count) > max_queue_count) { |
| 1843 | error = EAGAIN; | |
| 1844 | goto done; | |
| 1845 | } | |
| 984263bc MD |
1846 | |
| 1847 | ki = p->p_aioinfo; | |
| 3919ced0 MD |
1848 | if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { |
| 1849 | error = EAGAIN; | |
| 1850 | goto done; | |
| 1851 | } | |
| 984263bc | 1852 | |
| 5a26d050 | 1853 | lj = objcache_get(aiolio_oc); |
| 3919ced0 MD |
1854 | if (lj == NULL) { |
| 1855 | error = EAGAIN; | |
| 1856 | goto done; | |
| 1857 | } | |
| 984263bc MD |
1858 | |
| 1859 | lj->lioj_flags = 0; | |
| 1860 | lj->lioj_buffer_count = 0; | |
| 1861 | lj->lioj_buffer_finished_count = 0; | |
| 1862 | lj->lioj_queue_count = 0; | |
| 1863 | lj->lioj_queue_finished_count = 0; | |
| 1864 | lj->lioj_ki = ki; | |
| 1865 | ||
| 1866 | /* | |
| 1867 | * Setup signal. | |
| 1868 | */ | |
| 1869 | if (uap->sig && (uap->mode == LIO_NOWAIT)) { | |
| 1870 | error = copyin(uap->sig, &lj->lioj_signal, | |
| 1871 | sizeof(lj->lioj_signal)); | |
| 1872 | if (error) { | |
| 5a26d050 | 1873 | objcache_put(aiolio_oc, lj); |
| 3919ced0 | 1874 | goto done; |
| 984263bc MD |
1875 | } |
| 1876 | if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { | |
| 5a26d050 | 1877 | objcache_put(aiolio_oc, lj); |
| 3919ced0 MD |
1878 | error = EINVAL; |
| 1879 | goto done; | |
| 984263bc MD |
1880 | } |
| 1881 | lj->lioj_flags |= LIOJ_SIGNAL; | |
| 1882 | lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; | |
| 1883 | } else | |
| 1884 | lj->lioj_flags &= ~LIOJ_SIGNAL; | |
| 1885 | ||
| 1886 | TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); | |
| 1887 | /* | |
| 1888 | * Get pointers to the list of I/O requests. | |
| 1889 | */ | |
| 1890 | nerror = 0; | |
| 1891 | nentqueued = 0; | |
| 1892 | cbptr = uap->acb_list; | |
| 1893 | for (i = 0; i < uap->nent; i++) { | |
| 1894 | iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); | |
| 1895 | if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) { | |
| 41c20dac | 1896 | error = _aio_aqueue(iocb, lj, 0); |
| 984263bc MD |
1897 | if (error == 0) |
| 1898 | nentqueued++; | |
| 1899 | else | |
| 1900 | nerror++; | |
| 1901 | } | |
| 1902 | } | |
| 1903 | ||
| 1904 | /* | |
| 1905 | * If we haven't queued any, then just return error. | |
| 1906 | */ | |
| 3919ced0 MD |
1907 | if (nentqueued == 0) { |
| 1908 | error = 0; | |
| 1909 | goto done; | |
| 1910 | } | |
| 984263bc MD |
1911 | |
| 1912 | /* | |
| 1913 | * Calculate the appropriate error return. | |
| 1914 | */ | |
| 1915 | runningcode = 0; | |
| 1916 | if (nerror) | |
| 1917 | runningcode = EIO; | |
| 1918 | ||
| 1919 | if (uap->mode == LIO_WAIT) { | |
| 1920 | int command, found, jobref; | |
| 1921 | ||
| 1922 | for (;;) { | |
| 1923 | found = 0; | |
| 1924 | for (i = 0; i < uap->nent; i++) { | |
| 1925 | /* | |
| 1926 | * Fetch address of the control buf pointer in | |
| 1927 | * user space. | |
| 1928 | */ | |
| 1929 | iocb = (struct aiocb *) | |
| 1930 | (intptr_t)fuword(&cbptr[i]); | |
| 1931 | if (((intptr_t)iocb == -1) || ((intptr_t)iocb | |
| 1932 | == 0)) | |
| 1933 | continue; | |
| 1934 | ||
| 1935 | /* | |
| 1936 | * Fetch the associated command from user space. | |
| 1937 | */ | |
| 1938 | command = fuword(&iocb->aio_lio_opcode); | |
| 1939 | if (command == LIO_NOP) { | |
| 1940 | found++; | |
| 1941 | continue; | |
| 1942 | } | |
| 1943 | ||
| 1944 | jobref = fuword(&iocb->_aiocb_private.kernelinfo); | |
| 1945 | ||
| 1946 | TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { | |
| 1947 | if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) | |
| 1948 | == jobref) { | |
| 1949 | if (cb->uaiocb.aio_lio_opcode | |
| 1950 | == LIO_WRITE) { | |
| fde7ac71 | 1951 | lp->lwp_ru.ru_oublock += |
| 984263bc MD |
1952 | cb->outputcharge; |
| 1953 | cb->outputcharge = 0; | |
| 1954 | } else if (cb->uaiocb.aio_lio_opcode | |
| 1955 | == LIO_READ) { | |
| fde7ac71 SS |
1956 | lp->lwp_ru.ru_inblock += |
| 1957 | cb->inputcharge; | |
| 984263bc MD |
1958 | cb->inputcharge = 0; |
| 1959 | } | |
| 1960 | found++; | |
| 1961 | break; | |
| 1962 | } | |
| 1963 | } | |
| 1964 | ||
| e43a034f | 1965 | crit_enter(); |
| 984263bc MD |
1966 | TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { |
| 1967 | if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) | |
| 1968 | == jobref) { | |
| 1969 | found++; | |
| 1970 | break; | |
| 1971 | } | |
| 1972 | } | |
| e43a034f | 1973 | crit_exit(); |
| 984263bc MD |
1974 | } |
| 1975 | ||
| 1976 | /* | |
| 1977 | * If all I/Os have been disposed of, then we can | |
| 1978 | * return. | |
| 1979 | */ | |
| 3919ced0 MD |
1980 | if (found == nentqueued) { |
| 1981 | error = runningcode; | |
| 1982 | goto done; | |
| 1983 | } | |
| 984263bc MD |
1984 | |
| 1985 | ki->kaio_flags |= KAIO_WAKEUP; | |
| 377d4740 | 1986 | error = tsleep(p, PCATCH, "aiospn", 0); |
| 984263bc | 1987 | |
| 3919ced0 MD |
1988 | if (error == EINTR) { |
| 1989 | goto done; | |
| 1990 | } else if (error == EWOULDBLOCK) { | |
| 1991 | error = EAGAIN; | |
| 1992 | goto done; | |
| 1993 | } | |
| 984263bc MD |
1994 | } |
| 1995 | } | |
| 1996 | ||
| 3919ced0 MD |
1997 | error = runningcode; |
| 1998 | done: | |
| 1999 | rel_mplock(); | |
| 2000 | return (error); | |
| 984263bc MD |
2001 | #endif /* VFS_AIO */ |
| 2002 | } | |
| 2003 | ||
| 2004 | #ifdef VFS_AIO | |
| 2005 | /* | |
| 2006 | * This is a weird hack so that we can post a signal. It is safe to do so from | |
| 2007 | * a timeout routine, but *not* from an interrupt routine. | |
| 2008 | */ | |
| 2009 | static void | |
| 2010 | process_signal(void *aioj) | |
| 2011 | { | |
| 2012 | struct aiocblist *aiocbe = aioj; | |
| 2013 | struct aio_liojob *lj = aiocbe->lio; | |
| 2014 | struct aiocb *cb = &aiocbe->uaiocb; | |
| 2015 | ||
| 2016 | if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && | |
| 2017 | (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { | |
| 84204577 | 2018 | ksignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); |
| 984263bc MD |
2019 | lj->lioj_flags |= LIOJ_SIGNAL_POSTED; |
| 2020 | } | |
| 2021 | ||
| 2022 | if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) | |
| 84204577 | 2023 | ksignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); |
| 984263bc MD |
2024 | } |
| 2025 | ||
| 2026 | /* | |
| 2027 | * Interrupt handler for physio, performs the necessary process wakeups, and | |
| 2028 | * signals. | |
| 2029 | */ | |
| 2030 | static void | |
| 81b5c339 | 2031 | aio_physwakeup(struct bio *bio) |
| 984263bc | 2032 | { |
| 81b5c339 | 2033 | struct buf *bp = bio->bio_buf; |
| 984263bc MD |
2034 | struct aiocblist *aiocbe; |
| 2035 | struct proc *p; | |
| 2036 | struct kaioinfo *ki; | |
| 2037 | struct aio_liojob *lj; | |
| 2038 | ||
| 81b5c339 | 2039 | aiocbe = bio->bio_caller_info2.ptr; |
| 77912481 | 2040 | get_mplock(); |
| 984263bc | 2041 | |
| 984263bc | 2042 | if (aiocbe) { |
| 81b5c339 | 2043 | p = bio->bio_caller_info1.ptr; |
| 984263bc MD |
2044 | |
| 2045 | aiocbe->jobstate = JOBST_JOBBFINISHED; | |
| 2046 | aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; | |
| 2047 | aiocbe->uaiocb._aiocb_private.error = 0; | |
| 2048 | aiocbe->jobflags |= AIOCBLIST_DONE; | |
| 2049 | ||
| 2050 | if (bp->b_flags & B_ERROR) | |
| 2051 | aiocbe->uaiocb._aiocb_private.error = bp->b_error; | |
| 2052 | ||
| 2053 | lj = aiocbe->lio; | |
| 2054 | if (lj) { | |
| 2055 | lj->lioj_buffer_finished_count++; | |
| 2056 | ||
| 2057 | /* | |
| 2058 | * wakeup/signal if all of the interrupt jobs are done. | |
| 2059 | */ | |
| 2060 | if (lj->lioj_buffer_finished_count == | |
| 2061 | lj->lioj_buffer_count) { | |
| 2062 | /* | |
| 2063 | * Post a signal if it is called for. | |
| 2064 | */ | |
| 2065 | if ((lj->lioj_flags & | |
| 2066 | (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == | |
| 2067 | LIOJ_SIGNAL) { | |
| 2068 | lj->lioj_flags |= LIOJ_SIGNAL_POSTED; | |
| a5eb27b6 MD |
2069 | callout_reset(&aiocbe->timeout, 0, |
| 2070 | process_signal, aiocbe); | |
| 984263bc MD |
2071 | } |
| 2072 | } | |
| 2073 | } | |
| 2074 | ||
| 2075 | ki = p->p_aioinfo; | |
| 2076 | if (ki) { | |
| 2077 | ki->kaio_buffer_finished_count++; | |
| 2078 | TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); | |
| 2079 | TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); | |
| 2080 | TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); | |
| 2081 | ||
| 2082 | KNOTE(&aiocbe->klist, 0); | |
| 2083 | /* Do the wakeup. */ | |
| 2084 | if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { | |
| 2085 | ki->kaio_flags &= ~KAIO_WAKEUP; | |
| 2086 | wakeup(p); | |
| 2087 | } | |
| 2088 | } | |
| 2089 | ||
| a5eb27b6 MD |
2090 | if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) { |
| 2091 | callout_reset(&aiocbe->timeout, 0, | |
| 2092 | process_signal, aiocbe); | |
| 2093 | } | |
| 984263bc | 2094 | } |
| ae8e83e6 | 2095 | biodone_sync(bio); |
| 77912481 | 2096 | rel_mplock(); |
| 984263bc MD |
2097 | } |
| 2098 | #endif /* VFS_AIO */ | |
| 2099 | ||
| 3919ced0 MD |
2100 | /* |
| 2101 | * syscall - wait for the next completion of an aio request | |
| 2102 | * | |
| 2103 | * MPALMOSTSAFE | |
| 2104 | */ | |
| 984263bc | 2105 | int |
| 753fd850 | 2106 | sys_aio_waitcomplete(struct aio_waitcomplete_args *uap) |
| 984263bc MD |
2107 | { |
| 2108 | #ifndef VFS_AIO | |
| 2109 | return ENOSYS; | |
| 2110 | #else | |
| 41c20dac | 2111 | struct proc *p = curproc; |
| fde7ac71 | 2112 | struct lwp *lp = curthread->td_lwp; |
| 984263bc MD |
2113 | struct timeval atv; |
| 2114 | struct timespec ts; | |
| 2115 | struct kaioinfo *ki; | |
| 2116 | struct aiocblist *cb = NULL; | |
| 831f78e5 | 2117 | int error, timo; |
| 984263bc MD |
2118 | |
| 2119 | suword(uap->aiocbp, (int)NULL); | |
| 2120 | ||
| 2121 | timo = 0; | |
| 2122 | if (uap->timeout) { | |
| 2123 | /* Get timespec struct. */ | |
| 2124 | error = copyin(uap->timeout, &ts, sizeof(ts)); | |
| 2125 | if (error) | |
| 2126 | return error; | |
| 2127 | ||
| 2128 | if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) | |
| 2129 | return (EINVAL); | |
| 2130 | ||
| 2131 | TIMESPEC_TO_TIMEVAL(&atv, &ts); | |
| 2132 | if (itimerfix(&atv)) | |
| 2133 | return (EINVAL); | |
| a94976ad | 2134 | timo = tvtohz_high(&atv); |
| 984263bc MD |
2135 | } |
| 2136 | ||
| 2137 | ki = p->p_aioinfo; | |
| 2138 | if (ki == NULL) | |
| 2139 | return EAGAIN; | |
| 2140 | ||
| 3919ced0 MD |
2141 | get_mplock(); |
| 2142 | ||
| 984263bc MD |
2143 | for (;;) { |
| 2144 | if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { | |
| 2145 | suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); | |
| c7114eea | 2146 | uap->sysmsg_result = cb->uaiocb._aiocb_private.status; |
| 984263bc | 2147 | if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { |
| fde7ac71 | 2148 | lp->lwp_ru.ru_oublock += |
| 984263bc MD |
2149 | cb->outputcharge; |
| 2150 | cb->outputcharge = 0; | |
| 2151 | } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { | |
| fde7ac71 | 2152 | lp->lwp_ru.ru_inblock += cb->inputcharge; |
| 984263bc MD |
2153 | cb->inputcharge = 0; |
| 2154 | } | |
| 2155 | aio_free_entry(cb); | |
| 3919ced0 MD |
2156 | error = cb->uaiocb._aiocb_private.error; |
| 2157 | break; | |
| 984263bc MD |
2158 | } |
| 2159 | ||
| e43a034f | 2160 | crit_enter(); |
| 984263bc | 2161 | if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { |
| e43a034f | 2162 | crit_exit(); |
| 984263bc | 2163 | suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); |
| c7114eea | 2164 | uap->sysmsg_result = cb->uaiocb._aiocb_private.status; |
| 984263bc | 2165 | aio_free_entry(cb); |
| 3919ced0 MD |
2166 | error = cb->uaiocb._aiocb_private.error; |
| 2167 | break; | |
| 984263bc MD |
2168 | } |
| 2169 | ||
| 2170 | ki->kaio_flags |= KAIO_WAKEUP; | |
| 377d4740 | 2171 | error = tsleep(p, PCATCH, "aiowc", timo); |
| e43a034f | 2172 | crit_exit(); |
| 984263bc | 2173 | |
| 3919ced0 MD |
2174 | if (error == ERESTART) { |
| 2175 | error = EINTR; | |
| 2176 | break; | |
| 2177 | } | |
| 2178 | if (error < 0) | |
| 2179 | break; | |
| 2180 | if (error == EINTR) | |
| 2181 | break; | |
| 2182 | if (error == EWOULDBLOCK) { | |
| 2183 | error = EAGAIN; | |
| 2184 | break; | |
| 2185 | } | |
| 984263bc | 2186 | } |
| 3919ced0 MD |
2187 | rel_mplock(); |
| 2188 | return (error); | |
| 984263bc MD |
2189 | #endif /* VFS_AIO */ |
| 2190 | } | |
| 2191 | ||
| 2192 | #ifndef VFS_AIO | |
| 2193 | static int | |
| 2194 | filt_aioattach(struct knote *kn) | |
| 2195 | { | |
| 2196 | ||
| 2197 | return (ENXIO); | |
| 2198 | } | |
| 2199 | ||
| 2200 | struct filterops aio_filtops = | |
| 2201 | { 0, filt_aioattach, NULL, NULL }; | |
| 2202 | ||
| 2203 | #else | |
| 2204 | /* kqueue attach function */ | |
| 2205 | static int | |
| 2206 | filt_aioattach(struct knote *kn) | |
| 2207 | { | |
| 2208 | struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; | |
| 2209 | ||
| 2210 | /* | |
| 2211 | * The aiocbe pointer must be validated before using it, so | |
| 2212 | * registration is restricted to the kernel; the user cannot | |
| 2213 | * set EV_FLAG1. | |
| 2214 | */ | |
| 2215 | if ((kn->kn_flags & EV_FLAG1) == 0) | |
| 2216 | return (EPERM); | |
| 2217 | kn->kn_flags &= ~EV_FLAG1; | |
| 2218 | ||
| 5b22f1a7 | 2219 | knote_insert(&aiocbe->klist, kn); |
| 984263bc MD |
2220 | |
| 2221 | return (0); | |
| 2222 | } | |
| 2223 | ||
| 2224 | /* kqueue detach function */ | |
| 2225 | static void | |
| 2226 | filt_aiodetach(struct knote *kn) | |
| 2227 | { | |
| 2228 | struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; | |
| 2229 | ||
| 5b22f1a7 | 2230 | knote_remove(&aiocbe->klist, kn); |
| 984263bc MD |
2231 | } |
| 2232 | ||
| 2233 | /* kqueue filter function */ | |
| 2234 | /*ARGSUSED*/ | |
| 2235 | static int | |
| 2236 | filt_aio(struct knote *kn, long hint) | |
| 2237 | { | |
| 2238 | struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; | |
| 2239 | ||
| 2240 | kn->kn_data = aiocbe->uaiocb._aiocb_private.error; | |
| 2241 | if (aiocbe->jobstate != JOBST_JOBFINISHED && | |
| 2242 | aiocbe->jobstate != JOBST_JOBBFINISHED) | |
| 2243 | return (0); | |
| 2244 | kn->kn_flags |= EV_EOF; | |
| 2245 | return (1); | |
| 2246 | } | |
| 2247 | ||
| 2248 | struct filterops aio_filtops = | |
| 2249 | { 0, filt_aioattach, filt_aiodetach, filt_aio }; | |
| 2250 | #endif /* VFS_AIO */ |