Merge branch 'master' of ssh://crater.dragonflybsd.org/repository/git/dragonfly
[dragonfly.git] / sys / kern / sys_pipe.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 * John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 * are met.
18 *
19 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.60.2.13 2002/08/05 15:05:15 des Exp $
c730be20 20 * $DragonFly: src/sys/kern/sys_pipe.c,v 1.50 2008/09/09 04:06:13 dillon Exp $
984263bc
MD
21 */
22
23/*
24 * This file contains a high-performance replacement for the socket-based
25 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
26 * all features of sockets, but does do everything that pipes normally
27 * do.
28 */
984263bc
MD
29#include <sys/param.h>
30#include <sys/systm.h>
fc7d5181 31#include <sys/kernel.h>
984263bc
MD
32#include <sys/proc.h>
33#include <sys/fcntl.h>
34#include <sys/file.h>
35#include <sys/filedesc.h>
36#include <sys/filio.h>
37#include <sys/ttycom.h>
38#include <sys/stat.h>
39#include <sys/poll.h>
40#include <sys/select.h>
41#include <sys/signalvar.h>
42#include <sys/sysproto.h>
43#include <sys/pipe.h>
44#include <sys/vnode.h>
45#include <sys/uio.h>
46#include <sys/event.h>
fc7d5181
MD
47#include <sys/globaldata.h>
48#include <sys/module.h>
49#include <sys/malloc.h>
50#include <sys/sysctl.h>
004d2de5 51#include <sys/socket.h>
984263bc
MD
52
53#include <vm/vm.h>
54#include <vm/vm_param.h>
55#include <sys/lock.h>
56#include <vm/vm_object.h>
57#include <vm/vm_kern.h>
58#include <vm/vm_extern.h>
59#include <vm/pmap.h>
60#include <vm/vm_map.h>
61#include <vm/vm_page.h>
62#include <vm/vm_zone.h>
63
dadab5e9
MD
64#include <sys/file2.h>
65
8100156a 66#include <machine/cpufunc.h>
984263bc
MD
67
68/*
69 * interfaces to the outside world
70 */
402ed7e1 71static int pipe_read (struct file *fp, struct uio *uio,
87de5057 72 struct ucred *cred, int flags);
402ed7e1 73static int pipe_write (struct file *fp, struct uio *uio,
87de5057
MD
74 struct ucred *cred, int flags);
75static int pipe_close (struct file *fp);
76static int pipe_shutdown (struct file *fp, int how);
77static int pipe_poll (struct file *fp, int events, struct ucred *cred);
402ed7e1 78static int pipe_kqfilter (struct file *fp, struct knote *kn);
87de5057
MD
79static int pipe_stat (struct file *fp, struct stat *sb, struct ucred *cred);
80static int pipe_ioctl (struct file *fp, u_long cmd, caddr_t data, struct ucred *cred);
984263bc
MD
81
82static struct fileops pipeops = {
b2d248cb
MD
83 .fo_read = pipe_read,
84 .fo_write = pipe_write,
85 .fo_ioctl = pipe_ioctl,
86 .fo_poll = pipe_poll,
87 .fo_kqfilter = pipe_kqfilter,
88 .fo_stat = pipe_stat,
89 .fo_close = pipe_close,
90 .fo_shutdown = pipe_shutdown
984263bc
MD
91};
92
93static void filt_pipedetach(struct knote *kn);
94static int filt_piperead(struct knote *kn, long hint);
95static int filt_pipewrite(struct knote *kn, long hint);
96
97static struct filterops pipe_rfiltops =
98 { 1, NULL, filt_pipedetach, filt_piperead };
99static struct filterops pipe_wfiltops =
100 { 1, NULL, filt_pipedetach, filt_pipewrite };
101
fc7d5181 102MALLOC_DEFINE(M_PIPE, "pipe", "pipe structures");
984263bc
MD
103
104/*
105 * Default pipe buffer size(s), this can be kind-of large now because pipe
106 * space is pageable. The pipe code will try to maintain locality of
107 * reference for performance reasons, so small amounts of outstanding I/O
108 * will not wipe the cache.
109 */
110#define MINPIPESIZE (PIPE_SIZE/3)
111#define MAXPIPESIZE (2*PIPE_SIZE/3)
112
984263bc
MD
113/*
114 * Limit the number of "big" pipes
115 */
08593aa1 116#define LIMITBIGPIPES 64
fc7d5181
MD
117#define PIPEQ_MAX_CACHE 16 /* per-cpu pipe structure cache */
118
119static int pipe_maxbig = LIMITBIGPIPES;
120static int pipe_maxcache = PIPEQ_MAX_CACHE;
39b0a1af 121static int pipe_bigcount;
fc7d5181 122static int pipe_nbig;
fc7d5181
MD
123static int pipe_bcache_alloc;
124static int pipe_bkmem_alloc;
880ffa3a
MD
125static int pipe_rblocked_count;
126static int pipe_wblocked_count;
fc7d5181
MD
127
128SYSCTL_NODE(_kern, OID_AUTO, pipe, CTLFLAG_RW, 0, "Pipe operation");
129SYSCTL_INT(_kern_pipe, OID_AUTO, nbig,
130 CTLFLAG_RD, &pipe_nbig, 0, "numer of big pipes allocated");
39b0a1af
MD
131SYSCTL_INT(_kern_pipe, OID_AUTO, bigcount,
132 CTLFLAG_RW, &pipe_bigcount, 0, "number of times pipe expanded");
880ffa3a
MD
133SYSCTL_INT(_kern_pipe, OID_AUTO, rblocked,
134 CTLFLAG_RW, &pipe_rblocked_count, 0, "number of times pipe expanded");
135SYSCTL_INT(_kern_pipe, OID_AUTO, wblocked,
136 CTLFLAG_RW, &pipe_wblocked_count, 0, "number of times pipe expanded");
fc7d5181
MD
137SYSCTL_INT(_kern_pipe, OID_AUTO, maxcache,
138 CTLFLAG_RW, &pipe_maxcache, 0, "max pipes cached per-cpu");
139SYSCTL_INT(_kern_pipe, OID_AUTO, maxbig,
140 CTLFLAG_RW, &pipe_maxbig, 0, "max number of big pipes");
1ae37239 141#ifdef SMP
880ffa3a
MD
142static int pipe_delay = 5000; /* 5uS default */
143SYSCTL_INT(_kern_pipe, OID_AUTO, delay,
144 CTLFLAG_RW, &pipe_delay, 0, "SMP delay optimization in ns");
10fe29c0 145static int pipe_mpsafe = 1;
1ae37239
MD
146SYSCTL_INT(_kern_pipe, OID_AUTO, mpsafe,
147 CTLFLAG_RW, &pipe_mpsafe, 0, "");
148#endif
fc7d5181
MD
149#if !defined(NO_PIPE_SYSCTL_STATS)
150SYSCTL_INT(_kern_pipe, OID_AUTO, bcache_alloc,
151 CTLFLAG_RW, &pipe_bcache_alloc, 0, "pipe buffer from pcpu cache");
fc7d5181
MD
152SYSCTL_INT(_kern_pipe, OID_AUTO, bkmem_alloc,
153 CTLFLAG_RW, &pipe_bkmem_alloc, 0, "pipe buffer from kmem");
fc7d5181 154#endif
984263bc 155
402ed7e1
RG
156static void pipeclose (struct pipe *cpipe);
157static void pipe_free_kmem (struct pipe *cpipe);
158static int pipe_create (struct pipe **cpipep);
402ed7e1 159static __inline void pipeselwakeup (struct pipe *cpipe);
402ed7e1 160static int pipespace (struct pipe *cpipe, int size);
984263bc 161
1ae37239
MD
162static __inline void
163pipeselwakeup(struct pipe *cpipe)
164{
165 if (cpipe->pipe_state & PIPE_SEL) {
166 get_mplock();
167 cpipe->pipe_state &= ~PIPE_SEL;
168 selwakeup(&cpipe->pipe_sel);
169 rel_mplock();
170 }
171 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) {
172 get_mplock();
173 pgsigio(cpipe->pipe_sigio, SIGIO, 0);
174 rel_mplock();
175 }
176 if (SLIST_FIRST(&cpipe->pipe_sel.si_note)) {
177 get_mplock();
178 KNOTE(&cpipe->pipe_sel.si_note, 0);
179 rel_mplock();
180 }
181}
182
183/*
184 * These routines are called before and after a UIO. The UIO
185 * may block, causing our held tokens to be lost temporarily.
186 *
187 * We use these routines to serialize reads against other reads
188 * and writes against other writes.
189 *
190 * The read token is held on entry so *ipp does not race.
191 */
192static __inline int
930bd151 193pipe_start_uio(struct pipe *cpipe, int *ipp)
1ae37239
MD
194{
195 int error;
196
197 while (*ipp) {
198 *ipp = -1;
199 error = tsleep(ipp, PCATCH, "pipexx", 0);
200 if (error)
201 return (error);
202 }
203 *ipp = 1;
204 return (0);
205}
206
207static __inline void
930bd151 208pipe_end_uio(struct pipe *cpipe, int *ipp)
1ae37239
MD
209{
210 if (*ipp < 0) {
211 *ipp = 0;
212 wakeup(ipp);
213 } else {
930bd151 214 KKASSERT(*ipp > 0);
1ae37239
MD
215 *ipp = 0;
216 }
217}
218
219static __inline void
220pipe_get_mplock(int *save)
221{
222#ifdef SMP
223 if (pipe_mpsafe == 0) {
224 get_mplock();
225 *save = 1;
226 } else
227#endif
228 {
229 *save = 0;
230 }
231}
232
233static __inline void
234pipe_rel_mplock(int *save)
235{
236#ifdef SMP
237 if (*save)
238 rel_mplock();
239#endif
240}
241
242
984263bc
MD
243/*
244 * The pipe system call for the DTYPE_PIPE type of pipes
41c20dac
MD
245 *
246 * pipe_ARgs(int dummy)
984263bc
MD
247 */
248
249/* ARGSUSED */
250int
753fd850 251sys_pipe(struct pipe_args *uap)
984263bc 252{
dadab5e9
MD
253 struct thread *td = curthread;
254 struct proc *p = td->td_proc;
984263bc
MD
255 struct file *rf, *wf;
256 struct pipe *rpipe, *wpipe;
90b9818c 257 int fd1, fd2, error;
984263bc 258
dadab5e9 259 KKASSERT(p);
dadab5e9 260
984263bc
MD
261 rpipe = wpipe = NULL;
262 if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
263 pipeclose(rpipe);
264 pipeclose(wpipe);
265 return (ENFILE);
266 }
267
90b9818c 268 error = falloc(p, &rf, &fd1);
984263bc
MD
269 if (error) {
270 pipeclose(rpipe);
271 pipeclose(wpipe);
272 return (error);
273 }
c7114eea 274 uap->sysmsg_fds[0] = fd1;
984263bc
MD
275
276 /*
277 * Warning: once we've gotten past allocation of the fd for the
278 * read-side, we can only drop the read side via fdrop() in order
279 * to avoid races against processes which manage to dup() the read
280 * side while we are blocked trying to allocate the write side.
281 */
984263bc 282 rf->f_type = DTYPE_PIPE;
fbb4eeab 283 rf->f_flag = FREAD | FWRITE;
984263bc 284 rf->f_ops = &pipeops;
fbb4eeab 285 rf->f_data = rpipe;
90b9818c 286 error = falloc(p, &wf, &fd2);
984263bc 287 if (error) {
259b8ea0 288 fsetfd(p, NULL, fd1);
9f87144f 289 fdrop(rf);
984263bc
MD
290 /* rpipe has been closed by fdrop(). */
291 pipeclose(wpipe);
292 return (error);
293 }
984263bc 294 wf->f_type = DTYPE_PIPE;
fbb4eeab 295 wf->f_flag = FREAD | FWRITE;
984263bc 296 wf->f_ops = &pipeops;
fbb4eeab 297 wf->f_data = wpipe;
c7114eea 298 uap->sysmsg_fds[1] = fd2;
984263bc 299
1ae37239
MD
300 rpipe->pipe_slock = kmalloc(sizeof(struct lock),
301 M_PIPE, M_WAITOK|M_ZERO);
302 wpipe->pipe_slock = rpipe->pipe_slock;
984263bc
MD
303 rpipe->pipe_peer = wpipe;
304 wpipe->pipe_peer = rpipe;
1ae37239 305 lockinit(rpipe->pipe_slock, "pipecl", 0, 0);
259b8ea0 306
1ae37239
MD
307 /*
308 * Once activated the peer relationship remains valid until
309 * both sides are closed.
310 */
259b8ea0
MD
311 fsetfd(p, rf, fd1);
312 fsetfd(p, wf, fd2);
9f87144f
MD
313 fdrop(rf);
314 fdrop(wf);
984263bc
MD
315
316 return (0);
317}
318
319/*
320 * Allocate kva for pipe circular buffer, the space is pageable
321 * This routine will 'realloc' the size of a pipe safely, if it fails
322 * it will retain the old buffer.
323 * If it fails it will return ENOMEM.
324 */
325static int
fc7d5181 326pipespace(struct pipe *cpipe, int size)
984263bc
MD
327{
328 struct vm_object *object;
329 caddr_t buffer;
330 int npages, error;
331
fc7d5181
MD
332 npages = round_page(size) / PAGE_SIZE;
333 object = cpipe->pipe_buffer.object;
984263bc
MD
334
335 /*
fc7d5181
MD
336 * [re]create the object if necessary and reserve space for it
337 * in the kernel_map. The object and memory are pageable. On
338 * success, free the old resources before assigning the new
339 * ones.
984263bc 340 */
fc7d5181 341 if (object == NULL || object->size != npages) {
1ae37239 342 get_mplock();
fc7d5181 343 object = vm_object_allocate(OBJT_DEFAULT, npages);
e4846942 344 buffer = (caddr_t)vm_map_min(&kernel_map);
984263bc 345
e4846942 346 error = vm_map_find(&kernel_map, object, 0,
1b874851
MD
347 (vm_offset_t *)&buffer, size,
348 1,
349 VM_MAPTYPE_NORMAL,
350 VM_PROT_ALL, VM_PROT_ALL,
351 0);
984263bc 352
fc7d5181
MD
353 if (error != KERN_SUCCESS) {
354 vm_object_deallocate(object);
1ae37239 355 rel_mplock();
fc7d5181
MD
356 return (ENOMEM);
357 }
fc7d5181 358 pipe_free_kmem(cpipe);
1ae37239 359 rel_mplock();
fc7d5181
MD
360 cpipe->pipe_buffer.object = object;
361 cpipe->pipe_buffer.buffer = buffer;
362 cpipe->pipe_buffer.size = size;
363 ++pipe_bkmem_alloc;
364 } else {
365 ++pipe_bcache_alloc;
fc7d5181 366 }
c600838f
MD
367 cpipe->pipe_buffer.rindex = 0;
368 cpipe->pipe_buffer.windex = 0;
984263bc
MD
369 return (0);
370}
371
372/*
fc7d5181
MD
373 * Initialize and allocate VM and memory for pipe, pulling the pipe from
374 * our per-cpu cache if possible. For now make sure it is sized for the
375 * smaller PIPE_SIZE default.
984263bc
MD
376 */
377static int
c972a82f 378pipe_create(struct pipe **cpipep)
984263bc 379{
fc7d5181 380 globaldata_t gd = mycpu;
984263bc
MD
381 struct pipe *cpipe;
382 int error;
383
fc7d5181
MD
384 if ((cpipe = gd->gd_pipeq) != NULL) {
385 gd->gd_pipeq = cpipe->pipe_peer;
386 --gd->gd_pipeqcount;
387 cpipe->pipe_peer = NULL;
1ae37239 388 cpipe->pipe_wantwcnt = 0;
fc7d5181 389 } else {
efda3bd0 390 cpipe = kmalloc(sizeof(struct pipe), M_PIPE, M_WAITOK|M_ZERO);
fc7d5181
MD
391 }
392 *cpipep = cpipe;
393 if ((error = pipespace(cpipe, PIPE_SIZE)) != 0)
984263bc 394 return (error);
984263bc
MD
395 vfs_timestamp(&cpipe->pipe_ctime);
396 cpipe->pipe_atime = cpipe->pipe_ctime;
397 cpipe->pipe_mtime = cpipe->pipe_ctime;
1ae37239
MD
398 lwkt_token_init(&cpipe->pipe_rlock);
399 lwkt_token_init(&cpipe->pipe_wlock);
984263bc
MD
400 return (0);
401}
402
d9b2033e
MD
403/*
404 * MPALMOSTSAFE (acquires mplock)
405 */
984263bc 406static int
9ba76b73 407pipe_read(struct file *fp, struct uio *uio, struct ucred *cred, int fflags)
984263bc 408{
d9b2033e 409 struct pipe *rpipe;
984263bc 410 int error;
880ffa3a 411 int orig_resid;
984263bc 412 int nread = 0;
9ba76b73 413 int nbio;
c600838f 414 u_int size; /* total bytes available */
1ae37239 415 u_int nsize; /* total bytes to read */
c600838f 416 u_int rindex; /* contiguous bytes available */
b20720b5 417 int notify_writer;
1ae37239
MD
418 lwkt_tokref rlock;
419 lwkt_tokref wlock;
420 int mpsave;
984263bc 421
1ae37239
MD
422 /*
423 * Degenerate case
424 */
880ffa3a
MD
425 orig_resid = uio->uio_resid;
426 if (orig_resid == 0)
1ae37239
MD
427 return(0);
428
429 /*
430 * Setup locks, calculate nbio
431 */
432 pipe_get_mplock(&mpsave);
433 rpipe = (struct pipe *)fp->f_data;
434 lwkt_gettoken(&rlock, &rpipe->pipe_rlock);
984263bc 435
9ba76b73
MD
436 if (fflags & O_FBLOCKING)
437 nbio = 0;
438 else if (fflags & O_FNONBLOCKING)
439 nbio = 1;
440 else if (fp->f_flag & O_NONBLOCK)
441 nbio = 1;
442 else
443 nbio = 0;
444
1ae37239
MD
445 /*
446 * Reads are serialized. Note howeverthat pipe_buffer.buffer and
447 * pipe_buffer.size can change out from under us when the number
448 * of bytes in the buffer are zero due to the write-side doing a
449 * pipespace().
450 */
451 error = pipe_start_uio(rpipe, &rpipe->pipe_rip);
452 if (error) {
453 pipe_rel_mplock(&mpsave);
454 lwkt_reltoken(&rlock);
455 return (error);
456 }
b20720b5 457 notify_writer = 0;
984263bc 458 while (uio->uio_resid) {
c600838f 459 size = rpipe->pipe_buffer.windex - rpipe->pipe_buffer.rindex;
1ae37239 460 cpu_lfence();
c600838f
MD
461 if (size) {
462 rindex = rpipe->pipe_buffer.rindex &
463 (rpipe->pipe_buffer.size - 1);
1ae37239
MD
464 nsize = size;
465 if (nsize > rpipe->pipe_buffer.size - rindex)
466 nsize = rpipe->pipe_buffer.size - rindex;
467 if (nsize > (u_int)uio->uio_resid)
468 nsize = (u_int)uio->uio_resid;
c600838f
MD
469
470 error = uiomove(&rpipe->pipe_buffer.buffer[rindex],
1ae37239 471 nsize, uio);
984263bc
MD
472 if (error)
473 break;
1ae37239
MD
474 cpu_mfence();
475 rpipe->pipe_buffer.rindex += nsize;
476 nread += nsize;
984263bc
MD
477
478 /*
880ffa3a
MD
479 * If the FIFO is still over half full just continue
480 * and do not try to notify the writer yet.
984263bc 481 */
b20720b5
MD
482 if (size - nsize >= (rpipe->pipe_buffer.size >> 1)) {
483 notify_writer = 0;
1ae37239 484 continue;
984263bc 485 }
b20720b5
MD
486
487 /*
880ffa3a
MD
488 * When the FIFO is less then half full notify any
489 * waiting writer. WANTW can be checked while
490 * holding just the rlock.
b20720b5
MD
491 */
492 notify_writer = 1;
493 if ((rpipe->pipe_state & PIPE_WANTW) == 0)
494 continue;
1ae37239 495 }
984263bc 496
1ae37239
MD
497 /*
498 * If the "write-side" was blocked we wake it up. This code
499 * is reached either when the buffer is completely emptied
500 * or if it becomes more then half-empty.
501 *
502 * Pipe_state can only be modified if both the rlock and
503 * wlock are held.
504 */
505 if (rpipe->pipe_state & PIPE_WANTW) {
506 lwkt_gettoken(&wlock, &rpipe->pipe_wlock);
984263bc 507 if (rpipe->pipe_state & PIPE_WANTW) {
b20720b5 508 notify_writer = 0;
984263bc 509 rpipe->pipe_state &= ~PIPE_WANTW;
1ae37239 510 lwkt_reltoken(&wlock);
984263bc 511 wakeup(rpipe);
1ae37239
MD
512 } else {
513 lwkt_reltoken(&wlock);
984263bc 514 }
1ae37239 515 }
984263bc 516
1ae37239
MD
517 /*
518 * Pick up our copy loop again if the writer sent data to
880ffa3a
MD
519 * us while we were messing around.
520 *
521 * On a SMP box poll up to pipe_delay nanoseconds for new
522 * data. Typically a value of 2000 to 4000 is sufficient
523 * to eradicate most IPIs/tsleeps/wakeups when a pipe
524 * is used for synchronous communications with small packets,
525 * and 8000 or so (8uS) will pipeline large buffer xfers
526 * between cpus over a pipe.
527 *
528 * For synchronous communications a hit means doing a
529 * full Awrite-Bread-Bwrite-Aread cycle in less then 2uS,
530 * where as miss requiring a tsleep/wakeup sequence
531 * will take 7uS or more.
1ae37239 532 */
880ffa3a 533 if (rpipe->pipe_buffer.windex != rpipe->pipe_buffer.rindex)
1ae37239 534 continue;
984263bc 535
880ffa3a
MD
536#if defined(SMP) && defined(_RDTSC_SUPPORTED_)
537 if (pipe_delay) {
538 int64_t tsc_target;
539 int good = 0;
540
541 tsc_target = tsc_get_target(pipe_delay);
542 while (tsc_test_target(tsc_target) == 0) {
543 if (rpipe->pipe_buffer.windex !=
544 rpipe->pipe_buffer.rindex) {
545 good = 1;
546 break;
547 }
548 }
549 if (good)
550 continue;
551 }
552#endif
553
1ae37239
MD
554 /*
555 * Detect EOF condition, do not set error.
556 */
557 if (rpipe->pipe_state & PIPE_REOF)
558 break;
984263bc 559
1ae37239
MD
560 /*
561 * Break if some data was read, or if this was a non-blocking
562 * read.
563 */
564 if (nread > 0)
565 break;
566
567 if (nbio) {
568 error = EAGAIN;
569 break;
570 }
571
572 /*
573 * Last chance, interlock with WANTR.
574 */
575 lwkt_gettoken(&wlock, &rpipe->pipe_wlock);
576 size = rpipe->pipe_buffer.windex - rpipe->pipe_buffer.rindex;
577 if (size) {
578 lwkt_reltoken(&wlock);
579 continue;
984263bc 580 }
1ae37239
MD
581
582 /*
583 * If there is no more to read in the pipe, reset its
584 * pointers to the beginning. This improves cache hit
585 * stats.
586 *
587 * We need both locks to modify both pointers, and there
588 * must also not be a write in progress or the uiomove()
589 * in the write might block and temporarily release
590 * its wlock, then reacquire and update windex. We are
591 * only serialized against reads, not writes.
592 *
593 * XXX should we even bother resetting the indices? It
594 * might actually be more cache efficient not to.
595 */
596 if (rpipe->pipe_buffer.rindex == rpipe->pipe_buffer.windex &&
597 rpipe->pipe_wip == 0) {
598 rpipe->pipe_buffer.rindex = 0;
599 rpipe->pipe_buffer.windex = 0;
600 }
601
602 /*
603 * Wait for more data.
604 *
605 * Pipe_state can only be set if both the rlock and wlock
606 * are held.
607 */
608 rpipe->pipe_state |= PIPE_WANTR;
ae8e83e6 609 tsleep_interlock(rpipe, PCATCH);
1ae37239 610 lwkt_reltoken(&wlock);
d9345d3a 611 error = tsleep(rpipe, PCATCH | PINTERLOCKED, "piperd", 0);
880ffa3a 612 ++pipe_rblocked_count;
1ae37239
MD
613 if (error)
614 break;
984263bc 615 }
1ae37239 616 pipe_end_uio(rpipe, &rpipe->pipe_rip);
984263bc 617
1ae37239
MD
618 /*
619 * Uptime last access time
620 */
621 if (error == 0 && nread)
984263bc 622 vfs_timestamp(&rpipe->pipe_atime);
984263bc
MD
623
624 /*
b20720b5
MD
625 * If we drained the FIFO more then half way then handle
626 * write blocking hysteresis.
1ae37239 627 *
b20720b5
MD
628 * Note that PIPE_WANTW cannot be set by the writer without
629 * it holding both rlock and wlock, so we can test it
630 * while holding just rlock.
984263bc 631 */
b20720b5
MD
632 if (notify_writer) {
633 if (rpipe->pipe_state & PIPE_WANTW) {
1ae37239
MD
634 lwkt_gettoken(&wlock, &rpipe->pipe_wlock);
635 if (rpipe->pipe_state & PIPE_WANTW) {
636 rpipe->pipe_state &= ~PIPE_WANTW;
637 lwkt_reltoken(&wlock);
638 wakeup(rpipe);
639 } else {
640 lwkt_reltoken(&wlock);
641 }
984263bc
MD
642 }
643 }
1ae37239
MD
644 size = rpipe->pipe_buffer.windex - rpipe->pipe_buffer.rindex;
645 lwkt_reltoken(&rlock);
984263bc 646
1ae37239
MD
647 /*
648 * If enough space is available in buffer then wakeup sel writers?
649 */
c600838f 650 if ((rpipe->pipe_buffer.size - size) >= PIPE_BUF)
984263bc 651 pipeselwakeup(rpipe);
1ae37239 652 pipe_rel_mplock(&mpsave);
984263bc
MD
653 return (error);
654}
655
d9b2033e
MD
656/*
657 * MPALMOSTSAFE - acquires mplock
658 */
984263bc 659static int
9ba76b73 660pipe_write(struct file *fp, struct uio *uio, struct ucred *cred, int fflags)
984263bc 661{
1ae37239 662 int error;
984263bc 663 int orig_resid;
9ba76b73 664 int nbio;
984263bc 665 struct pipe *wpipe, *rpipe;
1ae37239
MD
666 lwkt_tokref rlock;
667 lwkt_tokref wlock;
c600838f
MD
668 u_int windex;
669 u_int space;
1ae37239
MD
670 u_int wcount;
671 int mpsave;
984263bc 672
1ae37239
MD
673 pipe_get_mplock(&mpsave);
674
675 /*
676 * Writes go to the peer. The peer will always exist.
677 */
984263bc
MD
678 rpipe = (struct pipe *) fp->f_data;
679 wpipe = rpipe->pipe_peer;
1ae37239
MD
680 lwkt_gettoken(&wlock, &wpipe->pipe_wlock);
681 if (wpipe->pipe_state & PIPE_WEOF) {
682 pipe_rel_mplock(&mpsave);
683 lwkt_reltoken(&wlock);
684 return (EPIPE);
685 }
984263bc
MD
686
687 /*
1ae37239 688 * Degenerate case (EPIPE takes prec)
984263bc 689 */
1ae37239
MD
690 if (uio->uio_resid == 0) {
691 pipe_rel_mplock(&mpsave);
692 lwkt_reltoken(&wlock);
693 return(0);
694 }
695
696 /*
697 * Writes are serialized (start_uio must be called with wlock)
698 */
699 error = pipe_start_uio(wpipe, &wpipe->pipe_wip);
700 if (error) {
701 pipe_rel_mplock(&mpsave);
702 lwkt_reltoken(&wlock);
703 return (error);
984263bc 704 }
984263bc 705
9ba76b73
MD
706 if (fflags & O_FBLOCKING)
707 nbio = 0;
708 else if (fflags & O_FNONBLOCKING)
709 nbio = 1;
710 else if (fp->f_flag & O_NONBLOCK)
711 nbio = 1;
712 else
713 nbio = 0;
714
984263bc
MD
715 /*
716 * If it is advantageous to resize the pipe buffer, do
1ae37239 717 * so. We are write-serialized so we can block safely.
984263bc 718 */
1ae37239 719 if ((wpipe->pipe_buffer.size <= PIPE_SIZE) &&
39b0a1af 720 (pipe_nbig < pipe_maxbig) &&
1ae37239
MD
721 wpipe->pipe_wantwcnt > 4 &&
722 (wpipe->pipe_buffer.rindex == wpipe->pipe_buffer.windex)) {
39b0a1af
MD
723 /*
724 * Recheck after lock.
725 */
1ae37239
MD
726 lwkt_gettoken(&rlock, &wpipe->pipe_rlock);
727 if ((wpipe->pipe_buffer.size <= PIPE_SIZE) &&
728 (pipe_nbig < pipe_maxbig) &&
c600838f 729 (wpipe->pipe_buffer.rindex == wpipe->pipe_buffer.windex)) {
880ffa3a
MD
730 atomic_add_int(&pipe_nbig, 1);
731 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
39b0a1af 732 ++pipe_bigcount;
880ffa3a
MD
733 else
734 atomic_subtract_int(&pipe_nbig, 1);
984263bc 735 }
1ae37239 736 lwkt_reltoken(&rlock);
984263bc 737 }
984263bc
MD
738
739 orig_resid = uio->uio_resid;
1ae37239 740 wcount = 0;
984263bc
MD
741
742 while (uio->uio_resid) {
1ae37239 743 if (wpipe->pipe_state & PIPE_WEOF) {
984263bc
MD
744 error = EPIPE;
745 break;
746 }
747
c600838f
MD
748 windex = wpipe->pipe_buffer.windex &
749 (wpipe->pipe_buffer.size - 1);
750 space = wpipe->pipe_buffer.size -
751 (wpipe->pipe_buffer.windex - wpipe->pipe_buffer.rindex);
1ae37239 752 cpu_lfence();
984263bc
MD
753
754 /* Writes of size <= PIPE_BUF must be atomic. */
755 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
756 space = 0;
757
c617bada
MD
758 /*
759 * Write to fill, read size handles write hysteresis. Also
760 * additional restrictions can cause select-based non-blocking
761 * writes to spin.
762 */
763 if (space > 0) {
1ae37239 764 u_int segsize;
984263bc 765
984263bc 766 /*
1ae37239
MD
767 * Transfer size is minimum of uio transfer
768 * and free space in pipe buffer.
769 *
770 * Limit each uiocopy to no more then PIPE_SIZE
771 * so we can keep the gravy train going on a
772 * SMP box. This doubles the performance for
773 * write sizes > 16K. Otherwise large writes
774 * wind up doing an inefficient synchronous
775 * ping-pong.
984263bc 776 */
1ae37239
MD
777 if (space > (u_int)uio->uio_resid)
778 space = (u_int)uio->uio_resid;
779 if (space > PIPE_SIZE)
780 space = PIPE_SIZE;
984263bc
MD
781
782 /*
1ae37239
MD
783 * First segment to transfer is minimum of
784 * transfer size and contiguous space in
785 * pipe buffer. If first segment to transfer
786 * is less than the transfer size, we've got
787 * a wraparound in the buffer.
984263bc 788 */
1ae37239
MD
789 segsize = wpipe->pipe_buffer.size - windex;
790 if (segsize > space)
791 segsize = space;
984263bc 792
39055880 793#ifdef SMP
984263bc 794 /*
1ae37239
MD
795 * If this is the first loop and the reader is
796 * blocked, do a preemptive wakeup of the reader.
797 *
39055880
MD
798 * On SMP the IPI latency plus the wlock interlock
799 * on the reader side is the fastest way to get the
800 * reader going. (The scheduler will hard loop on
801 * lock tokens).
1ae37239
MD
802 *
803 * NOTE: We can't clear WANTR here without acquiring
804 * the rlock, which we don't want to do here!
984263bc 805 */
880ffa3a 806 if ((wpipe->pipe_state & PIPE_WANTR) && pipe_mpsafe > 1)
1ae37239 807 wakeup(wpipe);
39055880 808#endif
984263bc 809
984263bc 810 /*
880ffa3a
MD
811 * Transfer segment, which may include a wrap-around.
812 * Update windex to account for both all in one go
813 * so the reader can read() the data atomically.
984263bc 814 */
1ae37239
MD
815 error = uiomove(&wpipe->pipe_buffer.buffer[windex],
816 segsize, uio);
1ae37239 817 if (error == 0 && segsize < space) {
1ae37239
MD
818 segsize = space - segsize;
819 error = uiomove(&wpipe->pipe_buffer.buffer[0],
820 segsize, uio);
1ae37239
MD
821 }
822 if (error)
984263bc 823 break;
880ffa3a
MD
824 cpu_mfence();
825 wpipe->pipe_buffer.windex += space;
1ae37239
MD
826 wcount += space;
827 continue;
984263bc 828 }
984263bc 829
1ae37239
MD
830 /*
831 * We need both the rlock and the wlock to interlock against
832 * the EOF, WANTW, and size checks, and to modify pipe_state.
833 *
834 * These are token locks so we do not have to worry about
835 * deadlocks.
836 */
837 lwkt_gettoken(&rlock, &wpipe->pipe_rlock);
984263bc 838
984263bc 839 /*
1ae37239
MD
840 * If the "read-side" has been blocked, wake it up now
841 * and yield to let it drain synchronously rather
842 * then block.
984263bc
MD
843 */
844 if (wpipe->pipe_state & PIPE_WANTR) {
845 wpipe->pipe_state &= ~PIPE_WANTR;
846 wakeup(wpipe);
847 }
1ae37239
MD
848
849 /*
850 * don't block on non-blocking I/O
851 */
852 if (nbio) {
853 lwkt_reltoken(&rlock);
854 error = EAGAIN;
855 break;
856 }
857
b20720b5
MD
858 /*
859 * re-test whether we have to block in the writer after
860 * acquiring both locks, in case the reader opened up
861 * some space.
862 */
863 space = wpipe->pipe_buffer.size -
864 (wpipe->pipe_buffer.windex - wpipe->pipe_buffer.rindex);
865 cpu_lfence();
866 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
867 space = 0;
868
1ae37239
MD
869 /*
870 * We have no more space and have something to offer,
871 * wake up select/poll.
872 */
b20720b5 873 if (space == 0) {
b20720b5 874 wpipe->pipe_state |= PIPE_WANTW;
958de305
MD
875 ++wpipe->pipe_wantwcnt;
876 pipeselwakeup(wpipe);
877 if (wpipe->pipe_state & PIPE_WANTW)
878 error = tsleep(wpipe, PCATCH, "pipewr", 0);
880ffa3a 879 ++pipe_wblocked_count;
b20720b5 880 }
1ae37239
MD
881 lwkt_reltoken(&rlock);
882
883 /*
884 * Break out if we errored or the read side wants us to go
885 * away.
886 */
887 if (error)
888 break;
889 if (wpipe->pipe_state & PIPE_WEOF) {
890 error = EPIPE;
891 break;
892 }
893 }
894 pipe_end_uio(wpipe, &wpipe->pipe_wip);
895
896 /*
897 * If we have put any characters in the buffer, we wake up
898 * the reader.
899 *
900 * Both rlock and wlock are required to be able to modify pipe_state.
901 */
902 if (wpipe->pipe_buffer.windex != wpipe->pipe_buffer.rindex) {
903 if (wpipe->pipe_state & PIPE_WANTR) {
904 lwkt_gettoken(&rlock, &wpipe->pipe_rlock);
905 if (wpipe->pipe_state & PIPE_WANTR) {
906 wpipe->pipe_state &= ~PIPE_WANTR;
907 lwkt_reltoken(&rlock);
908 wakeup(wpipe);
909 } else {
910 lwkt_reltoken(&rlock);
911 }
912 }
984263bc
MD
913 }
914
915 /*
916 * Don't return EPIPE if I/O was successful
917 */
c600838f 918 if ((wpipe->pipe_buffer.rindex == wpipe->pipe_buffer.windex) &&
984263bc
MD
919 (uio->uio_resid == 0) &&
920 (error == EPIPE)) {
921 error = 0;
922 }
923
924 if (error == 0)
925 vfs_timestamp(&wpipe->pipe_mtime);
926
927 /*
928 * We have something to offer,
929 * wake up select/poll.
930 */
1ae37239
MD
931 space = wpipe->pipe_buffer.windex - wpipe->pipe_buffer.rindex;
932 lwkt_reltoken(&wlock);
933 if (space)
984263bc 934 pipeselwakeup(wpipe);
1ae37239 935 pipe_rel_mplock(&mpsave);
984263bc
MD
936 return (error);
937}
938
939/*
d9b2033e
MD
940 * MPALMOSTSAFE - acquires mplock
941 *
984263bc
MD
942 * we implement a very minimal set of ioctls for compatibility with sockets.
943 */
944int
87de5057 945pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct ucred *cred)
984263bc 946{
d9b2033e 947 struct pipe *mpipe;
880ffa3a
MD
948 lwkt_tokref rlock;
949 lwkt_tokref wlock;
d9b2033e 950 int error;
880ffa3a 951 int mpsave;
984263bc 952
880ffa3a 953 pipe_get_mplock(&mpsave);
d9b2033e 954 mpipe = (struct pipe *)fp->f_data;
984263bc 955
880ffa3a
MD
956 lwkt_gettoken(&rlock, &mpipe->pipe_rlock);
957 lwkt_gettoken(&wlock, &mpipe->pipe_wlock);
958
d9b2033e 959 switch (cmd) {
984263bc
MD
960 case FIOASYNC:
961 if (*(int *)data) {
962 mpipe->pipe_state |= PIPE_ASYNC;
963 } else {
964 mpipe->pipe_state &= ~PIPE_ASYNC;
965 }
d9b2033e
MD
966 error = 0;
967 break;
984263bc 968 case FIONREAD:
c600838f
MD
969 *(int *)data = mpipe->pipe_buffer.windex -
970 mpipe->pipe_buffer.rindex;
d9b2033e
MD
971 error = 0;
972 break;
984263bc 973 case FIOSETOWN:
880ffa3a 974 get_mplock();
d9b2033e 975 error = fsetown(*(int *)data, &mpipe->pipe_sigio);
880ffa3a 976 rel_mplock();
d9b2033e 977 break;
984263bc
MD
978 case FIOGETOWN:
979 *(int *)data = fgetown(mpipe->pipe_sigio);
d9b2033e
MD
980 error = 0;
981 break;
984263bc 982 case TIOCSPGRP:
d9b2033e 983 /* This is deprecated, FIOSETOWN should be used instead. */
880ffa3a 984 get_mplock();
d9b2033e 985 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
880ffa3a 986 rel_mplock();
d9b2033e 987 break;
984263bc 988
984263bc 989 case TIOCGPGRP:
d9b2033e 990 /* This is deprecated, FIOGETOWN should be used instead. */
984263bc 991 *(int *)data = -fgetown(mpipe->pipe_sigio);
d9b2033e
MD
992 error = 0;
993 break;
994 default:
995 error = ENOTTY;
996 break;
984263bc 997 }
880ffa3a
MD
998 lwkt_reltoken(&rlock);
999 lwkt_reltoken(&wlock);
1000 pipe_rel_mplock(&mpsave);
1001
d9b2033e 1002 return (error);
984263bc
MD
1003}
1004
d9b2033e
MD
1005/*
1006 * MPALMOSTSAFE - acquires mplock
1007 */
984263bc 1008int
87de5057 1009pipe_poll(struct file *fp, int events, struct ucred *cred)
984263bc 1010{
d9b2033e 1011 struct pipe *rpipe;
984263bc
MD
1012 struct pipe *wpipe;
1013 int revents = 0;
c600838f 1014 u_int space;
1ae37239 1015 int mpsave;
984263bc 1016
1ae37239 1017 pipe_get_mplock(&mpsave);
d9b2033e 1018 rpipe = (struct pipe *)fp->f_data;
984263bc 1019 wpipe = rpipe->pipe_peer;
08593aa1 1020 if (events & (POLLIN | POLLRDNORM)) {
c600838f 1021 if ((rpipe->pipe_buffer.windex != rpipe->pipe_buffer.rindex) ||
1ae37239 1022 (rpipe->pipe_state & PIPE_REOF)) {
984263bc 1023 revents |= events & (POLLIN | POLLRDNORM);
08593aa1
MD
1024 }
1025 }
984263bc 1026
08593aa1 1027 if (events & (POLLOUT | POLLWRNORM)) {
1ae37239 1028 if (wpipe == NULL || (wpipe->pipe_state & PIPE_WEOF)) {
984263bc 1029 revents |= events & (POLLOUT | POLLWRNORM);
c600838f
MD
1030 } else {
1031 space = wpipe->pipe_buffer.windex -
1032 wpipe->pipe_buffer.rindex;
1033 space = wpipe->pipe_buffer.size - space;
1034 if (space >= PIPE_BUF)
1035 revents |= events & (POLLOUT | POLLWRNORM);
08593aa1
MD
1036 }
1037 }
984263bc 1038
1ae37239 1039 if ((rpipe->pipe_state & PIPE_REOF) ||
984263bc 1040 (wpipe == NULL) ||
1ae37239 1041 (wpipe->pipe_state & PIPE_WEOF))
984263bc
MD
1042 revents |= POLLHUP;
1043
1044 if (revents == 0) {
1045 if (events & (POLLIN | POLLRDNORM)) {
87de5057 1046 selrecord(curthread, &rpipe->pipe_sel);
984263bc
MD
1047 rpipe->pipe_state |= PIPE_SEL;
1048 }
1049
1050 if (events & (POLLOUT | POLLWRNORM)) {
87de5057 1051 selrecord(curthread, &wpipe->pipe_sel);
984263bc
MD
1052 wpipe->pipe_state |= PIPE_SEL;
1053 }
1054 }
1ae37239 1055 pipe_rel_mplock(&mpsave);
984263bc
MD
1056 return (revents);
1057}
1058
d9b2033e 1059/*
1ee6e3c6 1060 * MPSAFE
d9b2033e 1061 */
984263bc 1062static int
87de5057 1063pipe_stat(struct file *fp, struct stat *ub, struct ucred *cred)
984263bc 1064{
d9b2033e 1065 struct pipe *pipe;
1ae37239 1066 int mpsave;
d9b2033e 1067
1ae37239 1068 pipe_get_mplock(&mpsave);
d9b2033e 1069 pipe = (struct pipe *)fp->f_data;
984263bc
MD
1070
1071 bzero((caddr_t)ub, sizeof(*ub));
1072 ub->st_mode = S_IFIFO;
1073 ub->st_blksize = pipe->pipe_buffer.size;
c600838f 1074 ub->st_size = pipe->pipe_buffer.windex - pipe->pipe_buffer.rindex;
984263bc
MD
1075 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1076 ub->st_atimespec = pipe->pipe_atime;
1077 ub->st_mtimespec = pipe->pipe_mtime;
1078 ub->st_ctimespec = pipe->pipe_ctime;
1079 /*
1080 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
1081 * st_flags, st_gen.
1082 * XXX (st_dev, st_ino) should be unique.
1083 */
1ae37239 1084 pipe_rel_mplock(&mpsave);
984263bc
MD
1085 return (0);
1086}
1087
d9b2033e
MD
1088/*
1089 * MPALMOSTSAFE - acquires mplock
1090 */
984263bc 1091static int
87de5057 1092pipe_close(struct file *fp)
984263bc 1093{
39b0a1af 1094 struct pipe *cpipe;
984263bc 1095
d9b2033e 1096 get_mplock();
39b0a1af 1097 cpipe = (struct pipe *)fp->f_data;
984263bc
MD
1098 fp->f_ops = &badfileops;
1099 fp->f_data = NULL;
1100 funsetown(cpipe->pipe_sigio);
1101 pipeclose(cpipe);
d9b2033e 1102 rel_mplock();
984263bc
MD
1103 return (0);
1104}
1105
004d2de5
MD
1106/*
1107 * Shutdown one or both directions of a full-duplex pipe.
d9b2033e
MD
1108 *
1109 * MPALMOSTSAFE - acquires mplock
004d2de5 1110 */
004d2de5 1111static int
87de5057 1112pipe_shutdown(struct file *fp, int how)
004d2de5 1113{
d9b2033e 1114 struct pipe *rpipe;
004d2de5
MD
1115 struct pipe *wpipe;
1116 int error = EPIPE;
1ae37239
MD
1117 lwkt_tokref rpipe_rlock;
1118 lwkt_tokref rpipe_wlock;
1119 lwkt_tokref wpipe_rlock;
1120 lwkt_tokref wpipe_wlock;
1121 int mpsave;
004d2de5 1122
1ae37239 1123 pipe_get_mplock(&mpsave);
d9b2033e 1124 rpipe = (struct pipe *)fp->f_data;
1ae37239
MD
1125 wpipe = rpipe->pipe_peer;
1126
1127 /*
1128 * We modify pipe_state on both pipes, which means we need
1129 * all four tokens!
1130 */
1131 lwkt_gettoken(&rpipe_rlock, &rpipe->pipe_rlock);
1132 lwkt_gettoken(&rpipe_wlock, &rpipe->pipe_wlock);
1133 lwkt_gettoken(&wpipe_rlock, &wpipe->pipe_rlock);
1134 lwkt_gettoken(&wpipe_wlock, &wpipe->pipe_wlock);
d9b2033e 1135
004d2de5
MD
1136 switch(how) {
1137 case SHUT_RDWR:
1138 case SHUT_RD:
930bd151
MD
1139 rpipe->pipe_state |= PIPE_REOF; /* my reads */
1140 rpipe->pipe_state |= PIPE_WEOF; /* peer writes */
1ae37239
MD
1141 if (rpipe->pipe_state & PIPE_WANTR) {
1142 rpipe->pipe_state &= ~PIPE_WANTR;
1143 wakeup(rpipe);
004d2de5 1144 }
930bd151
MD
1145 if (rpipe->pipe_state & PIPE_WANTW) {
1146 rpipe->pipe_state &= ~PIPE_WANTW;
1147 wakeup(rpipe);
1ae37239 1148 }
1ae37239 1149 error = 0;
004d2de5
MD
1150 if (how == SHUT_RD)
1151 break;
1152 /* fall through */
1153 case SHUT_WR:
930bd151
MD
1154 wpipe->pipe_state |= PIPE_REOF; /* peer reads */
1155 wpipe->pipe_state |= PIPE_WEOF; /* my writes */
1156 if (wpipe->pipe_state & PIPE_WANTR) {
1157 wpipe->pipe_state &= ~PIPE_WANTR;
1158 wakeup(wpipe);
1159 }
1ae37239
MD
1160 if (wpipe->pipe_state & PIPE_WANTW) {
1161 wpipe->pipe_state &= ~PIPE_WANTW;
1162 wakeup(wpipe);
1163 }
1ae37239
MD
1164 error = 0;
1165 break;
004d2de5 1166 }
930bd151
MD
1167 pipeselwakeup(rpipe);
1168 pipeselwakeup(wpipe);
1ae37239
MD
1169
1170 lwkt_reltoken(&rpipe_rlock);
1171 lwkt_reltoken(&rpipe_wlock);
1172 lwkt_reltoken(&wpipe_rlock);
1173 lwkt_reltoken(&wpipe_wlock);
1174
1175 pipe_rel_mplock(&mpsave);
004d2de5
MD
1176 return (error);
1177}
1178
984263bc 1179static void
dadab5e9 1180pipe_free_kmem(struct pipe *cpipe)
984263bc 1181{
984263bc
MD
1182 if (cpipe->pipe_buffer.buffer != NULL) {
1183 if (cpipe->pipe_buffer.size > PIPE_SIZE)
880ffa3a 1184 atomic_subtract_int(&pipe_nbig, 1);
e4846942 1185 kmem_free(&kernel_map,
984263bc
MD
1186 (vm_offset_t)cpipe->pipe_buffer.buffer,
1187 cpipe->pipe_buffer.size);
1188 cpipe->pipe_buffer.buffer = NULL;
fc7d5181 1189 cpipe->pipe_buffer.object = NULL;
984263bc 1190 }
984263bc
MD
1191}
1192
1193/*
1ae37239
MD
1194 * Close the pipe. The slock must be held to interlock against simultanious
1195 * closes. The rlock and wlock must be held to adjust the pipe_state.
984263bc
MD
1196 */
1197static void
dadab5e9 1198pipeclose(struct pipe *cpipe)
984263bc 1199{
fc7d5181 1200 globaldata_t gd;
984263bc 1201 struct pipe *ppipe;
1ae37239
MD
1202 lwkt_tokref cpipe_rlock;
1203 lwkt_tokref cpipe_wlock;
1204 lwkt_tokref ppipe_rlock;
1205 lwkt_tokref ppipe_wlock;
984263bc 1206
fc7d5181
MD
1207 if (cpipe == NULL)
1208 return;
984263bc 1209
1ae37239
MD
1210 /*
1211 * The slock may not have been allocated yet (close during
1212 * initialization)
1213 *
1214 * We need both the read and write tokens to modify pipe_state.
1215 */
1216 if (cpipe->pipe_slock)
1217 lockmgr(cpipe->pipe_slock, LK_EXCLUSIVE);
1218 lwkt_gettoken(&cpipe_rlock, &cpipe->pipe_rlock);
1219 lwkt_gettoken(&cpipe_wlock, &cpipe->pipe_wlock);
984263bc 1220
fc7d5181 1221 /*
1ae37239
MD
1222 * Set our state, wakeup anyone waiting in select, and
1223 * wakeup anyone blocked on our pipe.
fc7d5181 1224 */
1ae37239
MD
1225 cpipe->pipe_state |= PIPE_CLOSED | PIPE_REOF | PIPE_WEOF;
1226 pipeselwakeup(cpipe);
1227 if (cpipe->pipe_state & (PIPE_WANTR | PIPE_WANTW)) {
1228 cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
fc7d5181 1229 wakeup(cpipe);
fc7d5181 1230 }
984263bc 1231
fc7d5181 1232 /*
930bd151 1233 * Disconnect from peer.
fc7d5181
MD
1234 */
1235 if ((ppipe = cpipe->pipe_peer) != NULL) {
1ae37239
MD
1236 lwkt_gettoken(&ppipe_rlock, &ppipe->pipe_rlock);
1237 lwkt_gettoken(&ppipe_wlock, &ppipe->pipe_wlock);
930bd151 1238 ppipe->pipe_state |= PIPE_REOF | PIPE_WEOF;
fc7d5181 1239 pipeselwakeup(ppipe);
1ae37239
MD
1240 if (ppipe->pipe_state & (PIPE_WANTR | PIPE_WANTW)) {
1241 ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1242 wakeup(ppipe);
1243 }
1244 if (SLIST_FIRST(&ppipe->pipe_sel.si_note)) {
1245 get_mplock();
1246 KNOTE(&ppipe->pipe_sel.si_note, 0);
1247 rel_mplock();
1248 }
1249 lwkt_reltoken(&ppipe_rlock);
1250 lwkt_reltoken(&ppipe_wlock);
1251 }
fc7d5181 1252
1ae37239
MD
1253 /*
1254 * If the peer is also closed we can free resources for both
1255 * sides, otherwise we leave our side intact to deal with any
1256 * races (since we only have the slock).
1257 */
1258 if (ppipe && (ppipe->pipe_state & PIPE_CLOSED)) {
1259 cpipe->pipe_peer = NULL;
fc7d5181 1260 ppipe->pipe_peer = NULL;
1ae37239
MD
1261 ppipe->pipe_slock = NULL; /* we will free the slock */
1262 pipeclose(ppipe);
1263 ppipe = NULL;
fc7d5181
MD
1264 }
1265
1ae37239
MD
1266 lwkt_reltoken(&cpipe_rlock);
1267 lwkt_reltoken(&cpipe_wlock);
1268 if (cpipe->pipe_slock)
1269 lockmgr(cpipe->pipe_slock, LK_RELEASE);
8100156a 1270
fc7d5181 1271 /*
1ae37239 1272 * If we disassociated from our peer we can free resources
fc7d5181 1273 */
1ae37239
MD
1274 if (ppipe == NULL) {
1275 gd = mycpu;
1276 if (cpipe->pipe_slock) {
1277 kfree(cpipe->pipe_slock, M_PIPE);
1278 cpipe->pipe_slock = NULL;
1279 }
1280 if (gd->gd_pipeqcount >= pipe_maxcache ||
1281 cpipe->pipe_buffer.size != PIPE_SIZE
1282 ) {
1283 pipe_free_kmem(cpipe);
1284 kfree(cpipe, M_PIPE);
1285 } else {
1286 cpipe->pipe_state = 0;
1287 cpipe->pipe_peer = gd->gd_pipeq;
1288 gd->gd_pipeq = cpipe;
1289 ++gd->gd_pipeqcount;
1290 }
984263bc
MD
1291 }
1292}
1293
d9b2033e
MD
1294/*
1295 * MPALMOSTSAFE - acquires mplock
1296 */
984263bc
MD
1297static int
1298pipe_kqfilter(struct file *fp, struct knote *kn)
1299{
d9b2033e
MD
1300 struct pipe *cpipe;
1301
1302 get_mplock();
1303 cpipe = (struct pipe *)kn->kn_fp->f_data;
984263bc
MD
1304
1305 switch (kn->kn_filter) {
1306 case EVFILT_READ:
1307 kn->kn_fop = &pipe_rfiltops;
1308 break;
1309 case EVFILT_WRITE:
1310 kn->kn_fop = &pipe_wfiltops;
1311 cpipe = cpipe->pipe_peer;
d9b2033e 1312 if (cpipe == NULL) {
984263bc 1313 /* other end of pipe has been closed */
d9b2033e 1314 rel_mplock();
41f57d15 1315 return (EPIPE);
d9b2033e 1316 }
984263bc
MD
1317 break;
1318 default:
1319 return (1);
1320 }
1321 kn->kn_hook = (caddr_t)cpipe;
1322
1323 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
d9b2033e 1324 rel_mplock();
984263bc
MD
1325 return (0);
1326}
1327
1328static void
1329filt_pipedetach(struct knote *kn)
1330{
1331 struct pipe *cpipe = (struct pipe *)kn->kn_hook;
1332
1333 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1334}
1335
1336/*ARGSUSED*/
1337static int
1338filt_piperead(struct knote *kn, long hint)
1339{
1340 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
984263bc 1341
c600838f 1342 kn->kn_data = rpipe->pipe_buffer.windex - rpipe->pipe_buffer.rindex;
984263bc 1343
1ae37239
MD
1344 /* XXX RACE */
1345 if (rpipe->pipe_state & PIPE_REOF) {
984263bc
MD
1346 kn->kn_flags |= EV_EOF;
1347 return (1);
1348 }
1349 return (kn->kn_data > 0);
1350}
1351
1352/*ARGSUSED*/
1353static int
1354filt_pipewrite(struct knote *kn, long hint)
1355{
1356 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1357 struct pipe *wpipe = rpipe->pipe_peer;
c600838f 1358 u_int32_t space;
984263bc 1359
1ae37239
MD
1360 /* XXX RACE */
1361 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_WEOF)) {
984263bc
MD
1362 kn->kn_data = 0;
1363 kn->kn_flags |= EV_EOF;
1364 return (1);
1365 }
c600838f
MD
1366 space = wpipe->pipe_buffer.windex -
1367 wpipe->pipe_buffer.rindex;
1368 space = wpipe->pipe_buffer.size - space;
1369 kn->kn_data = space;
984263bc
MD
1370 return (kn->kn_data >= PIPE_BUF);
1371}