kern: fix integer underflow in exec_shell_imgact.
[dragonfly.git] / sys / kern / sys_pipe.c
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  *
19  * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.60.2.13 2002/08/05 15:05:15 des Exp $
20  */
21
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/filio.h>
36 #include <sys/ttycom.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/sysproto.h>
40 #include <sys/pipe.h>
41 #include <sys/vnode.h>
42 #include <sys/uio.h>
43 #include <sys/event.h>
44 #include <sys/globaldata.h>
45 #include <sys/module.h>
46 #include <sys/malloc.h>
47 #include <sys/sysctl.h>
48 #include <sys/socket.h>
49
50 #include <vm/vm.h>
51 #include <vm/vm_param.h>
52 #include <sys/lock.h>
53 #include <vm/vm_object.h>
54 #include <vm/vm_kern.h>
55 #include <vm/vm_extern.h>
56 #include <vm/pmap.h>
57 #include <vm/vm_map.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_zone.h>
60
61 #include <sys/file2.h>
62 #include <sys/signal2.h>
63
64 #include <machine/cpufunc.h>
65
66 /*
67  * interfaces to the outside world
68  */
69 static int pipe_read (struct file *fp, struct uio *uio, 
70                 struct ucred *cred, int flags);
71 static int pipe_write (struct file *fp, struct uio *uio, 
72                 struct ucred *cred, int flags);
73 static int pipe_close (struct file *fp);
74 static int pipe_shutdown (struct file *fp, int how);
75 static int pipe_kqfilter (struct file *fp, struct knote *kn);
76 static int pipe_stat (struct file *fp, struct stat *sb, struct ucred *cred);
77 static int pipe_ioctl (struct file *fp, u_long cmd, caddr_t data,
78                 struct ucred *cred, struct sysmsg *msg);
79
80 static struct fileops pipeops = {
81         .fo_read = pipe_read, 
82         .fo_write = pipe_write,
83         .fo_ioctl = pipe_ioctl,
84         .fo_kqfilter = pipe_kqfilter,
85         .fo_stat = pipe_stat,
86         .fo_close = pipe_close,
87         .fo_shutdown = pipe_shutdown
88 };
89
90 static void     filt_pipedetach(struct knote *kn);
91 static int      filt_piperead(struct knote *kn, long hint);
92 static int      filt_pipewrite(struct knote *kn, long hint);
93
94 static struct filterops pipe_rfiltops =
95         { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_pipedetach, filt_piperead };
96 static struct filterops pipe_wfiltops =
97         { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_pipedetach, filt_pipewrite };
98
99 MALLOC_DEFINE(M_PIPE, "pipe", "pipe structures");
100
101 /*
102  * Default pipe buffer size(s), this can be kind-of large now because pipe
103  * space is pageable.  The pipe code will try to maintain locality of
104  * reference for performance reasons, so small amounts of outstanding I/O
105  * will not wipe the cache.
106  */
107 #define MINPIPESIZE (PIPE_SIZE/3)
108 #define MAXPIPESIZE (2*PIPE_SIZE/3)
109
110 /*
111  * Limit the number of "big" pipes
112  */
113 #define LIMITBIGPIPES   64
114 #define PIPEQ_MAX_CACHE 16      /* per-cpu pipe structure cache */
115
116 static int pipe_maxbig = LIMITBIGPIPES;
117 static int pipe_maxcache = PIPEQ_MAX_CACHE;
118 static int pipe_bigcount;
119 static int pipe_nbig;
120 static int pipe_bcache_alloc;
121 static int pipe_bkmem_alloc;
122 static int pipe_rblocked_count;
123 static int pipe_wblocked_count;
124
125 SYSCTL_NODE(_kern, OID_AUTO, pipe, CTLFLAG_RW, 0, "Pipe operation");
126 SYSCTL_INT(_kern_pipe, OID_AUTO, nbig,
127         CTLFLAG_RD, &pipe_nbig, 0, "number of big pipes allocated");
128 SYSCTL_INT(_kern_pipe, OID_AUTO, bigcount,
129         CTLFLAG_RW, &pipe_bigcount, 0, "number of times pipe expanded");
130 SYSCTL_INT(_kern_pipe, OID_AUTO, rblocked,
131         CTLFLAG_RW, &pipe_rblocked_count, 0, "number of times pipe expanded");
132 SYSCTL_INT(_kern_pipe, OID_AUTO, wblocked,
133         CTLFLAG_RW, &pipe_wblocked_count, 0, "number of times pipe expanded");
134 SYSCTL_INT(_kern_pipe, OID_AUTO, maxcache,
135         CTLFLAG_RW, &pipe_maxcache, 0, "max pipes cached per-cpu");
136 SYSCTL_INT(_kern_pipe, OID_AUTO, maxbig,
137         CTLFLAG_RW, &pipe_maxbig, 0, "max number of big pipes");
138 static int pipe_delay = 5000;   /* 5uS default */
139 SYSCTL_INT(_kern_pipe, OID_AUTO, delay,
140         CTLFLAG_RW, &pipe_delay, 0, "SMP delay optimization in ns");
141 #if !defined(NO_PIPE_SYSCTL_STATS)
142 SYSCTL_INT(_kern_pipe, OID_AUTO, bcache_alloc,
143         CTLFLAG_RW, &pipe_bcache_alloc, 0, "pipe buffer from pcpu cache");
144 SYSCTL_INT(_kern_pipe, OID_AUTO, bkmem_alloc,
145         CTLFLAG_RW, &pipe_bkmem_alloc, 0, "pipe buffer from kmem");
146 #endif
147
148 /*
149  * Auto-size pipe cache to reduce kmem allocations and frees.
150  */
151 static
152 void
153 pipeinit(void *dummy)
154 {
155         size_t mbytes = kmem_lim_size();
156
157         if (pipe_maxbig == LIMITBIGPIPES) {
158                 if (mbytes >= 7 * 1024)
159                         pipe_maxbig *= 2;
160                 if (mbytes >= 15 * 1024)
161                         pipe_maxbig *= 2;
162         }
163         if (pipe_maxcache == PIPEQ_MAX_CACHE) {
164                 if (mbytes >= 7 * 1024)
165                         pipe_maxcache *= 2;
166                 if (mbytes >= 15 * 1024)
167                         pipe_maxcache *= 2;
168         }
169 }
170 SYSINIT(kmem, SI_BOOT2_MACHDEP, SI_ORDER_ANY, pipeinit, NULL)
171
172 static void pipeclose (struct pipe *cpipe);
173 static void pipe_free_kmem (struct pipe *cpipe);
174 static int pipe_create (struct pipe **cpipep);
175 static int pipespace (struct pipe *cpipe, int size);
176
177 static __inline void
178 pipewakeup(struct pipe *cpipe, int dosigio)
179 {
180         if (dosigio && (cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) {
181                 lwkt_gettoken(&sigio_token);
182                 pgsigio(cpipe->pipe_sigio, SIGIO, 0);
183                 lwkt_reltoken(&sigio_token);
184         }
185         KNOTE(&cpipe->pipe_kq.ki_note, 0);
186 }
187
188 /*
189  * These routines are called before and after a UIO.  The UIO
190  * may block, causing our held tokens to be lost temporarily.
191  *
192  * We use these routines to serialize reads against other reads
193  * and writes against other writes.
194  *
195  * The read token is held on entry so *ipp does not race.
196  */
197 static __inline int
198 pipe_start_uio(struct pipe *cpipe, int *ipp)
199 {
200         int error;
201
202         while (*ipp) {
203                 *ipp = -1;
204                 error = tsleep(ipp, PCATCH, "pipexx", 0);
205                 if (error)
206                         return (error);
207         }
208         *ipp = 1;
209         return (0);
210 }
211
212 static __inline void
213 pipe_end_uio(struct pipe *cpipe, int *ipp)
214 {
215         if (*ipp < 0) {
216                 *ipp = 0;
217                 wakeup(ipp);
218         } else {
219                 KKASSERT(*ipp > 0);
220                 *ipp = 0;
221         }
222 }
223
224 /*
225  * The pipe system call for the DTYPE_PIPE type of pipes
226  *
227  * pipe_args(int dummy)
228  *
229  * MPSAFE
230  */
231 int
232 sys_pipe(struct pipe_args *uap)
233 {
234         struct thread *td = curthread;
235         struct filedesc *fdp = td->td_proc->p_fd;
236         struct file *rf, *wf;
237         struct pipe *rpipe, *wpipe;
238         int fd1, fd2, error;
239
240         rpipe = wpipe = NULL;
241         if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
242                 pipeclose(rpipe); 
243                 pipeclose(wpipe); 
244                 return (ENFILE);
245         }
246         
247         error = falloc(td->td_lwp, &rf, &fd1);
248         if (error) {
249                 pipeclose(rpipe);
250                 pipeclose(wpipe);
251                 return (error);
252         }
253         uap->sysmsg_fds[0] = fd1;
254
255         /*
256          * Warning: once we've gotten past allocation of the fd for the
257          * read-side, we can only drop the read side via fdrop() in order
258          * to avoid races against processes which manage to dup() the read
259          * side while we are blocked trying to allocate the write side.
260          */
261         rf->f_type = DTYPE_PIPE;
262         rf->f_flag = FREAD | FWRITE;
263         rf->f_ops = &pipeops;
264         rf->f_data = rpipe;
265         error = falloc(td->td_lwp, &wf, &fd2);
266         if (error) {
267                 fsetfd(fdp, NULL, fd1);
268                 fdrop(rf);
269                 /* rpipe has been closed by fdrop(). */
270                 pipeclose(wpipe);
271                 return (error);
272         }
273         wf->f_type = DTYPE_PIPE;
274         wf->f_flag = FREAD | FWRITE;
275         wf->f_ops = &pipeops;
276         wf->f_data = wpipe;
277         uap->sysmsg_fds[1] = fd2;
278
279         rpipe->pipe_slock = kmalloc(sizeof(struct lock),
280                                     M_PIPE, M_WAITOK|M_ZERO);
281         wpipe->pipe_slock = rpipe->pipe_slock;
282         rpipe->pipe_peer = wpipe;
283         wpipe->pipe_peer = rpipe;
284         lockinit(rpipe->pipe_slock, "pipecl", 0, 0);
285
286         /*
287          * Once activated the peer relationship remains valid until
288          * both sides are closed.
289          */
290         fsetfd(fdp, rf, fd1);
291         fsetfd(fdp, wf, fd2);
292         fdrop(rf);
293         fdrop(wf);
294
295         return (0);
296 }
297
298 /*
299  * Allocate kva for pipe circular buffer, the space is pageable
300  * This routine will 'realloc' the size of a pipe safely, if it fails
301  * it will retain the old buffer.
302  * If it fails it will return ENOMEM.
303  */
304 static int
305 pipespace(struct pipe *cpipe, int size)
306 {
307         struct vm_object *object;
308         caddr_t buffer;
309         int npages, error;
310
311         npages = round_page(size) / PAGE_SIZE;
312         object = cpipe->pipe_buffer.object;
313
314         /*
315          * [re]create the object if necessary and reserve space for it
316          * in the kernel_map.  The object and memory are pageable.  On
317          * success, free the old resources before assigning the new
318          * ones.
319          */
320         if (object == NULL || object->size != npages) {
321                 object = vm_object_allocate(OBJT_DEFAULT, npages);
322                 buffer = (caddr_t)vm_map_min(&kernel_map);
323
324                 error = vm_map_find(&kernel_map, object, 0,
325                                     (vm_offset_t *)&buffer,
326                                     size, PAGE_SIZE,
327                                     1, VM_MAPTYPE_NORMAL,
328                                     VM_PROT_ALL, VM_PROT_ALL,
329                                     0);
330
331                 if (error != KERN_SUCCESS) {
332                         vm_object_deallocate(object);
333                         return (ENOMEM);
334                 }
335                 pipe_free_kmem(cpipe);
336                 cpipe->pipe_buffer.object = object;
337                 cpipe->pipe_buffer.buffer = buffer;
338                 cpipe->pipe_buffer.size = size;
339                 ++pipe_bkmem_alloc;
340         } else {
341                 ++pipe_bcache_alloc;
342         }
343         cpipe->pipe_buffer.rindex = 0;
344         cpipe->pipe_buffer.windex = 0;
345         return (0);
346 }
347
348 /*
349  * Initialize and allocate VM and memory for pipe, pulling the pipe from
350  * our per-cpu cache if possible.  For now make sure it is sized for the
351  * smaller PIPE_SIZE default.
352  */
353 static int
354 pipe_create(struct pipe **cpipep)
355 {
356         globaldata_t gd = mycpu;
357         struct pipe *cpipe;
358         int error;
359
360         if ((cpipe = gd->gd_pipeq) != NULL) {
361                 gd->gd_pipeq = cpipe->pipe_peer;
362                 --gd->gd_pipeqcount;
363                 cpipe->pipe_peer = NULL;
364                 cpipe->pipe_wantwcnt = 0;
365         } else {
366                 cpipe = kmalloc(sizeof(struct pipe), M_PIPE, M_WAITOK|M_ZERO);
367         }
368         *cpipep = cpipe;
369         if ((error = pipespace(cpipe, PIPE_SIZE)) != 0)
370                 return (error);
371         vfs_timestamp(&cpipe->pipe_ctime);
372         cpipe->pipe_atime = cpipe->pipe_ctime;
373         cpipe->pipe_mtime = cpipe->pipe_ctime;
374         lwkt_token_init(&cpipe->pipe_rlock, "piper");
375         lwkt_token_init(&cpipe->pipe_wlock, "pipew");
376         return (0);
377 }
378
379 static int
380 pipe_read(struct file *fp, struct uio *uio, struct ucred *cred, int fflags)
381 {
382         struct pipe *rpipe;
383         struct pipe *wpipe;
384         int error;
385         size_t nread = 0;
386         int nbio;
387         u_int size;     /* total bytes available */
388         u_int nsize;    /* total bytes to read */
389         u_int rindex;   /* contiguous bytes available */
390         int notify_writer;
391         int bigread;
392         int bigcount;
393
394         atomic_set_int(&curthread->td_mpflags, TDF_MP_BATCH_DEMARC);
395
396         if (uio->uio_resid == 0)
397                 return(0);
398
399         /*
400          * Setup locks, calculate nbio
401          */
402         rpipe = (struct pipe *)fp->f_data;
403         wpipe = rpipe->pipe_peer;
404         lwkt_gettoken(&rpipe->pipe_rlock);
405
406         if (fflags & O_FBLOCKING)
407                 nbio = 0;
408         else if (fflags & O_FNONBLOCKING)
409                 nbio = 1;
410         else if (fp->f_flag & O_NONBLOCK)
411                 nbio = 1;
412         else
413                 nbio = 0;
414
415         /*
416          * Reads are serialized.  Note however that pipe_buffer.buffer and
417          * pipe_buffer.size can change out from under us when the number
418          * of bytes in the buffer are zero due to the write-side doing a
419          * pipespace().
420          */
421         error = pipe_start_uio(rpipe, &rpipe->pipe_rip);
422         if (error) {
423                 lwkt_reltoken(&rpipe->pipe_rlock);
424                 return (error);
425         }
426         notify_writer = 0;
427
428         bigread = (uio->uio_resid > 10 * 1024 * 1024);
429         bigcount = 10;
430
431         while (uio->uio_resid) {
432                 /*
433                  * Don't hog the cpu.
434                  */
435                 if (bigread && --bigcount == 0) {
436                         lwkt_user_yield();
437                         bigcount = 10;
438                         if (CURSIG(curthread->td_lwp)) {
439                                 error = EINTR;
440                                 break;
441                         }
442                 }
443
444                 size = rpipe->pipe_buffer.windex - rpipe->pipe_buffer.rindex;
445                 cpu_lfence();
446                 if (size) {
447                         rindex = rpipe->pipe_buffer.rindex &
448                                  (rpipe->pipe_buffer.size - 1);
449                         nsize = size;
450                         if (nsize > rpipe->pipe_buffer.size - rindex)
451                                 nsize = rpipe->pipe_buffer.size - rindex;
452                         nsize = szmin(nsize, uio->uio_resid);
453
454                         error = uiomove(&rpipe->pipe_buffer.buffer[rindex],
455                                         nsize, uio);
456                         if (error)
457                                 break;
458                         cpu_mfence();
459                         rpipe->pipe_buffer.rindex += nsize;
460                         nread += nsize;
461
462                         /*
463                          * If the FIFO is still over half full just continue
464                          * and do not try to notify the writer yet.
465                          */
466                         if (size - nsize >= (rpipe->pipe_buffer.size >> 1)) {
467                                 notify_writer = 0;
468                                 continue;
469                         }
470
471                         /*
472                          * When the FIFO is less then half full notify any
473                          * waiting writer.  WANTW can be checked while
474                          * holding just the rlock.
475                          */
476                         notify_writer = 1;
477                         if ((rpipe->pipe_state & PIPE_WANTW) == 0)
478                                 continue;
479                 }
480
481                 /*
482                  * If the "write-side" was blocked we wake it up.  This code
483                  * is reached either when the buffer is completely emptied
484                  * or if it becomes more then half-empty.
485                  *
486                  * Pipe_state can only be modified if both the rlock and
487                  * wlock are held.
488                  */
489                 if (rpipe->pipe_state & PIPE_WANTW) {
490                         lwkt_gettoken(&rpipe->pipe_wlock);
491                         if (rpipe->pipe_state & PIPE_WANTW) {
492                                 rpipe->pipe_state &= ~PIPE_WANTW;
493                                 lwkt_reltoken(&rpipe->pipe_wlock);
494                                 wakeup(rpipe);
495                         } else {
496                                 lwkt_reltoken(&rpipe->pipe_wlock);
497                         }
498                 }
499
500                 /*
501                  * Pick up our copy loop again if the writer sent data to
502                  * us while we were messing around.
503                  *
504                  * On a SMP box poll up to pipe_delay nanoseconds for new
505                  * data.  Typically a value of 2000 to 4000 is sufficient
506                  * to eradicate most IPIs/tsleeps/wakeups when a pipe
507                  * is used for synchronous communications with small packets,
508                  * and 8000 or so (8uS) will pipeline large buffer xfers
509                  * between cpus over a pipe.
510                  *
511                  * For synchronous communications a hit means doing a
512                  * full Awrite-Bread-Bwrite-Aread cycle in less then 2uS,
513                  * where as miss requiring a tsleep/wakeup sequence
514                  * will take 7uS or more.
515                  */
516                 if (rpipe->pipe_buffer.windex != rpipe->pipe_buffer.rindex)
517                         continue;
518
519 #ifdef _RDTSC_SUPPORTED_
520                 if (pipe_delay) {
521                         int64_t tsc_target;
522                         int good = 0;
523
524                         tsc_target = tsc_get_target(pipe_delay);
525                         while (tsc_test_target(tsc_target) == 0) {
526                                 if (rpipe->pipe_buffer.windex !=
527                                     rpipe->pipe_buffer.rindex) {
528                                         good = 1;
529                                         break;
530                                 }
531                         }
532                         if (good)
533                                 continue;
534                 }
535 #endif
536
537                 /*
538                  * Detect EOF condition, do not set error.
539                  */
540                 if (rpipe->pipe_state & PIPE_REOF)
541                         break;
542
543                 /*
544                  * Break if some data was read, or if this was a non-blocking
545                  * read.
546                  */
547                 if (nread > 0)
548                         break;
549
550                 if (nbio) {
551                         error = EAGAIN;
552                         break;
553                 }
554
555                 /*
556                  * Last chance, interlock with WANTR.
557                  */
558                 lwkt_gettoken(&rpipe->pipe_wlock);
559                 size = rpipe->pipe_buffer.windex - rpipe->pipe_buffer.rindex;
560                 if (size) {
561                         lwkt_reltoken(&rpipe->pipe_wlock);
562                         continue;
563                 }
564
565                 /*
566                  * Retest EOF - acquiring a new token can temporarily release
567                  * tokens already held.
568                  */
569                 if (rpipe->pipe_state & PIPE_REOF) {
570                         lwkt_reltoken(&rpipe->pipe_wlock);
571                         break;
572                 }
573
574                 /*
575                  * If there is no more to read in the pipe, reset its
576                  * pointers to the beginning.  This improves cache hit
577                  * stats.
578                  *
579                  * We need both locks to modify both pointers, and there
580                  * must also not be a write in progress or the uiomove()
581                  * in the write might block and temporarily release
582                  * its wlock, then reacquire and update windex.  We are
583                  * only serialized against reads, not writes.
584                  *
585                  * XXX should we even bother resetting the indices?  It
586                  *     might actually be more cache efficient not to.
587                  */
588                 if (rpipe->pipe_buffer.rindex == rpipe->pipe_buffer.windex &&
589                     rpipe->pipe_wip == 0) {
590                         rpipe->pipe_buffer.rindex = 0;
591                         rpipe->pipe_buffer.windex = 0;
592                 }
593
594                 /*
595                  * Wait for more data.
596                  *
597                  * Pipe_state can only be set if both the rlock and wlock
598                  * are held.
599                  */
600                 rpipe->pipe_state |= PIPE_WANTR;
601                 tsleep_interlock(rpipe, PCATCH);
602                 lwkt_reltoken(&rpipe->pipe_wlock);
603                 error = tsleep(rpipe, PCATCH | PINTERLOCKED, "piperd", 0);
604                 ++pipe_rblocked_count;
605                 if (error)
606                         break;
607         }
608         pipe_end_uio(rpipe, &rpipe->pipe_rip);
609
610         /*
611          * Uptime last access time
612          */
613         if (error == 0 && nread)
614                 vfs_timestamp(&rpipe->pipe_atime);
615
616         /*
617          * If we drained the FIFO more then half way then handle
618          * write blocking hysteresis.
619          *
620          * Note that PIPE_WANTW cannot be set by the writer without
621          * it holding both rlock and wlock, so we can test it
622          * while holding just rlock.
623          */
624         if (notify_writer) {
625                 /*
626                  * Synchronous blocking is done on the pipe involved
627                  */
628                 if (rpipe->pipe_state & PIPE_WANTW) {
629                         lwkt_gettoken(&rpipe->pipe_wlock);
630                         if (rpipe->pipe_state & PIPE_WANTW) {
631                                 rpipe->pipe_state &= ~PIPE_WANTW;
632                                 lwkt_reltoken(&rpipe->pipe_wlock);
633                                 wakeup(rpipe);
634                         } else {
635                                 lwkt_reltoken(&rpipe->pipe_wlock);
636                         }
637                 }
638
639                 /*
640                  * But we may also have to deal with a kqueue which is
641                  * stored on the same pipe as its descriptor, so a
642                  * EVFILT_WRITE event waiting for our side to drain will
643                  * be on the other side.
644                  */
645                 lwkt_gettoken(&wpipe->pipe_wlock);
646                 pipewakeup(wpipe, 0);
647                 lwkt_reltoken(&wpipe->pipe_wlock);
648         }
649         /*size = rpipe->pipe_buffer.windex - rpipe->pipe_buffer.rindex;*/
650         lwkt_reltoken(&rpipe->pipe_rlock);
651
652         return (error);
653 }
654
655 static int
656 pipe_write(struct file *fp, struct uio *uio, struct ucred *cred, int fflags)
657 {
658         int error;
659         int orig_resid;
660         int nbio;
661         struct pipe *wpipe;
662         struct pipe *rpipe;
663         u_int windex;
664         u_int space;
665         u_int wcount;
666         int bigwrite;
667         int bigcount;
668
669         /*
670          * Writes go to the peer.  The peer will always exist.
671          */
672         rpipe = (struct pipe *) fp->f_data;
673         wpipe = rpipe->pipe_peer;
674         lwkt_gettoken(&wpipe->pipe_wlock);
675         if (wpipe->pipe_state & PIPE_WEOF) {
676                 lwkt_reltoken(&wpipe->pipe_wlock);
677                 return (EPIPE);
678         }
679
680         /*
681          * Degenerate case (EPIPE takes prec)
682          */
683         if (uio->uio_resid == 0) {
684                 lwkt_reltoken(&wpipe->pipe_wlock);
685                 return(0);
686         }
687
688         /*
689          * Writes are serialized (start_uio must be called with wlock)
690          */
691         error = pipe_start_uio(wpipe, &wpipe->pipe_wip);
692         if (error) {
693                 lwkt_reltoken(&wpipe->pipe_wlock);
694                 return (error);
695         }
696
697         if (fflags & O_FBLOCKING)
698                 nbio = 0;
699         else if (fflags & O_FNONBLOCKING)
700                 nbio = 1;
701         else if (fp->f_flag & O_NONBLOCK)
702                 nbio = 1;
703         else
704                 nbio = 0;
705
706         /*
707          * If it is advantageous to resize the pipe buffer, do
708          * so.  We are write-serialized so we can block safely.
709          */
710         if ((wpipe->pipe_buffer.size <= PIPE_SIZE) &&
711             (pipe_nbig < pipe_maxbig) &&
712             wpipe->pipe_wantwcnt > 4 &&
713             (wpipe->pipe_buffer.rindex == wpipe->pipe_buffer.windex)) {
714                 /* 
715                  * Recheck after lock.
716                  */
717                 lwkt_gettoken(&wpipe->pipe_rlock);
718                 if ((wpipe->pipe_buffer.size <= PIPE_SIZE) &&
719                     (pipe_nbig < pipe_maxbig) &&
720                     (wpipe->pipe_buffer.rindex == wpipe->pipe_buffer.windex)) {
721                         atomic_add_int(&pipe_nbig, 1);
722                         if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
723                                 ++pipe_bigcount;
724                         else
725                                 atomic_subtract_int(&pipe_nbig, 1);
726                 }
727                 lwkt_reltoken(&wpipe->pipe_rlock);
728         }
729
730         orig_resid = uio->uio_resid;
731         wcount = 0;
732
733         bigwrite = (uio->uio_resid > 10 * 1024 * 1024);
734         bigcount = 10;
735
736         while (uio->uio_resid) {
737                 if (wpipe->pipe_state & PIPE_WEOF) {
738                         error = EPIPE;
739                         break;
740                 }
741
742                 /*
743                  * Don't hog the cpu.
744                  */
745                 if (bigwrite && --bigcount == 0) {
746                         lwkt_user_yield();
747                         bigcount = 10;
748                         if (CURSIG(curthread->td_lwp)) {
749                                 error = EINTR;
750                                 break;
751                         }
752                 }
753
754                 windex = wpipe->pipe_buffer.windex &
755                          (wpipe->pipe_buffer.size - 1);
756                 space = wpipe->pipe_buffer.size -
757                         (wpipe->pipe_buffer.windex - wpipe->pipe_buffer.rindex);
758                 cpu_lfence();
759
760                 /* Writes of size <= PIPE_BUF must be atomic. */
761                 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
762                         space = 0;
763
764                 /* 
765                  * Write to fill, read size handles write hysteresis.  Also
766                  * additional restrictions can cause select-based non-blocking
767                  * writes to spin.
768                  */
769                 if (space > 0) {
770                         u_int segsize;
771
772                         /*
773                          * Transfer size is minimum of uio transfer
774                          * and free space in pipe buffer.
775                          *
776                          * Limit each uiocopy to no more then PIPE_SIZE
777                          * so we can keep the gravy train going on a
778                          * SMP box.  This doubles the performance for
779                          * write sizes > 16K.  Otherwise large writes
780                          * wind up doing an inefficient synchronous
781                          * ping-pong.
782                          */
783                         space = szmin(space, uio->uio_resid);
784                         if (space > PIPE_SIZE)
785                                 space = PIPE_SIZE;
786
787                         /*
788                          * First segment to transfer is minimum of
789                          * transfer size and contiguous space in
790                          * pipe buffer.  If first segment to transfer
791                          * is less than the transfer size, we've got
792                          * a wraparound in the buffer.
793                          */
794                         segsize = wpipe->pipe_buffer.size - windex;
795                         if (segsize > space)
796                                 segsize = space;
797
798                         /*
799                          * If this is the first loop and the reader is
800                          * blocked, do a preemptive wakeup of the reader.
801                          *
802                          * On SMP the IPI latency plus the wlock interlock
803                          * on the reader side is the fastest way to get the
804                          * reader going.  (The scheduler will hard loop on
805                          * lock tokens).
806                          *
807                          * NOTE: We can't clear WANTR here without acquiring
808                          * the rlock, which we don't want to do here!
809                          */
810                         if ((wpipe->pipe_state & PIPE_WANTR))
811                                 wakeup(wpipe);
812
813                         /*
814                          * Transfer segment, which may include a wrap-around.
815                          * Update windex to account for both all in one go
816                          * so the reader can read() the data atomically.
817                          */
818                         error = uiomove(&wpipe->pipe_buffer.buffer[windex],
819                                         segsize, uio);
820                         if (error == 0 && segsize < space) {
821                                 segsize = space - segsize;
822                                 error = uiomove(&wpipe->pipe_buffer.buffer[0],
823                                                 segsize, uio);
824                         }
825                         if (error)
826                                 break;
827                         cpu_mfence();
828                         wpipe->pipe_buffer.windex += space;
829                         wcount += space;
830                         continue;
831                 }
832
833                 /*
834                  * We need both the rlock and the wlock to interlock against
835                  * the EOF, WANTW, and size checks, and to modify pipe_state.
836                  *
837                  * These are token locks so we do not have to worry about
838                  * deadlocks.
839                  */
840                 lwkt_gettoken(&wpipe->pipe_rlock);
841
842                 /*
843                  * If the "read-side" has been blocked, wake it up now
844                  * and yield to let it drain synchronously rather
845                  * then block.
846                  */
847                 if (wpipe->pipe_state & PIPE_WANTR) {
848                         wpipe->pipe_state &= ~PIPE_WANTR;
849                         wakeup(wpipe);
850                 }
851
852                 /*
853                  * don't block on non-blocking I/O
854                  */
855                 if (nbio) {
856                         lwkt_reltoken(&wpipe->pipe_rlock);
857                         error = EAGAIN;
858                         break;
859                 }
860
861                 /*
862                  * re-test whether we have to block in the writer after
863                  * acquiring both locks, in case the reader opened up
864                  * some space.
865                  */
866                 space = wpipe->pipe_buffer.size -
867                         (wpipe->pipe_buffer.windex - wpipe->pipe_buffer.rindex);
868                 cpu_lfence();
869                 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
870                         space = 0;
871
872                 /*
873                  * Retest EOF - acquiring a new token can temporarily release
874                  * tokens already held.
875                  */
876                 if (wpipe->pipe_state & PIPE_WEOF) {
877                         lwkt_reltoken(&wpipe->pipe_rlock);
878                         error = EPIPE;
879                         break;
880                 }
881
882                 /*
883                  * We have no more space and have something to offer,
884                  * wake up select/poll/kq.
885                  */
886                 if (space == 0) {
887                         wpipe->pipe_state |= PIPE_WANTW;
888                         ++wpipe->pipe_wantwcnt;
889                         pipewakeup(wpipe, 1);
890                         if (wpipe->pipe_state & PIPE_WANTW)
891                                 error = tsleep(wpipe, PCATCH, "pipewr", 0);
892                         ++pipe_wblocked_count;
893                 }
894                 lwkt_reltoken(&wpipe->pipe_rlock);
895
896                 /*
897                  * Break out if we errored or the read side wants us to go
898                  * away.
899                  */
900                 if (error)
901                         break;
902                 if (wpipe->pipe_state & PIPE_WEOF) {
903                         error = EPIPE;
904                         break;
905                 }
906         }
907         pipe_end_uio(wpipe, &wpipe->pipe_wip);
908
909         /*
910          * If we have put any characters in the buffer, we wake up
911          * the reader.
912          *
913          * Both rlock and wlock are required to be able to modify pipe_state.
914          */
915         if (wpipe->pipe_buffer.windex != wpipe->pipe_buffer.rindex) {
916                 if (wpipe->pipe_state & PIPE_WANTR) {
917                         lwkt_gettoken(&wpipe->pipe_rlock);
918                         if (wpipe->pipe_state & PIPE_WANTR) {
919                                 wpipe->pipe_state &= ~PIPE_WANTR;
920                                 lwkt_reltoken(&wpipe->pipe_rlock);
921                                 wakeup(wpipe);
922                         } else {
923                                 lwkt_reltoken(&wpipe->pipe_rlock);
924                         }
925                 }
926                 lwkt_gettoken(&wpipe->pipe_rlock);
927                 pipewakeup(wpipe, 1);
928                 lwkt_reltoken(&wpipe->pipe_rlock);
929         }
930
931         /*
932          * Don't return EPIPE if I/O was successful
933          */
934         if ((wpipe->pipe_buffer.rindex == wpipe->pipe_buffer.windex) &&
935             (uio->uio_resid == 0) &&
936             (error == EPIPE)) {
937                 error = 0;
938         }
939
940         if (error == 0)
941                 vfs_timestamp(&wpipe->pipe_mtime);
942
943         /*
944          * We have something to offer,
945          * wake up select/poll/kq.
946          */
947         /*space = wpipe->pipe_buffer.windex - wpipe->pipe_buffer.rindex;*/
948         lwkt_reltoken(&wpipe->pipe_wlock);
949         return (error);
950 }
951
952 /*
953  * we implement a very minimal set of ioctls for compatibility with sockets.
954  */
955 int
956 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data,
957            struct ucred *cred, struct sysmsg *msg)
958 {
959         struct pipe *mpipe;
960         int error;
961
962         mpipe = (struct pipe *)fp->f_data;
963
964         lwkt_gettoken(&mpipe->pipe_rlock);
965         lwkt_gettoken(&mpipe->pipe_wlock);
966
967         switch (cmd) {
968         case FIOASYNC:
969                 if (*(int *)data) {
970                         mpipe->pipe_state |= PIPE_ASYNC;
971                 } else {
972                         mpipe->pipe_state &= ~PIPE_ASYNC;
973                 }
974                 error = 0;
975                 break;
976         case FIONREAD:
977                 *(int *)data = mpipe->pipe_buffer.windex -
978                                 mpipe->pipe_buffer.rindex;
979                 error = 0;
980                 break;
981         case FIOSETOWN:
982                 error = fsetown(*(int *)data, &mpipe->pipe_sigio);
983                 break;
984         case FIOGETOWN:
985                 *(int *)data = fgetown(&mpipe->pipe_sigio);
986                 error = 0;
987                 break;
988         case TIOCSPGRP:
989                 /* This is deprecated, FIOSETOWN should be used instead. */
990                 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
991                 break;
992
993         case TIOCGPGRP:
994                 /* This is deprecated, FIOGETOWN should be used instead. */
995                 *(int *)data = -fgetown(&mpipe->pipe_sigio);
996                 error = 0;
997                 break;
998         default:
999                 error = ENOTTY;
1000                 break;
1001         }
1002         lwkt_reltoken(&mpipe->pipe_wlock);
1003         lwkt_reltoken(&mpipe->pipe_rlock);
1004
1005         return (error);
1006 }
1007
1008 /*
1009  * MPSAFE
1010  */
1011 static int
1012 pipe_stat(struct file *fp, struct stat *ub, struct ucred *cred)
1013 {
1014         struct pipe *pipe;
1015
1016         pipe = (struct pipe *)fp->f_data;
1017
1018         bzero((caddr_t)ub, sizeof(*ub));
1019         ub->st_mode = S_IFIFO;
1020         ub->st_blksize = pipe->pipe_buffer.size;
1021         ub->st_size = pipe->pipe_buffer.windex - pipe->pipe_buffer.rindex;
1022         ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1023         ub->st_atimespec = pipe->pipe_atime;
1024         ub->st_mtimespec = pipe->pipe_mtime;
1025         ub->st_ctimespec = pipe->pipe_ctime;
1026         /*
1027          * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
1028          * st_flags, st_gen.
1029          * XXX (st_dev, st_ino) should be unique.
1030          */
1031         return (0);
1032 }
1033
1034 static int
1035 pipe_close(struct file *fp)
1036 {
1037         struct pipe *cpipe;
1038
1039         cpipe = (struct pipe *)fp->f_data;
1040         fp->f_ops = &badfileops;
1041         fp->f_data = NULL;
1042         funsetown(&cpipe->pipe_sigio);
1043         pipeclose(cpipe);
1044         return (0);
1045 }
1046
1047 /*
1048  * Shutdown one or both directions of a full-duplex pipe.
1049  */
1050 static int
1051 pipe_shutdown(struct file *fp, int how)
1052 {
1053         struct pipe *rpipe;
1054         struct pipe *wpipe;
1055         int error = EPIPE;
1056
1057         rpipe = (struct pipe *)fp->f_data;
1058         wpipe = rpipe->pipe_peer;
1059
1060         /*
1061          * We modify pipe_state on both pipes, which means we need
1062          * all four tokens!
1063          */
1064         lwkt_gettoken(&rpipe->pipe_rlock);
1065         lwkt_gettoken(&rpipe->pipe_wlock);
1066         lwkt_gettoken(&wpipe->pipe_rlock);
1067         lwkt_gettoken(&wpipe->pipe_wlock);
1068
1069         switch(how) {
1070         case SHUT_RDWR:
1071         case SHUT_RD:
1072                 rpipe->pipe_state |= PIPE_REOF;         /* my reads */
1073                 rpipe->pipe_state |= PIPE_WEOF;         /* peer writes */
1074                 if (rpipe->pipe_state & PIPE_WANTR) {
1075                         rpipe->pipe_state &= ~PIPE_WANTR;
1076                         wakeup(rpipe);
1077                 }
1078                 if (rpipe->pipe_state & PIPE_WANTW) {
1079                         rpipe->pipe_state &= ~PIPE_WANTW;
1080                         wakeup(rpipe);
1081                 }
1082                 error = 0;
1083                 if (how == SHUT_RD)
1084                         break;
1085                 /* fall through */
1086         case SHUT_WR:
1087                 wpipe->pipe_state |= PIPE_REOF;         /* peer reads */
1088                 wpipe->pipe_state |= PIPE_WEOF;         /* my writes */
1089                 if (wpipe->pipe_state & PIPE_WANTR) {
1090                         wpipe->pipe_state &= ~PIPE_WANTR;
1091                         wakeup(wpipe);
1092                 }
1093                 if (wpipe->pipe_state & PIPE_WANTW) {
1094                         wpipe->pipe_state &= ~PIPE_WANTW;
1095                         wakeup(wpipe);
1096                 }
1097                 error = 0;
1098                 break;
1099         }
1100         pipewakeup(rpipe, 1);
1101         pipewakeup(wpipe, 1);
1102
1103         lwkt_reltoken(&wpipe->pipe_wlock);
1104         lwkt_reltoken(&wpipe->pipe_rlock);
1105         lwkt_reltoken(&rpipe->pipe_wlock);
1106         lwkt_reltoken(&rpipe->pipe_rlock);
1107
1108         return (error);
1109 }
1110
1111 static void
1112 pipe_free_kmem(struct pipe *cpipe)
1113 {
1114         if (cpipe->pipe_buffer.buffer != NULL) {
1115                 if (cpipe->pipe_buffer.size > PIPE_SIZE)
1116                         atomic_subtract_int(&pipe_nbig, 1);
1117                 kmem_free(&kernel_map,
1118                         (vm_offset_t)cpipe->pipe_buffer.buffer,
1119                         cpipe->pipe_buffer.size);
1120                 cpipe->pipe_buffer.buffer = NULL;
1121                 cpipe->pipe_buffer.object = NULL;
1122         }
1123 }
1124
1125 /*
1126  * Close the pipe.  The slock must be held to interlock against simultanious
1127  * closes.  The rlock and wlock must be held to adjust the pipe_state.
1128  */
1129 static void
1130 pipeclose(struct pipe *cpipe)
1131 {
1132         globaldata_t gd;
1133         struct pipe *ppipe;
1134
1135         if (cpipe == NULL)
1136                 return;
1137
1138         /*
1139          * The slock may not have been allocated yet (close during
1140          * initialization)
1141          *
1142          * We need both the read and write tokens to modify pipe_state.
1143          */
1144         if (cpipe->pipe_slock)
1145                 lockmgr(cpipe->pipe_slock, LK_EXCLUSIVE);
1146         lwkt_gettoken(&cpipe->pipe_rlock);
1147         lwkt_gettoken(&cpipe->pipe_wlock);
1148
1149         /*
1150          * Set our state, wakeup anyone waiting in select/poll/kq, and
1151          * wakeup anyone blocked on our pipe.
1152          */
1153         cpipe->pipe_state |= PIPE_CLOSED | PIPE_REOF | PIPE_WEOF;
1154         pipewakeup(cpipe, 1);
1155         if (cpipe->pipe_state & (PIPE_WANTR | PIPE_WANTW)) {
1156                 cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1157                 wakeup(cpipe);
1158         }
1159
1160         /*
1161          * Disconnect from peer.
1162          */
1163         if ((ppipe = cpipe->pipe_peer) != NULL) {
1164                 lwkt_gettoken(&ppipe->pipe_rlock);
1165                 lwkt_gettoken(&ppipe->pipe_wlock);
1166                 ppipe->pipe_state |= PIPE_REOF | PIPE_WEOF;
1167                 pipewakeup(ppipe, 1);
1168                 if (ppipe->pipe_state & (PIPE_WANTR | PIPE_WANTW)) {
1169                         ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1170                         wakeup(ppipe);
1171                 }
1172                 if (SLIST_FIRST(&ppipe->pipe_kq.ki_note))
1173                         KNOTE(&ppipe->pipe_kq.ki_note, 0);
1174                 lwkt_reltoken(&ppipe->pipe_wlock);
1175                 lwkt_reltoken(&ppipe->pipe_rlock);
1176         }
1177
1178         /*
1179          * If the peer is also closed we can free resources for both
1180          * sides, otherwise we leave our side intact to deal with any
1181          * races (since we only have the slock).
1182          */
1183         if (ppipe && (ppipe->pipe_state & PIPE_CLOSED)) {
1184                 cpipe->pipe_peer = NULL;
1185                 ppipe->pipe_peer = NULL;
1186                 ppipe->pipe_slock = NULL;       /* we will free the slock */
1187                 pipeclose(ppipe);
1188                 ppipe = NULL;
1189         }
1190
1191         lwkt_reltoken(&cpipe->pipe_wlock);
1192         lwkt_reltoken(&cpipe->pipe_rlock);
1193         if (cpipe->pipe_slock)
1194                 lockmgr(cpipe->pipe_slock, LK_RELEASE);
1195
1196         /*
1197          * If we disassociated from our peer we can free resources
1198          */
1199         if (ppipe == NULL) {
1200                 gd = mycpu;
1201                 if (cpipe->pipe_slock) {
1202                         kfree(cpipe->pipe_slock, M_PIPE);
1203                         cpipe->pipe_slock = NULL;
1204                 }
1205                 if (gd->gd_pipeqcount >= pipe_maxcache ||
1206                     cpipe->pipe_buffer.size != PIPE_SIZE
1207                 ) {
1208                         pipe_free_kmem(cpipe);
1209                         kfree(cpipe, M_PIPE);
1210                 } else {
1211                         cpipe->pipe_state = 0;
1212                         cpipe->pipe_peer = gd->gd_pipeq;
1213                         gd->gd_pipeq = cpipe;
1214                         ++gd->gd_pipeqcount;
1215                 }
1216         }
1217 }
1218
1219 static int
1220 pipe_kqfilter(struct file *fp, struct knote *kn)
1221 {
1222         struct pipe *cpipe;
1223
1224         cpipe = (struct pipe *)kn->kn_fp->f_data;
1225
1226         switch (kn->kn_filter) {
1227         case EVFILT_READ:
1228                 kn->kn_fop = &pipe_rfiltops;
1229                 break;
1230         case EVFILT_WRITE:
1231                 kn->kn_fop = &pipe_wfiltops;
1232                 if (cpipe->pipe_peer == NULL) {
1233                         /* other end of pipe has been closed */
1234                         return (EPIPE);
1235                 }
1236                 break;
1237         default:
1238                 return (EOPNOTSUPP);
1239         }
1240         kn->kn_hook = (caddr_t)cpipe;
1241
1242         knote_insert(&cpipe->pipe_kq.ki_note, kn);
1243
1244         return (0);
1245 }
1246
1247 static void
1248 filt_pipedetach(struct knote *kn)
1249 {
1250         struct pipe *cpipe = (struct pipe *)kn->kn_hook;
1251
1252         knote_remove(&cpipe->pipe_kq.ki_note, kn);
1253 }
1254
1255 /*ARGSUSED*/
1256 static int
1257 filt_piperead(struct knote *kn, long hint)
1258 {
1259         struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1260         int ready = 0;
1261
1262         lwkt_gettoken(&rpipe->pipe_rlock);
1263         lwkt_gettoken(&rpipe->pipe_wlock);
1264
1265         kn->kn_data = rpipe->pipe_buffer.windex - rpipe->pipe_buffer.rindex;
1266
1267         if (rpipe->pipe_state & PIPE_REOF) {
1268                 /*
1269                  * Only set NODATA if all data has been exhausted
1270                  */
1271                 if (kn->kn_data == 0)
1272                         kn->kn_flags |= EV_NODATA;
1273                 kn->kn_flags |= EV_EOF; 
1274                 ready = 1;
1275         }
1276
1277         lwkt_reltoken(&rpipe->pipe_wlock);
1278         lwkt_reltoken(&rpipe->pipe_rlock);
1279
1280         if (!ready)
1281                 ready = kn->kn_data > 0;
1282
1283         return (ready);
1284 }
1285
1286 /*ARGSUSED*/
1287 static int
1288 filt_pipewrite(struct knote *kn, long hint)
1289 {
1290         struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1291         struct pipe *wpipe = rpipe->pipe_peer;
1292         int ready = 0;
1293
1294         kn->kn_data = 0;
1295         if (wpipe == NULL) {
1296                 kn->kn_flags |= (EV_EOF | EV_NODATA);
1297                 return (1);
1298         }
1299
1300         lwkt_gettoken(&wpipe->pipe_rlock);
1301         lwkt_gettoken(&wpipe->pipe_wlock);
1302
1303         if (wpipe->pipe_state & PIPE_WEOF) {
1304                 kn->kn_flags |= (EV_EOF | EV_NODATA);
1305                 ready = 1;
1306         }
1307
1308         if (!ready)
1309                 kn->kn_data = wpipe->pipe_buffer.size -
1310                               (wpipe->pipe_buffer.windex -
1311                                wpipe->pipe_buffer.rindex);
1312
1313         lwkt_reltoken(&wpipe->pipe_wlock);
1314         lwkt_reltoken(&wpipe->pipe_rlock);
1315
1316         if (!ready)
1317                 ready = kn->kn_data >= PIPE_BUF;
1318
1319         return (ready);
1320 }