FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_pipe.c
1 /* $NetBSD: sys_pipe.c,v 1.64.2.1 2005/09/14 20:35:05 tron Exp $ */
2
3 /*-
4 * Copyright (c) 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Paul Kranenburg.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1996 John S. Dyson
41 * All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice immediately at the beginning of the file, without modification,
48 * this list of conditions, and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. Absolutely no warranty of function or purpose is made by the author
53 * John S. Dyson.
54 * 4. Modifications may be freely made to this file if the above conditions
55 * are met.
56 *
57 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.95 2002/03/09 22:06:31 alfred Exp $
58 */
59
60 /*
61 * This file contains a high-performance replacement for the socket-based
62 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
63 * all features of sockets, but does do everything that pipes normally
64 * do.
65 *
66 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
67 * written by Jaromir Dolecek.
68 */
69
70 /*
71 * This code has two modes of operation, a small write mode and a large
72 * write mode. The small write mode acts like conventional pipes with
73 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
74 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
75 * and PIPE_SIZE in size it is mapped read-only into the kernel address space
76 * using the UVM page loan facility from where the receiving process can copy
77 * the data directly from the pages in the sending process.
78 *
79 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
80 * happen for small transfers so that the system will not spend all of
81 * its time context switching. PIPE_SIZE is constrained by the
82 * amount of kernel virtual memory.
83 */
84
85 #include <sys/cdefs.h>
86 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.64.2.1 2005/09/14 20:35:05 tron Exp $");
87
88 #include <sys/param.h>
89 #include <sys/systm.h>
90 #include <sys/proc.h>
91 #include <sys/fcntl.h>
92 #include <sys/file.h>
93 #include <sys/filedesc.h>
94 #include <sys/filio.h>
95 #include <sys/kernel.h>
96 #include <sys/lock.h>
97 #include <sys/ttycom.h>
98 #include <sys/stat.h>
99 #include <sys/malloc.h>
100 #include <sys/poll.h>
101 #include <sys/signalvar.h>
102 #include <sys/vnode.h>
103 #include <sys/uio.h>
104 #include <sys/lock.h>
105 #include <sys/select.h>
106 #include <sys/mount.h>
107 #include <sys/sa.h>
108 #include <sys/syscallargs.h>
109 #include <uvm/uvm.h>
110 #include <sys/sysctl.h>
111 #include <sys/kernel.h>
112
113 #include <sys/pipe.h>
114
115 /*
116 * Avoid microtime(9), it's slow. We don't guard the read from time(9)
117 * with splclock(9) since we don't actually need to be THAT sure the access
118 * is atomic.
119 */
120 #define PIPE_TIMESTAMP(tvp) (*(tvp) = time)
121
122
123 /*
124 * Use this define if you want to disable *fancy* VM things. Expect an
125 * approx 30% decrease in transfer rate.
126 */
127 /* #define PIPE_NODIRECT */
128
129 /*
130 * interfaces to the outside world
131 */
132 static int pipe_read(struct file *fp, off_t *offset, struct uio *uio,
133 struct ucred *cred, int flags);
134 static int pipe_write(struct file *fp, off_t *offset, struct uio *uio,
135 struct ucred *cred, int flags);
136 static int pipe_close(struct file *fp, struct proc *p);
137 static int pipe_poll(struct file *fp, int events, struct proc *p);
138 static int pipe_kqfilter(struct file *fp, struct knote *kn);
139 static int pipe_stat(struct file *fp, struct stat *sb, struct proc *p);
140 static int pipe_ioctl(struct file *fp, u_long cmd, void *data,
141 struct proc *p);
142
143 static const struct fileops pipeops = {
144 pipe_read, pipe_write, pipe_ioctl, fnullop_fcntl, pipe_poll,
145 pipe_stat, pipe_close, pipe_kqfilter
146 };
147
148 /*
149 * Default pipe buffer size(s), this can be kind-of large now because pipe
150 * space is pageable. The pipe code will try to maintain locality of
151 * reference for performance reasons, so small amounts of outstanding I/O
152 * will not wipe the cache.
153 */
154 #define MINPIPESIZE (PIPE_SIZE/3)
155 #define MAXPIPESIZE (2*PIPE_SIZE/3)
156
157 /*
158 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
159 * is there so that on large systems, we don't exhaust it.
160 */
161 #define MAXPIPEKVA (8*1024*1024)
162 static int maxpipekva = MAXPIPEKVA;
163
164 /*
165 * Limit for direct transfers, we cannot, of course limit
166 * the amount of kva for pipes in general though.
167 */
168 #define LIMITPIPEKVA (16*1024*1024)
169 static int limitpipekva = LIMITPIPEKVA;
170
171 /*
172 * Limit the number of "big" pipes
173 */
174 #define LIMITBIGPIPES 32
175 static int maxbigpipes = LIMITBIGPIPES;
176 static int nbigpipe = 0;
177
178 /*
179 * Amount of KVA consumed by pipe buffers.
180 */
181 static int amountpipekva = 0;
182
183 MALLOC_DEFINE(M_PIPE, "pipe", "Pipe structures");
184
185 static void pipeclose(struct file *fp, struct pipe *pipe);
186 static void pipe_free_kmem(struct pipe *pipe);
187 static int pipe_create(struct pipe **pipep, int allockva);
188 static int pipelock(struct pipe *pipe, int catch);
189 static __inline void pipeunlock(struct pipe *pipe);
190 static void pipeselwakeup(struct pipe *pipe, struct pipe *sigp, int code);
191 #ifndef PIPE_NODIRECT
192 static int pipe_direct_write(struct file *fp, struct pipe *wpipe,
193 struct uio *uio);
194 #endif
195 static int pipespace(struct pipe *pipe, int size);
196
197 #ifndef PIPE_NODIRECT
198 static int pipe_loan_alloc(struct pipe *, int);
199 static void pipe_loan_free(struct pipe *);
200 #endif /* PIPE_NODIRECT */
201
202 static POOL_INIT(pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
203 &pool_allocator_nointr);
204
205 /*
206 * The pipe system call for the DTYPE_PIPE type of pipes
207 */
208
209 /* ARGSUSED */
210 int
211 sys_pipe(l, v, retval)
212 struct lwp *l;
213 void *v;
214 register_t *retval;
215 {
216 struct file *rf, *wf;
217 struct pipe *rpipe, *wpipe;
218 int fd, error;
219 struct proc *p;
220
221 p = l->l_proc;
222 rpipe = wpipe = NULL;
223 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) {
224 pipeclose(NULL, rpipe);
225 pipeclose(NULL, wpipe);
226 return (ENFILE);
227 }
228
229 /*
230 * Note: the file structure returned from falloc() is marked
231 * as 'larval' initially. Unless we mark it as 'mature' by
232 * FILE_SET_MATURE(), any attempt to do anything with it would
233 * return EBADF, including e.g. dup(2) or close(2). This avoids
234 * file descriptor races if we block in the second falloc().
235 */
236
237 error = falloc(p, &rf, &fd);
238 if (error)
239 goto free2;
240 retval[0] = fd;
241 rf->f_flag = FREAD;
242 rf->f_type = DTYPE_PIPE;
243 rf->f_data = (caddr_t)rpipe;
244 rf->f_ops = &pipeops;
245
246 error = falloc(p, &wf, &fd);
247 if (error)
248 goto free3;
249 retval[1] = fd;
250 wf->f_flag = FWRITE;
251 wf->f_type = DTYPE_PIPE;
252 wf->f_data = (caddr_t)wpipe;
253 wf->f_ops = &pipeops;
254
255 rpipe->pipe_peer = wpipe;
256 wpipe->pipe_peer = rpipe;
257
258 FILE_SET_MATURE(rf);
259 FILE_SET_MATURE(wf);
260 FILE_UNUSE(rf, p);
261 FILE_UNUSE(wf, p);
262 return (0);
263 free3:
264 FILE_UNUSE(rf, p);
265 ffree(rf);
266 fdremove(p->p_fd, retval[0]);
267 free2:
268 pipeclose(NULL, wpipe);
269 pipeclose(NULL, rpipe);
270
271 return (error);
272 }
273
274 /*
275 * Allocate kva for pipe circular buffer, the space is pageable
276 * This routine will 'realloc' the size of a pipe safely, if it fails
277 * it will retain the old buffer.
278 * If it fails it will return ENOMEM.
279 */
280 static int
281 pipespace(pipe, size)
282 struct pipe *pipe;
283 int size;
284 {
285 caddr_t buffer;
286 /*
287 * Allocate pageable virtual address space. Physical memory is
288 * allocated on demand.
289 */
290 buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
291 if (buffer == NULL)
292 return (ENOMEM);
293
294 /* free old resources if we're resizing */
295 pipe_free_kmem(pipe);
296 pipe->pipe_buffer.buffer = buffer;
297 pipe->pipe_buffer.size = size;
298 pipe->pipe_buffer.in = 0;
299 pipe->pipe_buffer.out = 0;
300 pipe->pipe_buffer.cnt = 0;
301 amountpipekva += pipe->pipe_buffer.size;
302 return (0);
303 }
304
305 /*
306 * Initialize and allocate VM and memory for pipe.
307 */
308 static int
309 pipe_create(pipep, allockva)
310 struct pipe **pipep;
311 int allockva;
312 {
313 struct pipe *pipe;
314 int error;
315
316 pipe = *pipep = pool_get(&pipe_pool, PR_WAITOK);
317
318 /* Initialize */
319 memset(pipe, 0, sizeof(struct pipe));
320 pipe->pipe_state = PIPE_SIGNALR;
321
322 PIPE_TIMESTAMP(&pipe->pipe_ctime);
323 pipe->pipe_atime = pipe->pipe_ctime;
324 pipe->pipe_mtime = pipe->pipe_ctime;
325 simple_lock_init(&pipe->pipe_slock);
326 lockinit(&pipe->pipe_lock, PSOCK | PCATCH, "pipelk", 0, 0);
327
328 if (allockva && (error = pipespace(pipe, PIPE_SIZE)))
329 return (error);
330
331 return (0);
332 }
333
334
335 /*
336 * Lock a pipe for I/O, blocking other access
337 * Called with pipe spin lock held.
338 * Return with pipe spin lock released on success.
339 */
340 static int
341 pipelock(pipe, catch)
342 struct pipe *pipe;
343 int catch;
344 {
345 int error;
346
347 LOCK_ASSERT(simple_lock_held(&pipe->pipe_slock));
348
349 while (1) {
350 error = lockmgr(&pipe->pipe_lock, LK_EXCLUSIVE | LK_INTERLOCK,
351 &pipe->pipe_slock);
352 if (error == 0)
353 break;
354
355 simple_lock(&pipe->pipe_slock);
356 if (catch || (error != EINTR && error != ERESTART))
357 break;
358 /*
359 * XXX XXX XXX
360 * The pipe lock is initialised with PCATCH on and we cannot
361 * override this in a lockmgr() call. Thus a pending signal
362 * will cause lockmgr() to return with EINTR or ERESTART.
363 * We cannot simply re-enter lockmgr() at this point since
364 * the pending signals have not yet been posted and would
365 * cause an immediate EINTR/ERESTART return again.
366 * As a workaround we pause for a while here, giving the lock
367 * a chance to drain, before trying again.
368 * XXX XXX XXX
369 *
370 * NOTE: Consider dropping PCATCH from this lock; in practice
371 * it is never held for long enough periods for having it
372 * interruptable at the start of pipe_read/pipe_write to be
373 * beneficial.
374 */
375 (void) ltsleep(&lbolt, PSOCK, "rstrtpipelock", hz,
376 &pipe->pipe_slock);
377 }
378 return (error);
379 }
380
381 /*
382 * unlock a pipe I/O lock
383 */
384 static __inline void
385 pipeunlock(pipe)
386 struct pipe *pipe;
387 {
388
389 lockmgr(&pipe->pipe_lock, LK_RELEASE, NULL);
390 }
391
392 /*
393 * Select/poll wakup. This also sends SIGIO to peer connected to
394 * 'sigpipe' side of pipe.
395 */
396 static void
397 pipeselwakeup(selp, sigp, code)
398 struct pipe *selp, *sigp;
399 int code;
400 {
401 int band;
402
403 selnotify(&selp->pipe_sel, NOTE_SUBMIT);
404
405 if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0)
406 return;
407
408 switch (code) {
409 case POLL_IN:
410 band = POLLIN|POLLRDNORM;
411 break;
412 case POLL_OUT:
413 band = POLLOUT|POLLWRNORM;
414 break;
415 case POLL_HUP:
416 band = POLLHUP;
417 break;
418 #if POLL_HUP != POLL_ERR
419 case POLL_ERR:
420 band = POLLERR;
421 break;
422 #endif
423 default:
424 band = 0;
425 #ifdef DIAGNOSTIC
426 printf("bad siginfo code %d in pipe notification.\n", code);
427 #endif
428 break;
429 }
430
431 fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp);
432 }
433
434 /* ARGSUSED */
435 static int
436 pipe_read(fp, offset, uio, cred, flags)
437 struct file *fp;
438 off_t *offset;
439 struct uio *uio;
440 struct ucred *cred;
441 int flags;
442 {
443 struct pipe *rpipe = (struct pipe *) fp->f_data;
444 struct pipebuf *bp = &rpipe->pipe_buffer;
445 int error;
446 size_t nread = 0;
447 size_t size;
448 size_t ocnt;
449
450 PIPE_LOCK(rpipe);
451 ++rpipe->pipe_busy;
452 ocnt = bp->cnt;
453
454 again:
455 error = pipelock(rpipe, 1);
456 if (error)
457 goto unlocked_error;
458
459 while (uio->uio_resid) {
460 /*
461 * normal pipe buffer receive
462 */
463 if (bp->cnt > 0) {
464 size = bp->size - bp->out;
465 if (size > bp->cnt)
466 size = bp->cnt;
467 if (size > uio->uio_resid)
468 size = uio->uio_resid;
469
470 error = uiomove(&bp->buffer[bp->out], size, uio);
471 if (error)
472 break;
473
474 bp->out += size;
475 if (bp->out >= bp->size)
476 bp->out = 0;
477
478 bp->cnt -= size;
479
480 /*
481 * If there is no more to read in the pipe, reset
482 * its pointers to the beginning. This improves
483 * cache hit stats.
484 */
485 if (bp->cnt == 0) {
486 bp->in = 0;
487 bp->out = 0;
488 }
489 nread += size;
490 #ifndef PIPE_NODIRECT
491 } else if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) {
492 /*
493 * Direct copy, bypassing a kernel buffer.
494 */
495 caddr_t va;
496
497 KASSERT(rpipe->pipe_state & PIPE_DIRECTW);
498
499 size = rpipe->pipe_map.cnt;
500 if (size > uio->uio_resid)
501 size = uio->uio_resid;
502
503 va = (caddr_t) rpipe->pipe_map.kva +
504 rpipe->pipe_map.pos;
505 error = uiomove(va, size, uio);
506 if (error)
507 break;
508 nread += size;
509 rpipe->pipe_map.pos += size;
510 rpipe->pipe_map.cnt -= size;
511 if (rpipe->pipe_map.cnt == 0) {
512 PIPE_LOCK(rpipe);
513 rpipe->pipe_state &= ~PIPE_DIRECTR;
514 wakeup(rpipe);
515 PIPE_UNLOCK(rpipe);
516 }
517 #endif
518 } else {
519 /*
520 * Break if some data was read.
521 */
522 if (nread > 0)
523 break;
524
525 PIPE_LOCK(rpipe);
526
527 /*
528 * detect EOF condition
529 * read returns 0 on EOF, no need to set error
530 */
531 if (rpipe->pipe_state & PIPE_EOF) {
532 PIPE_UNLOCK(rpipe);
533 break;
534 }
535
536 /*
537 * don't block on non-blocking I/O
538 */
539 if (fp->f_flag & FNONBLOCK) {
540 PIPE_UNLOCK(rpipe);
541 error = EAGAIN;
542 break;
543 }
544
545 /*
546 * Unlock the pipe buffer for our remaining processing.
547 * We will either break out with an error or we will
548 * sleep and relock to loop.
549 */
550 pipeunlock(rpipe);
551
552 /*
553 * The PIPE_DIRECTR flag is not under the control
554 * of the long-term lock (see pipe_direct_write()),
555 * so re-check now while holding the spin lock.
556 */
557 if ((rpipe->pipe_state & PIPE_DIRECTR) != 0)
558 goto again;
559
560 /*
561 * We want to read more, wake up select/poll.
562 */
563 pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_IN);
564
565 /*
566 * If the "write-side" is blocked, wake it up now.
567 */
568 if (rpipe->pipe_state & PIPE_WANTW) {
569 rpipe->pipe_state &= ~PIPE_WANTW;
570 wakeup(rpipe);
571 }
572
573 /* Now wait until the pipe is filled */
574 rpipe->pipe_state |= PIPE_WANTR;
575 error = ltsleep(rpipe, PSOCK | PCATCH,
576 "piperd", 0, &rpipe->pipe_slock);
577 if (error != 0)
578 goto unlocked_error;
579 goto again;
580 }
581 }
582
583 if (error == 0)
584 PIPE_TIMESTAMP(&rpipe->pipe_atime);
585
586 PIPE_LOCK(rpipe);
587 pipeunlock(rpipe);
588
589 unlocked_error:
590 --rpipe->pipe_busy;
591
592 /*
593 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
594 */
595 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
596 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
597 wakeup(rpipe);
598 } else if (bp->cnt < MINPIPESIZE) {
599 /*
600 * Handle write blocking hysteresis.
601 */
602 if (rpipe->pipe_state & PIPE_WANTW) {
603 rpipe->pipe_state &= ~PIPE_WANTW;
604 wakeup(rpipe);
605 }
606 }
607
608 /*
609 * If anything was read off the buffer, signal to the writer it's
610 * possible to write more data. Also send signal if we are here for the
611 * first time after last write.
612 */
613 if ((bp->size - bp->cnt) >= PIPE_BUF
614 && (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
615 pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT);
616 rpipe->pipe_state &= ~PIPE_SIGNALR;
617 }
618
619 PIPE_UNLOCK(rpipe);
620 return (error);
621 }
622
623 #ifndef PIPE_NODIRECT
624 /*
625 * Allocate structure for loan transfer.
626 */
627 static int
628 pipe_loan_alloc(wpipe, npages)
629 struct pipe *wpipe;
630 int npages;
631 {
632 vsize_t len;
633
634 len = (vsize_t)npages << PAGE_SHIFT;
635 wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, len);
636 if (wpipe->pipe_map.kva == 0)
637 return (ENOMEM);
638
639 amountpipekva += len;
640 wpipe->pipe_map.npages = npages;
641 wpipe->pipe_map.pgs = malloc(npages * sizeof(struct vm_page *), M_PIPE,
642 M_WAITOK);
643 return (0);
644 }
645
646 /*
647 * Free resources allocated for loan transfer.
648 */
649 static void
650 pipe_loan_free(wpipe)
651 struct pipe *wpipe;
652 {
653 vsize_t len;
654
655 len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT;
656 uvm_km_free(kernel_map, wpipe->pipe_map.kva, len);
657 wpipe->pipe_map.kva = 0;
658 amountpipekva -= len;
659 free(wpipe->pipe_map.pgs, M_PIPE);
660 wpipe->pipe_map.pgs = NULL;
661 }
662
663 /*
664 * NetBSD direct write, using uvm_loan() mechanism.
665 * This implements the pipe buffer write mechanism. Note that only
666 * a direct write OR a normal pipe write can be pending at any given time.
667 * If there are any characters in the pipe buffer, the direct write will
668 * be deferred until the receiving process grabs all of the bytes from
669 * the pipe buffer. Then the direct mapping write is set-up.
670 *
671 * Called with the long-term pipe lock held.
672 */
673 static int
674 pipe_direct_write(fp, wpipe, uio)
675 struct file *fp;
676 struct pipe *wpipe;
677 struct uio *uio;
678 {
679 int error, npages, j;
680 struct vm_page **pgs;
681 vaddr_t bbase, kva, base, bend;
682 vsize_t blen, bcnt;
683 voff_t bpos;
684
685 KASSERT(wpipe->pipe_map.cnt == 0);
686
687 /*
688 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers
689 * not aligned to PAGE_SIZE.
690 */
691 bbase = (vaddr_t)uio->uio_iov->iov_base;
692 base = trunc_page(bbase);
693 bend = round_page(bbase + uio->uio_iov->iov_len);
694 blen = bend - base;
695 bpos = bbase - base;
696
697 if (blen > PIPE_DIRECT_CHUNK) {
698 blen = PIPE_DIRECT_CHUNK;
699 bend = base + blen;
700 bcnt = PIPE_DIRECT_CHUNK - bpos;
701 } else {
702 bcnt = uio->uio_iov->iov_len;
703 }
704 npages = blen >> PAGE_SHIFT;
705
706 /*
707 * Free the old kva if we need more pages than we have
708 * allocated.
709 */
710 if (wpipe->pipe_map.kva != 0 && npages > wpipe->pipe_map.npages)
711 pipe_loan_free(wpipe);
712
713 /* Allocate new kva. */
714 if (wpipe->pipe_map.kva == 0) {
715 error = pipe_loan_alloc(wpipe, npages);
716 if (error)
717 return (error);
718 }
719
720 /* Loan the write buffer memory from writer process */
721 pgs = wpipe->pipe_map.pgs;
722 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen,
723 pgs, UVM_LOAN_TOPAGE);
724 if (error) {
725 pipe_loan_free(wpipe);
726 return (ENOMEM); /* so that caller fallback to ordinary write */
727 }
728
729 /* Enter the loaned pages to kva */
730 kva = wpipe->pipe_map.kva;
731 for (j = 0; j < npages; j++, kva += PAGE_SIZE) {
732 pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ);
733 }
734 pmap_update(pmap_kernel());
735
736 /* Now we can put the pipe in direct write mode */
737 wpipe->pipe_map.pos = bpos;
738 wpipe->pipe_map.cnt = bcnt;
739 wpipe->pipe_state |= PIPE_DIRECTW;
740
741 /*
742 * But before we can let someone do a direct read,
743 * we have to wait until the pipe is drained.
744 */
745
746 /* Relase the pipe lock while we wait */
747 PIPE_LOCK(wpipe);
748 pipeunlock(wpipe);
749
750 while (error == 0 && wpipe->pipe_buffer.cnt > 0) {
751 if (wpipe->pipe_state & PIPE_WANTR) {
752 wpipe->pipe_state &= ~PIPE_WANTR;
753 wakeup(wpipe);
754 }
755
756 wpipe->pipe_state |= PIPE_WANTW;
757 error = ltsleep(wpipe, PSOCK | PCATCH, "pipdwc", 0,
758 &wpipe->pipe_slock);
759 if (error == 0 && wpipe->pipe_state & PIPE_EOF)
760 error = EPIPE;
761 }
762
763 /* Pipe is drained; next read will off the direct buffer */
764 wpipe->pipe_state |= PIPE_DIRECTR;
765
766 /* Wait until the reader is done */
767 while (error == 0 && (wpipe->pipe_state & PIPE_DIRECTR)) {
768 if (wpipe->pipe_state & PIPE_WANTR) {
769 wpipe->pipe_state &= ~PIPE_WANTR;
770 wakeup(wpipe);
771 }
772 pipeselwakeup(wpipe, wpipe, POLL_IN);
773 error = ltsleep(wpipe, PSOCK | PCATCH, "pipdwt", 0,
774 &wpipe->pipe_slock);
775 if (error == 0 && wpipe->pipe_state & PIPE_EOF)
776 error = EPIPE;
777 }
778
779 /* Take pipe out of direct write mode */
780 wpipe->pipe_state &= ~(PIPE_DIRECTW | PIPE_DIRECTR);
781
782 /* Acquire the pipe lock and cleanup */
783 (void)pipelock(wpipe, 0);
784 if (pgs != NULL) {
785 pmap_kremove(wpipe->pipe_map.kva, blen);
786 uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
787 }
788 if (error || amountpipekva > maxpipekva)
789 pipe_loan_free(wpipe);
790
791 if (error) {
792 pipeselwakeup(wpipe, wpipe, POLL_ERR);
793
794 /*
795 * If nothing was read from what we offered, return error
796 * straight on. Otherwise update uio resid first. Caller
797 * will deal with the error condition, returning short
798 * write, error, or restarting the write(2) as appropriate.
799 */
800 if (wpipe->pipe_map.cnt == bcnt) {
801 wpipe->pipe_map.cnt = 0;
802 wakeup(wpipe);
803 return (error);
804 }
805
806 bcnt -= wpipe->pipe_map.cnt;
807 }
808
809 uio->uio_resid -= bcnt;
810 /* uio_offset not updated, not set/used for write(2) */
811 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt;
812 uio->uio_iov->iov_len -= bcnt;
813 if (uio->uio_iov->iov_len == 0) {
814 uio->uio_iov++;
815 uio->uio_iovcnt--;
816 }
817
818 wpipe->pipe_map.cnt = 0;
819 return (error);
820 }
821 #endif /* !PIPE_NODIRECT */
822
823 static int
824 pipe_write(fp, offset, uio, cred, flags)
825 struct file *fp;
826 off_t *offset;
827 struct uio *uio;
828 struct ucred *cred;
829 int flags;
830 {
831 struct pipe *wpipe, *rpipe;
832 struct pipebuf *bp;
833 int error;
834
835 /* We want to write to our peer */
836 rpipe = (struct pipe *) fp->f_data;
837
838 retry:
839 error = 0;
840 PIPE_LOCK(rpipe);
841 wpipe = rpipe->pipe_peer;
842
843 /*
844 * Detect loss of pipe read side, issue SIGPIPE if lost.
845 */
846 if (wpipe == NULL)
847 error = EPIPE;
848 else if (simple_lock_try(&wpipe->pipe_slock) == 0) {
849 /* Deal with race for peer */
850 PIPE_UNLOCK(rpipe);
851 goto retry;
852 } else if ((wpipe->pipe_state & PIPE_EOF) != 0) {
853 PIPE_UNLOCK(wpipe);
854 error = EPIPE;
855 }
856
857 PIPE_UNLOCK(rpipe);
858 if (error != 0)
859 return (error);
860
861 ++wpipe->pipe_busy;
862
863 /* Aquire the long-term pipe lock */
864 if ((error = pipelock(wpipe,1)) != 0) {
865 --wpipe->pipe_busy;
866 if (wpipe->pipe_busy == 0
867 && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
868 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
869 wakeup(wpipe);
870 }
871 PIPE_UNLOCK(wpipe);
872 return (error);
873 }
874
875 bp = &wpipe->pipe_buffer;
876
877 /*
878 * If it is advantageous to resize the pipe buffer, do so.
879 */
880 if ((uio->uio_resid > PIPE_SIZE) &&
881 (nbigpipe < maxbigpipes) &&
882 #ifndef PIPE_NODIRECT
883 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
884 #endif
885 (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) {
886
887 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
888 nbigpipe++;
889 }
890
891 while (uio->uio_resid) {
892 size_t space;
893
894 #ifndef PIPE_NODIRECT
895 /*
896 * Pipe buffered writes cannot be coincidental with
897 * direct writes. Also, only one direct write can be
898 * in progress at any one time. We wait until the currently
899 * executing direct write is completed before continuing.
900 *
901 * We break out if a signal occurs or the reader goes away.
902 */
903 while (error == 0 && wpipe->pipe_state & PIPE_DIRECTW) {
904 PIPE_LOCK(wpipe);
905 if (wpipe->pipe_state & PIPE_WANTR) {
906 wpipe->pipe_state &= ~PIPE_WANTR;
907 wakeup(wpipe);
908 }
909 pipeunlock(wpipe);
910 error = ltsleep(wpipe, PSOCK | PCATCH,
911 "pipbww", 0, &wpipe->pipe_slock);
912
913 (void)pipelock(wpipe, 0);
914 if (wpipe->pipe_state & PIPE_EOF)
915 error = EPIPE;
916 }
917 if (error)
918 break;
919
920 /*
921 * If the transfer is large, we can gain performance if
922 * we do process-to-process copies directly.
923 * If the write is non-blocking, we don't use the
924 * direct write mechanism.
925 *
926 * The direct write mechanism will detect the reader going
927 * away on us.
928 */
929 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
930 (fp->f_flag & FNONBLOCK) == 0 &&
931 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
932 error = pipe_direct_write(fp, wpipe, uio);
933
934 /*
935 * Break out if error occurred, unless it's ENOMEM.
936 * ENOMEM means we failed to allocate some resources
937 * for direct write, so we just fallback to ordinary
938 * write. If the direct write was successful,
939 * process rest of data via ordinary write.
940 */
941 if (error == 0)
942 continue;
943
944 if (error != ENOMEM)
945 break;
946 }
947 #endif /* PIPE_NODIRECT */
948
949 space = bp->size - bp->cnt;
950
951 /* Writes of size <= PIPE_BUF must be atomic. */
952 if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF))
953 space = 0;
954
955 if (space > 0) {
956 int size; /* Transfer size */
957 int segsize; /* first segment to transfer */
958
959 /*
960 * Transfer size is minimum of uio transfer
961 * and free space in pipe buffer.
962 */
963 if (space > uio->uio_resid)
964 size = uio->uio_resid;
965 else
966 size = space;
967 /*
968 * First segment to transfer is minimum of
969 * transfer size and contiguous space in
970 * pipe buffer. If first segment to transfer
971 * is less than the transfer size, we've got
972 * a wraparound in the buffer.
973 */
974 segsize = bp->size - bp->in;
975 if (segsize > size)
976 segsize = size;
977
978 /* Transfer first segment */
979 error = uiomove(&bp->buffer[bp->in], segsize, uio);
980
981 if (error == 0 && segsize < size) {
982 /*
983 * Transfer remaining part now, to
984 * support atomic writes. Wraparound
985 * happened.
986 */
987 #ifdef DEBUG
988 if (bp->in + segsize != bp->size)
989 panic("Expected pipe buffer wraparound disappeared");
990 #endif
991
992 error = uiomove(&bp->buffer[0],
993 size - segsize, uio);
994 }
995 if (error)
996 break;
997
998 bp->in += size;
999 if (bp->in >= bp->size) {
1000 #ifdef DEBUG
1001 if (bp->in != size - segsize + bp->size)
1002 panic("Expected wraparound bad");
1003 #endif
1004 bp->in = size - segsize;
1005 }
1006
1007 bp->cnt += size;
1008 #ifdef DEBUG
1009 if (bp->cnt > bp->size)
1010 panic("Pipe buffer overflow");
1011 #endif
1012 } else {
1013 /*
1014 * If the "read-side" has been blocked, wake it up now.
1015 */
1016 PIPE_LOCK(wpipe);
1017 if (wpipe->pipe_state & PIPE_WANTR) {
1018 wpipe->pipe_state &= ~PIPE_WANTR;
1019 wakeup(wpipe);
1020 }
1021 PIPE_UNLOCK(wpipe);
1022
1023 /*
1024 * don't block on non-blocking I/O
1025 */
1026 if (fp->f_flag & FNONBLOCK) {
1027 error = EAGAIN;
1028 break;
1029 }
1030
1031 /*
1032 * We have no more space and have something to offer,
1033 * wake up select/poll.
1034 */
1035 if (bp->cnt)
1036 pipeselwakeup(wpipe, wpipe, POLL_OUT);
1037
1038 PIPE_LOCK(wpipe);
1039 pipeunlock(wpipe);
1040 wpipe->pipe_state |= PIPE_WANTW;
1041 error = ltsleep(wpipe, PSOCK | PCATCH, "pipewr", 0,
1042 &wpipe->pipe_slock);
1043 (void)pipelock(wpipe, 0);
1044 if (error != 0)
1045 break;
1046 /*
1047 * If read side wants to go away, we just issue a signal
1048 * to ourselves.
1049 */
1050 if (wpipe->pipe_state & PIPE_EOF) {
1051 error = EPIPE;
1052 break;
1053 }
1054 }
1055 }
1056
1057 PIPE_LOCK(wpipe);
1058 --wpipe->pipe_busy;
1059 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1060 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1061 wakeup(wpipe);
1062 } else if (bp->cnt > 0) {
1063 /*
1064 * If we have put any characters in the buffer, we wake up
1065 * the reader.
1066 */
1067 if (wpipe->pipe_state & PIPE_WANTR) {
1068 wpipe->pipe_state &= ~PIPE_WANTR;
1069 wakeup(wpipe);
1070 }
1071 }
1072
1073 /*
1074 * Don't return EPIPE if I/O was successful
1075 */
1076 if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0)
1077 error = 0;
1078
1079 if (error == 0)
1080 PIPE_TIMESTAMP(&wpipe->pipe_mtime);
1081
1082 /*
1083 * We have something to offer, wake up select/poll.
1084 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1085 * is only done synchronously), so check only wpipe->pipe_buffer.cnt
1086 */
1087 if (bp->cnt)
1088 pipeselwakeup(wpipe, wpipe, POLL_OUT);
1089
1090 /*
1091 * Arrange for next read(2) to do a signal.
1092 */
1093 wpipe->pipe_state |= PIPE_SIGNALR;
1094
1095 pipeunlock(wpipe);
1096 PIPE_UNLOCK(wpipe);
1097 return (error);
1098 }
1099
1100 /*
1101 * we implement a very minimal set of ioctls for compatibility with sockets.
1102 */
1103 int
1104 pipe_ioctl(fp, cmd, data, p)
1105 struct file *fp;
1106 u_long cmd;
1107 void *data;
1108 struct proc *p;
1109 {
1110 struct pipe *pipe = (struct pipe *)fp->f_data;
1111
1112 switch (cmd) {
1113
1114 case FIONBIO:
1115 return (0);
1116
1117 case FIOASYNC:
1118 PIPE_LOCK(pipe);
1119 if (*(int *)data) {
1120 pipe->pipe_state |= PIPE_ASYNC;
1121 } else {
1122 pipe->pipe_state &= ~PIPE_ASYNC;
1123 }
1124 PIPE_UNLOCK(pipe);
1125 return (0);
1126
1127 case FIONREAD:
1128 PIPE_LOCK(pipe);
1129 #ifndef PIPE_NODIRECT
1130 if (pipe->pipe_state & PIPE_DIRECTW)
1131 *(int *)data = pipe->pipe_map.cnt;
1132 else
1133 #endif
1134 *(int *)data = pipe->pipe_buffer.cnt;
1135 PIPE_UNLOCK(pipe);
1136 return (0);
1137
1138 case FIONWRITE:
1139 /* Look at other side */
1140 pipe = pipe->pipe_peer;
1141 PIPE_LOCK(pipe);
1142 #ifndef PIPE_NODIRECT
1143 if (pipe->pipe_state & PIPE_DIRECTW)
1144 *(int *)data = pipe->pipe_map.cnt;
1145 else
1146 #endif
1147 *(int *)data = pipe->pipe_buffer.cnt;
1148 PIPE_UNLOCK(pipe);
1149 return (0);
1150
1151 case FIONSPACE:
1152 /* Look at other side */
1153 pipe = pipe->pipe_peer;
1154 PIPE_LOCK(pipe);
1155 #ifndef PIPE_NODIRECT
1156 /*
1157 * If we're in direct-mode, we don't really have a
1158 * send queue, and any other write will block. Thus
1159 * zero seems like the best answer.
1160 */
1161 if (pipe->pipe_state & PIPE_DIRECTW)
1162 *(int *)data = 0;
1163 else
1164 #endif
1165 *(int *)data = pipe->pipe_buffer.size -
1166 pipe->pipe_buffer.cnt;
1167 PIPE_UNLOCK(pipe);
1168 return (0);
1169
1170 case TIOCSPGRP:
1171 case FIOSETOWN:
1172 return fsetown(p, &pipe->pipe_pgid, cmd, data);
1173
1174 case TIOCGPGRP:
1175 case FIOGETOWN:
1176 return fgetown(p, pipe->pipe_pgid, cmd, data);
1177
1178 }
1179 return (EPASSTHROUGH);
1180 }
1181
1182 int
1183 pipe_poll(fp, events, td)
1184 struct file *fp;
1185 int events;
1186 struct proc *td;
1187 {
1188 struct pipe *rpipe = (struct pipe *)fp->f_data;
1189 struct pipe *wpipe;
1190 int eof = 0;
1191 int revents = 0;
1192
1193 retry:
1194 PIPE_LOCK(rpipe);
1195 wpipe = rpipe->pipe_peer;
1196 if (wpipe != NULL && simple_lock_try(&wpipe->pipe_slock) == 0) {
1197 /* Deal with race for peer */
1198 PIPE_UNLOCK(rpipe);
1199 goto retry;
1200 }
1201
1202 if (events & (POLLIN | POLLRDNORM))
1203 if ((rpipe->pipe_buffer.cnt > 0) ||
1204 #ifndef PIPE_NODIRECT
1205 (rpipe->pipe_state & PIPE_DIRECTR) ||
1206 #endif
1207 (rpipe->pipe_state & PIPE_EOF))
1208 revents |= events & (POLLIN | POLLRDNORM);
1209
1210 eof |= (rpipe->pipe_state & PIPE_EOF);
1211 PIPE_UNLOCK(rpipe);
1212
1213 if (wpipe == NULL)
1214 revents |= events & (POLLOUT | POLLWRNORM);
1215 else {
1216 if (events & (POLLOUT | POLLWRNORM))
1217 if ((wpipe->pipe_state & PIPE_EOF) || (
1218 #ifndef PIPE_NODIRECT
1219 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1220 #endif
1221 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1222 revents |= events & (POLLOUT | POLLWRNORM);
1223
1224 eof |= (wpipe->pipe_state & PIPE_EOF);
1225 PIPE_UNLOCK(wpipe);
1226 }
1227
1228 if (wpipe == NULL || eof)
1229 revents |= POLLHUP;
1230
1231 if (revents == 0) {
1232 if (events & (POLLIN | POLLRDNORM))
1233 selrecord(td, &rpipe->pipe_sel);
1234
1235 if (events & (POLLOUT | POLLWRNORM))
1236 selrecord(td, &wpipe->pipe_sel);
1237 }
1238
1239 return (revents);
1240 }
1241
1242 static int
1243 pipe_stat(fp, ub, td)
1244 struct file *fp;
1245 struct stat *ub;
1246 struct proc *td;
1247 {
1248 struct pipe *pipe = (struct pipe *)fp->f_data;
1249
1250 memset((caddr_t)ub, 0, sizeof(*ub));
1251 ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
1252 ub->st_blksize = pipe->pipe_buffer.size;
1253 if (ub->st_blksize == 0 && pipe->pipe_peer)
1254 ub->st_blksize = pipe->pipe_peer->pipe_buffer.size;
1255 ub->st_size = pipe->pipe_buffer.cnt;
1256 ub->st_blocks = (ub->st_size) ? 1 : 0;
1257 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec);
1258 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1259 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1260 ub->st_uid = fp->f_cred->cr_uid;
1261 ub->st_gid = fp->f_cred->cr_gid;
1262 /*
1263 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1264 * XXX (st_dev, st_ino) should be unique.
1265 */
1266 return (0);
1267 }
1268
1269 /* ARGSUSED */
1270 static int
1271 pipe_close(fp, td)
1272 struct file *fp;
1273 struct proc *td;
1274 {
1275 struct pipe *pipe = (struct pipe *)fp->f_data;
1276
1277 fp->f_data = NULL;
1278 pipeclose(fp, pipe);
1279 return (0);
1280 }
1281
1282 static void
1283 pipe_free_kmem(pipe)
1284 struct pipe *pipe;
1285 {
1286
1287 if (pipe->pipe_buffer.buffer != NULL) {
1288 if (pipe->pipe_buffer.size > PIPE_SIZE)
1289 --nbigpipe;
1290 amountpipekva -= pipe->pipe_buffer.size;
1291 uvm_km_free(kernel_map,
1292 (vaddr_t)pipe->pipe_buffer.buffer,
1293 pipe->pipe_buffer.size);
1294 pipe->pipe_buffer.buffer = NULL;
1295 }
1296 #ifndef PIPE_NODIRECT
1297 if (pipe->pipe_map.kva != 0) {
1298 pipe_loan_free(pipe);
1299 pipe->pipe_map.cnt = 0;
1300 pipe->pipe_map.kva = 0;
1301 pipe->pipe_map.pos = 0;
1302 pipe->pipe_map.npages = 0;
1303 }
1304 #endif /* !PIPE_NODIRECT */
1305 }
1306
1307 /*
1308 * shutdown the pipe
1309 */
1310 static void
1311 pipeclose(fp, pipe)
1312 struct file *fp;
1313 struct pipe *pipe;
1314 {
1315 struct pipe *ppipe;
1316
1317 if (pipe == NULL)
1318 return;
1319
1320 retry:
1321 PIPE_LOCK(pipe);
1322
1323 pipeselwakeup(pipe, pipe, POLL_HUP);
1324
1325 /*
1326 * If the other side is blocked, wake it up saying that
1327 * we want to close it down.
1328 */
1329 pipe->pipe_state |= PIPE_EOF;
1330 while (pipe->pipe_busy) {
1331 wakeup(pipe);
1332 pipe->pipe_state |= PIPE_WANTCLOSE;
1333 ltsleep(pipe, PSOCK, "pipecl", 0, &pipe->pipe_slock);
1334 }
1335
1336 /*
1337 * Disconnect from peer
1338 */
1339 if ((ppipe = pipe->pipe_peer) != NULL) {
1340 /* Deal with race for peer */
1341 if (simple_lock_try(&ppipe->pipe_slock) == 0) {
1342 PIPE_UNLOCK(pipe);
1343 goto retry;
1344 }
1345 pipeselwakeup(ppipe, ppipe, POLL_HUP);
1346
1347 ppipe->pipe_state |= PIPE_EOF;
1348 wakeup(ppipe);
1349 ppipe->pipe_peer = NULL;
1350 PIPE_UNLOCK(ppipe);
1351 }
1352
1353 (void)lockmgr(&pipe->pipe_lock, LK_DRAIN | LK_INTERLOCK,
1354 &pipe->pipe_slock);
1355
1356 /*
1357 * free resources
1358 */
1359 pipe_free_kmem(pipe);
1360 pool_put(&pipe_pool, pipe);
1361 }
1362
1363 static void
1364 filt_pipedetach(struct knote *kn)
1365 {
1366 struct pipe *pipe = (struct pipe *)kn->kn_fp->f_data;
1367
1368 switch(kn->kn_filter) {
1369 case EVFILT_WRITE:
1370 /* need the peer structure, not our own */
1371 pipe = pipe->pipe_peer;
1372 /* XXXSMP: race for peer */
1373
1374 /* if reader end already closed, just return */
1375 if (pipe == NULL)
1376 return;
1377
1378 break;
1379 default:
1380 /* nothing to do */
1381 break;
1382 }
1383
1384 #ifdef DIAGNOSTIC
1385 if (kn->kn_hook != pipe)
1386 panic("filt_pipedetach: inconsistent knote");
1387 #endif
1388
1389 PIPE_LOCK(pipe);
1390 SLIST_REMOVE(&pipe->pipe_sel.sel_klist, kn, knote, kn_selnext);
1391 PIPE_UNLOCK(pipe);
1392 }
1393
1394 /*ARGSUSED*/
1395 static int
1396 filt_piperead(struct knote *kn, long hint)
1397 {
1398 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1399 struct pipe *wpipe = rpipe->pipe_peer;
1400
1401 if ((hint & NOTE_SUBMIT) == 0)
1402 PIPE_LOCK(rpipe);
1403 kn->kn_data = rpipe->pipe_buffer.cnt;
1404 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1405 kn->kn_data = rpipe->pipe_map.cnt;
1406
1407 /* XXXSMP: race for peer */
1408 if ((rpipe->pipe_state & PIPE_EOF) ||
1409 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1410 kn->kn_flags |= EV_EOF;
1411 if ((hint & NOTE_SUBMIT) == 0)
1412 PIPE_UNLOCK(rpipe);
1413 return (1);
1414 }
1415 if ((hint & NOTE_SUBMIT) == 0)
1416 PIPE_UNLOCK(rpipe);
1417 return (kn->kn_data > 0);
1418 }
1419
1420 /*ARGSUSED*/
1421 static int
1422 filt_pipewrite(struct knote *kn, long hint)
1423 {
1424 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1425 struct pipe *wpipe = rpipe->pipe_peer;
1426
1427 if ((hint & NOTE_SUBMIT) == 0)
1428 PIPE_LOCK(rpipe);
1429 /* XXXSMP: race for peer */
1430 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1431 kn->kn_data = 0;
1432 kn->kn_flags |= EV_EOF;
1433 if ((hint & NOTE_SUBMIT) == 0)
1434 PIPE_UNLOCK(rpipe);
1435 return (1);
1436 }
1437 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1438 if (wpipe->pipe_state & PIPE_DIRECTW)
1439 kn->kn_data = 0;
1440
1441 if ((hint & NOTE_SUBMIT) == 0)
1442 PIPE_UNLOCK(rpipe);
1443 return (kn->kn_data >= PIPE_BUF);
1444 }
1445
1446 static const struct filterops pipe_rfiltops =
1447 { 1, NULL, filt_pipedetach, filt_piperead };
1448 static const struct filterops pipe_wfiltops =
1449 { 1, NULL, filt_pipedetach, filt_pipewrite };
1450
1451 /*ARGSUSED*/
1452 static int
1453 pipe_kqfilter(struct file *fp, struct knote *kn)
1454 {
1455 struct pipe *pipe;
1456
1457 pipe = (struct pipe *)kn->kn_fp->f_data;
1458 switch (kn->kn_filter) {
1459 case EVFILT_READ:
1460 kn->kn_fop = &pipe_rfiltops;
1461 break;
1462 case EVFILT_WRITE:
1463 kn->kn_fop = &pipe_wfiltops;
1464 /* XXXSMP: race for peer */
1465 pipe = pipe->pipe_peer;
1466 if (pipe == NULL) {
1467 /* other end of pipe has been closed */
1468 return (EBADF);
1469 }
1470 break;
1471 default:
1472 return (1);
1473 }
1474 kn->kn_hook = pipe;
1475
1476 PIPE_LOCK(pipe);
1477 SLIST_INSERT_HEAD(&pipe->pipe_sel.sel_klist, kn, kn_selnext);
1478 PIPE_UNLOCK(pipe);
1479 return (0);
1480 }
1481
1482 /*
1483 * Handle pipe sysctls.
1484 */
1485 SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup")
1486 {
1487
1488 sysctl_createv(clog, 0, NULL, NULL,
1489 CTLFLAG_PERMANENT,
1490 CTLTYPE_NODE, "kern", NULL,
1491 NULL, 0, NULL, 0,
1492 CTL_KERN, CTL_EOL);
1493 sysctl_createv(clog, 0, NULL, NULL,
1494 CTLFLAG_PERMANENT,
1495 CTLTYPE_NODE, "pipe",
1496 SYSCTL_DESCR("Pipe settings"),
1497 NULL, 0, NULL, 0,
1498 CTL_KERN, KERN_PIPE, CTL_EOL);
1499
1500 sysctl_createv(clog, 0, NULL, NULL,
1501 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1502 CTLTYPE_INT, "maxkvasz",
1503 SYSCTL_DESCR("Maximum amount of kernel memory to be "
1504 "used for pipes"),
1505 NULL, 0, &maxpipekva, 0,
1506 CTL_KERN, KERN_PIPE, KERN_PIPE_MAXKVASZ, CTL_EOL);
1507 sysctl_createv(clog, 0, NULL, NULL,
1508 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1509 CTLTYPE_INT, "maxloankvasz",
1510 SYSCTL_DESCR("Limit for direct transfers via page loan"),
1511 NULL, 0, &limitpipekva, 0,
1512 CTL_KERN, KERN_PIPE, KERN_PIPE_LIMITKVA, CTL_EOL);
1513 sysctl_createv(clog, 0, NULL, NULL,
1514 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1515 CTLTYPE_INT, "maxbigpipes",
1516 SYSCTL_DESCR("Maximum number of \"big\" pipes"),
1517 NULL, 0, &maxbigpipes, 0,
1518 CTL_KERN, KERN_PIPE, KERN_PIPE_MAXBIGPIPES, CTL_EOL);
1519 sysctl_createv(clog, 0, NULL, NULL,
1520 CTLFLAG_PERMANENT,
1521 CTLTYPE_INT, "nbigpipes",
1522 SYSCTL_DESCR("Number of \"big\" pipes"),
1523 NULL, 0, &nbigpipe, 0,
1524 CTL_KERN, KERN_PIPE, KERN_PIPE_NBIGPIPES, CTL_EOL);
1525 sysctl_createv(clog, 0, NULL, NULL,
1526 CTLFLAG_PERMANENT,
1527 CTLTYPE_INT, "kvasize",
1528 SYSCTL_DESCR("Amount of kernel memory consumed by pipe "
1529 "buffers"),
1530 NULL, 0, &amountpipekva, 0,
1531 CTL_KERN, KERN_PIPE, KERN_PIPE_KVASIZE, CTL_EOL);
1532 }
Cache object: 1fc336c870c8f94e69a9e6f1c176e364
|