1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD: releng/6.4/sys/kern/sys_generic.c 167225 2007-03-05 11:17:31Z bms $");
39
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <vm/vm.h>
72 #include <vm/vm_page.h>
73
74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
77
78 static int pollscan(struct thread *, struct pollfd *, u_int);
79 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
80 static int dofileread(struct thread *, int, struct file *, struct uio *,
81 off_t, int);
82 static int dofilewrite(struct thread *, int, struct file *, struct uio *,
83 off_t, int);
84 static void doselwakeup(struct selinfo *, int);
85
86 /*
87 * Read system call.
88 */
89 #ifndef _SYS_SYSPROTO_H_
90 struct read_args {
91 int fd;
92 void *buf;
93 size_t nbyte;
94 };
95 #endif
96 /*
97 * MPSAFE
98 */
99 int
100 read(td, uap)
101 struct thread *td;
102 struct read_args *uap;
103 {
104 struct uio auio;
105 struct iovec aiov;
106 int error;
107
108 if (uap->nbyte > INT_MAX)
109 return (EINVAL);
110 aiov.iov_base = uap->buf;
111 aiov.iov_len = uap->nbyte;
112 auio.uio_iov = &aiov;
113 auio.uio_iovcnt = 1;
114 auio.uio_resid = uap->nbyte;
115 auio.uio_segflg = UIO_USERSPACE;
116 error = kern_readv(td, uap->fd, &auio);
117 return(error);
118 }
119
120 /*
121 * Positioned read system call
122 */
123 #ifndef _SYS_SYSPROTO_H_
124 struct pread_args {
125 int fd;
126 void *buf;
127 size_t nbyte;
128 int pad;
129 off_t offset;
130 };
131 #endif
132 /*
133 * MPSAFE
134 */
135 int
136 pread(td, uap)
137 struct thread *td;
138 struct pread_args *uap;
139 {
140 struct uio auio;
141 struct iovec aiov;
142 int error;
143
144 if (uap->nbyte > INT_MAX)
145 return (EINVAL);
146 aiov.iov_base = uap->buf;
147 aiov.iov_len = uap->nbyte;
148 auio.uio_iov = &aiov;
149 auio.uio_iovcnt = 1;
150 auio.uio_resid = uap->nbyte;
151 auio.uio_segflg = UIO_USERSPACE;
152 error = kern_preadv(td, uap->fd, &auio, uap->offset);
153 return(error);
154 }
155
156 /*
157 * Scatter read system call.
158 */
159 #ifndef _SYS_SYSPROTO_H_
160 struct readv_args {
161 int fd;
162 struct iovec *iovp;
163 u_int iovcnt;
164 };
165 #endif
166 /*
167 * MPSAFE
168 */
169 int
170 readv(struct thread *td, struct readv_args *uap)
171 {
172 struct uio *auio;
173 int error;
174
175 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
176 if (error)
177 return (error);
178 error = kern_readv(td, uap->fd, auio);
179 free(auio, M_IOV);
180 return (error);
181 }
182
183 int
184 kern_readv(struct thread *td, int fd, struct uio *auio)
185 {
186 struct file *fp;
187 int error;
188
189 error = fget_read(td, fd, &fp);
190 if (error)
191 return (error);
192 error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
193 fdrop(fp, td);
194 return (error);
195 }
196
197 /*
198 * Scatter positioned read system call.
199 */
200 #ifndef _SYS_SYSPROTO_H_
201 struct preadv_args {
202 int fd;
203 struct iovec *iovp;
204 u_int iovcnt;
205 off_t offset;
206 };
207 #endif
208 /*
209 * MPSAFE
210 */
211 int
212 preadv(struct thread *td, struct preadv_args *uap)
213 {
214 struct uio *auio;
215 int error;
216
217 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
218 if (error)
219 return (error);
220 error = kern_preadv(td, uap->fd, auio, uap->offset);
221 free(auio, M_IOV);
222 return (error);
223 }
224
225 int
226 kern_preadv(td, fd, auio, offset)
227 struct thread *td;
228 int fd;
229 struct uio *auio;
230 off_t offset;
231 {
232 struct file *fp;
233 int error;
234
235 error = fget_read(td, fd, &fp);
236 if (error)
237 return (error);
238 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
239 error = ESPIPE;
240 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
241 error = EINVAL;
242 else
243 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
244 fdrop(fp, td);
245 return (error);
246 }
247
248 /*
249 * Common code for readv and preadv that reads data in
250 * from a file using the passed in uio, offset, and flags.
251 */
252 static int
253 dofileread(td, fd, fp, auio, offset, flags)
254 struct thread *td;
255 int fd;
256 struct file *fp;
257 struct uio *auio;
258 off_t offset;
259 int flags;
260 {
261 ssize_t cnt;
262 int error;
263 #ifdef KTRACE
264 struct uio *ktruio = NULL;
265 #endif
266
267 /* Finish zero length reads right here */
268 if (auio->uio_resid == 0) {
269 td->td_retval[0] = 0;
270 return(0);
271 }
272 auio->uio_rw = UIO_READ;
273 auio->uio_offset = offset;
274 auio->uio_td = td;
275 #ifdef KTRACE
276 if (KTRPOINT(td, KTR_GENIO))
277 ktruio = cloneuio(auio);
278 #endif
279 cnt = auio->uio_resid;
280 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
281 if (auio->uio_resid != cnt && (error == ERESTART ||
282 error == EINTR || error == EWOULDBLOCK))
283 error = 0;
284 }
285 cnt -= auio->uio_resid;
286 #ifdef KTRACE
287 if (ktruio != NULL) {
288 ktruio->uio_resid = cnt;
289 ktrgenio(fd, UIO_READ, ktruio, error);
290 }
291 #endif
292 td->td_retval[0] = cnt;
293 return (error);
294 }
295
296 /*
297 * Write system call
298 */
299 #ifndef _SYS_SYSPROTO_H_
300 struct write_args {
301 int fd;
302 const void *buf;
303 size_t nbyte;
304 };
305 #endif
306 /*
307 * MPSAFE
308 */
309 int
310 write(td, uap)
311 struct thread *td;
312 struct write_args *uap;
313 {
314 struct uio auio;
315 struct iovec aiov;
316 int error;
317
318 if (uap->nbyte > INT_MAX)
319 return (EINVAL);
320 aiov.iov_base = (void *)(uintptr_t)uap->buf;
321 aiov.iov_len = uap->nbyte;
322 auio.uio_iov = &aiov;
323 auio.uio_iovcnt = 1;
324 auio.uio_resid = uap->nbyte;
325 auio.uio_segflg = UIO_USERSPACE;
326 error = kern_writev(td, uap->fd, &auio);
327 return(error);
328 }
329
330 /*
331 * Positioned write system call
332 */
333 #ifndef _SYS_SYSPROTO_H_
334 struct pwrite_args {
335 int fd;
336 const void *buf;
337 size_t nbyte;
338 int pad;
339 off_t offset;
340 };
341 #endif
342 /*
343 * MPSAFE
344 */
345 int
346 pwrite(td, uap)
347 struct thread *td;
348 struct pwrite_args *uap;
349 {
350 struct uio auio;
351 struct iovec aiov;
352 int error;
353
354 if (uap->nbyte > INT_MAX)
355 return (EINVAL);
356 aiov.iov_base = (void *)(uintptr_t)uap->buf;
357 aiov.iov_len = uap->nbyte;
358 auio.uio_iov = &aiov;
359 auio.uio_iovcnt = 1;
360 auio.uio_resid = uap->nbyte;
361 auio.uio_segflg = UIO_USERSPACE;
362 error = kern_pwritev(td, uap->fd, &auio, uap->offset);
363 return(error);
364 }
365
366 /*
367 * Gather write system call
368 */
369 #ifndef _SYS_SYSPROTO_H_
370 struct writev_args {
371 int fd;
372 struct iovec *iovp;
373 u_int iovcnt;
374 };
375 #endif
376 /*
377 * MPSAFE
378 */
379 int
380 writev(struct thread *td, struct writev_args *uap)
381 {
382 struct uio *auio;
383 int error;
384
385 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
386 if (error)
387 return (error);
388 error = kern_writev(td, uap->fd, auio);
389 free(auio, M_IOV);
390 return (error);
391 }
392
393 int
394 kern_writev(struct thread *td, int fd, struct uio *auio)
395 {
396 struct file *fp;
397 int error;
398
399 error = fget_write(td, fd, &fp);
400 if (error)
401 return (EBADF); /* XXX this can't be right */
402 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
403 fdrop(fp, td);
404 return (error);
405 }
406
407 /*
408 * Gather positioned write system call
409 */
410 #ifndef _SYS_SYSPROTO_H_
411 struct pwritev_args {
412 int fd;
413 struct iovec *iovp;
414 u_int iovcnt;
415 off_t offset;
416 };
417 #endif
418 /*
419 * MPSAFE
420 */
421 int
422 pwritev(struct thread *td, struct pwritev_args *uap)
423 {
424 struct uio *auio;
425 int error;
426
427 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
428 if (error)
429 return (error);
430 error = kern_pwritev(td, uap->fd, auio, uap->offset);
431 free(auio, M_IOV);
432 return (error);
433 }
434
435 int
436 kern_pwritev(td, fd, auio, offset)
437 struct thread *td;
438 struct uio *auio;
439 int fd;
440 off_t offset;
441 {
442 struct file *fp;
443 int error;
444
445 error = fget_write(td, fd, &fp);
446 if (error)
447 return (EBADF); /* XXX this can't be right */
448 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
449 error = ESPIPE;
450 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
451 error = EINVAL;
452 else
453 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
454 fdrop(fp, td);
455 return (error);
456 }
457
458 /*
459 * Common code for writev and pwritev that writes data to
460 * a file using the passed in uio, offset, and flags.
461 */
462 static int
463 dofilewrite(td, fd, fp, auio, offset, flags)
464 struct thread *td;
465 int fd;
466 struct file *fp;
467 struct uio *auio;
468 off_t offset;
469 int flags;
470 {
471 ssize_t cnt;
472 int error;
473 #ifdef KTRACE
474 struct uio *ktruio = NULL;
475 #endif
476
477 auio->uio_rw = UIO_WRITE;
478 auio->uio_td = td;
479 auio->uio_offset = offset;
480 #ifdef KTRACE
481 if (KTRPOINT(td, KTR_GENIO))
482 ktruio = cloneuio(auio);
483 #endif
484 cnt = auio->uio_resid;
485 if (fp->f_type == DTYPE_VNODE)
486 bwillwrite();
487 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
488 if (auio->uio_resid != cnt && (error == ERESTART ||
489 error == EINTR || error == EWOULDBLOCK))
490 error = 0;
491 /* Socket layer is responsible for issuing SIGPIPE. */
492 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
493 PROC_LOCK(td->td_proc);
494 psignal(td->td_proc, SIGPIPE);
495 PROC_UNLOCK(td->td_proc);
496 }
497 }
498 cnt -= auio->uio_resid;
499 #ifdef KTRACE
500 if (ktruio != NULL) {
501 ktruio->uio_resid = cnt;
502 ktrgenio(fd, UIO_WRITE, ktruio, error);
503 }
504 #endif
505 td->td_retval[0] = cnt;
506 return (error);
507 }
508
509 /*
510 * Ioctl system call
511 */
512 #ifndef _SYS_SYSPROTO_H_
513 struct ioctl_args {
514 int fd;
515 u_long com;
516 caddr_t data;
517 };
518 #endif
519 /*
520 * MPSAFE
521 */
522 /* ARGSUSED */
523 int
524 ioctl(struct thread *td, struct ioctl_args *uap)
525 {
526 struct file *fp;
527 struct filedesc *fdp;
528 u_long com;
529 int arg, error;
530 u_int size;
531 caddr_t data, memp;
532 int tmp;
533
534 if (uap->com > 0xffffffff) {
535 printf(
536 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
537 td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
538 uap->com &= 0xffffffff;
539 }
540 if ((error = fget(td, uap->fd, &fp)) != 0)
541 return (error);
542 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
543 fdrop(fp, td);
544 return (EBADF);
545 }
546 fdp = td->td_proc->p_fd;
547 switch (com = uap->com) {
548 case FIONCLEX:
549 FILEDESC_LOCK_FAST(fdp);
550 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
551 FILEDESC_UNLOCK_FAST(fdp);
552 fdrop(fp, td);
553 return (0);
554 case FIOCLEX:
555 FILEDESC_LOCK_FAST(fdp);
556 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
557 FILEDESC_UNLOCK_FAST(fdp);
558 fdrop(fp, td);
559 return (0);
560 }
561
562 /*
563 * Interpret high order word to find amount of data to be
564 * copied to/from the user's address space.
565 */
566 size = IOCPARM_LEN(com);
567 if ((size > IOCPARM_MAX) ||
568 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) ||
569 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
570 ((com & IOC_OUT) && size == 0) ||
571 #else
572 ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
573 #endif
574 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) {
575 fdrop(fp, td);
576 return (ENOTTY);
577 }
578
579 if (size > 0) {
580 if (!(com & IOC_VOID)) {
581 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
582 data = memp;
583 } else {
584 /* Integer argument. */
585 memp = NULL;
586 arg = (intptr_t)uap->data;
587 data = (void *)&arg;
588 }
589 } else {
590 memp = NULL;
591 data = (void *)&uap->data;
592 }
593 if (com & IOC_IN) {
594 error = copyin(uap->data, data, (u_int)size);
595 if (error) {
596 free(memp, M_IOCTLOPS);
597 fdrop(fp, td);
598 return (error);
599 }
600 } else if (com & IOC_OUT) {
601 /*
602 * Zero the buffer so the user always
603 * gets back something deterministic.
604 */
605 bzero(data, size);
606 }
607
608 if (com == FIONBIO) {
609 FILE_LOCK(fp);
610 if ((tmp = *(int *)data))
611 fp->f_flag |= FNONBLOCK;
612 else
613 fp->f_flag &= ~FNONBLOCK;
614 FILE_UNLOCK(fp);
615 data = (void *)&tmp;
616 } else if (com == FIOASYNC) {
617 FILE_LOCK(fp);
618 if ((tmp = *(int *)data))
619 fp->f_flag |= FASYNC;
620 else
621 fp->f_flag &= ~FASYNC;
622 FILE_UNLOCK(fp);
623 data = (void *)&tmp;
624 }
625
626 error = fo_ioctl(fp, com, data, td->td_ucred, td);
627
628 if (error == 0 && (com & IOC_OUT))
629 error = copyout(data, uap->data, (u_int)size);
630
631 if (memp != NULL)
632 free(memp, M_IOCTLOPS);
633 fdrop(fp, td);
634 return (error);
635 }
636
637 /*
638 * sellock and selwait are initialized in selectinit() via SYSINIT.
639 */
640 struct mtx sellock;
641 struct cv selwait;
642 u_int nselcoll; /* Select collisions since boot */
643 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
644
645 /*
646 * Select system call.
647 */
648 #ifndef _SYS_SYSPROTO_H_
649 struct select_args {
650 int nd;
651 fd_set *in, *ou, *ex;
652 struct timeval *tv;
653 };
654 #endif
655 /*
656 * MPSAFE
657 */
658 int
659 select(td, uap)
660 register struct thread *td;
661 register struct select_args *uap;
662 {
663 struct timeval tv, *tvp;
664 int error;
665
666 if (uap->tv != NULL) {
667 error = copyin(uap->tv, &tv, sizeof(tv));
668 if (error)
669 return (error);
670 tvp = &tv;
671 } else
672 tvp = NULL;
673
674 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
675 }
676
677 int
678 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
679 fd_set *fd_ex, struct timeval *tvp)
680 {
681 struct filedesc *fdp;
682 /*
683 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
684 * infds with the new FD_SETSIZE of 1024, and more than enough for
685 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
686 * of 256.
687 */
688 fd_mask s_selbits[howmany(2048, NFDBITS)];
689 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
690 struct timeval atv, rtv, ttv;
691 int error, timo;
692 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
693
694 if (nd < 0)
695 return (EINVAL);
696 fdp = td->td_proc->p_fd;
697
698 FILEDESC_LOCK_FAST(fdp);
699
700 if (nd > td->td_proc->p_fd->fd_nfiles)
701 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
702 FILEDESC_UNLOCK_FAST(fdp);
703
704 /*
705 * Allocate just enough bits for the non-null fd_sets. Use the
706 * preallocated auto buffer if possible.
707 */
708 nfdbits = roundup(nd, NFDBITS);
709 ncpbytes = nfdbits / NBBY;
710 nbufbytes = 0;
711 if (fd_in != NULL)
712 nbufbytes += 2 * ncpbytes;
713 if (fd_ou != NULL)
714 nbufbytes += 2 * ncpbytes;
715 if (fd_ex != NULL)
716 nbufbytes += 2 * ncpbytes;
717 if (nbufbytes <= sizeof s_selbits)
718 selbits = &s_selbits[0];
719 else
720 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
721
722 /*
723 * Assign pointers into the bit buffers and fetch the input bits.
724 * Put the output buffers together so that they can be bzeroed
725 * together.
726 */
727 sbp = selbits;
728 #define getbits(name, x) \
729 do { \
730 if (name == NULL) \
731 ibits[x] = NULL; \
732 else { \
733 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
734 obits[x] = sbp; \
735 sbp += ncpbytes / sizeof *sbp; \
736 error = copyin(name, ibits[x], ncpbytes); \
737 if (error != 0) \
738 goto done_nosellock; \
739 } \
740 } while (0)
741 getbits(fd_in, 0);
742 getbits(fd_ou, 1);
743 getbits(fd_ex, 2);
744 #undef getbits
745 if (nbufbytes != 0)
746 bzero(selbits, nbufbytes / 2);
747
748 if (tvp != NULL) {
749 atv = *tvp;
750 if (itimerfix(&atv)) {
751 error = EINVAL;
752 goto done_nosellock;
753 }
754 getmicrouptime(&rtv);
755 timevaladd(&atv, &rtv);
756 } else {
757 atv.tv_sec = 0;
758 atv.tv_usec = 0;
759 }
760 timo = 0;
761 TAILQ_INIT(&td->td_selq);
762 mtx_lock(&sellock);
763 retry:
764 ncoll = nselcoll;
765 mtx_lock_spin(&sched_lock);
766 td->td_flags |= TDF_SELECT;
767 mtx_unlock_spin(&sched_lock);
768 mtx_unlock(&sellock);
769
770 error = selscan(td, ibits, obits, nd);
771 mtx_lock(&sellock);
772 if (error || td->td_retval[0])
773 goto done;
774 if (atv.tv_sec || atv.tv_usec) {
775 getmicrouptime(&rtv);
776 if (timevalcmp(&rtv, &atv, >=))
777 goto done;
778 ttv = atv;
779 timevalsub(&ttv, &rtv);
780 timo = ttv.tv_sec > 24 * 60 * 60 ?
781 24 * 60 * 60 * hz : tvtohz(&ttv);
782 }
783
784 /*
785 * An event of interest may occur while we do not hold
786 * sellock, so check TDF_SELECT and the number of
787 * collisions and rescan the file descriptors if
788 * necessary.
789 */
790 mtx_lock_spin(&sched_lock);
791 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
792 mtx_unlock_spin(&sched_lock);
793 goto retry;
794 }
795 mtx_unlock_spin(&sched_lock);
796
797 if (timo > 0)
798 error = cv_timedwait_sig(&selwait, &sellock, timo);
799 else
800 error = cv_wait_sig(&selwait, &sellock);
801
802 if (error == 0)
803 goto retry;
804
805 done:
806 clear_selinfo_list(td);
807 mtx_lock_spin(&sched_lock);
808 td->td_flags &= ~TDF_SELECT;
809 mtx_unlock_spin(&sched_lock);
810 mtx_unlock(&sellock);
811
812 done_nosellock:
813 /* select is not restarted after signals... */
814 if (error == ERESTART)
815 error = EINTR;
816 if (error == EWOULDBLOCK)
817 error = 0;
818 #define putbits(name, x) \
819 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
820 error = error2;
821 if (error == 0) {
822 int error2;
823
824 putbits(fd_in, 0);
825 putbits(fd_ou, 1);
826 putbits(fd_ex, 2);
827 #undef putbits
828 }
829 if (selbits != &s_selbits[0])
830 free(selbits, M_SELECT);
831
832 return (error);
833 }
834
835 static int
836 selscan(td, ibits, obits, nfd)
837 struct thread *td;
838 fd_mask **ibits, **obits;
839 int nfd;
840 {
841 int msk, i, fd;
842 fd_mask bits;
843 struct file *fp;
844 int n = 0;
845 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
846 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
847 struct filedesc *fdp = td->td_proc->p_fd;
848
849 FILEDESC_LOCK(fdp);
850 for (msk = 0; msk < 3; msk++) {
851 if (ibits[msk] == NULL)
852 continue;
853 for (i = 0; i < nfd; i += NFDBITS) {
854 bits = ibits[msk][i/NFDBITS];
855 /* ffs(int mask) not portable, fd_mask is long */
856 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
857 if (!(bits & 1))
858 continue;
859 if ((fp = fget_locked(fdp, fd)) == NULL) {
860 FILEDESC_UNLOCK(fdp);
861 return (EBADF);
862 }
863 if (fo_poll(fp, flag[msk], td->td_ucred,
864 td)) {
865 obits[msk][(fd)/NFDBITS] |=
866 ((fd_mask)1 << ((fd) % NFDBITS));
867 n++;
868 }
869 }
870 }
871 }
872 FILEDESC_UNLOCK(fdp);
873 td->td_retval[0] = n;
874 return (0);
875 }
876
877 /*
878 * Poll system call.
879 */
880 #ifndef _SYS_SYSPROTO_H_
881 struct poll_args {
882 struct pollfd *fds;
883 u_int nfds;
884 int timeout;
885 };
886 #endif
887 /*
888 * MPSAFE
889 */
890 int
891 poll(td, uap)
892 struct thread *td;
893 struct poll_args *uap;
894 {
895 struct pollfd *bits;
896 struct pollfd smallbits[32];
897 struct timeval atv, rtv, ttv;
898 int error = 0, timo;
899 u_int ncoll, nfds;
900 size_t ni;
901
902 nfds = uap->nfds;
903
904 /*
905 * This is kinda bogus. We have fd limits, but that is not
906 * really related to the size of the pollfd array. Make sure
907 * we let the process use at least FD_SETSIZE entries and at
908 * least enough for the current limits. We want to be reasonably
909 * safe, but not overly restrictive.
910 */
911 PROC_LOCK(td->td_proc);
912 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
913 (nfds > FD_SETSIZE)) {
914 PROC_UNLOCK(td->td_proc);
915 error = EINVAL;
916 goto done2;
917 }
918 PROC_UNLOCK(td->td_proc);
919 ni = nfds * sizeof(struct pollfd);
920 if (ni > sizeof(smallbits))
921 bits = malloc(ni, M_TEMP, M_WAITOK);
922 else
923 bits = smallbits;
924 error = copyin(uap->fds, bits, ni);
925 if (error)
926 goto done_nosellock;
927 if (uap->timeout != INFTIM) {
928 atv.tv_sec = uap->timeout / 1000;
929 atv.tv_usec = (uap->timeout % 1000) * 1000;
930 if (itimerfix(&atv)) {
931 error = EINVAL;
932 goto done_nosellock;
933 }
934 getmicrouptime(&rtv);
935 timevaladd(&atv, &rtv);
936 } else {
937 atv.tv_sec = 0;
938 atv.tv_usec = 0;
939 }
940 timo = 0;
941 TAILQ_INIT(&td->td_selq);
942 mtx_lock(&sellock);
943 retry:
944 ncoll = nselcoll;
945 mtx_lock_spin(&sched_lock);
946 td->td_flags |= TDF_SELECT;
947 mtx_unlock_spin(&sched_lock);
948 mtx_unlock(&sellock);
949
950 error = pollscan(td, bits, nfds);
951 mtx_lock(&sellock);
952 if (error || td->td_retval[0])
953 goto done;
954 if (atv.tv_sec || atv.tv_usec) {
955 getmicrouptime(&rtv);
956 if (timevalcmp(&rtv, &atv, >=))
957 goto done;
958 ttv = atv;
959 timevalsub(&ttv, &rtv);
960 timo = ttv.tv_sec > 24 * 60 * 60 ?
961 24 * 60 * 60 * hz : tvtohz(&ttv);
962 }
963 /*
964 * An event of interest may occur while we do not hold
965 * sellock, so check TDF_SELECT and the number of collisions
966 * and rescan the file descriptors if necessary.
967 */
968 mtx_lock_spin(&sched_lock);
969 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
970 mtx_unlock_spin(&sched_lock);
971 goto retry;
972 }
973 mtx_unlock_spin(&sched_lock);
974
975 if (timo > 0)
976 error = cv_timedwait_sig(&selwait, &sellock, timo);
977 else
978 error = cv_wait_sig(&selwait, &sellock);
979
980 if (error == 0)
981 goto retry;
982
983 done:
984 clear_selinfo_list(td);
985 mtx_lock_spin(&sched_lock);
986 td->td_flags &= ~TDF_SELECT;
987 mtx_unlock_spin(&sched_lock);
988 mtx_unlock(&sellock);
989
990 done_nosellock:
991 /* poll is not restarted after signals... */
992 if (error == ERESTART)
993 error = EINTR;
994 if (error == EWOULDBLOCK)
995 error = 0;
996 if (error == 0) {
997 error = copyout(bits, uap->fds, ni);
998 if (error)
999 goto out;
1000 }
1001 out:
1002 if (ni > sizeof(smallbits))
1003 free(bits, M_TEMP);
1004 done2:
1005 return (error);
1006 }
1007
1008 static int
1009 pollscan(td, fds, nfd)
1010 struct thread *td;
1011 struct pollfd *fds;
1012 u_int nfd;
1013 {
1014 register struct filedesc *fdp = td->td_proc->p_fd;
1015 int i;
1016 struct file *fp;
1017 int n = 0;
1018
1019 FILEDESC_LOCK(fdp);
1020 for (i = 0; i < nfd; i++, fds++) {
1021 if (fds->fd >= fdp->fd_nfiles) {
1022 fds->revents = POLLNVAL;
1023 n++;
1024 } else if (fds->fd < 0) {
1025 fds->revents = 0;
1026 } else {
1027 fp = fdp->fd_ofiles[fds->fd];
1028 if (fp == NULL) {
1029 fds->revents = POLLNVAL;
1030 n++;
1031 } else {
1032 /*
1033 * Note: backend also returns POLLHUP and
1034 * POLLERR if appropriate.
1035 */
1036 fds->revents = fo_poll(fp, fds->events,
1037 td->td_ucred, td);
1038 if (fds->revents != 0)
1039 n++;
1040 }
1041 }
1042 }
1043 FILEDESC_UNLOCK(fdp);
1044 td->td_retval[0] = n;
1045 return (0);
1046 }
1047
1048 /*
1049 * OpenBSD poll system call.
1050 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1051 */
1052 #ifndef _SYS_SYSPROTO_H_
1053 struct openbsd_poll_args {
1054 struct pollfd *fds;
1055 u_int nfds;
1056 int timeout;
1057 };
1058 #endif
1059 /*
1060 * MPSAFE
1061 */
1062 int
1063 openbsd_poll(td, uap)
1064 register struct thread *td;
1065 register struct openbsd_poll_args *uap;
1066 {
1067 return (poll(td, (struct poll_args *)uap));
1068 }
1069
1070 /*
1071 * Remove the references to the thread from all of the objects
1072 * we were polling.
1073 *
1074 * This code assumes that the underlying owner of the selinfo
1075 * structure will hold sellock before it changes it, and that
1076 * it will unlink itself from our list if it goes away.
1077 */
1078 void
1079 clear_selinfo_list(td)
1080 struct thread *td;
1081 {
1082 struct selinfo *si;
1083
1084 mtx_assert(&sellock, MA_OWNED);
1085 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1086 si->si_thread = NULL;
1087 TAILQ_INIT(&td->td_selq);
1088 }
1089
1090 /*
1091 * Record a select request.
1092 */
1093 void
1094 selrecord(selector, sip)
1095 struct thread *selector;
1096 struct selinfo *sip;
1097 {
1098
1099 mtx_lock(&sellock);
1100 /*
1101 * If the selinfo's thread pointer is NULL then take ownership of it.
1102 *
1103 * If the thread pointer is not NULL and it points to another
1104 * thread, then we have a collision.
1105 *
1106 * If the thread pointer is not NULL and points back to us then leave
1107 * it alone as we've already added pointed it at us and added it to
1108 * our list.
1109 */
1110 if (sip->si_thread == NULL) {
1111 sip->si_thread = selector;
1112 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1113 } else if (sip->si_thread != selector) {
1114 sip->si_flags |= SI_COLL;
1115 }
1116
1117 mtx_unlock(&sellock);
1118 }
1119
1120 /* Wake up a selecting thread. */
1121 void
1122 selwakeup(sip)
1123 struct selinfo *sip;
1124 {
1125 doselwakeup(sip, -1);
1126 }
1127
1128 /* Wake up a selecting thread, and set its priority. */
1129 void
1130 selwakeuppri(sip, pri)
1131 struct selinfo *sip;
1132 int pri;
1133 {
1134 doselwakeup(sip, pri);
1135 }
1136
1137 /*
1138 * Do a wakeup when a selectable event occurs.
1139 */
1140 static void
1141 doselwakeup(sip, pri)
1142 struct selinfo *sip;
1143 int pri;
1144 {
1145 struct thread *td;
1146
1147 mtx_lock(&sellock);
1148 td = sip->si_thread;
1149 if ((sip->si_flags & SI_COLL) != 0) {
1150 nselcoll++;
1151 sip->si_flags &= ~SI_COLL;
1152 cv_broadcastpri(&selwait, pri);
1153 }
1154 if (td == NULL) {
1155 mtx_unlock(&sellock);
1156 return;
1157 }
1158 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1159 sip->si_thread = NULL;
1160 mtx_lock_spin(&sched_lock);
1161 td->td_flags &= ~TDF_SELECT;
1162 mtx_unlock_spin(&sched_lock);
1163 sleepq_remove(td, &selwait);
1164 mtx_unlock(&sellock);
1165 }
1166
1167 static void selectinit(void *);
1168 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1169
1170 /* ARGSUSED*/
1171 static void
1172 selectinit(dummy)
1173 void *dummy;
1174 {
1175 cv_init(&selwait, "select");
1176 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1177 }
Cache object: ba090c292afcd3a3c1b914108622267d
|