1 /*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD: releng/5.2/sys/kern/sys_generic.c 122352 2003-11-09 09:17:26Z tanimura $");
43
44 #include "opt_ktrace.h"
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/filedesc.h>
50 #include <sys/filio.h>
51 #include <sys/fcntl.h>
52 #include <sys/file.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/socketvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/limits.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #include <sys/resourcevar.h>
62 #include <sys/selinfo.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/sysent.h>
66 #include <sys/bio.h>
67 #include <sys/buf.h>
68 #include <sys/condvar.h>
69 #ifdef KTRACE
70 #include <sys/ktrace.h>
71 #endif
72 #include <vm/vm.h>
73 #include <vm/vm_page.h>
74
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
77 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
78
79 static int pollscan(struct thread *, struct pollfd *, u_int);
80 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
81 static int dofileread(struct thread *, struct file *, int, void *,
82 size_t, off_t, int);
83 static int dofilewrite(struct thread *, struct file *, int,
84 const void *, size_t, off_t, int);
85 static void doselwakeup(struct selinfo *, int);
86
87 /*
88 * Read system call.
89 */
90 #ifndef _SYS_SYSPROTO_H_
91 struct read_args {
92 int fd;
93 void *buf;
94 size_t nbyte;
95 };
96 #endif
97 /*
98 * MPSAFE
99 */
100 int
101 read(td, uap)
102 struct thread *td;
103 struct read_args *uap;
104 {
105 struct file *fp;
106 int error;
107
108 if ((error = fget_read(td, uap->fd, &fp)) == 0) {
109 error = dofileread(td, fp, uap->fd, uap->buf,
110 uap->nbyte, (off_t)-1, 0);
111 fdrop(fp, td);
112 }
113 return(error);
114 }
115
116 /*
117 * Pread system call
118 */
119 #ifndef _SYS_SYSPROTO_H_
120 struct pread_args {
121 int fd;
122 void *buf;
123 size_t nbyte;
124 int pad;
125 off_t offset;
126 };
127 #endif
128 /*
129 * MPSAFE
130 */
131 int
132 pread(td, uap)
133 struct thread *td;
134 struct pread_args *uap;
135 {
136 struct file *fp;
137 int error;
138
139 if ((error = fget_read(td, uap->fd, &fp)) != 0)
140 return (error);
141 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
142 error = ESPIPE;
143 } else {
144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 uap->offset, FOF_OFFSET);
146 }
147 fdrop(fp, td);
148 return(error);
149 }
150
151 /*
152 * Code common for read and pread
153 */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 struct thread *td;
157 struct file *fp;
158 int fd, flags;
159 void *buf;
160 size_t nbyte;
161 off_t offset;
162 {
163 struct uio auio;
164 struct iovec aiov;
165 long cnt, error = 0;
166 #ifdef KTRACE
167 struct iovec ktriov;
168 struct uio ktruio;
169 int didktr = 0;
170 #endif
171
172 aiov.iov_base = buf;
173 aiov.iov_len = nbyte;
174 auio.uio_iov = &aiov;
175 auio.uio_iovcnt = 1;
176 auio.uio_offset = offset;
177 if (nbyte > INT_MAX)
178 return (EINVAL);
179 auio.uio_resid = nbyte;
180 auio.uio_rw = UIO_READ;
181 auio.uio_segflg = UIO_USERSPACE;
182 auio.uio_td = td;
183 #ifdef KTRACE
184 /*
185 * if tracing, save a copy of iovec
186 */
187 if (KTRPOINT(td, KTR_GENIO)) {
188 ktriov = aiov;
189 ktruio = auio;
190 didktr = 1;
191 }
192 #endif
193 cnt = nbyte;
194
195 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
196 if (auio.uio_resid != cnt && (error == ERESTART ||
197 error == EINTR || error == EWOULDBLOCK))
198 error = 0;
199 }
200 cnt -= auio.uio_resid;
201 #ifdef KTRACE
202 if (didktr && error == 0) {
203 ktruio.uio_iov = &ktriov;
204 ktruio.uio_resid = cnt;
205 ktrgenio(fd, UIO_READ, &ktruio, error);
206 }
207 #endif
208 td->td_retval[0] = cnt;
209 return (error);
210 }
211
212 /*
213 * Scatter read system call.
214 */
215 #ifndef _SYS_SYSPROTO_H_
216 struct readv_args {
217 int fd;
218 struct iovec *iovp;
219 u_int iovcnt;
220 };
221 #endif
222 /*
223 * MPSAFE
224 */
225 int
226 readv(td, uap)
227 struct thread *td;
228 struct readv_args *uap;
229 {
230 struct file *fp;
231 struct uio auio;
232 struct iovec *iov;
233 struct iovec *needfree;
234 struct iovec aiov[UIO_SMALLIOV];
235 long i, cnt;
236 int error;
237 u_int iovlen;
238 #ifdef KTRACE
239 struct iovec *ktriov = NULL;
240 struct uio ktruio;
241 #endif
242
243 if ((error = fget_read(td, uap->fd, &fp)) != 0)
244 return (error);
245 needfree = NULL;
246 /* note: can't use iovlen until iovcnt is validated */
247 iovlen = uap->iovcnt * sizeof (struct iovec);
248 if (uap->iovcnt > UIO_SMALLIOV) {
249 if (uap->iovcnt > UIO_MAXIOV) {
250 error = EINVAL;
251 goto done;
252 }
253 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
254 needfree = iov;
255 } else
256 iov = aiov;
257 auio.uio_iov = iov;
258 auio.uio_iovcnt = uap->iovcnt;
259 auio.uio_rw = UIO_READ;
260 auio.uio_segflg = UIO_USERSPACE;
261 auio.uio_td = td;
262 auio.uio_offset = -1;
263 if ((error = copyin(uap->iovp, iov, iovlen)))
264 goto done;
265 auio.uio_resid = 0;
266 for (i = 0; i < uap->iovcnt; i++) {
267 if (iov->iov_len > INT_MAX - auio.uio_resid) {
268 error = EINVAL;
269 goto done;
270 }
271 auio.uio_resid += iov->iov_len;
272 iov++;
273 }
274 #ifdef KTRACE
275 /*
276 * if tracing, save a copy of iovec
277 */
278 if (KTRPOINT(td, KTR_GENIO)) {
279 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
280 bcopy(auio.uio_iov, ktriov, iovlen);
281 ktruio = auio;
282 }
283 #endif
284 cnt = auio.uio_resid;
285 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
286 if (auio.uio_resid != cnt && (error == ERESTART ||
287 error == EINTR || error == EWOULDBLOCK))
288 error = 0;
289 }
290 cnt -= auio.uio_resid;
291 #ifdef KTRACE
292 if (ktriov != NULL) {
293 if (error == 0) {
294 ktruio.uio_iov = ktriov;
295 ktruio.uio_resid = cnt;
296 ktrgenio(uap->fd, UIO_READ, &ktruio, error);
297 }
298 FREE(ktriov, M_TEMP);
299 }
300 #endif
301 td->td_retval[0] = cnt;
302 done:
303 fdrop(fp, td);
304 if (needfree)
305 FREE(needfree, M_IOV);
306 return (error);
307 }
308
309 /*
310 * Write system call
311 */
312 #ifndef _SYS_SYSPROTO_H_
313 struct write_args {
314 int fd;
315 const void *buf;
316 size_t nbyte;
317 };
318 #endif
319 /*
320 * MPSAFE
321 */
322 int
323 write(td, uap)
324 struct thread *td;
325 struct write_args *uap;
326 {
327 struct file *fp;
328 int error;
329
330 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
331 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
332 (off_t)-1, 0);
333 fdrop(fp, td);
334 } else {
335 error = EBADF; /* XXX this can't be right */
336 }
337 return(error);
338 }
339
340 /*
341 * Pwrite system call
342 */
343 #ifndef _SYS_SYSPROTO_H_
344 struct pwrite_args {
345 int fd;
346 const void *buf;
347 size_t nbyte;
348 int pad;
349 off_t offset;
350 };
351 #endif
352 /*
353 * MPSAFE
354 */
355 int
356 pwrite(td, uap)
357 struct thread *td;
358 struct pwrite_args *uap;
359 {
360 struct file *fp;
361 int error;
362
363 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
364 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
365 error = ESPIPE;
366 } else {
367 error = dofilewrite(td, fp, uap->fd, uap->buf,
368 uap->nbyte, uap->offset, FOF_OFFSET);
369 }
370 fdrop(fp, td);
371 } else {
372 error = EBADF; /* this can't be right */
373 }
374 return(error);
375 }
376
377 static int
378 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
379 struct thread *td;
380 struct file *fp;
381 int fd, flags;
382 const void *buf;
383 size_t nbyte;
384 off_t offset;
385 {
386 struct uio auio;
387 struct iovec aiov;
388 long cnt, error = 0;
389 #ifdef KTRACE
390 struct iovec ktriov;
391 struct uio ktruio;
392 int didktr = 0;
393 #endif
394
395 aiov.iov_base = (void *)(uintptr_t)buf;
396 aiov.iov_len = nbyte;
397 auio.uio_iov = &aiov;
398 auio.uio_iovcnt = 1;
399 auio.uio_offset = offset;
400 if (nbyte > INT_MAX)
401 return (EINVAL);
402 auio.uio_resid = nbyte;
403 auio.uio_rw = UIO_WRITE;
404 auio.uio_segflg = UIO_USERSPACE;
405 auio.uio_td = td;
406 #ifdef KTRACE
407 /*
408 * if tracing, save a copy of iovec and uio
409 */
410 if (KTRPOINT(td, KTR_GENIO)) {
411 ktriov = aiov;
412 ktruio = auio;
413 didktr = 1;
414 }
415 #endif
416 cnt = nbyte;
417 if (fp->f_type == DTYPE_VNODE)
418 bwillwrite();
419 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
420 if (auio.uio_resid != cnt && (error == ERESTART ||
421 error == EINTR || error == EWOULDBLOCK))
422 error = 0;
423 /* Socket layer is responsible for issuing SIGPIPE. */
424 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
425 PROC_LOCK(td->td_proc);
426 psignal(td->td_proc, SIGPIPE);
427 PROC_UNLOCK(td->td_proc);
428 }
429 }
430 cnt -= auio.uio_resid;
431 #ifdef KTRACE
432 if (didktr && error == 0) {
433 ktruio.uio_iov = &ktriov;
434 ktruio.uio_resid = cnt;
435 ktrgenio(fd, UIO_WRITE, &ktruio, error);
436 }
437 #endif
438 td->td_retval[0] = cnt;
439 return (error);
440 }
441
442 /*
443 * Gather write system call
444 */
445 #ifndef _SYS_SYSPROTO_H_
446 struct writev_args {
447 int fd;
448 struct iovec *iovp;
449 u_int iovcnt;
450 };
451 #endif
452 /*
453 * MPSAFE
454 */
455 int
456 writev(td, uap)
457 struct thread *td;
458 register struct writev_args *uap;
459 {
460 struct file *fp;
461 struct uio auio;
462 register struct iovec *iov;
463 struct iovec *needfree;
464 struct iovec aiov[UIO_SMALLIOV];
465 long i, cnt, error = 0;
466 u_int iovlen;
467 #ifdef KTRACE
468 struct iovec *ktriov = NULL;
469 struct uio ktruio;
470 #endif
471
472 if ((error = fget_write(td, uap->fd, &fp)) != 0)
473 return (EBADF);
474 needfree = NULL;
475 /* note: can't use iovlen until iovcnt is validated */
476 iovlen = uap->iovcnt * sizeof (struct iovec);
477 if (uap->iovcnt > UIO_SMALLIOV) {
478 if (uap->iovcnt > UIO_MAXIOV) {
479 error = EINVAL;
480 goto done;
481 }
482 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
483 needfree = iov;
484 } else
485 iov = aiov;
486 auio.uio_iov = iov;
487 auio.uio_iovcnt = uap->iovcnt;
488 auio.uio_rw = UIO_WRITE;
489 auio.uio_segflg = UIO_USERSPACE;
490 auio.uio_td = td;
491 auio.uio_offset = -1;
492 if ((error = copyin(uap->iovp, iov, iovlen)))
493 goto done;
494 auio.uio_resid = 0;
495 for (i = 0; i < uap->iovcnt; i++) {
496 if (iov->iov_len > INT_MAX - auio.uio_resid) {
497 error = EINVAL;
498 goto done;
499 }
500 auio.uio_resid += iov->iov_len;
501 iov++;
502 }
503 #ifdef KTRACE
504 /*
505 * if tracing, save a copy of iovec and uio
506 */
507 if (KTRPOINT(td, KTR_GENIO)) {
508 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
509 bcopy(auio.uio_iov, ktriov, iovlen);
510 ktruio = auio;
511 }
512 #endif
513 cnt = auio.uio_resid;
514 if (fp->f_type == DTYPE_VNODE)
515 bwillwrite();
516 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
517 if (auio.uio_resid != cnt && (error == ERESTART ||
518 error == EINTR || error == EWOULDBLOCK))
519 error = 0;
520 if (error == EPIPE) {
521 PROC_LOCK(td->td_proc);
522 psignal(td->td_proc, SIGPIPE);
523 PROC_UNLOCK(td->td_proc);
524 }
525 }
526 cnt -= auio.uio_resid;
527 #ifdef KTRACE
528 if (ktriov != NULL) {
529 if (error == 0) {
530 ktruio.uio_iov = ktriov;
531 ktruio.uio_resid = cnt;
532 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
533 }
534 FREE(ktriov, M_TEMP);
535 }
536 #endif
537 td->td_retval[0] = cnt;
538 done:
539 fdrop(fp, td);
540 if (needfree)
541 FREE(needfree, M_IOV);
542 return (error);
543 }
544
545 /*
546 * Ioctl system call
547 */
548 #ifndef _SYS_SYSPROTO_H_
549 struct ioctl_args {
550 int fd;
551 u_long com;
552 caddr_t data;
553 };
554 #endif
555 /*
556 * MPSAFE
557 */
558 /* ARGSUSED */
559 int
560 ioctl(td, uap)
561 struct thread *td;
562 register struct ioctl_args *uap;
563 {
564 struct file *fp;
565 register struct filedesc *fdp;
566 register u_long com;
567 int error = 0;
568 register u_int size;
569 caddr_t data, memp;
570 int tmp;
571 #define STK_PARAMS 128
572 union {
573 char stkbuf[STK_PARAMS];
574 long align;
575 } ubuf;
576
577 if ((error = fget(td, uap->fd, &fp)) != 0)
578 return (error);
579 mtx_lock(&Giant);
580 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
581 fdrop(fp, td);
582 mtx_unlock(&Giant);
583 return (EBADF);
584 }
585 fdp = td->td_proc->p_fd;
586 switch (com = uap->com) {
587 case FIONCLEX:
588 FILEDESC_LOCK(fdp);
589 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
590 FILEDESC_UNLOCK(fdp);
591 fdrop(fp, td);
592 mtx_unlock(&Giant);
593 return (0);
594 case FIOCLEX:
595 FILEDESC_LOCK(fdp);
596 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
597 FILEDESC_UNLOCK(fdp);
598 fdrop(fp, td);
599 mtx_unlock(&Giant);
600 return (0);
601 }
602
603 /*
604 * Interpret high order word to find amount of data to be
605 * copied to/from the user's address space.
606 */
607 size = IOCPARM_LEN(com);
608 if (size > IOCPARM_MAX) {
609 fdrop(fp, td);
610 mtx_unlock(&Giant);
611 return (ENOTTY);
612 }
613
614 memp = NULL;
615 if (size > sizeof (ubuf.stkbuf)) {
616 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
617 data = memp;
618 } else {
619 data = ubuf.stkbuf;
620 }
621 if (com&IOC_IN) {
622 if (size) {
623 error = copyin(uap->data, data, (u_int)size);
624 if (error) {
625 if (memp)
626 free(memp, M_IOCTLOPS);
627 fdrop(fp, td);
628 goto done;
629 }
630 } else {
631 *(caddr_t *)data = uap->data;
632 }
633 } else if ((com&IOC_OUT) && size) {
634 /*
635 * Zero the buffer so the user always
636 * gets back something deterministic.
637 */
638 bzero(data, size);
639 } else if (com&IOC_VOID) {
640 *(caddr_t *)data = uap->data;
641 }
642
643 switch (com) {
644
645 case FIONBIO:
646 FILE_LOCK(fp);
647 if ((tmp = *(int *)data))
648 fp->f_flag |= FNONBLOCK;
649 else
650 fp->f_flag &= ~FNONBLOCK;
651 FILE_UNLOCK(fp);
652 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
653 break;
654
655 case FIOASYNC:
656 FILE_LOCK(fp);
657 if ((tmp = *(int *)data))
658 fp->f_flag |= FASYNC;
659 else
660 fp->f_flag &= ~FASYNC;
661 FILE_UNLOCK(fp);
662 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
663 break;
664
665 default:
666 error = fo_ioctl(fp, com, data, td->td_ucred, td);
667 /*
668 * Copy any data to user, size was
669 * already set and checked above.
670 */
671 if (error == 0 && (com&IOC_OUT) && size)
672 error = copyout(data, uap->data, (u_int)size);
673 break;
674 }
675 if (memp)
676 free(memp, M_IOCTLOPS);
677 fdrop(fp, td);
678 done:
679 mtx_unlock(&Giant);
680 return (error);
681 }
682
683 /*
684 * sellock and selwait are initialized in selectinit() via SYSINIT.
685 */
686 struct mtx sellock;
687 struct cv selwait;
688 u_int nselcoll; /* Select collisions since boot */
689 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
690
691 /*
692 * Select system call.
693 */
694 #ifndef _SYS_SYSPROTO_H_
695 struct select_args {
696 int nd;
697 fd_set *in, *ou, *ex;
698 struct timeval *tv;
699 };
700 #endif
701 /*
702 * MPSAFE
703 */
704 int
705 select(td, uap)
706 register struct thread *td;
707 register struct select_args *uap;
708 {
709 struct timeval tv, *tvp;
710 int error;
711
712 if (uap->tv != NULL) {
713 error = copyin(uap->tv, &tv, sizeof(tv));
714 if (error)
715 return (error);
716 tvp = &tv;
717 } else
718 tvp = NULL;
719
720 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
721 }
722
723 int
724 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
725 fd_set *fd_ex, struct timeval *tvp)
726 {
727 struct filedesc *fdp;
728 /*
729 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
730 * infds with the new FD_SETSIZE of 1024, and more than enough for
731 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
732 * of 256.
733 */
734 fd_mask s_selbits[howmany(2048, NFDBITS)];
735 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
736 struct timeval atv, rtv, ttv;
737 int error, timo;
738 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
739
740 if (nd < 0)
741 return (EINVAL);
742 fdp = td->td_proc->p_fd;
743 mtx_lock(&Giant);
744 FILEDESC_LOCK(fdp);
745
746 if (nd > td->td_proc->p_fd->fd_nfiles)
747 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
748 FILEDESC_UNLOCK(fdp);
749
750 /*
751 * Allocate just enough bits for the non-null fd_sets. Use the
752 * preallocated auto buffer if possible.
753 */
754 nfdbits = roundup(nd, NFDBITS);
755 ncpbytes = nfdbits / NBBY;
756 nbufbytes = 0;
757 if (fd_in != NULL)
758 nbufbytes += 2 * ncpbytes;
759 if (fd_ou != NULL)
760 nbufbytes += 2 * ncpbytes;
761 if (fd_ex != NULL)
762 nbufbytes += 2 * ncpbytes;
763 if (nbufbytes <= sizeof s_selbits)
764 selbits = &s_selbits[0];
765 else
766 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
767
768 /*
769 * Assign pointers into the bit buffers and fetch the input bits.
770 * Put the output buffers together so that they can be bzeroed
771 * together.
772 */
773 sbp = selbits;
774 #define getbits(name, x) \
775 do { \
776 if (name == NULL) \
777 ibits[x] = NULL; \
778 else { \
779 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
780 obits[x] = sbp; \
781 sbp += ncpbytes / sizeof *sbp; \
782 error = copyin(name, ibits[x], ncpbytes); \
783 if (error != 0) \
784 goto done_nosellock; \
785 } \
786 } while (0)
787 getbits(fd_in, 0);
788 getbits(fd_ou, 1);
789 getbits(fd_ex, 2);
790 #undef getbits
791 if (nbufbytes != 0)
792 bzero(selbits, nbufbytes / 2);
793
794 if (tvp != NULL) {
795 atv = *tvp;
796 if (itimerfix(&atv)) {
797 error = EINVAL;
798 goto done_nosellock;
799 }
800 getmicrouptime(&rtv);
801 timevaladd(&atv, &rtv);
802 } else {
803 atv.tv_sec = 0;
804 atv.tv_usec = 0;
805 }
806 timo = 0;
807 TAILQ_INIT(&td->td_selq);
808 mtx_lock(&sellock);
809 retry:
810 ncoll = nselcoll;
811 mtx_lock_spin(&sched_lock);
812 td->td_flags |= TDF_SELECT;
813 mtx_unlock_spin(&sched_lock);
814 mtx_unlock(&sellock);
815
816 error = selscan(td, ibits, obits, nd);
817 mtx_lock(&sellock);
818 if (error || td->td_retval[0])
819 goto done;
820 if (atv.tv_sec || atv.tv_usec) {
821 getmicrouptime(&rtv);
822 if (timevalcmp(&rtv, &atv, >=))
823 goto done;
824 ttv = atv;
825 timevalsub(&ttv, &rtv);
826 timo = ttv.tv_sec > 24 * 60 * 60 ?
827 24 * 60 * 60 * hz : tvtohz(&ttv);
828 }
829
830 /*
831 * An event of interest may occur while we do not hold
832 * sellock, so check TDF_SELECT and the number of
833 * collisions and rescan the file descriptors if
834 * necessary.
835 */
836 mtx_lock_spin(&sched_lock);
837 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
838 mtx_unlock_spin(&sched_lock);
839 goto retry;
840 }
841 mtx_unlock_spin(&sched_lock);
842
843 if (timo > 0)
844 error = cv_timedwait_sig(&selwait, &sellock, timo);
845 else
846 error = cv_wait_sig(&selwait, &sellock);
847
848 if (error == 0)
849 goto retry;
850
851 done:
852 clear_selinfo_list(td);
853 mtx_lock_spin(&sched_lock);
854 td->td_flags &= ~TDF_SELECT;
855 mtx_unlock_spin(&sched_lock);
856 mtx_unlock(&sellock);
857
858 done_nosellock:
859 /* select is not restarted after signals... */
860 if (error == ERESTART)
861 error = EINTR;
862 if (error == EWOULDBLOCK)
863 error = 0;
864 #define putbits(name, x) \
865 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
866 error = error2;
867 if (error == 0) {
868 int error2;
869
870 putbits(fd_in, 0);
871 putbits(fd_ou, 1);
872 putbits(fd_ex, 2);
873 #undef putbits
874 }
875 if (selbits != &s_selbits[0])
876 free(selbits, M_SELECT);
877
878 mtx_unlock(&Giant);
879 return (error);
880 }
881
882 static int
883 selscan(td, ibits, obits, nfd)
884 struct thread *td;
885 fd_mask **ibits, **obits;
886 int nfd;
887 {
888 int msk, i, fd;
889 fd_mask bits;
890 struct file *fp;
891 int n = 0;
892 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
893 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
894 struct filedesc *fdp = td->td_proc->p_fd;
895
896 FILEDESC_LOCK(fdp);
897 for (msk = 0; msk < 3; msk++) {
898 if (ibits[msk] == NULL)
899 continue;
900 for (i = 0; i < nfd; i += NFDBITS) {
901 bits = ibits[msk][i/NFDBITS];
902 /* ffs(int mask) not portable, fd_mask is long */
903 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
904 if (!(bits & 1))
905 continue;
906 if ((fp = fget_locked(fdp, fd)) == NULL) {
907 FILEDESC_UNLOCK(fdp);
908 return (EBADF);
909 }
910 if (fo_poll(fp, flag[msk], td->td_ucred,
911 td)) {
912 obits[msk][(fd)/NFDBITS] |=
913 ((fd_mask)1 << ((fd) % NFDBITS));
914 n++;
915 }
916 }
917 }
918 }
919 FILEDESC_UNLOCK(fdp);
920 td->td_retval[0] = n;
921 return (0);
922 }
923
924 /*
925 * Poll system call.
926 */
927 #ifndef _SYS_SYSPROTO_H_
928 struct poll_args {
929 struct pollfd *fds;
930 u_int nfds;
931 int timeout;
932 };
933 #endif
934 /*
935 * MPSAFE
936 */
937 int
938 poll(td, uap)
939 struct thread *td;
940 struct poll_args *uap;
941 {
942 caddr_t bits;
943 char smallbits[32 * sizeof(struct pollfd)];
944 struct timeval atv, rtv, ttv;
945 int error = 0, timo;
946 u_int ncoll, nfds;
947 size_t ni;
948
949 nfds = uap->nfds;
950
951 mtx_lock(&Giant);
952 /*
953 * This is kinda bogus. We have fd limits, but that is not
954 * really related to the size of the pollfd array. Make sure
955 * we let the process use at least FD_SETSIZE entries and at
956 * least enough for the current limits. We want to be reasonably
957 * safe, but not overly restrictive.
958 */
959 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
960 (nfds > FD_SETSIZE)) {
961 error = EINVAL;
962 goto done2;
963 }
964 ni = nfds * sizeof(struct pollfd);
965 if (ni > sizeof(smallbits))
966 bits = malloc(ni, M_TEMP, M_WAITOK);
967 else
968 bits = smallbits;
969 error = copyin(uap->fds, bits, ni);
970 if (error)
971 goto done_nosellock;
972 if (uap->timeout != INFTIM) {
973 atv.tv_sec = uap->timeout / 1000;
974 atv.tv_usec = (uap->timeout % 1000) * 1000;
975 if (itimerfix(&atv)) {
976 error = EINVAL;
977 goto done_nosellock;
978 }
979 getmicrouptime(&rtv);
980 timevaladd(&atv, &rtv);
981 } else {
982 atv.tv_sec = 0;
983 atv.tv_usec = 0;
984 }
985 timo = 0;
986 TAILQ_INIT(&td->td_selq);
987 mtx_lock(&sellock);
988 retry:
989 ncoll = nselcoll;
990 mtx_lock_spin(&sched_lock);
991 td->td_flags |= TDF_SELECT;
992 mtx_unlock_spin(&sched_lock);
993 mtx_unlock(&sellock);
994
995 error = pollscan(td, (struct pollfd *)bits, nfds);
996 mtx_lock(&sellock);
997 if (error || td->td_retval[0])
998 goto done;
999 if (atv.tv_sec || atv.tv_usec) {
1000 getmicrouptime(&rtv);
1001 if (timevalcmp(&rtv, &atv, >=))
1002 goto done;
1003 ttv = atv;
1004 timevalsub(&ttv, &rtv);
1005 timo = ttv.tv_sec > 24 * 60 * 60 ?
1006 24 * 60 * 60 * hz : tvtohz(&ttv);
1007 }
1008 /*
1009 * An event of interest may occur while we do not hold
1010 * sellock, so check TDF_SELECT and the number of collisions
1011 * and rescan the file descriptors if necessary.
1012 */
1013 mtx_lock_spin(&sched_lock);
1014 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1015 mtx_unlock_spin(&sched_lock);
1016 goto retry;
1017 }
1018 mtx_unlock_spin(&sched_lock);
1019
1020 if (timo > 0)
1021 error = cv_timedwait_sig(&selwait, &sellock, timo);
1022 else
1023 error = cv_wait_sig(&selwait, &sellock);
1024
1025 if (error == 0)
1026 goto retry;
1027
1028 done:
1029 clear_selinfo_list(td);
1030 mtx_lock_spin(&sched_lock);
1031 td->td_flags &= ~TDF_SELECT;
1032 mtx_unlock_spin(&sched_lock);
1033 mtx_unlock(&sellock);
1034
1035 done_nosellock:
1036 /* poll is not restarted after signals... */
1037 if (error == ERESTART)
1038 error = EINTR;
1039 if (error == EWOULDBLOCK)
1040 error = 0;
1041 if (error == 0) {
1042 error = copyout(bits, uap->fds, ni);
1043 if (error)
1044 goto out;
1045 }
1046 out:
1047 if (ni > sizeof(smallbits))
1048 free(bits, M_TEMP);
1049 done2:
1050 mtx_unlock(&Giant);
1051 return (error);
1052 }
1053
1054 static int
1055 pollscan(td, fds, nfd)
1056 struct thread *td;
1057 struct pollfd *fds;
1058 u_int nfd;
1059 {
1060 register struct filedesc *fdp = td->td_proc->p_fd;
1061 int i;
1062 struct file *fp;
1063 int n = 0;
1064
1065 FILEDESC_LOCK(fdp);
1066 for (i = 0; i < nfd; i++, fds++) {
1067 if (fds->fd >= fdp->fd_nfiles) {
1068 fds->revents = POLLNVAL;
1069 n++;
1070 } else if (fds->fd < 0) {
1071 fds->revents = 0;
1072 } else {
1073 fp = fdp->fd_ofiles[fds->fd];
1074 if (fp == NULL) {
1075 fds->revents = POLLNVAL;
1076 n++;
1077 } else {
1078 /*
1079 * Note: backend also returns POLLHUP and
1080 * POLLERR if appropriate.
1081 */
1082 fds->revents = fo_poll(fp, fds->events,
1083 td->td_ucred, td);
1084 if (fds->revents != 0)
1085 n++;
1086 }
1087 }
1088 }
1089 FILEDESC_UNLOCK(fdp);
1090 td->td_retval[0] = n;
1091 return (0);
1092 }
1093
1094 /*
1095 * OpenBSD poll system call.
1096 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1097 */
1098 #ifndef _SYS_SYSPROTO_H_
1099 struct openbsd_poll_args {
1100 struct pollfd *fds;
1101 u_int nfds;
1102 int timeout;
1103 };
1104 #endif
1105 /*
1106 * MPSAFE
1107 */
1108 int
1109 openbsd_poll(td, uap)
1110 register struct thread *td;
1111 register struct openbsd_poll_args *uap;
1112 {
1113 return (poll(td, (struct poll_args *)uap));
1114 }
1115
1116 /*
1117 * Remove the references to the thread from all of the objects
1118 * we were polling.
1119 *
1120 * This code assumes that the underlying owner of the selinfo
1121 * structure will hold sellock before it changes it, and that
1122 * it will unlink itself from our list if it goes away.
1123 */
1124 void
1125 clear_selinfo_list(td)
1126 struct thread *td;
1127 {
1128 struct selinfo *si;
1129
1130 mtx_assert(&sellock, MA_OWNED);
1131 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1132 si->si_thread = NULL;
1133 TAILQ_INIT(&td->td_selq);
1134 }
1135
1136 /*
1137 * Record a select request.
1138 */
1139 void
1140 selrecord(selector, sip)
1141 struct thread *selector;
1142 struct selinfo *sip;
1143 {
1144
1145 mtx_lock(&sellock);
1146 /*
1147 * If the selinfo's thread pointer is NULL then take ownership of it.
1148 *
1149 * If the thread pointer is not NULL and it points to another
1150 * thread, then we have a collision.
1151 *
1152 * If the thread pointer is not NULL and points back to us then leave
1153 * it alone as we've already added pointed it at us and added it to
1154 * our list.
1155 */
1156 if (sip->si_thread == NULL) {
1157 sip->si_thread = selector;
1158 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1159 } else if (sip->si_thread != selector) {
1160 sip->si_flags |= SI_COLL;
1161 }
1162
1163 mtx_unlock(&sellock);
1164 }
1165
1166 /* Wake up a selecting thread. */
1167 void
1168 selwakeup(sip)
1169 struct selinfo *sip;
1170 {
1171 doselwakeup(sip, -1);
1172 }
1173
1174 /* Wake up a selecting thread, and set its priority. */
1175 void
1176 selwakeuppri(sip, pri)
1177 struct selinfo *sip;
1178 int pri;
1179 {
1180 doselwakeup(sip, pri);
1181 }
1182
1183 /*
1184 * Do a wakeup when a selectable event occurs.
1185 */
1186 static void
1187 doselwakeup(sip, pri)
1188 struct selinfo *sip;
1189 int pri;
1190 {
1191 struct thread *td;
1192
1193 mtx_lock(&sellock);
1194 td = sip->si_thread;
1195 if ((sip->si_flags & SI_COLL) != 0) {
1196 nselcoll++;
1197 sip->si_flags &= ~SI_COLL;
1198 cv_broadcastpri(&selwait, pri);
1199 }
1200 if (td == NULL) {
1201 mtx_unlock(&sellock);
1202 return;
1203 }
1204 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1205 sip->si_thread = NULL;
1206 mtx_lock_spin(&sched_lock);
1207 if (td->td_wchan == &selwait) {
1208 cv_waitq_remove(td);
1209 TD_CLR_SLEEPING(td);
1210 if (pri >= PRI_MIN && pri <= PRI_MAX && td->td_priority > pri)
1211 td->td_priority = pri;
1212 setrunnable(td);
1213 } else
1214 td->td_flags &= ~TDF_SELECT;
1215 mtx_unlock_spin(&sched_lock);
1216 mtx_unlock(&sellock);
1217 }
1218
1219 static void selectinit(void *);
1220 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1221
1222 /* ARGSUSED*/
1223 static void
1224 selectinit(dummy)
1225 void *dummy;
1226 {
1227 cv_init(&selwait, "select");
1228 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1229 }
Cache object: 9e4fe40d49018cf818d1cc0cf190a9f1
|