1 /*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: releng/5.0/sys/kern/sys_generic.c 108086 2002-12-19 09:40:13Z alfred $
40 */
41
42 #include "opt_ktrace.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/bio.h>
64 #include <sys/buf.h>
65 #include <sys/condvar.h>
66 #ifdef KTRACE
67 #include <sys/ktrace.h>
68 #endif
69 #include <vm/vm.h>
70 #include <vm/vm_page.h>
71
72 #include <machine/limits.h>
73
74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
77
78 static int pollscan(struct thread *, struct pollfd *, u_int);
79 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
80 static int dofileread(struct thread *, struct file *, int, void *,
81 size_t, off_t, int);
82 static int dofilewrite(struct thread *, struct file *, int,
83 const void *, size_t, off_t, int);
84
85 /*
86 * Read system call.
87 */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 int fd;
91 void *buf;
92 size_t nbyte;
93 };
94 #endif
95 /*
96 * MPSAFE
97 */
98 int
99 read(td, uap)
100 struct thread *td;
101 struct read_args *uap;
102 {
103 struct file *fp;
104 int error;
105
106 if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 error = dofileread(td, fp, uap->fd, uap->buf,
108 uap->nbyte, (off_t)-1, 0);
109 fdrop(fp, td);
110 }
111 return(error);
112 }
113
114 /*
115 * Pread system call
116 */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 int fd;
120 void *buf;
121 size_t nbyte;
122 int pad;
123 off_t offset;
124 };
125 #endif
126 /*
127 * MPSAFE
128 */
129 int
130 pread(td, uap)
131 struct thread *td;
132 struct pread_args *uap;
133 {
134 struct file *fp;
135 int error;
136
137 if ((error = fget_read(td, uap->fd, &fp)) != 0)
138 return (error);
139 if (fp->f_type != DTYPE_VNODE) {
140 error = ESPIPE;
141 } else {
142 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
143 uap->offset, FOF_OFFSET);
144 }
145 fdrop(fp, td);
146 return(error);
147 }
148
149 /*
150 * Code common for read and pread
151 */
152 static int
153 dofileread(td, fp, fd, buf, nbyte, offset, flags)
154 struct thread *td;
155 struct file *fp;
156 int fd, flags;
157 void *buf;
158 size_t nbyte;
159 off_t offset;
160 {
161 struct uio auio;
162 struct iovec aiov;
163 long cnt, error = 0;
164 #ifdef KTRACE
165 struct iovec ktriov;
166 struct uio ktruio;
167 int didktr = 0;
168 #endif
169
170 aiov.iov_base = buf;
171 aiov.iov_len = nbyte;
172 auio.uio_iov = &aiov;
173 auio.uio_iovcnt = 1;
174 auio.uio_offset = offset;
175 if (nbyte > INT_MAX)
176 return (EINVAL);
177 auio.uio_resid = nbyte;
178 auio.uio_rw = UIO_READ;
179 auio.uio_segflg = UIO_USERSPACE;
180 auio.uio_td = td;
181 #ifdef KTRACE
182 /*
183 * if tracing, save a copy of iovec
184 */
185 if (KTRPOINT(td, KTR_GENIO)) {
186 ktriov = aiov;
187 ktruio = auio;
188 didktr = 1;
189 }
190 #endif
191 cnt = nbyte;
192
193 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
194 if (auio.uio_resid != cnt && (error == ERESTART ||
195 error == EINTR || error == EWOULDBLOCK))
196 error = 0;
197 }
198 cnt -= auio.uio_resid;
199 #ifdef KTRACE
200 if (didktr && error == 0) {
201 ktruio.uio_iov = &ktriov;
202 ktruio.uio_resid = cnt;
203 ktrgenio(fd, UIO_READ, &ktruio, error);
204 }
205 #endif
206 td->td_retval[0] = cnt;
207 return (error);
208 }
209
210 /*
211 * Scatter read system call.
212 */
213 #ifndef _SYS_SYSPROTO_H_
214 struct readv_args {
215 int fd;
216 struct iovec *iovp;
217 u_int iovcnt;
218 };
219 #endif
220 /*
221 * MPSAFE
222 */
223 int
224 readv(td, uap)
225 struct thread *td;
226 struct readv_args *uap;
227 {
228 struct file *fp;
229 struct uio auio;
230 struct iovec *iov;
231 struct iovec *needfree;
232 struct iovec aiov[UIO_SMALLIOV];
233 long i, cnt;
234 int error;
235 u_int iovlen;
236 #ifdef KTRACE
237 struct iovec *ktriov = NULL;
238 struct uio ktruio;
239 #endif
240
241 if ((error = fget_read(td, uap->fd, &fp)) != 0)
242 return (error);
243 needfree = NULL;
244 /* note: can't use iovlen until iovcnt is validated */
245 iovlen = uap->iovcnt * sizeof (struct iovec);
246 if (uap->iovcnt > UIO_SMALLIOV) {
247 if (uap->iovcnt > UIO_MAXIOV) {
248 error = EINVAL;
249 goto done;
250 }
251 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
252 needfree = iov;
253 } else
254 iov = aiov;
255 auio.uio_iov = iov;
256 auio.uio_iovcnt = uap->iovcnt;
257 auio.uio_rw = UIO_READ;
258 auio.uio_segflg = UIO_USERSPACE;
259 auio.uio_td = td;
260 auio.uio_offset = -1;
261 if ((error = copyin(uap->iovp, iov, iovlen)))
262 goto done;
263 auio.uio_resid = 0;
264 for (i = 0; i < uap->iovcnt; i++) {
265 if (iov->iov_len > INT_MAX - auio.uio_resid) {
266 error = EINVAL;
267 goto done;
268 }
269 auio.uio_resid += iov->iov_len;
270 iov++;
271 }
272 #ifdef KTRACE
273 /*
274 * if tracing, save a copy of iovec
275 */
276 if (KTRPOINT(td, KTR_GENIO)) {
277 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
278 bcopy(auio.uio_iov, ktriov, iovlen);
279 ktruio = auio;
280 }
281 #endif
282 cnt = auio.uio_resid;
283 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
284 if (auio.uio_resid != cnt && (error == ERESTART ||
285 error == EINTR || error == EWOULDBLOCK))
286 error = 0;
287 }
288 cnt -= auio.uio_resid;
289 #ifdef KTRACE
290 if (ktriov != NULL) {
291 if (error == 0) {
292 ktruio.uio_iov = ktriov;
293 ktruio.uio_resid = cnt;
294 ktrgenio(uap->fd, UIO_READ, &ktruio, error);
295 }
296 FREE(ktriov, M_TEMP);
297 }
298 #endif
299 td->td_retval[0] = cnt;
300 done:
301 fdrop(fp, td);
302 if (needfree)
303 FREE(needfree, M_IOV);
304 return (error);
305 }
306
307 /*
308 * Write system call
309 */
310 #ifndef _SYS_SYSPROTO_H_
311 struct write_args {
312 int fd;
313 const void *buf;
314 size_t nbyte;
315 };
316 #endif
317 /*
318 * MPSAFE
319 */
320 int
321 write(td, uap)
322 struct thread *td;
323 struct write_args *uap;
324 {
325 struct file *fp;
326 int error;
327
328 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
329 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
330 (off_t)-1, 0);
331 fdrop(fp, td);
332 } else {
333 error = EBADF; /* XXX this can't be right */
334 }
335 return(error);
336 }
337
338 /*
339 * Pwrite system call
340 */
341 #ifndef _SYS_SYSPROTO_H_
342 struct pwrite_args {
343 int fd;
344 const void *buf;
345 size_t nbyte;
346 int pad;
347 off_t offset;
348 };
349 #endif
350 /*
351 * MPSAFE
352 */
353 int
354 pwrite(td, uap)
355 struct thread *td;
356 struct pwrite_args *uap;
357 {
358 struct file *fp;
359 int error;
360
361 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
362 if (fp->f_type == DTYPE_VNODE) {
363 error = dofilewrite(td, fp, uap->fd, uap->buf,
364 uap->nbyte, uap->offset, FOF_OFFSET);
365 } else {
366 error = ESPIPE;
367 }
368 fdrop(fp, td);
369 } else {
370 error = EBADF; /* this can't be right */
371 }
372 return(error);
373 }
374
375 static int
376 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
377 struct thread *td;
378 struct file *fp;
379 int fd, flags;
380 const void *buf;
381 size_t nbyte;
382 off_t offset;
383 {
384 struct uio auio;
385 struct iovec aiov;
386 long cnt, error = 0;
387 #ifdef KTRACE
388 struct iovec ktriov;
389 struct uio ktruio;
390 int didktr = 0;
391 #endif
392
393 aiov.iov_base = (void *)(uintptr_t)buf;
394 aiov.iov_len = nbyte;
395 auio.uio_iov = &aiov;
396 auio.uio_iovcnt = 1;
397 auio.uio_offset = offset;
398 if (nbyte > INT_MAX)
399 return (EINVAL);
400 auio.uio_resid = nbyte;
401 auio.uio_rw = UIO_WRITE;
402 auio.uio_segflg = UIO_USERSPACE;
403 auio.uio_td = td;
404 #ifdef KTRACE
405 /*
406 * if tracing, save a copy of iovec and uio
407 */
408 if (KTRPOINT(td, KTR_GENIO)) {
409 ktriov = aiov;
410 ktruio = auio;
411 didktr = 1;
412 }
413 #endif
414 cnt = nbyte;
415 if (fp->f_type == DTYPE_VNODE)
416 bwillwrite();
417 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
418 if (auio.uio_resid != cnt && (error == ERESTART ||
419 error == EINTR || error == EWOULDBLOCK))
420 error = 0;
421 /* Socket layer is responsible for issuing SIGPIPE. */
422 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
423 PROC_LOCK(td->td_proc);
424 psignal(td->td_proc, SIGPIPE);
425 PROC_UNLOCK(td->td_proc);
426 }
427 }
428 cnt -= auio.uio_resid;
429 #ifdef KTRACE
430 if (didktr && error == 0) {
431 ktruio.uio_iov = &ktriov;
432 ktruio.uio_resid = cnt;
433 ktrgenio(fd, UIO_WRITE, &ktruio, error);
434 }
435 #endif
436 td->td_retval[0] = cnt;
437 return (error);
438 }
439
440 /*
441 * Gather write system call
442 */
443 #ifndef _SYS_SYSPROTO_H_
444 struct writev_args {
445 int fd;
446 struct iovec *iovp;
447 u_int iovcnt;
448 };
449 #endif
450 /*
451 * MPSAFE
452 */
453 int
454 writev(td, uap)
455 struct thread *td;
456 register struct writev_args *uap;
457 {
458 struct file *fp;
459 struct uio auio;
460 register struct iovec *iov;
461 struct iovec *needfree;
462 struct iovec aiov[UIO_SMALLIOV];
463 long i, cnt, error = 0;
464 u_int iovlen;
465 #ifdef KTRACE
466 struct iovec *ktriov = NULL;
467 struct uio ktruio;
468 #endif
469
470 mtx_lock(&Giant);
471 if ((error = fget_write(td, uap->fd, &fp)) != 0) {
472 error = EBADF;
473 goto done2;
474 }
475 /* note: can't use iovlen until iovcnt is validated */
476 iovlen = uap->iovcnt * sizeof (struct iovec);
477 if (uap->iovcnt > UIO_SMALLIOV) {
478 if (uap->iovcnt > UIO_MAXIOV) {
479 needfree = NULL;
480 error = EINVAL;
481 goto done;
482 }
483 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
484 needfree = iov;
485 } else {
486 iov = aiov;
487 needfree = NULL;
488 }
489 auio.uio_iov = iov;
490 auio.uio_iovcnt = uap->iovcnt;
491 auio.uio_rw = UIO_WRITE;
492 auio.uio_segflg = UIO_USERSPACE;
493 auio.uio_td = td;
494 auio.uio_offset = -1;
495 if ((error = copyin(uap->iovp, iov, iovlen)))
496 goto done;
497 auio.uio_resid = 0;
498 for (i = 0; i < uap->iovcnt; i++) {
499 if (iov->iov_len > INT_MAX - auio.uio_resid) {
500 error = EINVAL;
501 goto done;
502 }
503 auio.uio_resid += iov->iov_len;
504 iov++;
505 }
506 #ifdef KTRACE
507 /*
508 * if tracing, save a copy of iovec and uio
509 */
510 if (KTRPOINT(td, KTR_GENIO)) {
511 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
512 bcopy(auio.uio_iov, ktriov, iovlen);
513 ktruio = auio;
514 }
515 #endif
516 cnt = auio.uio_resid;
517 if (fp->f_type == DTYPE_VNODE)
518 bwillwrite();
519 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
520 if (auio.uio_resid != cnt && (error == ERESTART ||
521 error == EINTR || error == EWOULDBLOCK))
522 error = 0;
523 if (error == EPIPE) {
524 PROC_LOCK(td->td_proc);
525 psignal(td->td_proc, SIGPIPE);
526 PROC_UNLOCK(td->td_proc);
527 }
528 }
529 cnt -= auio.uio_resid;
530 #ifdef KTRACE
531 if (ktriov != NULL) {
532 if (error == 0) {
533 ktruio.uio_iov = ktriov;
534 ktruio.uio_resid = cnt;
535 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
536 }
537 FREE(ktriov, M_TEMP);
538 }
539 #endif
540 td->td_retval[0] = cnt;
541 done:
542 fdrop(fp, td);
543 if (needfree)
544 FREE(needfree, M_IOV);
545 done2:
546 mtx_unlock(&Giant);
547 return (error);
548 }
549
550 /*
551 * Ioctl system call
552 */
553 #ifndef _SYS_SYSPROTO_H_
554 struct ioctl_args {
555 int fd;
556 u_long com;
557 caddr_t data;
558 };
559 #endif
560 /*
561 * MPSAFE
562 */
563 /* ARGSUSED */
564 int
565 ioctl(td, uap)
566 struct thread *td;
567 register struct ioctl_args *uap;
568 {
569 struct file *fp;
570 register struct filedesc *fdp;
571 register u_long com;
572 int error = 0;
573 register u_int size;
574 caddr_t data, memp;
575 int tmp;
576 #define STK_PARAMS 128
577 union {
578 char stkbuf[STK_PARAMS];
579 long align;
580 } ubuf;
581
582 if ((error = fget(td, uap->fd, &fp)) != 0)
583 return (error);
584 mtx_lock(&Giant);
585 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
586 fdrop(fp, td);
587 mtx_unlock(&Giant);
588 return (EBADF);
589 }
590 fdp = td->td_proc->p_fd;
591 switch (com = uap->com) {
592 case FIONCLEX:
593 FILEDESC_LOCK(fdp);
594 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
595 FILEDESC_UNLOCK(fdp);
596 fdrop(fp, td);
597 mtx_unlock(&Giant);
598 return (0);
599 case FIOCLEX:
600 FILEDESC_LOCK(fdp);
601 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
602 FILEDESC_UNLOCK(fdp);
603 fdrop(fp, td);
604 mtx_unlock(&Giant);
605 return (0);
606 }
607
608 /*
609 * Interpret high order word to find amount of data to be
610 * copied to/from the user's address space.
611 */
612 size = IOCPARM_LEN(com);
613 if (size > IOCPARM_MAX) {
614 fdrop(fp, td);
615 mtx_unlock(&Giant);
616 return (ENOTTY);
617 }
618
619 memp = NULL;
620 if (size > sizeof (ubuf.stkbuf)) {
621 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
622 data = memp;
623 } else {
624 data = ubuf.stkbuf;
625 }
626 if (com&IOC_IN) {
627 if (size) {
628 error = copyin(uap->data, data, (u_int)size);
629 if (error) {
630 if (memp)
631 free(memp, M_IOCTLOPS);
632 fdrop(fp, td);
633 goto done;
634 }
635 } else {
636 *(caddr_t *)data = uap->data;
637 }
638 } else if ((com&IOC_OUT) && size) {
639 /*
640 * Zero the buffer so the user always
641 * gets back something deterministic.
642 */
643 bzero(data, size);
644 } else if (com&IOC_VOID) {
645 *(caddr_t *)data = uap->data;
646 }
647
648 switch (com) {
649
650 case FIONBIO:
651 FILE_LOCK(fp);
652 if ((tmp = *(int *)data))
653 fp->f_flag |= FNONBLOCK;
654 else
655 fp->f_flag &= ~FNONBLOCK;
656 FILE_UNLOCK(fp);
657 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
658 break;
659
660 case FIOASYNC:
661 FILE_LOCK(fp);
662 if ((tmp = *(int *)data))
663 fp->f_flag |= FASYNC;
664 else
665 fp->f_flag &= ~FASYNC;
666 FILE_UNLOCK(fp);
667 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
668 break;
669
670 default:
671 error = fo_ioctl(fp, com, data, td->td_ucred, td);
672 /*
673 * Copy any data to user, size was
674 * already set and checked above.
675 */
676 if (error == 0 && (com&IOC_OUT) && size)
677 error = copyout(data, uap->data, (u_int)size);
678 break;
679 }
680 if (memp)
681 free(memp, M_IOCTLOPS);
682 fdrop(fp, td);
683 done:
684 mtx_unlock(&Giant);
685 return (error);
686 }
687
688 /*
689 * sellock and selwait are initialized in selectinit() via SYSINIT.
690 */
691 struct mtx sellock;
692 struct cv selwait;
693 u_int nselcoll; /* Select collisions since boot */
694 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
695
696 /*
697 * Select system call.
698 */
699 #ifndef _SYS_SYSPROTO_H_
700 struct select_args {
701 int nd;
702 fd_set *in, *ou, *ex;
703 struct timeval *tv;
704 };
705 #endif
706 /*
707 * MPSAFE
708 */
709 int
710 select(td, uap)
711 register struct thread *td;
712 register struct select_args *uap;
713 {
714 struct timeval tv, *tvp;
715 int error;
716
717 if (uap->tv != NULL) {
718 error = copyin(uap->tv, &tv, sizeof(tv));
719 if (error)
720 return (error);
721 tvp = &tv;
722 } else
723 tvp = NULL;
724
725 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
726 }
727
728 int
729 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
730 fd_set *fd_ex, struct timeval *tvp)
731 {
732 struct filedesc *fdp;
733 /*
734 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
735 * infds with the new FD_SETSIZE of 1024, and more than enough for
736 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
737 * of 256.
738 */
739 fd_mask s_selbits[howmany(2048, NFDBITS)];
740 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
741 struct timeval atv, rtv, ttv;
742 int error, timo;
743 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
744
745 if (nd < 0)
746 return (EINVAL);
747 fdp = td->td_proc->p_fd;
748 mtx_lock(&Giant);
749 FILEDESC_LOCK(fdp);
750
751 if (nd > td->td_proc->p_fd->fd_nfiles)
752 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
753 FILEDESC_UNLOCK(fdp);
754
755 /*
756 * Allocate just enough bits for the non-null fd_sets. Use the
757 * preallocated auto buffer if possible.
758 */
759 nfdbits = roundup(nd, NFDBITS);
760 ncpbytes = nfdbits / NBBY;
761 nbufbytes = 0;
762 if (fd_in != NULL)
763 nbufbytes += 2 * ncpbytes;
764 if (fd_ou != NULL)
765 nbufbytes += 2 * ncpbytes;
766 if (fd_ex != NULL)
767 nbufbytes += 2 * ncpbytes;
768 if (nbufbytes <= sizeof s_selbits)
769 selbits = &s_selbits[0];
770 else
771 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
772
773 /*
774 * Assign pointers into the bit buffers and fetch the input bits.
775 * Put the output buffers together so that they can be bzeroed
776 * together.
777 */
778 sbp = selbits;
779 #define getbits(name, x) \
780 do { \
781 if (name == NULL) \
782 ibits[x] = NULL; \
783 else { \
784 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
785 obits[x] = sbp; \
786 sbp += ncpbytes / sizeof *sbp; \
787 error = copyin(name, ibits[x], ncpbytes); \
788 if (error != 0) \
789 goto done_nosellock; \
790 } \
791 } while (0)
792 getbits(fd_in, 0);
793 getbits(fd_ou, 1);
794 getbits(fd_ex, 2);
795 #undef getbits
796 if (nbufbytes != 0)
797 bzero(selbits, nbufbytes / 2);
798
799 if (tvp != NULL) {
800 atv = *tvp;
801 if (itimerfix(&atv)) {
802 error = EINVAL;
803 goto done_nosellock;
804 }
805 getmicrouptime(&rtv);
806 timevaladd(&atv, &rtv);
807 } else {
808 atv.tv_sec = 0;
809 atv.tv_usec = 0;
810 }
811 timo = 0;
812 TAILQ_INIT(&td->td_selq);
813 mtx_lock(&sellock);
814 retry:
815 ncoll = nselcoll;
816 mtx_lock_spin(&sched_lock);
817 td->td_flags |= TDF_SELECT;
818 mtx_unlock_spin(&sched_lock);
819 mtx_unlock(&sellock);
820
821 error = selscan(td, ibits, obits, nd);
822 mtx_lock(&sellock);
823 if (error || td->td_retval[0])
824 goto done;
825 if (atv.tv_sec || atv.tv_usec) {
826 getmicrouptime(&rtv);
827 if (timevalcmp(&rtv, &atv, >=))
828 goto done;
829 ttv = atv;
830 timevalsub(&ttv, &rtv);
831 timo = ttv.tv_sec > 24 * 60 * 60 ?
832 24 * 60 * 60 * hz : tvtohz(&ttv);
833 }
834
835 /*
836 * An event of interest may occur while we do not hold
837 * sellock, so check TDF_SELECT and the number of
838 * collisions and rescan the file descriptors if
839 * necessary.
840 */
841 mtx_lock_spin(&sched_lock);
842 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
843 mtx_unlock_spin(&sched_lock);
844 goto retry;
845 }
846 mtx_unlock_spin(&sched_lock);
847
848 if (timo > 0)
849 error = cv_timedwait_sig(&selwait, &sellock, timo);
850 else
851 error = cv_wait_sig(&selwait, &sellock);
852
853 if (error == 0)
854 goto retry;
855
856 done:
857 clear_selinfo_list(td);
858 mtx_lock_spin(&sched_lock);
859 td->td_flags &= ~TDF_SELECT;
860 mtx_unlock_spin(&sched_lock);
861 mtx_unlock(&sellock);
862
863 done_nosellock:
864 /* select is not restarted after signals... */
865 if (error == ERESTART)
866 error = EINTR;
867 if (error == EWOULDBLOCK)
868 error = 0;
869 #define putbits(name, x) \
870 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
871 error = error2;
872 if (error == 0) {
873 int error2;
874
875 putbits(fd_in, 0);
876 putbits(fd_ou, 1);
877 putbits(fd_ex, 2);
878 #undef putbits
879 }
880 if (selbits != &s_selbits[0])
881 free(selbits, M_SELECT);
882
883 mtx_unlock(&Giant);
884 return (error);
885 }
886
887 static int
888 selscan(td, ibits, obits, nfd)
889 struct thread *td;
890 fd_mask **ibits, **obits;
891 int nfd;
892 {
893 int msk, i, fd;
894 fd_mask bits;
895 struct file *fp;
896 int n = 0;
897 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
898 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
899 struct filedesc *fdp = td->td_proc->p_fd;
900
901 FILEDESC_LOCK(fdp);
902 for (msk = 0; msk < 3; msk++) {
903 if (ibits[msk] == NULL)
904 continue;
905 for (i = 0; i < nfd; i += NFDBITS) {
906 bits = ibits[msk][i/NFDBITS];
907 /* ffs(int mask) not portable, fd_mask is long */
908 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
909 if (!(bits & 1))
910 continue;
911 if ((fp = fget_locked(fdp, fd)) == NULL) {
912 FILEDESC_UNLOCK(fdp);
913 return (EBADF);
914 }
915 if (fo_poll(fp, flag[msk], td->td_ucred,
916 td)) {
917 obits[msk][(fd)/NFDBITS] |=
918 ((fd_mask)1 << ((fd) % NFDBITS));
919 n++;
920 }
921 }
922 }
923 }
924 FILEDESC_UNLOCK(fdp);
925 td->td_retval[0] = n;
926 return (0);
927 }
928
929 /*
930 * Poll system call.
931 */
932 #ifndef _SYS_SYSPROTO_H_
933 struct poll_args {
934 struct pollfd *fds;
935 u_int nfds;
936 int timeout;
937 };
938 #endif
939 /*
940 * MPSAFE
941 */
942 int
943 poll(td, uap)
944 struct thread *td;
945 struct poll_args *uap;
946 {
947 caddr_t bits;
948 char smallbits[32 * sizeof(struct pollfd)];
949 struct timeval atv, rtv, ttv;
950 int error = 0, timo;
951 u_int ncoll, nfds;
952 size_t ni;
953
954 nfds = uap->nfds;
955
956 mtx_lock(&Giant);
957 /*
958 * This is kinda bogus. We have fd limits, but that is not
959 * really related to the size of the pollfd array. Make sure
960 * we let the process use at least FD_SETSIZE entries and at
961 * least enough for the current limits. We want to be reasonably
962 * safe, but not overly restrictive.
963 */
964 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
965 (nfds > FD_SETSIZE)) {
966 error = EINVAL;
967 goto done2;
968 }
969 ni = nfds * sizeof(struct pollfd);
970 if (ni > sizeof(smallbits))
971 bits = malloc(ni, M_TEMP, M_WAITOK);
972 else
973 bits = smallbits;
974 error = copyin(uap->fds, bits, ni);
975 if (error)
976 goto done_nosellock;
977 if (uap->timeout != INFTIM) {
978 atv.tv_sec = uap->timeout / 1000;
979 atv.tv_usec = (uap->timeout % 1000) * 1000;
980 if (itimerfix(&atv)) {
981 error = EINVAL;
982 goto done_nosellock;
983 }
984 getmicrouptime(&rtv);
985 timevaladd(&atv, &rtv);
986 } else {
987 atv.tv_sec = 0;
988 atv.tv_usec = 0;
989 }
990 timo = 0;
991 TAILQ_INIT(&td->td_selq);
992 mtx_lock(&sellock);
993 retry:
994 ncoll = nselcoll;
995 mtx_lock_spin(&sched_lock);
996 td->td_flags |= TDF_SELECT;
997 mtx_unlock_spin(&sched_lock);
998 mtx_unlock(&sellock);
999
1000 error = pollscan(td, (struct pollfd *)bits, nfds);
1001 mtx_lock(&sellock);
1002 if (error || td->td_retval[0])
1003 goto done;
1004 if (atv.tv_sec || atv.tv_usec) {
1005 getmicrouptime(&rtv);
1006 if (timevalcmp(&rtv, &atv, >=))
1007 goto done;
1008 ttv = atv;
1009 timevalsub(&ttv, &rtv);
1010 timo = ttv.tv_sec > 24 * 60 * 60 ?
1011 24 * 60 * 60 * hz : tvtohz(&ttv);
1012 }
1013 /*
1014 * An event of interest may occur while we do not hold
1015 * sellock, so check TDF_SELECT and the number of collisions
1016 * and rescan the file descriptors if necessary.
1017 */
1018 mtx_lock_spin(&sched_lock);
1019 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1020 mtx_unlock_spin(&sched_lock);
1021 goto retry;
1022 }
1023 mtx_unlock_spin(&sched_lock);
1024
1025 if (timo > 0)
1026 error = cv_timedwait_sig(&selwait, &sellock, timo);
1027 else
1028 error = cv_wait_sig(&selwait, &sellock);
1029
1030 if (error == 0)
1031 goto retry;
1032
1033 done:
1034 clear_selinfo_list(td);
1035 mtx_lock_spin(&sched_lock);
1036 td->td_flags &= ~TDF_SELECT;
1037 mtx_unlock_spin(&sched_lock);
1038 mtx_unlock(&sellock);
1039
1040 done_nosellock:
1041 /* poll is not restarted after signals... */
1042 if (error == ERESTART)
1043 error = EINTR;
1044 if (error == EWOULDBLOCK)
1045 error = 0;
1046 if (error == 0) {
1047 error = copyout(bits, uap->fds, ni);
1048 if (error)
1049 goto out;
1050 }
1051 out:
1052 if (ni > sizeof(smallbits))
1053 free(bits, M_TEMP);
1054 done2:
1055 mtx_unlock(&Giant);
1056 return (error);
1057 }
1058
1059 static int
1060 pollscan(td, fds, nfd)
1061 struct thread *td;
1062 struct pollfd *fds;
1063 u_int nfd;
1064 {
1065 register struct filedesc *fdp = td->td_proc->p_fd;
1066 int i;
1067 struct file *fp;
1068 int n = 0;
1069
1070 FILEDESC_LOCK(fdp);
1071 for (i = 0; i < nfd; i++, fds++) {
1072 if (fds->fd >= fdp->fd_nfiles) {
1073 fds->revents = POLLNVAL;
1074 n++;
1075 } else if (fds->fd < 0) {
1076 fds->revents = 0;
1077 } else {
1078 fp = fdp->fd_ofiles[fds->fd];
1079 if (fp == NULL) {
1080 fds->revents = POLLNVAL;
1081 n++;
1082 } else {
1083 /*
1084 * Note: backend also returns POLLHUP and
1085 * POLLERR if appropriate.
1086 */
1087 fds->revents = fo_poll(fp, fds->events,
1088 td->td_ucred, td);
1089 if (fds->revents != 0)
1090 n++;
1091 }
1092 }
1093 }
1094 FILEDESC_UNLOCK(fdp);
1095 td->td_retval[0] = n;
1096 return (0);
1097 }
1098
1099 /*
1100 * OpenBSD poll system call.
1101 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1102 */
1103 #ifndef _SYS_SYSPROTO_H_
1104 struct openbsd_poll_args {
1105 struct pollfd *fds;
1106 u_int nfds;
1107 int timeout;
1108 };
1109 #endif
1110 /*
1111 * MPSAFE
1112 */
1113 int
1114 openbsd_poll(td, uap)
1115 register struct thread *td;
1116 register struct openbsd_poll_args *uap;
1117 {
1118 return (poll(td, (struct poll_args *)uap));
1119 }
1120
1121 /*
1122 * Remove the references to the thread from all of the objects
1123 * we were polling.
1124 *
1125 * This code assumes that the underlying owner of the selinfo
1126 * structure will hold sellock before it changes it, and that
1127 * it will unlink itself from our list if it goes away.
1128 */
1129 void
1130 clear_selinfo_list(td)
1131 struct thread *td;
1132 {
1133 struct selinfo *si;
1134
1135 mtx_assert(&sellock, MA_OWNED);
1136 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1137 si->si_thread = NULL;
1138 TAILQ_INIT(&td->td_selq);
1139 }
1140
1141 /*ARGSUSED*/
1142 int
1143 seltrue(dev, events, td)
1144 dev_t dev;
1145 int events;
1146 struct thread *td;
1147 {
1148
1149 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1150 }
1151
1152 /*
1153 * Record a select request.
1154 */
1155 void
1156 selrecord(selector, sip)
1157 struct thread *selector;
1158 struct selinfo *sip;
1159 {
1160
1161 mtx_lock(&sellock);
1162 /*
1163 * If the selinfo's thread pointer is NULL then take ownership of it.
1164 *
1165 * If the thread pointer is not NULL and it points to another
1166 * thread, then we have a collision.
1167 *
1168 * If the thread pointer is not NULL and points back to us then leave
1169 * it alone as we've already added pointed it at us and added it to
1170 * our list.
1171 */
1172 if (sip->si_thread == NULL) {
1173 sip->si_thread = selector;
1174 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1175 } else if (sip->si_thread != selector) {
1176 sip->si_flags |= SI_COLL;
1177 }
1178
1179 mtx_unlock(&sellock);
1180 }
1181
1182 /*
1183 * Do a wakeup when a selectable event occurs.
1184 */
1185 void
1186 selwakeup(sip)
1187 struct selinfo *sip;
1188 {
1189 struct thread *td;
1190
1191 mtx_lock(&sellock);
1192 td = sip->si_thread;
1193 if ((sip->si_flags & SI_COLL) != 0) {
1194 nselcoll++;
1195 sip->si_flags &= ~SI_COLL;
1196 cv_broadcast(&selwait);
1197 }
1198 if (td == NULL) {
1199 mtx_unlock(&sellock);
1200 return;
1201 }
1202 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1203 sip->si_thread = NULL;
1204 mtx_lock_spin(&sched_lock);
1205 if (td->td_wchan == &selwait) {
1206 cv_waitq_remove(td);
1207 TD_CLR_SLEEPING(td);
1208 setrunnable(td);
1209 } else
1210 td->td_flags &= ~TDF_SELECT;
1211 mtx_unlock_spin(&sched_lock);
1212 mtx_unlock(&sellock);
1213 }
1214
1215 static void selectinit(void *);
1216 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1217
1218 /* ARGSUSED*/
1219 static void
1220 selectinit(dummy)
1221 void *dummy;
1222 {
1223 cv_init(&selwait, "select");
1224 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1225 }
Cache object: 8b00ecfd95f004ec0c535763e49abc47
|