1 /*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: releng/5.1/sys/kern/sys_generic.c 114216 2003-04-29 13:36:06Z kan $
40 */
41
42 #include "opt_ktrace.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/limits.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #include <sys/resourcevar.h>
60 #include <sys/selinfo.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77 static int pollscan(struct thread *, struct pollfd *, u_int);
78 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int dofileread(struct thread *, struct file *, int, void *,
80 size_t, off_t, int);
81 static int dofilewrite(struct thread *, struct file *, int,
82 const void *, size_t, off_t, int);
83
84 /*
85 * Read system call.
86 */
87 #ifndef _SYS_SYSPROTO_H_
88 struct read_args {
89 int fd;
90 void *buf;
91 size_t nbyte;
92 };
93 #endif
94 /*
95 * MPSAFE
96 */
97 int
98 read(td, uap)
99 struct thread *td;
100 struct read_args *uap;
101 {
102 struct file *fp;
103 int error;
104
105 if ((error = fget_read(td, uap->fd, &fp)) == 0) {
106 error = dofileread(td, fp, uap->fd, uap->buf,
107 uap->nbyte, (off_t)-1, 0);
108 fdrop(fp, td);
109 }
110 return(error);
111 }
112
113 /*
114 * Pread system call
115 */
116 #ifndef _SYS_SYSPROTO_H_
117 struct pread_args {
118 int fd;
119 void *buf;
120 size_t nbyte;
121 int pad;
122 off_t offset;
123 };
124 #endif
125 /*
126 * MPSAFE
127 */
128 int
129 pread(td, uap)
130 struct thread *td;
131 struct pread_args *uap;
132 {
133 struct file *fp;
134 int error;
135
136 if ((error = fget_read(td, uap->fd, &fp)) != 0)
137 return (error);
138 if (fp->f_type != DTYPE_VNODE) {
139 error = ESPIPE;
140 } else {
141 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
142 uap->offset, FOF_OFFSET);
143 }
144 fdrop(fp, td);
145 return(error);
146 }
147
148 /*
149 * Code common for read and pread
150 */
151 static int
152 dofileread(td, fp, fd, buf, nbyte, offset, flags)
153 struct thread *td;
154 struct file *fp;
155 int fd, flags;
156 void *buf;
157 size_t nbyte;
158 off_t offset;
159 {
160 struct uio auio;
161 struct iovec aiov;
162 long cnt, error = 0;
163 #ifdef KTRACE
164 struct iovec ktriov;
165 struct uio ktruio;
166 int didktr = 0;
167 #endif
168
169 aiov.iov_base = buf;
170 aiov.iov_len = nbyte;
171 auio.uio_iov = &aiov;
172 auio.uio_iovcnt = 1;
173 auio.uio_offset = offset;
174 if (nbyte > INT_MAX)
175 return (EINVAL);
176 auio.uio_resid = nbyte;
177 auio.uio_rw = UIO_READ;
178 auio.uio_segflg = UIO_USERSPACE;
179 auio.uio_td = td;
180 #ifdef KTRACE
181 /*
182 * if tracing, save a copy of iovec
183 */
184 if (KTRPOINT(td, KTR_GENIO)) {
185 ktriov = aiov;
186 ktruio = auio;
187 didktr = 1;
188 }
189 #endif
190 cnt = nbyte;
191
192 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
193 if (auio.uio_resid != cnt && (error == ERESTART ||
194 error == EINTR || error == EWOULDBLOCK))
195 error = 0;
196 }
197 cnt -= auio.uio_resid;
198 #ifdef KTRACE
199 if (didktr && error == 0) {
200 ktruio.uio_iov = &ktriov;
201 ktruio.uio_resid = cnt;
202 ktrgenio(fd, UIO_READ, &ktruio, error);
203 }
204 #endif
205 td->td_retval[0] = cnt;
206 return (error);
207 }
208
209 /*
210 * Scatter read system call.
211 */
212 #ifndef _SYS_SYSPROTO_H_
213 struct readv_args {
214 int fd;
215 struct iovec *iovp;
216 u_int iovcnt;
217 };
218 #endif
219 /*
220 * MPSAFE
221 */
222 int
223 readv(td, uap)
224 struct thread *td;
225 struct readv_args *uap;
226 {
227 struct file *fp;
228 struct uio auio;
229 struct iovec *iov;
230 struct iovec *needfree;
231 struct iovec aiov[UIO_SMALLIOV];
232 long i, cnt;
233 int error;
234 u_int iovlen;
235 #ifdef KTRACE
236 struct iovec *ktriov = NULL;
237 struct uio ktruio;
238 #endif
239
240 if ((error = fget_read(td, uap->fd, &fp)) != 0)
241 return (error);
242 needfree = NULL;
243 /* note: can't use iovlen until iovcnt is validated */
244 iovlen = uap->iovcnt * sizeof (struct iovec);
245 if (uap->iovcnt > UIO_SMALLIOV) {
246 if (uap->iovcnt > UIO_MAXIOV) {
247 error = EINVAL;
248 goto done;
249 }
250 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
251 needfree = iov;
252 } else
253 iov = aiov;
254 auio.uio_iov = iov;
255 auio.uio_iovcnt = uap->iovcnt;
256 auio.uio_rw = UIO_READ;
257 auio.uio_segflg = UIO_USERSPACE;
258 auio.uio_td = td;
259 auio.uio_offset = -1;
260 if ((error = copyin(uap->iovp, iov, iovlen)))
261 goto done;
262 auio.uio_resid = 0;
263 for (i = 0; i < uap->iovcnt; i++) {
264 if (iov->iov_len > INT_MAX - auio.uio_resid) {
265 error = EINVAL;
266 goto done;
267 }
268 auio.uio_resid += iov->iov_len;
269 iov++;
270 }
271 #ifdef KTRACE
272 /*
273 * if tracing, save a copy of iovec
274 */
275 if (KTRPOINT(td, KTR_GENIO)) {
276 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
277 bcopy(auio.uio_iov, ktriov, iovlen);
278 ktruio = auio;
279 }
280 #endif
281 cnt = auio.uio_resid;
282 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
283 if (auio.uio_resid != cnt && (error == ERESTART ||
284 error == EINTR || error == EWOULDBLOCK))
285 error = 0;
286 }
287 cnt -= auio.uio_resid;
288 #ifdef KTRACE
289 if (ktriov != NULL) {
290 if (error == 0) {
291 ktruio.uio_iov = ktriov;
292 ktruio.uio_resid = cnt;
293 ktrgenio(uap->fd, UIO_READ, &ktruio, error);
294 }
295 FREE(ktriov, M_TEMP);
296 }
297 #endif
298 td->td_retval[0] = cnt;
299 done:
300 fdrop(fp, td);
301 if (needfree)
302 FREE(needfree, M_IOV);
303 return (error);
304 }
305
306 /*
307 * Write system call
308 */
309 #ifndef _SYS_SYSPROTO_H_
310 struct write_args {
311 int fd;
312 const void *buf;
313 size_t nbyte;
314 };
315 #endif
316 /*
317 * MPSAFE
318 */
319 int
320 write(td, uap)
321 struct thread *td;
322 struct write_args *uap;
323 {
324 struct file *fp;
325 int error;
326
327 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
328 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
329 (off_t)-1, 0);
330 fdrop(fp, td);
331 } else {
332 error = EBADF; /* XXX this can't be right */
333 }
334 return(error);
335 }
336
337 /*
338 * Pwrite system call
339 */
340 #ifndef _SYS_SYSPROTO_H_
341 struct pwrite_args {
342 int fd;
343 const void *buf;
344 size_t nbyte;
345 int pad;
346 off_t offset;
347 };
348 #endif
349 /*
350 * MPSAFE
351 */
352 int
353 pwrite(td, uap)
354 struct thread *td;
355 struct pwrite_args *uap;
356 {
357 struct file *fp;
358 int error;
359
360 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
361 if (fp->f_type == DTYPE_VNODE) {
362 error = dofilewrite(td, fp, uap->fd, uap->buf,
363 uap->nbyte, uap->offset, FOF_OFFSET);
364 } else {
365 error = ESPIPE;
366 }
367 fdrop(fp, td);
368 } else {
369 error = EBADF; /* this can't be right */
370 }
371 return(error);
372 }
373
374 static int
375 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
376 struct thread *td;
377 struct file *fp;
378 int fd, flags;
379 const void *buf;
380 size_t nbyte;
381 off_t offset;
382 {
383 struct uio auio;
384 struct iovec aiov;
385 long cnt, error = 0;
386 #ifdef KTRACE
387 struct iovec ktriov;
388 struct uio ktruio;
389 int didktr = 0;
390 #endif
391
392 aiov.iov_base = (void *)(uintptr_t)buf;
393 aiov.iov_len = nbyte;
394 auio.uio_iov = &aiov;
395 auio.uio_iovcnt = 1;
396 auio.uio_offset = offset;
397 if (nbyte > INT_MAX)
398 return (EINVAL);
399 auio.uio_resid = nbyte;
400 auio.uio_rw = UIO_WRITE;
401 auio.uio_segflg = UIO_USERSPACE;
402 auio.uio_td = td;
403 #ifdef KTRACE
404 /*
405 * if tracing, save a copy of iovec and uio
406 */
407 if (KTRPOINT(td, KTR_GENIO)) {
408 ktriov = aiov;
409 ktruio = auio;
410 didktr = 1;
411 }
412 #endif
413 cnt = nbyte;
414 if (fp->f_type == DTYPE_VNODE)
415 bwillwrite();
416 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
417 if (auio.uio_resid != cnt && (error == ERESTART ||
418 error == EINTR || error == EWOULDBLOCK))
419 error = 0;
420 /* Socket layer is responsible for issuing SIGPIPE. */
421 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
422 PROC_LOCK(td->td_proc);
423 psignal(td->td_proc, SIGPIPE);
424 PROC_UNLOCK(td->td_proc);
425 }
426 }
427 cnt -= auio.uio_resid;
428 #ifdef KTRACE
429 if (didktr && error == 0) {
430 ktruio.uio_iov = &ktriov;
431 ktruio.uio_resid = cnt;
432 ktrgenio(fd, UIO_WRITE, &ktruio, error);
433 }
434 #endif
435 td->td_retval[0] = cnt;
436 return (error);
437 }
438
439 /*
440 * Gather write system call
441 */
442 #ifndef _SYS_SYSPROTO_H_
443 struct writev_args {
444 int fd;
445 struct iovec *iovp;
446 u_int iovcnt;
447 };
448 #endif
449 /*
450 * MPSAFE
451 */
452 int
453 writev(td, uap)
454 struct thread *td;
455 register struct writev_args *uap;
456 {
457 struct file *fp;
458 struct uio auio;
459 register struct iovec *iov;
460 struct iovec *needfree;
461 struct iovec aiov[UIO_SMALLIOV];
462 long i, cnt, error = 0;
463 u_int iovlen;
464 #ifdef KTRACE
465 struct iovec *ktriov = NULL;
466 struct uio ktruio;
467 #endif
468
469 mtx_lock(&Giant);
470 if ((error = fget_write(td, uap->fd, &fp)) != 0) {
471 error = EBADF;
472 goto done2;
473 }
474 /* note: can't use iovlen until iovcnt is validated */
475 iovlen = uap->iovcnt * sizeof (struct iovec);
476 if (uap->iovcnt > UIO_SMALLIOV) {
477 if (uap->iovcnt > UIO_MAXIOV) {
478 needfree = NULL;
479 error = EINVAL;
480 goto done;
481 }
482 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
483 needfree = iov;
484 } else {
485 iov = aiov;
486 needfree = NULL;
487 }
488 auio.uio_iov = iov;
489 auio.uio_iovcnt = uap->iovcnt;
490 auio.uio_rw = UIO_WRITE;
491 auio.uio_segflg = UIO_USERSPACE;
492 auio.uio_td = td;
493 auio.uio_offset = -1;
494 if ((error = copyin(uap->iovp, iov, iovlen)))
495 goto done;
496 auio.uio_resid = 0;
497 for (i = 0; i < uap->iovcnt; i++) {
498 if (iov->iov_len > INT_MAX - auio.uio_resid) {
499 error = EINVAL;
500 goto done;
501 }
502 auio.uio_resid += iov->iov_len;
503 iov++;
504 }
505 #ifdef KTRACE
506 /*
507 * if tracing, save a copy of iovec and uio
508 */
509 if (KTRPOINT(td, KTR_GENIO)) {
510 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
511 bcopy(auio.uio_iov, ktriov, iovlen);
512 ktruio = auio;
513 }
514 #endif
515 cnt = auio.uio_resid;
516 if (fp->f_type == DTYPE_VNODE)
517 bwillwrite();
518 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
519 if (auio.uio_resid != cnt && (error == ERESTART ||
520 error == EINTR || error == EWOULDBLOCK))
521 error = 0;
522 if (error == EPIPE) {
523 PROC_LOCK(td->td_proc);
524 psignal(td->td_proc, SIGPIPE);
525 PROC_UNLOCK(td->td_proc);
526 }
527 }
528 cnt -= auio.uio_resid;
529 #ifdef KTRACE
530 if (ktriov != NULL) {
531 if (error == 0) {
532 ktruio.uio_iov = ktriov;
533 ktruio.uio_resid = cnt;
534 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
535 }
536 FREE(ktriov, M_TEMP);
537 }
538 #endif
539 td->td_retval[0] = cnt;
540 done:
541 fdrop(fp, td);
542 if (needfree)
543 FREE(needfree, M_IOV);
544 done2:
545 mtx_unlock(&Giant);
546 return (error);
547 }
548
549 /*
550 * Ioctl system call
551 */
552 #ifndef _SYS_SYSPROTO_H_
553 struct ioctl_args {
554 int fd;
555 u_long com;
556 caddr_t data;
557 };
558 #endif
559 /*
560 * MPSAFE
561 */
562 /* ARGSUSED */
563 int
564 ioctl(td, uap)
565 struct thread *td;
566 register struct ioctl_args *uap;
567 {
568 struct file *fp;
569 register struct filedesc *fdp;
570 register u_long com;
571 int error = 0;
572 register u_int size;
573 caddr_t data, memp;
574 int tmp;
575 #define STK_PARAMS 128
576 union {
577 char stkbuf[STK_PARAMS];
578 long align;
579 } ubuf;
580
581 if ((error = fget(td, uap->fd, &fp)) != 0)
582 return (error);
583 mtx_lock(&Giant);
584 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
585 fdrop(fp, td);
586 mtx_unlock(&Giant);
587 return (EBADF);
588 }
589 fdp = td->td_proc->p_fd;
590 switch (com = uap->com) {
591 case FIONCLEX:
592 FILEDESC_LOCK(fdp);
593 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
594 FILEDESC_UNLOCK(fdp);
595 fdrop(fp, td);
596 mtx_unlock(&Giant);
597 return (0);
598 case FIOCLEX:
599 FILEDESC_LOCK(fdp);
600 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
601 FILEDESC_UNLOCK(fdp);
602 fdrop(fp, td);
603 mtx_unlock(&Giant);
604 return (0);
605 }
606
607 /*
608 * Interpret high order word to find amount of data to be
609 * copied to/from the user's address space.
610 */
611 size = IOCPARM_LEN(com);
612 if (size > IOCPARM_MAX) {
613 fdrop(fp, td);
614 mtx_unlock(&Giant);
615 return (ENOTTY);
616 }
617
618 memp = NULL;
619 if (size > sizeof (ubuf.stkbuf)) {
620 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
621 data = memp;
622 } else {
623 data = ubuf.stkbuf;
624 }
625 if (com&IOC_IN) {
626 if (size) {
627 error = copyin(uap->data, data, (u_int)size);
628 if (error) {
629 if (memp)
630 free(memp, M_IOCTLOPS);
631 fdrop(fp, td);
632 goto done;
633 }
634 } else {
635 *(caddr_t *)data = uap->data;
636 }
637 } else if ((com&IOC_OUT) && size) {
638 /*
639 * Zero the buffer so the user always
640 * gets back something deterministic.
641 */
642 bzero(data, size);
643 } else if (com&IOC_VOID) {
644 *(caddr_t *)data = uap->data;
645 }
646
647 switch (com) {
648
649 case FIONBIO:
650 FILE_LOCK(fp);
651 if ((tmp = *(int *)data))
652 fp->f_flag |= FNONBLOCK;
653 else
654 fp->f_flag &= ~FNONBLOCK;
655 FILE_UNLOCK(fp);
656 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
657 break;
658
659 case FIOASYNC:
660 FILE_LOCK(fp);
661 if ((tmp = *(int *)data))
662 fp->f_flag |= FASYNC;
663 else
664 fp->f_flag &= ~FASYNC;
665 FILE_UNLOCK(fp);
666 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
667 break;
668
669 default:
670 error = fo_ioctl(fp, com, data, td->td_ucred, td);
671 /*
672 * Copy any data to user, size was
673 * already set and checked above.
674 */
675 if (error == 0 && (com&IOC_OUT) && size)
676 error = copyout(data, uap->data, (u_int)size);
677 break;
678 }
679 if (memp)
680 free(memp, M_IOCTLOPS);
681 fdrop(fp, td);
682 done:
683 mtx_unlock(&Giant);
684 return (error);
685 }
686
687 /*
688 * sellock and selwait are initialized in selectinit() via SYSINIT.
689 */
690 struct mtx sellock;
691 struct cv selwait;
692 u_int nselcoll; /* Select collisions since boot */
693 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
694
695 /*
696 * Select system call.
697 */
698 #ifndef _SYS_SYSPROTO_H_
699 struct select_args {
700 int nd;
701 fd_set *in, *ou, *ex;
702 struct timeval *tv;
703 };
704 #endif
705 /*
706 * MPSAFE
707 */
708 int
709 select(td, uap)
710 register struct thread *td;
711 register struct select_args *uap;
712 {
713 struct timeval tv, *tvp;
714 int error;
715
716 if (uap->tv != NULL) {
717 error = copyin(uap->tv, &tv, sizeof(tv));
718 if (error)
719 return (error);
720 tvp = &tv;
721 } else
722 tvp = NULL;
723
724 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
725 }
726
727 int
728 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
729 fd_set *fd_ex, struct timeval *tvp)
730 {
731 struct filedesc *fdp;
732 /*
733 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
734 * infds with the new FD_SETSIZE of 1024, and more than enough for
735 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
736 * of 256.
737 */
738 fd_mask s_selbits[howmany(2048, NFDBITS)];
739 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
740 struct timeval atv, rtv, ttv;
741 int error, timo;
742 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
743
744 if (nd < 0)
745 return (EINVAL);
746 fdp = td->td_proc->p_fd;
747 mtx_lock(&Giant);
748 FILEDESC_LOCK(fdp);
749
750 if (nd > td->td_proc->p_fd->fd_nfiles)
751 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
752 FILEDESC_UNLOCK(fdp);
753
754 /*
755 * Allocate just enough bits for the non-null fd_sets. Use the
756 * preallocated auto buffer if possible.
757 */
758 nfdbits = roundup(nd, NFDBITS);
759 ncpbytes = nfdbits / NBBY;
760 nbufbytes = 0;
761 if (fd_in != NULL)
762 nbufbytes += 2 * ncpbytes;
763 if (fd_ou != NULL)
764 nbufbytes += 2 * ncpbytes;
765 if (fd_ex != NULL)
766 nbufbytes += 2 * ncpbytes;
767 if (nbufbytes <= sizeof s_selbits)
768 selbits = &s_selbits[0];
769 else
770 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
771
772 /*
773 * Assign pointers into the bit buffers and fetch the input bits.
774 * Put the output buffers together so that they can be bzeroed
775 * together.
776 */
777 sbp = selbits;
778 #define getbits(name, x) \
779 do { \
780 if (name == NULL) \
781 ibits[x] = NULL; \
782 else { \
783 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
784 obits[x] = sbp; \
785 sbp += ncpbytes / sizeof *sbp; \
786 error = copyin(name, ibits[x], ncpbytes); \
787 if (error != 0) \
788 goto done_nosellock; \
789 } \
790 } while (0)
791 getbits(fd_in, 0);
792 getbits(fd_ou, 1);
793 getbits(fd_ex, 2);
794 #undef getbits
795 if (nbufbytes != 0)
796 bzero(selbits, nbufbytes / 2);
797
798 if (tvp != NULL) {
799 atv = *tvp;
800 if (itimerfix(&atv)) {
801 error = EINVAL;
802 goto done_nosellock;
803 }
804 getmicrouptime(&rtv);
805 timevaladd(&atv, &rtv);
806 } else {
807 atv.tv_sec = 0;
808 atv.tv_usec = 0;
809 }
810 timo = 0;
811 TAILQ_INIT(&td->td_selq);
812 mtx_lock(&sellock);
813 retry:
814 ncoll = nselcoll;
815 mtx_lock_spin(&sched_lock);
816 td->td_flags |= TDF_SELECT;
817 mtx_unlock_spin(&sched_lock);
818 mtx_unlock(&sellock);
819
820 error = selscan(td, ibits, obits, nd);
821 mtx_lock(&sellock);
822 if (error || td->td_retval[0])
823 goto done;
824 if (atv.tv_sec || atv.tv_usec) {
825 getmicrouptime(&rtv);
826 if (timevalcmp(&rtv, &atv, >=))
827 goto done;
828 ttv = atv;
829 timevalsub(&ttv, &rtv);
830 timo = ttv.tv_sec > 24 * 60 * 60 ?
831 24 * 60 * 60 * hz : tvtohz(&ttv);
832 }
833
834 /*
835 * An event of interest may occur while we do not hold
836 * sellock, so check TDF_SELECT and the number of
837 * collisions and rescan the file descriptors if
838 * necessary.
839 */
840 mtx_lock_spin(&sched_lock);
841 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
842 mtx_unlock_spin(&sched_lock);
843 goto retry;
844 }
845 mtx_unlock_spin(&sched_lock);
846
847 if (timo > 0)
848 error = cv_timedwait_sig(&selwait, &sellock, timo);
849 else
850 error = cv_wait_sig(&selwait, &sellock);
851
852 if (error == 0)
853 goto retry;
854
855 done:
856 clear_selinfo_list(td);
857 mtx_lock_spin(&sched_lock);
858 td->td_flags &= ~TDF_SELECT;
859 mtx_unlock_spin(&sched_lock);
860 mtx_unlock(&sellock);
861
862 done_nosellock:
863 /* select is not restarted after signals... */
864 if (error == ERESTART)
865 error = EINTR;
866 if (error == EWOULDBLOCK)
867 error = 0;
868 #define putbits(name, x) \
869 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
870 error = error2;
871 if (error == 0) {
872 int error2;
873
874 putbits(fd_in, 0);
875 putbits(fd_ou, 1);
876 putbits(fd_ex, 2);
877 #undef putbits
878 }
879 if (selbits != &s_selbits[0])
880 free(selbits, M_SELECT);
881
882 mtx_unlock(&Giant);
883 return (error);
884 }
885
886 static int
887 selscan(td, ibits, obits, nfd)
888 struct thread *td;
889 fd_mask **ibits, **obits;
890 int nfd;
891 {
892 int msk, i, fd;
893 fd_mask bits;
894 struct file *fp;
895 int n = 0;
896 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
897 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
898 struct filedesc *fdp = td->td_proc->p_fd;
899
900 FILEDESC_LOCK(fdp);
901 for (msk = 0; msk < 3; msk++) {
902 if (ibits[msk] == NULL)
903 continue;
904 for (i = 0; i < nfd; i += NFDBITS) {
905 bits = ibits[msk][i/NFDBITS];
906 /* ffs(int mask) not portable, fd_mask is long */
907 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
908 if (!(bits & 1))
909 continue;
910 if ((fp = fget_locked(fdp, fd)) == NULL) {
911 FILEDESC_UNLOCK(fdp);
912 return (EBADF);
913 }
914 if (fo_poll(fp, flag[msk], td->td_ucred,
915 td)) {
916 obits[msk][(fd)/NFDBITS] |=
917 ((fd_mask)1 << ((fd) % NFDBITS));
918 n++;
919 }
920 }
921 }
922 }
923 FILEDESC_UNLOCK(fdp);
924 td->td_retval[0] = n;
925 return (0);
926 }
927
928 /*
929 * Poll system call.
930 */
931 #ifndef _SYS_SYSPROTO_H_
932 struct poll_args {
933 struct pollfd *fds;
934 u_int nfds;
935 int timeout;
936 };
937 #endif
938 /*
939 * MPSAFE
940 */
941 int
942 poll(td, uap)
943 struct thread *td;
944 struct poll_args *uap;
945 {
946 caddr_t bits;
947 char smallbits[32 * sizeof(struct pollfd)];
948 struct timeval atv, rtv, ttv;
949 int error = 0, timo;
950 u_int ncoll, nfds;
951 size_t ni;
952
953 nfds = uap->nfds;
954
955 mtx_lock(&Giant);
956 /*
957 * This is kinda bogus. We have fd limits, but that is not
958 * really related to the size of the pollfd array. Make sure
959 * we let the process use at least FD_SETSIZE entries and at
960 * least enough for the current limits. We want to be reasonably
961 * safe, but not overly restrictive.
962 */
963 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
964 (nfds > FD_SETSIZE)) {
965 error = EINVAL;
966 goto done2;
967 }
968 ni = nfds * sizeof(struct pollfd);
969 if (ni > sizeof(smallbits))
970 bits = malloc(ni, M_TEMP, M_WAITOK);
971 else
972 bits = smallbits;
973 error = copyin(uap->fds, bits, ni);
974 if (error)
975 goto done_nosellock;
976 if (uap->timeout != INFTIM) {
977 atv.tv_sec = uap->timeout / 1000;
978 atv.tv_usec = (uap->timeout % 1000) * 1000;
979 if (itimerfix(&atv)) {
980 error = EINVAL;
981 goto done_nosellock;
982 }
983 getmicrouptime(&rtv);
984 timevaladd(&atv, &rtv);
985 } else {
986 atv.tv_sec = 0;
987 atv.tv_usec = 0;
988 }
989 timo = 0;
990 TAILQ_INIT(&td->td_selq);
991 mtx_lock(&sellock);
992 retry:
993 ncoll = nselcoll;
994 mtx_lock_spin(&sched_lock);
995 td->td_flags |= TDF_SELECT;
996 mtx_unlock_spin(&sched_lock);
997 mtx_unlock(&sellock);
998
999 error = pollscan(td, (struct pollfd *)bits, nfds);
1000 mtx_lock(&sellock);
1001 if (error || td->td_retval[0])
1002 goto done;
1003 if (atv.tv_sec || atv.tv_usec) {
1004 getmicrouptime(&rtv);
1005 if (timevalcmp(&rtv, &atv, >=))
1006 goto done;
1007 ttv = atv;
1008 timevalsub(&ttv, &rtv);
1009 timo = ttv.tv_sec > 24 * 60 * 60 ?
1010 24 * 60 * 60 * hz : tvtohz(&ttv);
1011 }
1012 /*
1013 * An event of interest may occur while we do not hold
1014 * sellock, so check TDF_SELECT and the number of collisions
1015 * and rescan the file descriptors if necessary.
1016 */
1017 mtx_lock_spin(&sched_lock);
1018 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1019 mtx_unlock_spin(&sched_lock);
1020 goto retry;
1021 }
1022 mtx_unlock_spin(&sched_lock);
1023
1024 if (timo > 0)
1025 error = cv_timedwait_sig(&selwait, &sellock, timo);
1026 else
1027 error = cv_wait_sig(&selwait, &sellock);
1028
1029 if (error == 0)
1030 goto retry;
1031
1032 done:
1033 clear_selinfo_list(td);
1034 mtx_lock_spin(&sched_lock);
1035 td->td_flags &= ~TDF_SELECT;
1036 mtx_unlock_spin(&sched_lock);
1037 mtx_unlock(&sellock);
1038
1039 done_nosellock:
1040 /* poll is not restarted after signals... */
1041 if (error == ERESTART)
1042 error = EINTR;
1043 if (error == EWOULDBLOCK)
1044 error = 0;
1045 if (error == 0) {
1046 error = copyout(bits, uap->fds, ni);
1047 if (error)
1048 goto out;
1049 }
1050 out:
1051 if (ni > sizeof(smallbits))
1052 free(bits, M_TEMP);
1053 done2:
1054 mtx_unlock(&Giant);
1055 return (error);
1056 }
1057
1058 static int
1059 pollscan(td, fds, nfd)
1060 struct thread *td;
1061 struct pollfd *fds;
1062 u_int nfd;
1063 {
1064 register struct filedesc *fdp = td->td_proc->p_fd;
1065 int i;
1066 struct file *fp;
1067 int n = 0;
1068
1069 FILEDESC_LOCK(fdp);
1070 for (i = 0; i < nfd; i++, fds++) {
1071 if (fds->fd >= fdp->fd_nfiles) {
1072 fds->revents = POLLNVAL;
1073 n++;
1074 } else if (fds->fd < 0) {
1075 fds->revents = 0;
1076 } else {
1077 fp = fdp->fd_ofiles[fds->fd];
1078 if (fp == NULL) {
1079 fds->revents = POLLNVAL;
1080 n++;
1081 } else {
1082 /*
1083 * Note: backend also returns POLLHUP and
1084 * POLLERR if appropriate.
1085 */
1086 fds->revents = fo_poll(fp, fds->events,
1087 td->td_ucred, td);
1088 if (fds->revents != 0)
1089 n++;
1090 }
1091 }
1092 }
1093 FILEDESC_UNLOCK(fdp);
1094 td->td_retval[0] = n;
1095 return (0);
1096 }
1097
1098 /*
1099 * OpenBSD poll system call.
1100 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1101 */
1102 #ifndef _SYS_SYSPROTO_H_
1103 struct openbsd_poll_args {
1104 struct pollfd *fds;
1105 u_int nfds;
1106 int timeout;
1107 };
1108 #endif
1109 /*
1110 * MPSAFE
1111 */
1112 int
1113 openbsd_poll(td, uap)
1114 register struct thread *td;
1115 register struct openbsd_poll_args *uap;
1116 {
1117 return (poll(td, (struct poll_args *)uap));
1118 }
1119
1120 /*
1121 * Remove the references to the thread from all of the objects
1122 * we were polling.
1123 *
1124 * This code assumes that the underlying owner of the selinfo
1125 * structure will hold sellock before it changes it, and that
1126 * it will unlink itself from our list if it goes away.
1127 */
1128 void
1129 clear_selinfo_list(td)
1130 struct thread *td;
1131 {
1132 struct selinfo *si;
1133
1134 mtx_assert(&sellock, MA_OWNED);
1135 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1136 si->si_thread = NULL;
1137 TAILQ_INIT(&td->td_selq);
1138 }
1139
1140 /*ARGSUSED*/
1141 int
1142 seltrue(dev, events, td)
1143 dev_t dev;
1144 int events;
1145 struct thread *td;
1146 {
1147
1148 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1149 }
1150
1151 /*
1152 * Record a select request.
1153 */
1154 void
1155 selrecord(selector, sip)
1156 struct thread *selector;
1157 struct selinfo *sip;
1158 {
1159
1160 mtx_lock(&sellock);
1161 /*
1162 * If the selinfo's thread pointer is NULL then take ownership of it.
1163 *
1164 * If the thread pointer is not NULL and it points to another
1165 * thread, then we have a collision.
1166 *
1167 * If the thread pointer is not NULL and points back to us then leave
1168 * it alone as we've already added pointed it at us and added it to
1169 * our list.
1170 */
1171 if (sip->si_thread == NULL) {
1172 sip->si_thread = selector;
1173 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1174 } else if (sip->si_thread != selector) {
1175 sip->si_flags |= SI_COLL;
1176 }
1177
1178 mtx_unlock(&sellock);
1179 }
1180
1181 /*
1182 * Do a wakeup when a selectable event occurs.
1183 */
1184 void
1185 selwakeup(sip)
1186 struct selinfo *sip;
1187 {
1188 struct thread *td;
1189
1190 mtx_lock(&sellock);
1191 td = sip->si_thread;
1192 if ((sip->si_flags & SI_COLL) != 0) {
1193 nselcoll++;
1194 sip->si_flags &= ~SI_COLL;
1195 cv_broadcast(&selwait);
1196 }
1197 if (td == NULL) {
1198 mtx_unlock(&sellock);
1199 return;
1200 }
1201 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1202 sip->si_thread = NULL;
1203 mtx_lock_spin(&sched_lock);
1204 if (td->td_wchan == &selwait) {
1205 cv_waitq_remove(td);
1206 TD_CLR_SLEEPING(td);
1207 setrunnable(td);
1208 } else
1209 td->td_flags &= ~TDF_SELECT;
1210 mtx_unlock_spin(&sched_lock);
1211 mtx_unlock(&sellock);
1212 }
1213
1214 static void selectinit(void *);
1215 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1216
1217 /* ARGSUSED*/
1218 static void
1219 selectinit(dummy)
1220 void *dummy;
1221 {
1222 cv_init(&selwait, "select");
1223 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1224 }
Cache object: f997c7871ec531d38642e438db2ab5f4
|