1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD: releng/5.4/sys/kern/sys_generic.c 144848 2005-04-10 00:53:36Z kensmith $");
39
40 #include "opt_ktrace.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/signalvar.h>
51 #include <sys/socketvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #include <sys/resourcevar.h>
58 #include <sys/selinfo.h>
59 #include <sys/sleepqueue.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/vnode.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77 static int pollscan(struct thread *, struct pollfd *, u_int);
78 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int dofileread(struct thread *, struct file *, int, void *,
80 size_t, off_t, int);
81 static int dofilewrite(struct thread *, struct file *, int,
82 const void *, size_t, off_t, int);
83 static void doselwakeup(struct selinfo *, int);
84
85 /*
86 * Read system call.
87 */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 int fd;
91 void *buf;
92 size_t nbyte;
93 };
94 #endif
95 /*
96 * MPSAFE
97 */
98 int
99 read(td, uap)
100 struct thread *td;
101 struct read_args *uap;
102 {
103 struct file *fp;
104 int error;
105
106 if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 error = dofileread(td, fp, uap->fd, uap->buf,
108 uap->nbyte, (off_t)-1, 0);
109 fdrop(fp, td);
110 }
111 return(error);
112 }
113
114 /*
115 * Pread system call
116 */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 int fd;
120 void *buf;
121 size_t nbyte;
122 int pad;
123 off_t offset;
124 };
125 #endif
126 /*
127 * MPSAFE
128 */
129 int
130 pread(td, uap)
131 struct thread *td;
132 struct pread_args *uap;
133 {
134 struct file *fp;
135 int error;
136
137 if ((error = fget_read(td, uap->fd, &fp)) != 0)
138 return (error);
139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140 error = ESPIPE;
141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142 error = EINVAL;
143 else {
144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 uap->offset, FOF_OFFSET);
146 }
147 fdrop(fp, td);
148 return(error);
149 }
150
151 /*
152 * Code common for read and pread
153 */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 struct thread *td;
157 struct file *fp;
158 int fd, flags;
159 void *buf;
160 size_t nbyte;
161 off_t offset;
162 {
163 struct uio auio;
164 struct iovec aiov;
165 long cnt, error = 0;
166 #ifdef KTRACE
167 struct uio *ktruio = NULL;
168 #endif
169
170 aiov.iov_base = buf;
171 aiov.iov_len = nbyte;
172 auio.uio_iov = &aiov;
173 auio.uio_iovcnt = 1;
174 auio.uio_offset = offset;
175 if (nbyte > INT_MAX)
176 return (EINVAL);
177 auio.uio_resid = nbyte;
178 auio.uio_rw = UIO_READ;
179 auio.uio_segflg = UIO_USERSPACE;
180 auio.uio_td = td;
181 #ifdef KTRACE
182 if (KTRPOINT(td, KTR_GENIO))
183 ktruio = cloneuio(&auio);
184 #endif
185 cnt = nbyte;
186
187 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
188 if (auio.uio_resid != cnt && (error == ERESTART ||
189 error == EINTR || error == EWOULDBLOCK))
190 error = 0;
191 }
192 cnt -= auio.uio_resid;
193 #ifdef KTRACE
194 if (ktruio != NULL) {
195 ktruio->uio_resid = cnt;
196 ktrgenio(fd, UIO_READ, ktruio, error);
197 }
198 #endif
199 td->td_retval[0] = cnt;
200 return (error);
201 }
202
203 /*
204 * Scatter read system call.
205 */
206 #ifndef _SYS_SYSPROTO_H_
207 struct readv_args {
208 int fd;
209 struct iovec *iovp;
210 u_int iovcnt;
211 };
212 #endif
213 /*
214 * MPSAFE
215 */
216 int
217 readv(struct thread *td, struct readv_args *uap)
218 {
219 struct file *fp;
220 struct uio *auio = NULL;
221 long cnt;
222 int error;
223 #ifdef KTRACE
224 struct uio *ktruio = NULL;
225 #endif
226
227 error = fget_read(td, uap->fd, &fp);
228 if (error)
229 return (error);
230 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
231 if (error) {
232 fdrop(fp, td);
233 return (error);
234 }
235 auio->uio_rw = UIO_READ;
236 auio->uio_td = td;
237 #ifdef KTRACE
238 if (KTRPOINT(td, KTR_GENIO))
239 ktruio = cloneuio(auio);
240 #endif
241 cnt = auio->uio_resid;
242 if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
243 if (auio->uio_resid != cnt && (error == ERESTART ||
244 error == EINTR || error == EWOULDBLOCK))
245 error = 0;
246 }
247 cnt -= auio->uio_resid;
248 #ifdef KTRACE
249 if (ktruio != NULL) {
250 ktruio->uio_resid = cnt;
251 ktrgenio(uap->fd, UIO_READ, ktruio, error);
252 }
253 #endif
254 td->td_retval[0] = cnt;
255 free(auio, M_IOV);
256 fdrop(fp, td);
257 return (error);
258 }
259
260 /*
261 * Write system call
262 */
263 #ifndef _SYS_SYSPROTO_H_
264 struct write_args {
265 int fd;
266 const void *buf;
267 size_t nbyte;
268 };
269 #endif
270 /*
271 * MPSAFE
272 */
273 int
274 write(td, uap)
275 struct thread *td;
276 struct write_args *uap;
277 {
278 struct file *fp;
279 int error;
280
281 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
282 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
283 (off_t)-1, 0);
284 fdrop(fp, td);
285 } else {
286 error = EBADF; /* XXX this can't be right */
287 }
288 return(error);
289 }
290
291 /*
292 * Pwrite system call
293 */
294 #ifndef _SYS_SYSPROTO_H_
295 struct pwrite_args {
296 int fd;
297 const void *buf;
298 size_t nbyte;
299 int pad;
300 off_t offset;
301 };
302 #endif
303 /*
304 * MPSAFE
305 */
306 int
307 pwrite(td, uap)
308 struct thread *td;
309 struct pwrite_args *uap;
310 {
311 struct file *fp;
312 int error;
313
314 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
315 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
316 error = ESPIPE;
317 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
318 error = EINVAL;
319 else {
320 error = dofilewrite(td, fp, uap->fd, uap->buf,
321 uap->nbyte, uap->offset, FOF_OFFSET);
322 }
323 fdrop(fp, td);
324 } else {
325 error = EBADF; /* this can't be right */
326 }
327 return(error);
328 }
329
330 static int
331 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
332 struct thread *td;
333 struct file *fp;
334 int fd, flags;
335 const void *buf;
336 size_t nbyte;
337 off_t offset;
338 {
339 struct uio auio;
340 struct iovec aiov;
341 long cnt, error = 0;
342 #ifdef KTRACE
343 struct uio *ktruio = NULL;
344 #endif
345
346 aiov.iov_base = (void *)(uintptr_t)buf;
347 aiov.iov_len = nbyte;
348 auio.uio_iov = &aiov;
349 auio.uio_iovcnt = 1;
350 auio.uio_offset = offset;
351 if (nbyte > INT_MAX)
352 return (EINVAL);
353 auio.uio_resid = nbyte;
354 auio.uio_rw = UIO_WRITE;
355 auio.uio_segflg = UIO_USERSPACE;
356 auio.uio_td = td;
357 #ifdef KTRACE
358 if (KTRPOINT(td, KTR_GENIO))
359 ktruio = cloneuio(&auio);
360 #endif
361 cnt = nbyte;
362 if (fp->f_type == DTYPE_VNODE)
363 bwillwrite();
364 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
365 if (auio.uio_resid != cnt && (error == ERESTART ||
366 error == EINTR || error == EWOULDBLOCK))
367 error = 0;
368 /* Socket layer is responsible for issuing SIGPIPE. */
369 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
370 PROC_LOCK(td->td_proc);
371 psignal(td->td_proc, SIGPIPE);
372 PROC_UNLOCK(td->td_proc);
373 }
374 }
375 cnt -= auio.uio_resid;
376 #ifdef KTRACE
377 if (ktruio != NULL) {
378 ktruio->uio_resid = cnt;
379 ktrgenio(fd, UIO_WRITE, ktruio, error);
380 }
381 #endif
382 td->td_retval[0] = cnt;
383 return (error);
384 }
385
386 /*
387 * Gather write system call
388 */
389 #ifndef _SYS_SYSPROTO_H_
390 struct writev_args {
391 int fd;
392 struct iovec *iovp;
393 u_int iovcnt;
394 };
395 #endif
396 /*
397 * MPSAFE
398 */
399 int
400 writev(struct thread *td, struct writev_args *uap)
401 {
402 struct file *fp;
403 struct uio *auio = NULL;
404 long cnt;
405 int error;
406 #ifdef KTRACE
407 struct uio *ktruio = NULL;
408 #endif
409
410 error = fget_write(td, uap->fd, &fp);
411 if (error)
412 return (EBADF);
413 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
414 if (error) {
415 fdrop(fp, td);
416 return (error);
417 }
418 auio->uio_rw = UIO_WRITE;
419 auio->uio_td = td;
420 #ifdef KTRACE
421 if (KTRPOINT(td, KTR_GENIO))
422 ktruio = cloneuio(auio);
423 #endif
424 cnt = auio->uio_resid;
425 if (fp->f_type == DTYPE_VNODE)
426 bwillwrite();
427 if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
428 if (auio->uio_resid != cnt && (error == ERESTART ||
429 error == EINTR || error == EWOULDBLOCK))
430 error = 0;
431 if (error == EPIPE) {
432 PROC_LOCK(td->td_proc);
433 psignal(td->td_proc, SIGPIPE);
434 PROC_UNLOCK(td->td_proc);
435 }
436 }
437 cnt -= auio->uio_resid;
438 #ifdef KTRACE
439 if (ktruio != NULL) {
440 ktruio->uio_resid = cnt;
441 ktrgenio(uap->fd, UIO_WRITE, ktruio, error);
442 }
443 #endif
444 td->td_retval[0] = cnt;
445 fdrop(fp, td);
446 free(auio, M_IOV);
447 return (error);
448 }
449
450 /*
451 * Ioctl system call
452 */
453 #ifndef _SYS_SYSPROTO_H_
454 struct ioctl_args {
455 int fd;
456 u_long com;
457 caddr_t data;
458 };
459 #endif
460 /*
461 * MPSAFE
462 */
463 /* ARGSUSED */
464 int
465 ioctl(struct thread *td, struct ioctl_args *uap)
466 {
467 struct file *fp;
468 struct filedesc *fdp;
469 u_long com;
470 int error = 0;
471 u_int size;
472 caddr_t data, memp;
473 int tmp;
474
475 if ((error = fget(td, uap->fd, &fp)) != 0)
476 return (error);
477 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
478 fdrop(fp, td);
479 return (EBADF);
480 }
481 fdp = td->td_proc->p_fd;
482 switch (com = uap->com) {
483 case FIONCLEX:
484 FILEDESC_LOCK_FAST(fdp);
485 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
486 FILEDESC_UNLOCK_FAST(fdp);
487 fdrop(fp, td);
488 return (0);
489 case FIOCLEX:
490 FILEDESC_LOCK_FAST(fdp);
491 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
492 FILEDESC_UNLOCK_FAST(fdp);
493 fdrop(fp, td);
494 return (0);
495 }
496
497 /*
498 * Interpret high order word to find amount of data to be
499 * copied to/from the user's address space.
500 */
501 size = IOCPARM_LEN(com);
502 if ((size > IOCPARM_MAX) ||
503 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0)) {
504 fdrop(fp, td);
505 return (ENOTTY);
506 }
507
508 if (size > 0) {
509 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
510 data = memp;
511 } else {
512 memp = NULL;
513 data = (void *)&uap->data;
514 }
515 if (com & IOC_IN) {
516 error = copyin(uap->data, data, (u_int)size);
517 if (error) {
518 free(memp, M_IOCTLOPS);
519 fdrop(fp, td);
520 return (error);
521 }
522 } else if (com & IOC_OUT) {
523 /*
524 * Zero the buffer so the user always
525 * gets back something deterministic.
526 */
527 bzero(data, size);
528 }
529
530 if (com == FIONBIO) {
531 FILE_LOCK(fp);
532 if ((tmp = *(int *)data))
533 fp->f_flag |= FNONBLOCK;
534 else
535 fp->f_flag &= ~FNONBLOCK;
536 FILE_UNLOCK(fp);
537 data = (void *)&tmp;
538 } else if (com == FIOASYNC) {
539 FILE_LOCK(fp);
540 if ((tmp = *(int *)data))
541 fp->f_flag |= FASYNC;
542 else
543 fp->f_flag &= ~FASYNC;
544 FILE_UNLOCK(fp);
545 data = (void *)&tmp;
546 }
547
548 error = fo_ioctl(fp, com, data, td->td_ucred, td);
549
550 if (error == 0 && (com & IOC_OUT))
551 error = copyout(data, uap->data, (u_int)size);
552
553 if (memp != NULL)
554 free(memp, M_IOCTLOPS);
555 fdrop(fp, td);
556 return (error);
557 }
558
559 /*
560 * sellock and selwait are initialized in selectinit() via SYSINIT.
561 */
562 struct mtx sellock;
563 struct cv selwait;
564 u_int nselcoll; /* Select collisions since boot */
565 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
566
567 /*
568 * Select system call.
569 */
570 #ifndef _SYS_SYSPROTO_H_
571 struct select_args {
572 int nd;
573 fd_set *in, *ou, *ex;
574 struct timeval *tv;
575 };
576 #endif
577 /*
578 * MPSAFE
579 */
580 int
581 select(td, uap)
582 register struct thread *td;
583 register struct select_args *uap;
584 {
585 struct timeval tv, *tvp;
586 int error;
587
588 if (uap->tv != NULL) {
589 error = copyin(uap->tv, &tv, sizeof(tv));
590 if (error)
591 return (error);
592 tvp = &tv;
593 } else
594 tvp = NULL;
595
596 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
597 }
598
599 int
600 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
601 fd_set *fd_ex, struct timeval *tvp)
602 {
603 struct filedesc *fdp;
604 /*
605 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
606 * infds with the new FD_SETSIZE of 1024, and more than enough for
607 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
608 * of 256.
609 */
610 fd_mask s_selbits[howmany(2048, NFDBITS)];
611 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
612 struct timeval atv, rtv, ttv;
613 int error, timo;
614 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
615
616 if (nd < 0)
617 return (EINVAL);
618 fdp = td->td_proc->p_fd;
619
620 FILEDESC_LOCK_FAST(fdp);
621
622 if (nd > td->td_proc->p_fd->fd_nfiles)
623 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
624 FILEDESC_UNLOCK_FAST(fdp);
625
626 /*
627 * Allocate just enough bits for the non-null fd_sets. Use the
628 * preallocated auto buffer if possible.
629 */
630 nfdbits = roundup(nd, NFDBITS);
631 ncpbytes = nfdbits / NBBY;
632 nbufbytes = 0;
633 if (fd_in != NULL)
634 nbufbytes += 2 * ncpbytes;
635 if (fd_ou != NULL)
636 nbufbytes += 2 * ncpbytes;
637 if (fd_ex != NULL)
638 nbufbytes += 2 * ncpbytes;
639 if (nbufbytes <= sizeof s_selbits)
640 selbits = &s_selbits[0];
641 else
642 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
643
644 /*
645 * Assign pointers into the bit buffers and fetch the input bits.
646 * Put the output buffers together so that they can be bzeroed
647 * together.
648 */
649 sbp = selbits;
650 #define getbits(name, x) \
651 do { \
652 if (name == NULL) \
653 ibits[x] = NULL; \
654 else { \
655 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
656 obits[x] = sbp; \
657 sbp += ncpbytes / sizeof *sbp; \
658 error = copyin(name, ibits[x], ncpbytes); \
659 if (error != 0) \
660 goto done_nosellock; \
661 } \
662 } while (0)
663 getbits(fd_in, 0);
664 getbits(fd_ou, 1);
665 getbits(fd_ex, 2);
666 #undef getbits
667 if (nbufbytes != 0)
668 bzero(selbits, nbufbytes / 2);
669
670 if (tvp != NULL) {
671 atv = *tvp;
672 if (itimerfix(&atv)) {
673 error = EINVAL;
674 goto done_nosellock;
675 }
676 getmicrouptime(&rtv);
677 timevaladd(&atv, &rtv);
678 } else {
679 atv.tv_sec = 0;
680 atv.tv_usec = 0;
681 }
682 timo = 0;
683 TAILQ_INIT(&td->td_selq);
684 mtx_lock(&sellock);
685 retry:
686 ncoll = nselcoll;
687 mtx_lock_spin(&sched_lock);
688 td->td_flags |= TDF_SELECT;
689 mtx_unlock_spin(&sched_lock);
690 mtx_unlock(&sellock);
691
692 error = selscan(td, ibits, obits, nd);
693 mtx_lock(&sellock);
694 if (error || td->td_retval[0])
695 goto done;
696 if (atv.tv_sec || atv.tv_usec) {
697 getmicrouptime(&rtv);
698 if (timevalcmp(&rtv, &atv, >=))
699 goto done;
700 ttv = atv;
701 timevalsub(&ttv, &rtv);
702 timo = ttv.tv_sec > 24 * 60 * 60 ?
703 24 * 60 * 60 * hz : tvtohz(&ttv);
704 }
705
706 /*
707 * An event of interest may occur while we do not hold
708 * sellock, so check TDF_SELECT and the number of
709 * collisions and rescan the file descriptors if
710 * necessary.
711 */
712 mtx_lock_spin(&sched_lock);
713 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
714 mtx_unlock_spin(&sched_lock);
715 goto retry;
716 }
717 mtx_unlock_spin(&sched_lock);
718
719 if (timo > 0)
720 error = cv_timedwait_sig(&selwait, &sellock, timo);
721 else
722 error = cv_wait_sig(&selwait, &sellock);
723
724 if (error == 0)
725 goto retry;
726
727 done:
728 clear_selinfo_list(td);
729 mtx_lock_spin(&sched_lock);
730 td->td_flags &= ~TDF_SELECT;
731 mtx_unlock_spin(&sched_lock);
732 mtx_unlock(&sellock);
733
734 done_nosellock:
735 /* select is not restarted after signals... */
736 if (error == ERESTART)
737 error = EINTR;
738 if (error == EWOULDBLOCK)
739 error = 0;
740 #define putbits(name, x) \
741 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
742 error = error2;
743 if (error == 0) {
744 int error2;
745
746 putbits(fd_in, 0);
747 putbits(fd_ou, 1);
748 putbits(fd_ex, 2);
749 #undef putbits
750 }
751 if (selbits != &s_selbits[0])
752 free(selbits, M_SELECT);
753
754 return (error);
755 }
756
757 static int
758 selscan(td, ibits, obits, nfd)
759 struct thread *td;
760 fd_mask **ibits, **obits;
761 int nfd;
762 {
763 int msk, i, fd;
764 fd_mask bits;
765 struct file *fp;
766 int n = 0;
767 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
768 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
769 struct filedesc *fdp = td->td_proc->p_fd;
770
771 FILEDESC_LOCK(fdp);
772 for (msk = 0; msk < 3; msk++) {
773 if (ibits[msk] == NULL)
774 continue;
775 for (i = 0; i < nfd; i += NFDBITS) {
776 bits = ibits[msk][i/NFDBITS];
777 /* ffs(int mask) not portable, fd_mask is long */
778 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
779 if (!(bits & 1))
780 continue;
781 if ((fp = fget_locked(fdp, fd)) == NULL) {
782 FILEDESC_UNLOCK(fdp);
783 return (EBADF);
784 }
785 if (fo_poll(fp, flag[msk], td->td_ucred,
786 td)) {
787 obits[msk][(fd)/NFDBITS] |=
788 ((fd_mask)1 << ((fd) % NFDBITS));
789 n++;
790 }
791 }
792 }
793 }
794 FILEDESC_UNLOCK(fdp);
795 td->td_retval[0] = n;
796 return (0);
797 }
798
799 /*
800 * Poll system call.
801 */
802 #ifndef _SYS_SYSPROTO_H_
803 struct poll_args {
804 struct pollfd *fds;
805 u_int nfds;
806 int timeout;
807 };
808 #endif
809 /*
810 * MPSAFE
811 */
812 int
813 poll(td, uap)
814 struct thread *td;
815 struct poll_args *uap;
816 {
817 struct pollfd *bits;
818 struct pollfd smallbits[32];
819 struct timeval atv, rtv, ttv;
820 int error = 0, timo;
821 u_int ncoll, nfds;
822 size_t ni;
823
824 nfds = uap->nfds;
825
826 /*
827 * This is kinda bogus. We have fd limits, but that is not
828 * really related to the size of the pollfd array. Make sure
829 * we let the process use at least FD_SETSIZE entries and at
830 * least enough for the current limits. We want to be reasonably
831 * safe, but not overly restrictive.
832 */
833 PROC_LOCK(td->td_proc);
834 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
835 (nfds > FD_SETSIZE)) {
836 PROC_UNLOCK(td->td_proc);
837 error = EINVAL;
838 goto done2;
839 }
840 PROC_UNLOCK(td->td_proc);
841 ni = nfds * sizeof(struct pollfd);
842 if (ni > sizeof(smallbits))
843 bits = malloc(ni, M_TEMP, M_WAITOK);
844 else
845 bits = smallbits;
846 error = copyin(uap->fds, bits, ni);
847 if (error)
848 goto done_nosellock;
849 if (uap->timeout != INFTIM) {
850 atv.tv_sec = uap->timeout / 1000;
851 atv.tv_usec = (uap->timeout % 1000) * 1000;
852 if (itimerfix(&atv)) {
853 error = EINVAL;
854 goto done_nosellock;
855 }
856 getmicrouptime(&rtv);
857 timevaladd(&atv, &rtv);
858 } else {
859 atv.tv_sec = 0;
860 atv.tv_usec = 0;
861 }
862 timo = 0;
863 TAILQ_INIT(&td->td_selq);
864 mtx_lock(&sellock);
865 retry:
866 ncoll = nselcoll;
867 mtx_lock_spin(&sched_lock);
868 td->td_flags |= TDF_SELECT;
869 mtx_unlock_spin(&sched_lock);
870 mtx_unlock(&sellock);
871
872 error = pollscan(td, bits, nfds);
873 mtx_lock(&sellock);
874 if (error || td->td_retval[0])
875 goto done;
876 if (atv.tv_sec || atv.tv_usec) {
877 getmicrouptime(&rtv);
878 if (timevalcmp(&rtv, &atv, >=))
879 goto done;
880 ttv = atv;
881 timevalsub(&ttv, &rtv);
882 timo = ttv.tv_sec > 24 * 60 * 60 ?
883 24 * 60 * 60 * hz : tvtohz(&ttv);
884 }
885 /*
886 * An event of interest may occur while we do not hold
887 * sellock, so check TDF_SELECT and the number of collisions
888 * and rescan the file descriptors if necessary.
889 */
890 mtx_lock_spin(&sched_lock);
891 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
892 mtx_unlock_spin(&sched_lock);
893 goto retry;
894 }
895 mtx_unlock_spin(&sched_lock);
896
897 if (timo > 0)
898 error = cv_timedwait_sig(&selwait, &sellock, timo);
899 else
900 error = cv_wait_sig(&selwait, &sellock);
901
902 if (error == 0)
903 goto retry;
904
905 done:
906 clear_selinfo_list(td);
907 mtx_lock_spin(&sched_lock);
908 td->td_flags &= ~TDF_SELECT;
909 mtx_unlock_spin(&sched_lock);
910 mtx_unlock(&sellock);
911
912 done_nosellock:
913 /* poll is not restarted after signals... */
914 if (error == ERESTART)
915 error = EINTR;
916 if (error == EWOULDBLOCK)
917 error = 0;
918 if (error == 0) {
919 error = copyout(bits, uap->fds, ni);
920 if (error)
921 goto out;
922 }
923 out:
924 if (ni > sizeof(smallbits))
925 free(bits, M_TEMP);
926 done2:
927 return (error);
928 }
929
930 static int
931 pollscan(td, fds, nfd)
932 struct thread *td;
933 struct pollfd *fds;
934 u_int nfd;
935 {
936 register struct filedesc *fdp = td->td_proc->p_fd;
937 int i;
938 struct file *fp;
939 int n = 0;
940
941 FILEDESC_LOCK(fdp);
942 for (i = 0; i < nfd; i++, fds++) {
943 if (fds->fd >= fdp->fd_nfiles) {
944 fds->revents = POLLNVAL;
945 n++;
946 } else if (fds->fd < 0) {
947 fds->revents = 0;
948 } else {
949 fp = fdp->fd_ofiles[fds->fd];
950 if (fp == NULL) {
951 fds->revents = POLLNVAL;
952 n++;
953 } else {
954 /*
955 * Note: backend also returns POLLHUP and
956 * POLLERR if appropriate.
957 */
958 fds->revents = fo_poll(fp, fds->events,
959 td->td_ucred, td);
960 if (fds->revents != 0)
961 n++;
962 }
963 }
964 }
965 FILEDESC_UNLOCK(fdp);
966 td->td_retval[0] = n;
967 return (0);
968 }
969
970 /*
971 * OpenBSD poll system call.
972 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
973 */
974 #ifndef _SYS_SYSPROTO_H_
975 struct openbsd_poll_args {
976 struct pollfd *fds;
977 u_int nfds;
978 int timeout;
979 };
980 #endif
981 /*
982 * MPSAFE
983 */
984 int
985 openbsd_poll(td, uap)
986 register struct thread *td;
987 register struct openbsd_poll_args *uap;
988 {
989 return (poll(td, (struct poll_args *)uap));
990 }
991
992 /*
993 * Remove the references to the thread from all of the objects
994 * we were polling.
995 *
996 * This code assumes that the underlying owner of the selinfo
997 * structure will hold sellock before it changes it, and that
998 * it will unlink itself from our list if it goes away.
999 */
1000 void
1001 clear_selinfo_list(td)
1002 struct thread *td;
1003 {
1004 struct selinfo *si;
1005
1006 mtx_assert(&sellock, MA_OWNED);
1007 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1008 si->si_thread = NULL;
1009 TAILQ_INIT(&td->td_selq);
1010 }
1011
1012 /*
1013 * Record a select request.
1014 */
1015 void
1016 selrecord(selector, sip)
1017 struct thread *selector;
1018 struct selinfo *sip;
1019 {
1020
1021 mtx_lock(&sellock);
1022 /*
1023 * If the selinfo's thread pointer is NULL then take ownership of it.
1024 *
1025 * If the thread pointer is not NULL and it points to another
1026 * thread, then we have a collision.
1027 *
1028 * If the thread pointer is not NULL and points back to us then leave
1029 * it alone as we've already added pointed it at us and added it to
1030 * our list.
1031 */
1032 if (sip->si_thread == NULL) {
1033 sip->si_thread = selector;
1034 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1035 } else if (sip->si_thread != selector) {
1036 sip->si_flags |= SI_COLL;
1037 }
1038
1039 mtx_unlock(&sellock);
1040 }
1041
1042 /* Wake up a selecting thread. */
1043 void
1044 selwakeup(sip)
1045 struct selinfo *sip;
1046 {
1047 doselwakeup(sip, -1);
1048 }
1049
1050 /* Wake up a selecting thread, and set its priority. */
1051 void
1052 selwakeuppri(sip, pri)
1053 struct selinfo *sip;
1054 int pri;
1055 {
1056 doselwakeup(sip, pri);
1057 }
1058
1059 /*
1060 * Do a wakeup when a selectable event occurs.
1061 */
1062 static void
1063 doselwakeup(sip, pri)
1064 struct selinfo *sip;
1065 int pri;
1066 {
1067 struct thread *td;
1068
1069 mtx_lock(&sellock);
1070 td = sip->si_thread;
1071 if ((sip->si_flags & SI_COLL) != 0) {
1072 nselcoll++;
1073 sip->si_flags &= ~SI_COLL;
1074 cv_broadcastpri(&selwait, pri);
1075 }
1076 if (td == NULL) {
1077 mtx_unlock(&sellock);
1078 return;
1079 }
1080 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1081 sip->si_thread = NULL;
1082 mtx_lock_spin(&sched_lock);
1083 td->td_flags &= ~TDF_SELECT;
1084 mtx_unlock_spin(&sched_lock);
1085 sleepq_remove(td, &selwait);
1086 mtx_unlock(&sellock);
1087 }
1088
1089 static void selectinit(void *);
1090 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1091
1092 /* ARGSUSED*/
1093 static void
1094 selectinit(dummy)
1095 void *dummy;
1096 {
1097 cv_init(&selwait, "select");
1098 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1099 }
Cache object: 26e5dd8747862191ba638fd9e6c25d79
|