1 /*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD: releng/5.3/sys/kern/sys_generic.c 136588 2004-10-16 08:43:07Z cvs2svn $");
39
40 #include "opt_ktrace.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/signalvar.h>
51 #include <sys/socketvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #include <sys/resourcevar.h>
58 #include <sys/selinfo.h>
59 #include <sys/sleepqueue.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/vnode.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77 static int pollscan(struct thread *, struct pollfd *, u_int);
78 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int dofileread(struct thread *, struct file *, int, void *,
80 size_t, off_t, int);
81 static int dofilewrite(struct thread *, struct file *, int,
82 const void *, size_t, off_t, int);
83 static void doselwakeup(struct selinfo *, int);
84
85 /*
86 * Read system call.
87 */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 int fd;
91 void *buf;
92 size_t nbyte;
93 };
94 #endif
95 /*
96 * MPSAFE
97 */
98 int
99 read(td, uap)
100 struct thread *td;
101 struct read_args *uap;
102 {
103 struct file *fp;
104 int error;
105
106 if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 error = dofileread(td, fp, uap->fd, uap->buf,
108 uap->nbyte, (off_t)-1, 0);
109 fdrop(fp, td);
110 }
111 return(error);
112 }
113
114 /*
115 * Pread system call
116 */
117 #ifndef _SYS_SYSPROTO_H_
118 struct pread_args {
119 int fd;
120 void *buf;
121 size_t nbyte;
122 int pad;
123 off_t offset;
124 };
125 #endif
126 /*
127 * MPSAFE
128 */
129 int
130 pread(td, uap)
131 struct thread *td;
132 struct pread_args *uap;
133 {
134 struct file *fp;
135 int error;
136
137 if ((error = fget_read(td, uap->fd, &fp)) != 0)
138 return (error);
139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140 error = ESPIPE;
141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142 error = EINVAL;
143 else {
144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 uap->offset, FOF_OFFSET);
146 }
147 fdrop(fp, td);
148 return(error);
149 }
150
151 /*
152 * Code common for read and pread
153 */
154 static int
155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
156 struct thread *td;
157 struct file *fp;
158 int fd, flags;
159 void *buf;
160 size_t nbyte;
161 off_t offset;
162 {
163 struct uio auio;
164 struct iovec aiov;
165 long cnt, error = 0;
166 #ifdef KTRACE
167 struct uio *ktruio = NULL;
168 #endif
169
170 aiov.iov_base = buf;
171 aiov.iov_len = nbyte;
172 auio.uio_iov = &aiov;
173 auio.uio_iovcnt = 1;
174 auio.uio_offset = offset;
175 if (nbyte > INT_MAX)
176 return (EINVAL);
177 auio.uio_resid = nbyte;
178 auio.uio_rw = UIO_READ;
179 auio.uio_segflg = UIO_USERSPACE;
180 auio.uio_td = td;
181 #ifdef KTRACE
182 if (KTRPOINT(td, KTR_GENIO))
183 ktruio = cloneuio(&auio);
184 #endif
185 cnt = nbyte;
186
187 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
188 if (auio.uio_resid != cnt && (error == ERESTART ||
189 error == EINTR || error == EWOULDBLOCK))
190 error = 0;
191 }
192 cnt -= auio.uio_resid;
193 #ifdef KTRACE
194 if (ktruio != NULL) {
195 ktruio->uio_resid = cnt;
196 ktrgenio(fd, UIO_READ, ktruio, error);
197 }
198 #endif
199 td->td_retval[0] = cnt;
200 return (error);
201 }
202
203 /*
204 * Scatter read system call.
205 */
206 #ifndef _SYS_SYSPROTO_H_
207 struct readv_args {
208 int fd;
209 struct iovec *iovp;
210 u_int iovcnt;
211 };
212 #endif
213 /*
214 * MPSAFE
215 */
216 int
217 readv(struct thread *td, struct readv_args *uap)
218 {
219 struct file *fp;
220 struct uio *auio = NULL;
221 long cnt;
222 int error;
223 #ifdef KTRACE
224 struct uio *ktruio = NULL;
225 #endif
226
227 error = fget_read(td, uap->fd, &fp);
228 if (error)
229 return (error);
230 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
231 if (error) {
232 fdrop(fp, td);
233 return (error);
234 }
235 auio->uio_rw = UIO_READ;
236 auio->uio_td = td;
237 #ifdef KTRACE
238 if (KTRPOINT(td, KTR_GENIO))
239 ktruio = cloneuio(auio);
240 #endif
241 cnt = auio->uio_resid;
242 if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
243 if (auio->uio_resid != cnt && (error == ERESTART ||
244 error == EINTR || error == EWOULDBLOCK))
245 error = 0;
246 }
247 cnt -= auio->uio_resid;
248 #ifdef KTRACE
249 if (ktruio != NULL) {
250 ktruio->uio_resid = cnt;
251 ktrgenio(uap->fd, UIO_READ, ktruio, error);
252 }
253 #endif
254 td->td_retval[0] = cnt;
255 free(auio, M_IOV);
256 fdrop(fp, td);
257 return (error);
258 }
259
260 /*
261 * Write system call
262 */
263 #ifndef _SYS_SYSPROTO_H_
264 struct write_args {
265 int fd;
266 const void *buf;
267 size_t nbyte;
268 };
269 #endif
270 /*
271 * MPSAFE
272 */
273 int
274 write(td, uap)
275 struct thread *td;
276 struct write_args *uap;
277 {
278 struct file *fp;
279 int error;
280
281 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
282 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
283 (off_t)-1, 0);
284 fdrop(fp, td);
285 } else {
286 error = EBADF; /* XXX this can't be right */
287 }
288 return(error);
289 }
290
291 /*
292 * Pwrite system call
293 */
294 #ifndef _SYS_SYSPROTO_H_
295 struct pwrite_args {
296 int fd;
297 const void *buf;
298 size_t nbyte;
299 int pad;
300 off_t offset;
301 };
302 #endif
303 /*
304 * MPSAFE
305 */
306 int
307 pwrite(td, uap)
308 struct thread *td;
309 struct pwrite_args *uap;
310 {
311 struct file *fp;
312 int error;
313
314 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
315 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
316 error = ESPIPE;
317 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
318 error = EINVAL;
319 else {
320 error = dofilewrite(td, fp, uap->fd, uap->buf,
321 uap->nbyte, uap->offset, FOF_OFFSET);
322 }
323 fdrop(fp, td);
324 } else {
325 error = EBADF; /* this can't be right */
326 }
327 return(error);
328 }
329
330 static int
331 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
332 struct thread *td;
333 struct file *fp;
334 int fd, flags;
335 const void *buf;
336 size_t nbyte;
337 off_t offset;
338 {
339 struct uio auio;
340 struct iovec aiov;
341 long cnt, error = 0;
342 #ifdef KTRACE
343 struct uio *ktruio = NULL;
344 #endif
345
346 aiov.iov_base = (void *)(uintptr_t)buf;
347 aiov.iov_len = nbyte;
348 auio.uio_iov = &aiov;
349 auio.uio_iovcnt = 1;
350 auio.uio_offset = offset;
351 if (nbyte > INT_MAX)
352 return (EINVAL);
353 auio.uio_resid = nbyte;
354 auio.uio_rw = UIO_WRITE;
355 auio.uio_segflg = UIO_USERSPACE;
356 auio.uio_td = td;
357 #ifdef KTRACE
358 if (KTRPOINT(td, KTR_GENIO))
359 ktruio = cloneuio(&auio);
360 #endif
361 cnt = nbyte;
362 if (fp->f_type == DTYPE_VNODE)
363 bwillwrite();
364 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
365 if (auio.uio_resid != cnt && (error == ERESTART ||
366 error == EINTR || error == EWOULDBLOCK))
367 error = 0;
368 /* Socket layer is responsible for issuing SIGPIPE. */
369 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
370 PROC_LOCK(td->td_proc);
371 psignal(td->td_proc, SIGPIPE);
372 PROC_UNLOCK(td->td_proc);
373 }
374 }
375 cnt -= auio.uio_resid;
376 #ifdef KTRACE
377 if (ktruio != NULL) {
378 ktruio->uio_resid = cnt;
379 ktrgenio(fd, UIO_WRITE, ktruio, error);
380 }
381 #endif
382 td->td_retval[0] = cnt;
383 return (error);
384 }
385
386 /*
387 * Gather write system call
388 */
389 #ifndef _SYS_SYSPROTO_H_
390 struct writev_args {
391 int fd;
392 struct iovec *iovp;
393 u_int iovcnt;
394 };
395 #endif
396 /*
397 * MPSAFE
398 */
399 int
400 writev(struct thread *td, struct writev_args *uap)
401 {
402 struct file *fp;
403 struct uio *auio = NULL;
404 long cnt;
405 int error;
406 #ifdef KTRACE
407 struct uio *ktruio = NULL;
408 #endif
409
410 error = fget_write(td, uap->fd, &fp);
411 if (error)
412 return (EBADF);
413 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
414 if (error) {
415 fdrop(fp, td);
416 return (error);
417 }
418 auio->uio_rw = UIO_WRITE;
419 auio->uio_td = td;
420 #ifdef KTRACE
421 if (KTRPOINT(td, KTR_GENIO))
422 ktruio = cloneuio(auio);
423 #endif
424 cnt = auio->uio_resid;
425 if (fp->f_type == DTYPE_VNODE)
426 bwillwrite();
427 if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
428 if (auio->uio_resid != cnt && (error == ERESTART ||
429 error == EINTR || error == EWOULDBLOCK))
430 error = 0;
431 if (error == EPIPE) {
432 PROC_LOCK(td->td_proc);
433 psignal(td->td_proc, SIGPIPE);
434 PROC_UNLOCK(td->td_proc);
435 }
436 }
437 cnt -= auio->uio_resid;
438 #ifdef KTRACE
439 if (ktruio != NULL) {
440 ktruio->uio_resid = cnt;
441 ktrgenio(uap->fd, UIO_WRITE, ktruio, error);
442 }
443 #endif
444 td->td_retval[0] = cnt;
445 fdrop(fp, td);
446 free(auio, M_IOV);
447 return (error);
448 }
449
450 /*
451 * Ioctl system call
452 */
453 #ifndef _SYS_SYSPROTO_H_
454 struct ioctl_args {
455 int fd;
456 u_long com;
457 caddr_t data;
458 };
459 #endif
460 /*
461 * MPSAFE
462 */
463 /* ARGSUSED */
464 int
465 ioctl(td, uap)
466 struct thread *td;
467 register struct ioctl_args *uap;
468 {
469 struct file *fp;
470 register struct filedesc *fdp;
471 register u_long com;
472 int error = 0;
473 register u_int size;
474 caddr_t data, memp;
475 int tmp;
476 #define STK_PARAMS 128
477 union {
478 char stkbuf[STK_PARAMS];
479 long align;
480 } ubuf;
481
482 if ((error = fget(td, uap->fd, &fp)) != 0)
483 return (error);
484 mtx_lock(&Giant);
485 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
486 fdrop(fp, td);
487 mtx_unlock(&Giant);
488 return (EBADF);
489 }
490 fdp = td->td_proc->p_fd;
491 switch (com = uap->com) {
492 case FIONCLEX:
493 FILEDESC_LOCK(fdp);
494 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
495 FILEDESC_UNLOCK(fdp);
496 fdrop(fp, td);
497 mtx_unlock(&Giant);
498 return (0);
499 case FIOCLEX:
500 FILEDESC_LOCK(fdp);
501 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
502 FILEDESC_UNLOCK(fdp);
503 fdrop(fp, td);
504 mtx_unlock(&Giant);
505 return (0);
506 }
507
508 /*
509 * Interpret high order word to find amount of data to be
510 * copied to/from the user's address space.
511 */
512 size = IOCPARM_LEN(com);
513 if (size > IOCPARM_MAX) {
514 fdrop(fp, td);
515 mtx_unlock(&Giant);
516 return (ENOTTY);
517 }
518
519 memp = NULL;
520 if (size > sizeof (ubuf.stkbuf)) {
521 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
522 data = memp;
523 } else {
524 data = ubuf.stkbuf;
525 }
526 if (com&IOC_IN) {
527 if (size) {
528 error = copyin(uap->data, data, (u_int)size);
529 if (error) {
530 if (memp)
531 free(memp, M_IOCTLOPS);
532 fdrop(fp, td);
533 goto done;
534 }
535 } else {
536 *(caddr_t *)data = uap->data;
537 }
538 } else if ((com&IOC_OUT) && size) {
539 /*
540 * Zero the buffer so the user always
541 * gets back something deterministic.
542 */
543 bzero(data, size);
544 } else if (com&IOC_VOID) {
545 *(caddr_t *)data = uap->data;
546 }
547
548 switch (com) {
549
550 case FIONBIO:
551 FILE_LOCK(fp);
552 if ((tmp = *(int *)data))
553 fp->f_flag |= FNONBLOCK;
554 else
555 fp->f_flag &= ~FNONBLOCK;
556 FILE_UNLOCK(fp);
557 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
558 break;
559
560 case FIOASYNC:
561 FILE_LOCK(fp);
562 if ((tmp = *(int *)data))
563 fp->f_flag |= FASYNC;
564 else
565 fp->f_flag &= ~FASYNC;
566 FILE_UNLOCK(fp);
567 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
568 break;
569
570 default:
571 error = fo_ioctl(fp, com, data, td->td_ucred, td);
572 /*
573 * Copy any data to user, size was
574 * already set and checked above.
575 */
576 if (error == 0 && (com&IOC_OUT) && size)
577 error = copyout(data, uap->data, (u_int)size);
578 break;
579 }
580 if (memp)
581 free(memp, M_IOCTLOPS);
582 fdrop(fp, td);
583 done:
584 mtx_unlock(&Giant);
585 return (error);
586 }
587
588 /*
589 * sellock and selwait are initialized in selectinit() via SYSINIT.
590 */
591 struct mtx sellock;
592 struct cv selwait;
593 u_int nselcoll; /* Select collisions since boot */
594 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
595
596 /*
597 * Select system call.
598 */
599 #ifndef _SYS_SYSPROTO_H_
600 struct select_args {
601 int nd;
602 fd_set *in, *ou, *ex;
603 struct timeval *tv;
604 };
605 #endif
606 /*
607 * MPSAFE
608 */
609 int
610 select(td, uap)
611 register struct thread *td;
612 register struct select_args *uap;
613 {
614 struct timeval tv, *tvp;
615 int error;
616
617 if (uap->tv != NULL) {
618 error = copyin(uap->tv, &tv, sizeof(tv));
619 if (error)
620 return (error);
621 tvp = &tv;
622 } else
623 tvp = NULL;
624
625 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
626 }
627
628 int
629 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
630 fd_set *fd_ex, struct timeval *tvp)
631 {
632 struct filedesc *fdp;
633 /*
634 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
635 * infds with the new FD_SETSIZE of 1024, and more than enough for
636 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
637 * of 256.
638 */
639 fd_mask s_selbits[howmany(2048, NFDBITS)];
640 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
641 struct timeval atv, rtv, ttv;
642 int error, timo;
643 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
644
645 if (nd < 0)
646 return (EINVAL);
647 fdp = td->td_proc->p_fd;
648 /*
649 * XXX: kern_select() currently requires that we acquire Giant
650 * even if none of the file descriptors we poll requires Giant.
651 */
652 mtx_lock(&Giant);
653 FILEDESC_LOCK(fdp);
654
655 if (nd > td->td_proc->p_fd->fd_nfiles)
656 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
657 FILEDESC_UNLOCK(fdp);
658
659 /*
660 * Allocate just enough bits for the non-null fd_sets. Use the
661 * preallocated auto buffer if possible.
662 */
663 nfdbits = roundup(nd, NFDBITS);
664 ncpbytes = nfdbits / NBBY;
665 nbufbytes = 0;
666 if (fd_in != NULL)
667 nbufbytes += 2 * ncpbytes;
668 if (fd_ou != NULL)
669 nbufbytes += 2 * ncpbytes;
670 if (fd_ex != NULL)
671 nbufbytes += 2 * ncpbytes;
672 if (nbufbytes <= sizeof s_selbits)
673 selbits = &s_selbits[0];
674 else
675 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
676
677 /*
678 * Assign pointers into the bit buffers and fetch the input bits.
679 * Put the output buffers together so that they can be bzeroed
680 * together.
681 */
682 sbp = selbits;
683 #define getbits(name, x) \
684 do { \
685 if (name == NULL) \
686 ibits[x] = NULL; \
687 else { \
688 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
689 obits[x] = sbp; \
690 sbp += ncpbytes / sizeof *sbp; \
691 error = copyin(name, ibits[x], ncpbytes); \
692 if (error != 0) \
693 goto done_nosellock; \
694 } \
695 } while (0)
696 getbits(fd_in, 0);
697 getbits(fd_ou, 1);
698 getbits(fd_ex, 2);
699 #undef getbits
700 if (nbufbytes != 0)
701 bzero(selbits, nbufbytes / 2);
702
703 if (tvp != NULL) {
704 atv = *tvp;
705 if (itimerfix(&atv)) {
706 error = EINVAL;
707 goto done_nosellock;
708 }
709 getmicrouptime(&rtv);
710 timevaladd(&atv, &rtv);
711 } else {
712 atv.tv_sec = 0;
713 atv.tv_usec = 0;
714 }
715 timo = 0;
716 TAILQ_INIT(&td->td_selq);
717 mtx_lock(&sellock);
718 retry:
719 ncoll = nselcoll;
720 mtx_lock_spin(&sched_lock);
721 td->td_flags |= TDF_SELECT;
722 mtx_unlock_spin(&sched_lock);
723 mtx_unlock(&sellock);
724
725 error = selscan(td, ibits, obits, nd);
726 mtx_lock(&sellock);
727 if (error || td->td_retval[0])
728 goto done;
729 if (atv.tv_sec || atv.tv_usec) {
730 getmicrouptime(&rtv);
731 if (timevalcmp(&rtv, &atv, >=))
732 goto done;
733 ttv = atv;
734 timevalsub(&ttv, &rtv);
735 timo = ttv.tv_sec > 24 * 60 * 60 ?
736 24 * 60 * 60 * hz : tvtohz(&ttv);
737 }
738
739 /*
740 * An event of interest may occur while we do not hold
741 * sellock, so check TDF_SELECT and the number of
742 * collisions and rescan the file descriptors if
743 * necessary.
744 */
745 mtx_lock_spin(&sched_lock);
746 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
747 mtx_unlock_spin(&sched_lock);
748 goto retry;
749 }
750 mtx_unlock_spin(&sched_lock);
751
752 if (timo > 0)
753 error = cv_timedwait_sig(&selwait, &sellock, timo);
754 else
755 error = cv_wait_sig(&selwait, &sellock);
756
757 if (error == 0)
758 goto retry;
759
760 done:
761 clear_selinfo_list(td);
762 mtx_lock_spin(&sched_lock);
763 td->td_flags &= ~TDF_SELECT;
764 mtx_unlock_spin(&sched_lock);
765 mtx_unlock(&sellock);
766
767 done_nosellock:
768 /* select is not restarted after signals... */
769 if (error == ERESTART)
770 error = EINTR;
771 if (error == EWOULDBLOCK)
772 error = 0;
773 #define putbits(name, x) \
774 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
775 error = error2;
776 if (error == 0) {
777 int error2;
778
779 putbits(fd_in, 0);
780 putbits(fd_ou, 1);
781 putbits(fd_ex, 2);
782 #undef putbits
783 }
784 if (selbits != &s_selbits[0])
785 free(selbits, M_SELECT);
786
787 mtx_unlock(&Giant);
788 return (error);
789 }
790
791 static int
792 selscan(td, ibits, obits, nfd)
793 struct thread *td;
794 fd_mask **ibits, **obits;
795 int nfd;
796 {
797 int msk, i, fd;
798 fd_mask bits;
799 struct file *fp;
800 int n = 0;
801 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
802 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
803 struct filedesc *fdp = td->td_proc->p_fd;
804
805 FILEDESC_LOCK(fdp);
806 for (msk = 0; msk < 3; msk++) {
807 if (ibits[msk] == NULL)
808 continue;
809 for (i = 0; i < nfd; i += NFDBITS) {
810 bits = ibits[msk][i/NFDBITS];
811 /* ffs(int mask) not portable, fd_mask is long */
812 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
813 if (!(bits & 1))
814 continue;
815 if ((fp = fget_locked(fdp, fd)) == NULL) {
816 FILEDESC_UNLOCK(fdp);
817 return (EBADF);
818 }
819 if (fo_poll(fp, flag[msk], td->td_ucred,
820 td)) {
821 obits[msk][(fd)/NFDBITS] |=
822 ((fd_mask)1 << ((fd) % NFDBITS));
823 n++;
824 }
825 }
826 }
827 }
828 FILEDESC_UNLOCK(fdp);
829 td->td_retval[0] = n;
830 return (0);
831 }
832
833 /*
834 * Poll system call.
835 */
836 #ifndef _SYS_SYSPROTO_H_
837 struct poll_args {
838 struct pollfd *fds;
839 u_int nfds;
840 int timeout;
841 };
842 #endif
843 /*
844 * MPSAFE
845 */
846 int
847 poll(td, uap)
848 struct thread *td;
849 struct poll_args *uap;
850 {
851 struct pollfd *bits;
852 struct pollfd smallbits[32];
853 struct timeval atv, rtv, ttv;
854 int error = 0, timo;
855 u_int ncoll, nfds;
856 size_t ni;
857
858 nfds = uap->nfds;
859
860 /*
861 * XXX: poll() currently requires that we acquire Giant even if
862 * none of the file descriptors we poll requires Giant.
863 */
864 mtx_lock(&Giant);
865 /*
866 * This is kinda bogus. We have fd limits, but that is not
867 * really related to the size of the pollfd array. Make sure
868 * we let the process use at least FD_SETSIZE entries and at
869 * least enough for the current limits. We want to be reasonably
870 * safe, but not overly restrictive.
871 */
872 PROC_LOCK(td->td_proc);
873 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
874 (nfds > FD_SETSIZE)) {
875 PROC_UNLOCK(td->td_proc);
876 error = EINVAL;
877 goto done2;
878 }
879 PROC_UNLOCK(td->td_proc);
880 ni = nfds * sizeof(struct pollfd);
881 if (ni > sizeof(smallbits))
882 bits = malloc(ni, M_TEMP, M_WAITOK);
883 else
884 bits = smallbits;
885 error = copyin(uap->fds, bits, ni);
886 if (error)
887 goto done_nosellock;
888 if (uap->timeout != INFTIM) {
889 atv.tv_sec = uap->timeout / 1000;
890 atv.tv_usec = (uap->timeout % 1000) * 1000;
891 if (itimerfix(&atv)) {
892 error = EINVAL;
893 goto done_nosellock;
894 }
895 getmicrouptime(&rtv);
896 timevaladd(&atv, &rtv);
897 } else {
898 atv.tv_sec = 0;
899 atv.tv_usec = 0;
900 }
901 timo = 0;
902 TAILQ_INIT(&td->td_selq);
903 mtx_lock(&sellock);
904 retry:
905 ncoll = nselcoll;
906 mtx_lock_spin(&sched_lock);
907 td->td_flags |= TDF_SELECT;
908 mtx_unlock_spin(&sched_lock);
909 mtx_unlock(&sellock);
910
911 error = pollscan(td, bits, nfds);
912 mtx_lock(&sellock);
913 if (error || td->td_retval[0])
914 goto done;
915 if (atv.tv_sec || atv.tv_usec) {
916 getmicrouptime(&rtv);
917 if (timevalcmp(&rtv, &atv, >=))
918 goto done;
919 ttv = atv;
920 timevalsub(&ttv, &rtv);
921 timo = ttv.tv_sec > 24 * 60 * 60 ?
922 24 * 60 * 60 * hz : tvtohz(&ttv);
923 }
924 /*
925 * An event of interest may occur while we do not hold
926 * sellock, so check TDF_SELECT and the number of collisions
927 * and rescan the file descriptors if necessary.
928 */
929 mtx_lock_spin(&sched_lock);
930 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
931 mtx_unlock_spin(&sched_lock);
932 goto retry;
933 }
934 mtx_unlock_spin(&sched_lock);
935
936 if (timo > 0)
937 error = cv_timedwait_sig(&selwait, &sellock, timo);
938 else
939 error = cv_wait_sig(&selwait, &sellock);
940
941 if (error == 0)
942 goto retry;
943
944 done:
945 clear_selinfo_list(td);
946 mtx_lock_spin(&sched_lock);
947 td->td_flags &= ~TDF_SELECT;
948 mtx_unlock_spin(&sched_lock);
949 mtx_unlock(&sellock);
950
951 done_nosellock:
952 /* poll is not restarted after signals... */
953 if (error == ERESTART)
954 error = EINTR;
955 if (error == EWOULDBLOCK)
956 error = 0;
957 if (error == 0) {
958 error = copyout(bits, uap->fds, ni);
959 if (error)
960 goto out;
961 }
962 out:
963 if (ni > sizeof(smallbits))
964 free(bits, M_TEMP);
965 done2:
966 mtx_unlock(&Giant);
967 return (error);
968 }
969
970 static int
971 pollscan(td, fds, nfd)
972 struct thread *td;
973 struct pollfd *fds;
974 u_int nfd;
975 {
976 register struct filedesc *fdp = td->td_proc->p_fd;
977 int i;
978 struct file *fp;
979 int n = 0;
980
981 FILEDESC_LOCK(fdp);
982 for (i = 0; i < nfd; i++, fds++) {
983 if (fds->fd >= fdp->fd_nfiles) {
984 fds->revents = POLLNVAL;
985 n++;
986 } else if (fds->fd < 0) {
987 fds->revents = 0;
988 } else {
989 fp = fdp->fd_ofiles[fds->fd];
990 if (fp == NULL) {
991 fds->revents = POLLNVAL;
992 n++;
993 } else {
994 /*
995 * Note: backend also returns POLLHUP and
996 * POLLERR if appropriate.
997 */
998 fds->revents = fo_poll(fp, fds->events,
999 td->td_ucred, td);
1000 if (fds->revents != 0)
1001 n++;
1002 }
1003 }
1004 }
1005 FILEDESC_UNLOCK(fdp);
1006 td->td_retval[0] = n;
1007 return (0);
1008 }
1009
1010 /*
1011 * OpenBSD poll system call.
1012 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1013 */
1014 #ifndef _SYS_SYSPROTO_H_
1015 struct openbsd_poll_args {
1016 struct pollfd *fds;
1017 u_int nfds;
1018 int timeout;
1019 };
1020 #endif
1021 /*
1022 * MPSAFE
1023 */
1024 int
1025 openbsd_poll(td, uap)
1026 register struct thread *td;
1027 register struct openbsd_poll_args *uap;
1028 {
1029 return (poll(td, (struct poll_args *)uap));
1030 }
1031
1032 /*
1033 * Remove the references to the thread from all of the objects
1034 * we were polling.
1035 *
1036 * This code assumes that the underlying owner of the selinfo
1037 * structure will hold sellock before it changes it, and that
1038 * it will unlink itself from our list if it goes away.
1039 */
1040 void
1041 clear_selinfo_list(td)
1042 struct thread *td;
1043 {
1044 struct selinfo *si;
1045
1046 mtx_assert(&sellock, MA_OWNED);
1047 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1048 si->si_thread = NULL;
1049 TAILQ_INIT(&td->td_selq);
1050 }
1051
1052 /*
1053 * Record a select request.
1054 */
1055 void
1056 selrecord(selector, sip)
1057 struct thread *selector;
1058 struct selinfo *sip;
1059 {
1060
1061 mtx_lock(&sellock);
1062 /*
1063 * If the selinfo's thread pointer is NULL then take ownership of it.
1064 *
1065 * If the thread pointer is not NULL and it points to another
1066 * thread, then we have a collision.
1067 *
1068 * If the thread pointer is not NULL and points back to us then leave
1069 * it alone as we've already added pointed it at us and added it to
1070 * our list.
1071 */
1072 if (sip->si_thread == NULL) {
1073 sip->si_thread = selector;
1074 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1075 } else if (sip->si_thread != selector) {
1076 sip->si_flags |= SI_COLL;
1077 }
1078
1079 mtx_unlock(&sellock);
1080 }
1081
1082 /* Wake up a selecting thread. */
1083 void
1084 selwakeup(sip)
1085 struct selinfo *sip;
1086 {
1087 doselwakeup(sip, -1);
1088 }
1089
1090 /* Wake up a selecting thread, and set its priority. */
1091 void
1092 selwakeuppri(sip, pri)
1093 struct selinfo *sip;
1094 int pri;
1095 {
1096 doselwakeup(sip, pri);
1097 }
1098
1099 /*
1100 * Do a wakeup when a selectable event occurs.
1101 */
1102 static void
1103 doselwakeup(sip, pri)
1104 struct selinfo *sip;
1105 int pri;
1106 {
1107 struct thread *td;
1108
1109 mtx_lock(&sellock);
1110 td = sip->si_thread;
1111 if ((sip->si_flags & SI_COLL) != 0) {
1112 nselcoll++;
1113 sip->si_flags &= ~SI_COLL;
1114 cv_broadcastpri(&selwait, pri);
1115 }
1116 if (td == NULL) {
1117 mtx_unlock(&sellock);
1118 return;
1119 }
1120 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1121 sip->si_thread = NULL;
1122 mtx_lock_spin(&sched_lock);
1123 td->td_flags &= ~TDF_SELECT;
1124 mtx_unlock_spin(&sched_lock);
1125 sleepq_remove(td, &selwait);
1126 mtx_unlock(&sellock);
1127 }
1128
1129 static void selectinit(void *);
1130 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1131
1132 /* ARGSUSED*/
1133 static void
1134 selectinit(dummy)
1135 void *dummy;
1136 {
1137 cv_init(&selwait, "select");
1138 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1139 }
Cache object: 1157470bfb0d7d2f2cabdc89f628f59c
|