1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39
40 #include "opt_ktrace.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/signalvar.h>
51 #include <sys/socketvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #include <sys/resourcevar.h>
58 #include <sys/selinfo.h>
59 #include <sys/sleepqueue.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/vnode.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/condvar.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #include <vm/vm.h>
71 #include <vm/vm_page.h>
72
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77 static int pollscan(struct thread *, struct pollfd *, u_int);
78 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int dofileread(struct thread *, int, struct file *, struct uio *,
80 off_t, int);
81 static int dofilewrite(struct thread *, int, struct file *, struct uio *,
82 off_t, int);
83 static void doselwakeup(struct selinfo *, int);
84
85 /*
86 * Read system call.
87 */
88 #ifndef _SYS_SYSPROTO_H_
89 struct read_args {
90 int fd;
91 void *buf;
92 size_t nbyte;
93 };
94 #endif
95 /*
96 * MPSAFE
97 */
98 int
99 read(td, uap)
100 struct thread *td;
101 struct read_args *uap;
102 {
103 struct uio auio;
104 struct iovec aiov;
105 int error;
106
107 if (uap->nbyte > INT_MAX)
108 return (EINVAL);
109 aiov.iov_base = uap->buf;
110 aiov.iov_len = uap->nbyte;
111 auio.uio_iov = &aiov;
112 auio.uio_iovcnt = 1;
113 auio.uio_resid = uap->nbyte;
114 auio.uio_segflg = UIO_USERSPACE;
115 error = kern_readv(td, uap->fd, &auio);
116 return(error);
117 }
118
119 /*
120 * Positioned read system call
121 */
122 #ifndef _SYS_SYSPROTO_H_
123 struct pread_args {
124 int fd;
125 void *buf;
126 size_t nbyte;
127 int pad;
128 off_t offset;
129 };
130 #endif
131 /*
132 * MPSAFE
133 */
134 int
135 pread(td, uap)
136 struct thread *td;
137 struct pread_args *uap;
138 {
139 struct uio auio;
140 struct iovec aiov;
141 int error;
142
143 if (uap->nbyte > INT_MAX)
144 return (EINVAL);
145 aiov.iov_base = uap->buf;
146 aiov.iov_len = uap->nbyte;
147 auio.uio_iov = &aiov;
148 auio.uio_iovcnt = 1;
149 auio.uio_resid = uap->nbyte;
150 auio.uio_segflg = UIO_USERSPACE;
151 error = kern_preadv(td, uap->fd, &auio, uap->offset);
152 return(error);
153 }
154
155 /*
156 * Scatter read system call.
157 */
158 #ifndef _SYS_SYSPROTO_H_
159 struct readv_args {
160 int fd;
161 struct iovec *iovp;
162 u_int iovcnt;
163 };
164 #endif
165 /*
166 * MPSAFE
167 */
168 int
169 readv(struct thread *td, struct readv_args *uap)
170 {
171 struct uio *auio;
172 int error;
173
174 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
175 if (error)
176 return (error);
177 error = kern_readv(td, uap->fd, auio);
178 free(auio, M_IOV);
179 return (error);
180 }
181
182 int
183 kern_readv(struct thread *td, int fd, struct uio *auio)
184 {
185 struct file *fp;
186 int error;
187
188 error = fget_read(td, fd, &fp);
189 if (error)
190 return (error);
191 error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
192 fdrop(fp, td);
193 return (error);
194 }
195
196 /*
197 * Scatter positioned read system call.
198 */
199 #ifndef _SYS_SYSPROTO_H_
200 struct preadv_args {
201 int fd;
202 struct iovec *iovp;
203 u_int iovcnt;
204 off_t offset;
205 };
206 #endif
207 /*
208 * MPSAFE
209 */
210 int
211 preadv(struct thread *td, struct preadv_args *uap)
212 {
213 struct uio *auio;
214 int error;
215
216 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
217 if (error)
218 return (error);
219 error = kern_preadv(td, uap->fd, auio, uap->offset);
220 free(auio, M_IOV);
221 return (error);
222 }
223
224 int
225 kern_preadv(td, fd, auio, offset)
226 struct thread *td;
227 int fd;
228 struct uio *auio;
229 off_t offset;
230 {
231 struct file *fp;
232 int error;
233
234 error = fget_read(td, fd, &fp);
235 if (error)
236 return (error);
237 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
238 error = ESPIPE;
239 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
240 error = EINVAL;
241 else
242 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
243 fdrop(fp, td);
244 return (error);
245 }
246
247 /*
248 * Common code for readv and preadv that reads data in
249 * from a file using the passed in uio, offset, and flags.
250 */
251 static int
252 dofileread(td, fd, fp, auio, offset, flags)
253 struct thread *td;
254 int fd;
255 struct file *fp;
256 struct uio *auio;
257 off_t offset;
258 int flags;
259 {
260 ssize_t cnt;
261 int error;
262 #ifdef KTRACE
263 struct uio *ktruio = NULL;
264 #endif
265
266 /* Finish zero length reads right here */
267 if (auio->uio_resid == 0) {
268 td->td_retval[0] = 0;
269 return(0);
270 }
271 auio->uio_rw = UIO_READ;
272 auio->uio_offset = offset;
273 auio->uio_td = td;
274 #ifdef KTRACE
275 if (KTRPOINT(td, KTR_GENIO))
276 ktruio = cloneuio(auio);
277 #endif
278 cnt = auio->uio_resid;
279 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
280 if (auio->uio_resid != cnt && (error == ERESTART ||
281 error == EINTR || error == EWOULDBLOCK))
282 error = 0;
283 }
284 cnt -= auio->uio_resid;
285 #ifdef KTRACE
286 if (ktruio != NULL) {
287 ktruio->uio_resid = cnt;
288 ktrgenio(fd, UIO_READ, ktruio, error);
289 }
290 #endif
291 td->td_retval[0] = cnt;
292 return (error);
293 }
294
295 /*
296 * Write system call
297 */
298 #ifndef _SYS_SYSPROTO_H_
299 struct write_args {
300 int fd;
301 const void *buf;
302 size_t nbyte;
303 };
304 #endif
305 /*
306 * MPSAFE
307 */
308 int
309 write(td, uap)
310 struct thread *td;
311 struct write_args *uap;
312 {
313 struct uio auio;
314 struct iovec aiov;
315 int error;
316
317 if (uap->nbyte > INT_MAX)
318 return (EINVAL);
319 aiov.iov_base = (void *)(uintptr_t)uap->buf;
320 aiov.iov_len = uap->nbyte;
321 auio.uio_iov = &aiov;
322 auio.uio_iovcnt = 1;
323 auio.uio_resid = uap->nbyte;
324 auio.uio_segflg = UIO_USERSPACE;
325 error = kern_writev(td, uap->fd, &auio);
326 return(error);
327 }
328
329 /*
330 * Positioned write system call
331 */
332 #ifndef _SYS_SYSPROTO_H_
333 struct pwrite_args {
334 int fd;
335 const void *buf;
336 size_t nbyte;
337 int pad;
338 off_t offset;
339 };
340 #endif
341 /*
342 * MPSAFE
343 */
344 int
345 pwrite(td, uap)
346 struct thread *td;
347 struct pwrite_args *uap;
348 {
349 struct uio auio;
350 struct iovec aiov;
351 int error;
352
353 if (uap->nbyte > INT_MAX)
354 return (EINVAL);
355 aiov.iov_base = (void *)(uintptr_t)uap->buf;
356 aiov.iov_len = uap->nbyte;
357 auio.uio_iov = &aiov;
358 auio.uio_iovcnt = 1;
359 auio.uio_resid = uap->nbyte;
360 auio.uio_segflg = UIO_USERSPACE;
361 error = kern_pwritev(td, uap->fd, &auio, uap->offset);
362 return(error);
363 }
364
365 /*
366 * Gather write system call
367 */
368 #ifndef _SYS_SYSPROTO_H_
369 struct writev_args {
370 int fd;
371 struct iovec *iovp;
372 u_int iovcnt;
373 };
374 #endif
375 /*
376 * MPSAFE
377 */
378 int
379 writev(struct thread *td, struct writev_args *uap)
380 {
381 struct uio *auio;
382 int error;
383
384 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
385 if (error)
386 return (error);
387 error = kern_writev(td, uap->fd, auio);
388 free(auio, M_IOV);
389 return (error);
390 }
391
392 int
393 kern_writev(struct thread *td, int fd, struct uio *auio)
394 {
395 struct file *fp;
396 int error;
397
398 error = fget_write(td, fd, &fp);
399 if (error)
400 return (EBADF); /* XXX this can't be right */
401 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
402 fdrop(fp, td);
403 return (error);
404 }
405
406 /*
407 * Gather positioned write system call
408 */
409 #ifndef _SYS_SYSPROTO_H_
410 struct pwritev_args {
411 int fd;
412 struct iovec *iovp;
413 u_int iovcnt;
414 off_t offset;
415 };
416 #endif
417 /*
418 * MPSAFE
419 */
420 int
421 pwritev(struct thread *td, struct pwritev_args *uap)
422 {
423 struct uio *auio;
424 int error;
425
426 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
427 if (error)
428 return (error);
429 error = kern_pwritev(td, uap->fd, auio, uap->offset);
430 free(auio, M_IOV);
431 return (error);
432 }
433
434 int
435 kern_pwritev(td, fd, auio, offset)
436 struct thread *td;
437 struct uio *auio;
438 int fd;
439 off_t offset;
440 {
441 struct file *fp;
442 int error;
443
444 error = fget_write(td, fd, &fp);
445 if (error)
446 return (EBADF); /* XXX this can't be right */
447 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
448 error = ESPIPE;
449 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
450 error = EINVAL;
451 else
452 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
453 fdrop(fp, td);
454 return (error);
455 }
456
457 /*
458 * Common code for writev and pwritev that writes data to
459 * a file using the passed in uio, offset, and flags.
460 */
461 static int
462 dofilewrite(td, fd, fp, auio, offset, flags)
463 struct thread *td;
464 int fd;
465 struct file *fp;
466 struct uio *auio;
467 off_t offset;
468 int flags;
469 {
470 ssize_t cnt;
471 int error;
472 #ifdef KTRACE
473 struct uio *ktruio = NULL;
474 #endif
475
476 auio->uio_rw = UIO_WRITE;
477 auio->uio_td = td;
478 auio->uio_offset = offset;
479 #ifdef KTRACE
480 if (KTRPOINT(td, KTR_GENIO))
481 ktruio = cloneuio(auio);
482 #endif
483 cnt = auio->uio_resid;
484 if (fp->f_type == DTYPE_VNODE)
485 bwillwrite();
486 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
487 if (auio->uio_resid != cnt && (error == ERESTART ||
488 error == EINTR || error == EWOULDBLOCK))
489 error = 0;
490 /* Socket layer is responsible for issuing SIGPIPE. */
491 if (error == EPIPE) {
492 PROC_LOCK(td->td_proc);
493 psignal(td->td_proc, SIGPIPE);
494 PROC_UNLOCK(td->td_proc);
495 }
496 }
497 cnt -= auio->uio_resid;
498 #ifdef KTRACE
499 if (ktruio != NULL) {
500 ktruio->uio_resid = cnt;
501 ktrgenio(fd, UIO_WRITE, ktruio, error);
502 }
503 #endif
504 td->td_retval[0] = cnt;
505 return (error);
506 }
507
508 /*
509 * Ioctl system call
510 */
511 #ifndef _SYS_SYSPROTO_H_
512 struct ioctl_args {
513 int fd;
514 u_long com;
515 caddr_t data;
516 };
517 #endif
518 /*
519 * MPSAFE
520 */
521 /* ARGSUSED */
522 int
523 ioctl(struct thread *td, struct ioctl_args *uap)
524 {
525 struct file *fp;
526 struct filedesc *fdp;
527 u_long com;
528 int error = 0;
529 u_int size;
530 caddr_t data, memp;
531 int tmp;
532
533 if ((error = fget(td, uap->fd, &fp)) != 0)
534 return (error);
535 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
536 fdrop(fp, td);
537 return (EBADF);
538 }
539 fdp = td->td_proc->p_fd;
540 switch (com = uap->com) {
541 case FIONCLEX:
542 FILEDESC_LOCK_FAST(fdp);
543 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
544 FILEDESC_UNLOCK_FAST(fdp);
545 fdrop(fp, td);
546 return (0);
547 case FIOCLEX:
548 FILEDESC_LOCK_FAST(fdp);
549 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
550 FILEDESC_UNLOCK_FAST(fdp);
551 fdrop(fp, td);
552 return (0);
553 }
554
555 /*
556 * Interpret high order word to find amount of data to be
557 * copied to/from the user's address space.
558 */
559 size = IOCPARM_LEN(com);
560 if ((size > IOCPARM_MAX) ||
561 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0)) {
562 fdrop(fp, td);
563 return (ENOTTY);
564 }
565
566 if (size > 0) {
567 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
568 data = memp;
569 } else {
570 memp = NULL;
571 data = (void *)&uap->data;
572 }
573 if (com & IOC_IN) {
574 error = copyin(uap->data, data, (u_int)size);
575 if (error) {
576 free(memp, M_IOCTLOPS);
577 fdrop(fp, td);
578 return (error);
579 }
580 } else if (com & IOC_OUT) {
581 /*
582 * Zero the buffer so the user always
583 * gets back something deterministic.
584 */
585 bzero(data, size);
586 }
587
588 if (com == FIONBIO) {
589 FILE_LOCK(fp);
590 if ((tmp = *(int *)data))
591 fp->f_flag |= FNONBLOCK;
592 else
593 fp->f_flag &= ~FNONBLOCK;
594 FILE_UNLOCK(fp);
595 data = (void *)&tmp;
596 } else if (com == FIOASYNC) {
597 FILE_LOCK(fp);
598 if ((tmp = *(int *)data))
599 fp->f_flag |= FASYNC;
600 else
601 fp->f_flag &= ~FASYNC;
602 FILE_UNLOCK(fp);
603 data = (void *)&tmp;
604 }
605
606 error = fo_ioctl(fp, com, data, td->td_ucred, td);
607
608 if (error == 0 && (com & IOC_OUT))
609 error = copyout(data, uap->data, (u_int)size);
610
611 if (memp != NULL)
612 free(memp, M_IOCTLOPS);
613 fdrop(fp, td);
614 return (error);
615 }
616
617 /*
618 * sellock and selwait are initialized in selectinit() via SYSINIT.
619 */
620 struct mtx sellock;
621 struct cv selwait;
622 u_int nselcoll; /* Select collisions since boot */
623 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
624
625 /*
626 * Select system call.
627 */
628 #ifndef _SYS_SYSPROTO_H_
629 struct select_args {
630 int nd;
631 fd_set *in, *ou, *ex;
632 struct timeval *tv;
633 };
634 #endif
635 /*
636 * MPSAFE
637 */
638 int
639 select(td, uap)
640 register struct thread *td;
641 register struct select_args *uap;
642 {
643 struct timeval tv, *tvp;
644 int error;
645
646 if (uap->tv != NULL) {
647 error = copyin(uap->tv, &tv, sizeof(tv));
648 if (error)
649 return (error);
650 tvp = &tv;
651 } else
652 tvp = NULL;
653
654 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
655 }
656
657 int
658 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
659 fd_set *fd_ex, struct timeval *tvp)
660 {
661 struct filedesc *fdp;
662 /*
663 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
664 * infds with the new FD_SETSIZE of 1024, and more than enough for
665 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
666 * of 256.
667 */
668 fd_mask s_selbits[howmany(2048, NFDBITS)];
669 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
670 struct timeval atv, rtv, ttv;
671 int error, timo;
672 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
673
674 if (nd < 0)
675 return (EINVAL);
676 fdp = td->td_proc->p_fd;
677
678 FILEDESC_LOCK_FAST(fdp);
679
680 if (nd > td->td_proc->p_fd->fd_nfiles)
681 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
682 FILEDESC_UNLOCK_FAST(fdp);
683
684 /*
685 * Allocate just enough bits for the non-null fd_sets. Use the
686 * preallocated auto buffer if possible.
687 */
688 nfdbits = roundup(nd, NFDBITS);
689 ncpbytes = nfdbits / NBBY;
690 nbufbytes = 0;
691 if (fd_in != NULL)
692 nbufbytes += 2 * ncpbytes;
693 if (fd_ou != NULL)
694 nbufbytes += 2 * ncpbytes;
695 if (fd_ex != NULL)
696 nbufbytes += 2 * ncpbytes;
697 if (nbufbytes <= sizeof s_selbits)
698 selbits = &s_selbits[0];
699 else
700 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
701
702 /*
703 * Assign pointers into the bit buffers and fetch the input bits.
704 * Put the output buffers together so that they can be bzeroed
705 * together.
706 */
707 sbp = selbits;
708 #define getbits(name, x) \
709 do { \
710 if (name == NULL) \
711 ibits[x] = NULL; \
712 else { \
713 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
714 obits[x] = sbp; \
715 sbp += ncpbytes / sizeof *sbp; \
716 error = copyin(name, ibits[x], ncpbytes); \
717 if (error != 0) \
718 goto done_nosellock; \
719 } \
720 } while (0)
721 getbits(fd_in, 0);
722 getbits(fd_ou, 1);
723 getbits(fd_ex, 2);
724 #undef getbits
725 if (nbufbytes != 0)
726 bzero(selbits, nbufbytes / 2);
727
728 if (tvp != NULL) {
729 atv = *tvp;
730 if (itimerfix(&atv)) {
731 error = EINVAL;
732 goto done_nosellock;
733 }
734 getmicrouptime(&rtv);
735 timevaladd(&atv, &rtv);
736 } else {
737 atv.tv_sec = 0;
738 atv.tv_usec = 0;
739 }
740 timo = 0;
741 TAILQ_INIT(&td->td_selq);
742 mtx_lock(&sellock);
743 retry:
744 ncoll = nselcoll;
745 mtx_lock_spin(&sched_lock);
746 td->td_flags |= TDF_SELECT;
747 mtx_unlock_spin(&sched_lock);
748 mtx_unlock(&sellock);
749
750 error = selscan(td, ibits, obits, nd);
751 mtx_lock(&sellock);
752 if (error || td->td_retval[0])
753 goto done;
754 if (atv.tv_sec || atv.tv_usec) {
755 getmicrouptime(&rtv);
756 if (timevalcmp(&rtv, &atv, >=))
757 goto done;
758 ttv = atv;
759 timevalsub(&ttv, &rtv);
760 timo = ttv.tv_sec > 24 * 60 * 60 ?
761 24 * 60 * 60 * hz : tvtohz(&ttv);
762 }
763
764 /*
765 * An event of interest may occur while we do not hold
766 * sellock, so check TDF_SELECT and the number of
767 * collisions and rescan the file descriptors if
768 * necessary.
769 */
770 mtx_lock_spin(&sched_lock);
771 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
772 mtx_unlock_spin(&sched_lock);
773 goto retry;
774 }
775 mtx_unlock_spin(&sched_lock);
776
777 if (timo > 0)
778 error = cv_timedwait_sig(&selwait, &sellock, timo);
779 else
780 error = cv_wait_sig(&selwait, &sellock);
781
782 if (error == 0)
783 goto retry;
784
785 done:
786 clear_selinfo_list(td);
787 mtx_lock_spin(&sched_lock);
788 td->td_flags &= ~TDF_SELECT;
789 mtx_unlock_spin(&sched_lock);
790 mtx_unlock(&sellock);
791
792 done_nosellock:
793 /* select is not restarted after signals... */
794 if (error == ERESTART)
795 error = EINTR;
796 if (error == EWOULDBLOCK)
797 error = 0;
798 #define putbits(name, x) \
799 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
800 error = error2;
801 if (error == 0) {
802 int error2;
803
804 putbits(fd_in, 0);
805 putbits(fd_ou, 1);
806 putbits(fd_ex, 2);
807 #undef putbits
808 }
809 if (selbits != &s_selbits[0])
810 free(selbits, M_SELECT);
811
812 return (error);
813 }
814
815 static int
816 selscan(td, ibits, obits, nfd)
817 struct thread *td;
818 fd_mask **ibits, **obits;
819 int nfd;
820 {
821 int msk, i, fd;
822 fd_mask bits;
823 struct file *fp;
824 int n = 0;
825 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
826 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
827 struct filedesc *fdp = td->td_proc->p_fd;
828
829 FILEDESC_LOCK(fdp);
830 for (msk = 0; msk < 3; msk++) {
831 if (ibits[msk] == NULL)
832 continue;
833 for (i = 0; i < nfd; i += NFDBITS) {
834 bits = ibits[msk][i/NFDBITS];
835 /* ffs(int mask) not portable, fd_mask is long */
836 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
837 if (!(bits & 1))
838 continue;
839 if ((fp = fget_locked(fdp, fd)) == NULL) {
840 FILEDESC_UNLOCK(fdp);
841 return (EBADF);
842 }
843 if (fo_poll(fp, flag[msk], td->td_ucred,
844 td)) {
845 obits[msk][(fd)/NFDBITS] |=
846 ((fd_mask)1 << ((fd) % NFDBITS));
847 n++;
848 }
849 }
850 }
851 }
852 FILEDESC_UNLOCK(fdp);
853 td->td_retval[0] = n;
854 return (0);
855 }
856
857 /*
858 * Poll system call.
859 */
860 #ifndef _SYS_SYSPROTO_H_
861 struct poll_args {
862 struct pollfd *fds;
863 u_int nfds;
864 int timeout;
865 };
866 #endif
867 /*
868 * MPSAFE
869 */
870 int
871 poll(td, uap)
872 struct thread *td;
873 struct poll_args *uap;
874 {
875 struct pollfd *bits;
876 struct pollfd smallbits[32];
877 struct timeval atv, rtv, ttv;
878 int error = 0, timo;
879 u_int ncoll, nfds;
880 size_t ni;
881
882 nfds = uap->nfds;
883
884 /*
885 * This is kinda bogus. We have fd limits, but that is not
886 * really related to the size of the pollfd array. Make sure
887 * we let the process use at least FD_SETSIZE entries and at
888 * least enough for the current limits. We want to be reasonably
889 * safe, but not overly restrictive.
890 */
891 PROC_LOCK(td->td_proc);
892 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
893 (nfds > FD_SETSIZE)) {
894 PROC_UNLOCK(td->td_proc);
895 error = EINVAL;
896 goto done2;
897 }
898 PROC_UNLOCK(td->td_proc);
899 ni = nfds * sizeof(struct pollfd);
900 if (ni > sizeof(smallbits))
901 bits = malloc(ni, M_TEMP, M_WAITOK);
902 else
903 bits = smallbits;
904 error = copyin(uap->fds, bits, ni);
905 if (error)
906 goto done_nosellock;
907 if (uap->timeout != INFTIM) {
908 atv.tv_sec = uap->timeout / 1000;
909 atv.tv_usec = (uap->timeout % 1000) * 1000;
910 if (itimerfix(&atv)) {
911 error = EINVAL;
912 goto done_nosellock;
913 }
914 getmicrouptime(&rtv);
915 timevaladd(&atv, &rtv);
916 } else {
917 atv.tv_sec = 0;
918 atv.tv_usec = 0;
919 }
920 timo = 0;
921 TAILQ_INIT(&td->td_selq);
922 mtx_lock(&sellock);
923 retry:
924 ncoll = nselcoll;
925 mtx_lock_spin(&sched_lock);
926 td->td_flags |= TDF_SELECT;
927 mtx_unlock_spin(&sched_lock);
928 mtx_unlock(&sellock);
929
930 error = pollscan(td, bits, nfds);
931 mtx_lock(&sellock);
932 if (error || td->td_retval[0])
933 goto done;
934 if (atv.tv_sec || atv.tv_usec) {
935 getmicrouptime(&rtv);
936 if (timevalcmp(&rtv, &atv, >=))
937 goto done;
938 ttv = atv;
939 timevalsub(&ttv, &rtv);
940 timo = ttv.tv_sec > 24 * 60 * 60 ?
941 24 * 60 * 60 * hz : tvtohz(&ttv);
942 }
943 /*
944 * An event of interest may occur while we do not hold
945 * sellock, so check TDF_SELECT and the number of collisions
946 * and rescan the file descriptors if necessary.
947 */
948 mtx_lock_spin(&sched_lock);
949 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
950 mtx_unlock_spin(&sched_lock);
951 goto retry;
952 }
953 mtx_unlock_spin(&sched_lock);
954
955 if (timo > 0)
956 error = cv_timedwait_sig(&selwait, &sellock, timo);
957 else
958 error = cv_wait_sig(&selwait, &sellock);
959
960 if (error == 0)
961 goto retry;
962
963 done:
964 clear_selinfo_list(td);
965 mtx_lock_spin(&sched_lock);
966 td->td_flags &= ~TDF_SELECT;
967 mtx_unlock_spin(&sched_lock);
968 mtx_unlock(&sellock);
969
970 done_nosellock:
971 /* poll is not restarted after signals... */
972 if (error == ERESTART)
973 error = EINTR;
974 if (error == EWOULDBLOCK)
975 error = 0;
976 if (error == 0) {
977 error = copyout(bits, uap->fds, ni);
978 if (error)
979 goto out;
980 }
981 out:
982 if (ni > sizeof(smallbits))
983 free(bits, M_TEMP);
984 done2:
985 return (error);
986 }
987
988 static int
989 pollscan(td, fds, nfd)
990 struct thread *td;
991 struct pollfd *fds;
992 u_int nfd;
993 {
994 register struct filedesc *fdp = td->td_proc->p_fd;
995 int i;
996 struct file *fp;
997 int n = 0;
998
999 FILEDESC_LOCK(fdp);
1000 for (i = 0; i < nfd; i++, fds++) {
1001 if (fds->fd >= fdp->fd_nfiles) {
1002 fds->revents = POLLNVAL;
1003 n++;
1004 } else if (fds->fd < 0) {
1005 fds->revents = 0;
1006 } else {
1007 fp = fdp->fd_ofiles[fds->fd];
1008 if (fp == NULL) {
1009 fds->revents = POLLNVAL;
1010 n++;
1011 } else {
1012 /*
1013 * Note: backend also returns POLLHUP and
1014 * POLLERR if appropriate.
1015 */
1016 fds->revents = fo_poll(fp, fds->events,
1017 td->td_ucred, td);
1018 if (fds->revents != 0)
1019 n++;
1020 }
1021 }
1022 }
1023 FILEDESC_UNLOCK(fdp);
1024 td->td_retval[0] = n;
1025 return (0);
1026 }
1027
1028 /*
1029 * OpenBSD poll system call.
1030 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1031 */
1032 #ifndef _SYS_SYSPROTO_H_
1033 struct openbsd_poll_args {
1034 struct pollfd *fds;
1035 u_int nfds;
1036 int timeout;
1037 };
1038 #endif
1039 /*
1040 * MPSAFE
1041 */
1042 int
1043 openbsd_poll(td, uap)
1044 register struct thread *td;
1045 register struct openbsd_poll_args *uap;
1046 {
1047 return (poll(td, (struct poll_args *)uap));
1048 }
1049
1050 /*
1051 * Remove the references to the thread from all of the objects
1052 * we were polling.
1053 *
1054 * This code assumes that the underlying owner of the selinfo
1055 * structure will hold sellock before it changes it, and that
1056 * it will unlink itself from our list if it goes away.
1057 */
1058 void
1059 clear_selinfo_list(td)
1060 struct thread *td;
1061 {
1062 struct selinfo *si;
1063
1064 mtx_assert(&sellock, MA_OWNED);
1065 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1066 si->si_thread = NULL;
1067 TAILQ_INIT(&td->td_selq);
1068 }
1069
1070 /*
1071 * Record a select request.
1072 */
1073 void
1074 selrecord(selector, sip)
1075 struct thread *selector;
1076 struct selinfo *sip;
1077 {
1078
1079 mtx_lock(&sellock);
1080 /*
1081 * If the selinfo's thread pointer is NULL then take ownership of it.
1082 *
1083 * If the thread pointer is not NULL and it points to another
1084 * thread, then we have a collision.
1085 *
1086 * If the thread pointer is not NULL and points back to us then leave
1087 * it alone as we've already added pointed it at us and added it to
1088 * our list.
1089 */
1090 if (sip->si_thread == NULL) {
1091 sip->si_thread = selector;
1092 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1093 } else if (sip->si_thread != selector) {
1094 sip->si_flags |= SI_COLL;
1095 }
1096
1097 mtx_unlock(&sellock);
1098 }
1099
1100 /* Wake up a selecting thread. */
1101 void
1102 selwakeup(sip)
1103 struct selinfo *sip;
1104 {
1105 doselwakeup(sip, -1);
1106 }
1107
1108 /* Wake up a selecting thread, and set its priority. */
1109 void
1110 selwakeuppri(sip, pri)
1111 struct selinfo *sip;
1112 int pri;
1113 {
1114 doselwakeup(sip, pri);
1115 }
1116
1117 /*
1118 * Do a wakeup when a selectable event occurs.
1119 */
1120 static void
1121 doselwakeup(sip, pri)
1122 struct selinfo *sip;
1123 int pri;
1124 {
1125 struct thread *td;
1126
1127 mtx_lock(&sellock);
1128 td = sip->si_thread;
1129 if ((sip->si_flags & SI_COLL) != 0) {
1130 nselcoll++;
1131 sip->si_flags &= ~SI_COLL;
1132 cv_broadcastpri(&selwait, pri);
1133 }
1134 if (td == NULL) {
1135 mtx_unlock(&sellock);
1136 return;
1137 }
1138 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1139 sip->si_thread = NULL;
1140 mtx_lock_spin(&sched_lock);
1141 td->td_flags &= ~TDF_SELECT;
1142 mtx_unlock_spin(&sched_lock);
1143 sleepq_remove(td, &selwait);
1144 mtx_unlock(&sellock);
1145 }
1146
1147 static void selectinit(void *);
1148 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1149
1150 /* ARGSUSED*/
1151 static void
1152 selectinit(dummy)
1153 void *dummy;
1154 {
1155 cv_init(&selwait, "select");
1156 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1157 }
Cache object: 4887d6a62a5471d9ddbee5f746b9d5a8
|