1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71
72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
75
76 static int pollscan(struct thread *, struct pollfd *, u_int);
77 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
78 static int dofileread(struct thread *, int, struct file *, struct uio *,
79 off_t, int);
80 static int dofilewrite(struct thread *, int, struct file *, struct uio *,
81 off_t, int);
82 static void doselwakeup(struct selinfo *, int);
83
84 #ifndef _SYS_SYSPROTO_H_
85 struct read_args {
86 int fd;
87 void *buf;
88 size_t nbyte;
89 };
90 #endif
91 int
92 read(td, uap)
93 struct thread *td;
94 struct read_args *uap;
95 {
96 struct uio auio;
97 struct iovec aiov;
98 int error;
99
100 if (uap->nbyte > INT_MAX)
101 return (EINVAL);
102 aiov.iov_base = uap->buf;
103 aiov.iov_len = uap->nbyte;
104 auio.uio_iov = &aiov;
105 auio.uio_iovcnt = 1;
106 auio.uio_resid = uap->nbyte;
107 auio.uio_segflg = UIO_USERSPACE;
108 error = kern_readv(td, uap->fd, &auio);
109 return(error);
110 }
111
112 /*
113 * Positioned read system call
114 */
115 #ifndef _SYS_SYSPROTO_H_
116 struct pread_args {
117 int fd;
118 void *buf;
119 size_t nbyte;
120 int pad;
121 off_t offset;
122 };
123 #endif
124 int
125 pread(td, uap)
126 struct thread *td;
127 struct pread_args *uap;
128 {
129 struct uio auio;
130 struct iovec aiov;
131 int error;
132
133 if (uap->nbyte > INT_MAX)
134 return (EINVAL);
135 aiov.iov_base = uap->buf;
136 aiov.iov_len = uap->nbyte;
137 auio.uio_iov = &aiov;
138 auio.uio_iovcnt = 1;
139 auio.uio_resid = uap->nbyte;
140 auio.uio_segflg = UIO_USERSPACE;
141 error = kern_preadv(td, uap->fd, &auio, uap->offset);
142 return(error);
143 }
144
145 int
146 freebsd6_pread(td, uap)
147 struct thread *td;
148 struct freebsd6_pread_args *uap;
149 {
150 struct pread_args oargs;
151
152 oargs.fd = uap->fd;
153 oargs.buf = uap->buf;
154 oargs.nbyte = uap->nbyte;
155 oargs.offset = uap->offset;
156 return (pread(td, &oargs));
157 }
158
159 /*
160 * Scatter read system call.
161 */
162 #ifndef _SYS_SYSPROTO_H_
163 struct readv_args {
164 int fd;
165 struct iovec *iovp;
166 u_int iovcnt;
167 };
168 #endif
169 int
170 readv(struct thread *td, struct readv_args *uap)
171 {
172 struct uio *auio;
173 int error;
174
175 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
176 if (error)
177 return (error);
178 error = kern_readv(td, uap->fd, auio);
179 free(auio, M_IOV);
180 return (error);
181 }
182
183 int
184 kern_readv(struct thread *td, int fd, struct uio *auio)
185 {
186 struct file *fp;
187 int error;
188
189 error = fget_read(td, fd, &fp);
190 if (error)
191 return (error);
192 error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
193 fdrop(fp, td);
194 return (error);
195 }
196
197 /*
198 * Scatter positioned read system call.
199 */
200 #ifndef _SYS_SYSPROTO_H_
201 struct preadv_args {
202 int fd;
203 struct iovec *iovp;
204 u_int iovcnt;
205 off_t offset;
206 };
207 #endif
208 int
209 preadv(struct thread *td, struct preadv_args *uap)
210 {
211 struct uio *auio;
212 int error;
213
214 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
215 if (error)
216 return (error);
217 error = kern_preadv(td, uap->fd, auio, uap->offset);
218 free(auio, M_IOV);
219 return (error);
220 }
221
222 int
223 kern_preadv(td, fd, auio, offset)
224 struct thread *td;
225 int fd;
226 struct uio *auio;
227 off_t offset;
228 {
229 struct file *fp;
230 int error;
231
232 error = fget_read(td, fd, &fp);
233 if (error)
234 return (error);
235 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
236 error = ESPIPE;
237 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
238 error = EINVAL;
239 else
240 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
241 fdrop(fp, td);
242 return (error);
243 }
244
245 /*
246 * Common code for readv and preadv that reads data in
247 * from a file using the passed in uio, offset, and flags.
248 */
249 static int
250 dofileread(td, fd, fp, auio, offset, flags)
251 struct thread *td;
252 int fd;
253 struct file *fp;
254 struct uio *auio;
255 off_t offset;
256 int flags;
257 {
258 ssize_t cnt;
259 int error;
260 #ifdef KTRACE
261 struct uio *ktruio = NULL;
262 #endif
263
264 /* Finish zero length reads right here */
265 if (auio->uio_resid == 0) {
266 td->td_retval[0] = 0;
267 return(0);
268 }
269 auio->uio_rw = UIO_READ;
270 auio->uio_offset = offset;
271 auio->uio_td = td;
272 #ifdef KTRACE
273 if (KTRPOINT(td, KTR_GENIO))
274 ktruio = cloneuio(auio);
275 #endif
276 cnt = auio->uio_resid;
277 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
278 if (auio->uio_resid != cnt && (error == ERESTART ||
279 error == EINTR || error == EWOULDBLOCK))
280 error = 0;
281 }
282 cnt -= auio->uio_resid;
283 #ifdef KTRACE
284 if (ktruio != NULL) {
285 ktruio->uio_resid = cnt;
286 ktrgenio(fd, UIO_READ, ktruio, error);
287 }
288 #endif
289 td->td_retval[0] = cnt;
290 return (error);
291 }
292
293 #ifndef _SYS_SYSPROTO_H_
294 struct write_args {
295 int fd;
296 const void *buf;
297 size_t nbyte;
298 };
299 #endif
300 int
301 write(td, uap)
302 struct thread *td;
303 struct write_args *uap;
304 {
305 struct uio auio;
306 struct iovec aiov;
307 int error;
308
309 if (uap->nbyte > INT_MAX)
310 return (EINVAL);
311 aiov.iov_base = (void *)(uintptr_t)uap->buf;
312 aiov.iov_len = uap->nbyte;
313 auio.uio_iov = &aiov;
314 auio.uio_iovcnt = 1;
315 auio.uio_resid = uap->nbyte;
316 auio.uio_segflg = UIO_USERSPACE;
317 error = kern_writev(td, uap->fd, &auio);
318 return(error);
319 }
320
321 /*
322 * Positioned write system call.
323 */
324 #ifndef _SYS_SYSPROTO_H_
325 struct pwrite_args {
326 int fd;
327 const void *buf;
328 size_t nbyte;
329 int pad;
330 off_t offset;
331 };
332 #endif
333 int
334 pwrite(td, uap)
335 struct thread *td;
336 struct pwrite_args *uap;
337 {
338 struct uio auio;
339 struct iovec aiov;
340 int error;
341
342 if (uap->nbyte > INT_MAX)
343 return (EINVAL);
344 aiov.iov_base = (void *)(uintptr_t)uap->buf;
345 aiov.iov_len = uap->nbyte;
346 auio.uio_iov = &aiov;
347 auio.uio_iovcnt = 1;
348 auio.uio_resid = uap->nbyte;
349 auio.uio_segflg = UIO_USERSPACE;
350 error = kern_pwritev(td, uap->fd, &auio, uap->offset);
351 return(error);
352 }
353
354 int
355 freebsd6_pwrite(td, uap)
356 struct thread *td;
357 struct freebsd6_pwrite_args *uap;
358 {
359 struct pwrite_args oargs;
360
361 oargs.fd = uap->fd;
362 oargs.buf = uap->buf;
363 oargs.nbyte = uap->nbyte;
364 oargs.offset = uap->offset;
365 return (pwrite(td, &oargs));
366 }
367
368 /*
369 * Gather write system call.
370 */
371 #ifndef _SYS_SYSPROTO_H_
372 struct writev_args {
373 int fd;
374 struct iovec *iovp;
375 u_int iovcnt;
376 };
377 #endif
378 int
379 writev(struct thread *td, struct writev_args *uap)
380 {
381 struct uio *auio;
382 int error;
383
384 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
385 if (error)
386 return (error);
387 error = kern_writev(td, uap->fd, auio);
388 free(auio, M_IOV);
389 return (error);
390 }
391
392 int
393 kern_writev(struct thread *td, int fd, struct uio *auio)
394 {
395 struct file *fp;
396 int error;
397
398 error = fget_write(td, fd, &fp);
399 if (error)
400 return (error);
401 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
402 fdrop(fp, td);
403 return (error);
404 }
405
406 /*
407 * Gather positioned write system call.
408 */
409 #ifndef _SYS_SYSPROTO_H_
410 struct pwritev_args {
411 int fd;
412 struct iovec *iovp;
413 u_int iovcnt;
414 off_t offset;
415 };
416 #endif
417 int
418 pwritev(struct thread *td, struct pwritev_args *uap)
419 {
420 struct uio *auio;
421 int error;
422
423 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
424 if (error)
425 return (error);
426 error = kern_pwritev(td, uap->fd, auio, uap->offset);
427 free(auio, M_IOV);
428 return (error);
429 }
430
431 int
432 kern_pwritev(td, fd, auio, offset)
433 struct thread *td;
434 struct uio *auio;
435 int fd;
436 off_t offset;
437 {
438 struct file *fp;
439 int error;
440
441 error = fget_write(td, fd, &fp);
442 if (error)
443 return (error);
444 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
445 error = ESPIPE;
446 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
447 error = EINVAL;
448 else
449 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
450 fdrop(fp, td);
451 return (error);
452 }
453
454 /*
455 * Common code for writev and pwritev that writes data to
456 * a file using the passed in uio, offset, and flags.
457 */
458 static int
459 dofilewrite(td, fd, fp, auio, offset, flags)
460 struct thread *td;
461 int fd;
462 struct file *fp;
463 struct uio *auio;
464 off_t offset;
465 int flags;
466 {
467 ssize_t cnt;
468 int error;
469 #ifdef KTRACE
470 struct uio *ktruio = NULL;
471 #endif
472
473 auio->uio_rw = UIO_WRITE;
474 auio->uio_td = td;
475 auio->uio_offset = offset;
476 #ifdef KTRACE
477 if (KTRPOINT(td, KTR_GENIO))
478 ktruio = cloneuio(auio);
479 #endif
480 cnt = auio->uio_resid;
481 if (fp->f_type == DTYPE_VNODE)
482 bwillwrite();
483 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
484 if (auio->uio_resid != cnt && (error == ERESTART ||
485 error == EINTR || error == EWOULDBLOCK))
486 error = 0;
487 /* Socket layer is responsible for issuing SIGPIPE. */
488 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
489 PROC_LOCK(td->td_proc);
490 psignal(td->td_proc, SIGPIPE);
491 PROC_UNLOCK(td->td_proc);
492 }
493 }
494 cnt -= auio->uio_resid;
495 #ifdef KTRACE
496 if (ktruio != NULL) {
497 ktruio->uio_resid = cnt;
498 ktrgenio(fd, UIO_WRITE, ktruio, error);
499 }
500 #endif
501 td->td_retval[0] = cnt;
502 return (error);
503 }
504
505 #ifndef _SYS_SYSPROTO_H_
506 struct ioctl_args {
507 int fd;
508 u_long com;
509 caddr_t data;
510 };
511 #endif
512 /* ARGSUSED */
513 int
514 ioctl(struct thread *td, struct ioctl_args *uap)
515 {
516 u_long com;
517 int arg, error;
518 u_int size;
519 caddr_t data;
520
521 if (uap->com > 0xffffffff) {
522 printf(
523 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
524 td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
525 uap->com &= 0xffffffff;
526 }
527 com = uap->com;
528
529 /*
530 * Interpret high order word to find amount of data to be
531 * copied to/from the user's address space.
532 */
533 size = IOCPARM_LEN(com);
534 if ((size > IOCPARM_MAX) ||
535 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) ||
536 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
537 ((com & IOC_OUT) && size == 0) ||
538 #else
539 ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
540 #endif
541 ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
542 return (ENOTTY);
543
544 if (size > 0) {
545 if (!(com & IOC_VOID))
546 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
547 else {
548 /* Integer argument. */
549 arg = (intptr_t)uap->data;
550 data = (void *)&arg;
551 size = 0;
552 }
553 } else
554 data = (void *)&uap->data;
555 if (com & IOC_IN) {
556 error = copyin(uap->data, data, (u_int)size);
557 if (error) {
558 if (size > 0)
559 free(data, M_IOCTLOPS);
560 return (error);
561 }
562 } else if (com & IOC_OUT) {
563 /*
564 * Zero the buffer so the user always
565 * gets back something deterministic.
566 */
567 bzero(data, size);
568 }
569
570 error = kern_ioctl(td, uap->fd, com, data);
571
572 if (error == 0 && (com & IOC_OUT))
573 error = copyout(data, uap->data, (u_int)size);
574
575 if (size > 0)
576 free(data, M_IOCTLOPS);
577 return (error);
578 }
579
580 int
581 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
582 {
583 struct file *fp;
584 struct filedesc *fdp;
585 int error;
586 int tmp;
587
588 if ((error = fget(td, fd, &fp)) != 0)
589 return (error);
590 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
591 fdrop(fp, td);
592 return (EBADF);
593 }
594 fdp = td->td_proc->p_fd;
595 switch (com) {
596 case FIONCLEX:
597 FILEDESC_XLOCK(fdp);
598 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
599 FILEDESC_XUNLOCK(fdp);
600 goto out;
601 case FIOCLEX:
602 FILEDESC_XLOCK(fdp);
603 fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
604 FILEDESC_XUNLOCK(fdp);
605 goto out;
606 case FIONBIO:
607 FILE_LOCK(fp);
608 if ((tmp = *(int *)data))
609 fp->f_flag |= FNONBLOCK;
610 else
611 fp->f_flag &= ~FNONBLOCK;
612 FILE_UNLOCK(fp);
613 data = (void *)&tmp;
614 break;
615 case FIOASYNC:
616 FILE_LOCK(fp);
617 if ((tmp = *(int *)data))
618 fp->f_flag |= FASYNC;
619 else
620 fp->f_flag &= ~FASYNC;
621 FILE_UNLOCK(fp);
622 data = (void *)&tmp;
623 break;
624 }
625
626 error = fo_ioctl(fp, com, data, td->td_ucred, td);
627 out:
628 fdrop(fp, td);
629 return (error);
630 }
631
632 /*
633 * sellock and selwait are initialized in selectinit() via SYSINIT.
634 */
635 struct mtx sellock;
636 struct cv selwait;
637 u_int nselcoll; /* Select collisions since boot */
638 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
639
640 #ifndef _SYS_SYSPROTO_H_
641 struct select_args {
642 int nd;
643 fd_set *in, *ou, *ex;
644 struct timeval *tv;
645 };
646 #endif
647 int
648 select(td, uap)
649 register struct thread *td;
650 register struct select_args *uap;
651 {
652 struct timeval tv, *tvp;
653 int error;
654
655 if (uap->tv != NULL) {
656 error = copyin(uap->tv, &tv, sizeof(tv));
657 if (error)
658 return (error);
659 tvp = &tv;
660 } else
661 tvp = NULL;
662
663 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
664 }
665
666 int
667 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
668 fd_set *fd_ex, struct timeval *tvp)
669 {
670 struct filedesc *fdp;
671 /*
672 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
673 * infds with the new FD_SETSIZE of 1024, and more than enough for
674 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
675 * of 256.
676 */
677 fd_mask s_selbits[howmany(2048, NFDBITS)];
678 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
679 struct timeval atv, rtv, ttv;
680 int error, timo;
681 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
682
683 if (nd < 0)
684 return (EINVAL);
685 fdp = td->td_proc->p_fd;
686
687 FILEDESC_SLOCK(fdp);
688 if (nd > td->td_proc->p_fd->fd_nfiles)
689 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
690 FILEDESC_SUNLOCK(fdp);
691
692 /*
693 * Allocate just enough bits for the non-null fd_sets. Use the
694 * preallocated auto buffer if possible.
695 */
696 nfdbits = roundup(nd, NFDBITS);
697 ncpbytes = nfdbits / NBBY;
698 nbufbytes = 0;
699 if (fd_in != NULL)
700 nbufbytes += 2 * ncpbytes;
701 if (fd_ou != NULL)
702 nbufbytes += 2 * ncpbytes;
703 if (fd_ex != NULL)
704 nbufbytes += 2 * ncpbytes;
705 if (nbufbytes <= sizeof s_selbits)
706 selbits = &s_selbits[0];
707 else
708 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
709
710 /*
711 * Assign pointers into the bit buffers and fetch the input bits.
712 * Put the output buffers together so that they can be bzeroed
713 * together.
714 */
715 sbp = selbits;
716 #define getbits(name, x) \
717 do { \
718 if (name == NULL) \
719 ibits[x] = NULL; \
720 else { \
721 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
722 obits[x] = sbp; \
723 sbp += ncpbytes / sizeof *sbp; \
724 error = copyin(name, ibits[x], ncpbytes); \
725 if (error != 0) \
726 goto done_nosellock; \
727 } \
728 } while (0)
729 getbits(fd_in, 0);
730 getbits(fd_ou, 1);
731 getbits(fd_ex, 2);
732 #undef getbits
733 if (nbufbytes != 0)
734 bzero(selbits, nbufbytes / 2);
735
736 if (tvp != NULL) {
737 atv = *tvp;
738 if (itimerfix(&atv)) {
739 error = EINVAL;
740 goto done_nosellock;
741 }
742 getmicrouptime(&rtv);
743 timevaladd(&atv, &rtv);
744 } else {
745 atv.tv_sec = 0;
746 atv.tv_usec = 0;
747 }
748 timo = 0;
749 TAILQ_INIT(&td->td_selq);
750 mtx_lock(&sellock);
751 retry:
752 ncoll = nselcoll;
753 thread_lock(td);
754 td->td_flags |= TDF_SELECT;
755 thread_unlock(td);
756 mtx_unlock(&sellock);
757
758 error = selscan(td, ibits, obits, nd);
759 mtx_lock(&sellock);
760 if (error || td->td_retval[0])
761 goto done;
762 if (atv.tv_sec || atv.tv_usec) {
763 getmicrouptime(&rtv);
764 if (timevalcmp(&rtv, &atv, >=))
765 goto done;
766 ttv = atv;
767 timevalsub(&ttv, &rtv);
768 timo = ttv.tv_sec > 24 * 60 * 60 ?
769 24 * 60 * 60 * hz : tvtohz(&ttv);
770 }
771
772 /*
773 * An event of interest may occur while we do not hold
774 * sellock, so check TDF_SELECT and the number of
775 * collisions and rescan the file descriptors if
776 * necessary.
777 */
778 thread_lock(td);
779 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
780 thread_unlock(td);
781 goto retry;
782 }
783 thread_unlock(td);
784
785 if (timo > 0)
786 error = cv_timedwait_sig(&selwait, &sellock, timo);
787 else
788 error = cv_wait_sig(&selwait, &sellock);
789
790 if (error == 0)
791 goto retry;
792
793 done:
794 clear_selinfo_list(td);
795 thread_lock(td);
796 td->td_flags &= ~TDF_SELECT;
797 thread_unlock(td);
798 mtx_unlock(&sellock);
799
800 done_nosellock:
801 /* select is not restarted after signals... */
802 if (error == ERESTART)
803 error = EINTR;
804 if (error == EWOULDBLOCK)
805 error = 0;
806 #define putbits(name, x) \
807 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
808 error = error2;
809 if (error == 0) {
810 int error2;
811
812 putbits(fd_in, 0);
813 putbits(fd_ou, 1);
814 putbits(fd_ex, 2);
815 #undef putbits
816 }
817 if (selbits != &s_selbits[0])
818 free(selbits, M_SELECT);
819
820 return (error);
821 }
822
823 static int
824 selscan(td, ibits, obits, nfd)
825 struct thread *td;
826 fd_mask **ibits, **obits;
827 int nfd;
828 {
829 int msk, i, fd;
830 fd_mask bits;
831 struct file *fp;
832 int n = 0;
833 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
834 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
835 struct filedesc *fdp = td->td_proc->p_fd;
836
837 FILEDESC_SLOCK(fdp);
838 for (msk = 0; msk < 3; msk++) {
839 if (ibits[msk] == NULL)
840 continue;
841 for (i = 0; i < nfd; i += NFDBITS) {
842 bits = ibits[msk][i/NFDBITS];
843 /* ffs(int mask) not portable, fd_mask is long */
844 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
845 if (!(bits & 1))
846 continue;
847 if ((fp = fget_locked(fdp, fd)) == NULL) {
848 FILEDESC_SUNLOCK(fdp);
849 return (EBADF);
850 }
851 if (fo_poll(fp, flag[msk], td->td_ucred,
852 td)) {
853 obits[msk][(fd)/NFDBITS] |=
854 ((fd_mask)1 << ((fd) % NFDBITS));
855 n++;
856 }
857 }
858 }
859 }
860 FILEDESC_SUNLOCK(fdp);
861 td->td_retval[0] = n;
862 return (0);
863 }
864
865 #ifndef _SYS_SYSPROTO_H_
866 struct poll_args {
867 struct pollfd *fds;
868 u_int nfds;
869 int timeout;
870 };
871 #endif
872 int
873 poll(td, uap)
874 struct thread *td;
875 struct poll_args *uap;
876 {
877 struct pollfd *bits;
878 struct pollfd smallbits[32];
879 struct timeval atv, rtv, ttv;
880 int error = 0, timo;
881 u_int ncoll, nfds;
882 size_t ni;
883
884 nfds = uap->nfds;
885
886 /*
887 * This is kinda bogus. We have fd limits, but that is not
888 * really related to the size of the pollfd array. Make sure
889 * we let the process use at least FD_SETSIZE entries and at
890 * least enough for the current limits. We want to be reasonably
891 * safe, but not overly restrictive.
892 */
893 PROC_LOCK(td->td_proc);
894 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
895 (nfds > FD_SETSIZE)) {
896 PROC_UNLOCK(td->td_proc);
897 error = EINVAL;
898 goto done2;
899 }
900 PROC_UNLOCK(td->td_proc);
901 ni = nfds * sizeof(struct pollfd);
902 if (ni > sizeof(smallbits))
903 bits = malloc(ni, M_TEMP, M_WAITOK);
904 else
905 bits = smallbits;
906 error = copyin(uap->fds, bits, ni);
907 if (error)
908 goto done_nosellock;
909 if (uap->timeout != INFTIM) {
910 atv.tv_sec = uap->timeout / 1000;
911 atv.tv_usec = (uap->timeout % 1000) * 1000;
912 if (itimerfix(&atv)) {
913 error = EINVAL;
914 goto done_nosellock;
915 }
916 getmicrouptime(&rtv);
917 timevaladd(&atv, &rtv);
918 } else {
919 atv.tv_sec = 0;
920 atv.tv_usec = 0;
921 }
922 timo = 0;
923 TAILQ_INIT(&td->td_selq);
924 mtx_lock(&sellock);
925 retry:
926 ncoll = nselcoll;
927 thread_lock(td);
928 td->td_flags |= TDF_SELECT;
929 thread_unlock(td);
930 mtx_unlock(&sellock);
931
932 error = pollscan(td, bits, nfds);
933 mtx_lock(&sellock);
934 if (error || td->td_retval[0])
935 goto done;
936 if (atv.tv_sec || atv.tv_usec) {
937 getmicrouptime(&rtv);
938 if (timevalcmp(&rtv, &atv, >=))
939 goto done;
940 ttv = atv;
941 timevalsub(&ttv, &rtv);
942 timo = ttv.tv_sec > 24 * 60 * 60 ?
943 24 * 60 * 60 * hz : tvtohz(&ttv);
944 }
945 /*
946 * An event of interest may occur while we do not hold
947 * sellock, so check TDF_SELECT and the number of collisions
948 * and rescan the file descriptors if necessary.
949 */
950 thread_lock(td);
951 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
952 thread_unlock(td);
953 goto retry;
954 }
955 thread_unlock(td);
956
957 if (timo > 0)
958 error = cv_timedwait_sig(&selwait, &sellock, timo);
959 else
960 error = cv_wait_sig(&selwait, &sellock);
961
962 if (error == 0)
963 goto retry;
964
965 done:
966 clear_selinfo_list(td);
967 thread_lock(td);
968 td->td_flags &= ~TDF_SELECT;
969 thread_unlock(td);
970 mtx_unlock(&sellock);
971
972 done_nosellock:
973 /* poll is not restarted after signals... */
974 if (error == ERESTART)
975 error = EINTR;
976 if (error == EWOULDBLOCK)
977 error = 0;
978 if (error == 0) {
979 error = copyout(bits, uap->fds, ni);
980 if (error)
981 goto out;
982 }
983 out:
984 if (ni > sizeof(smallbits))
985 free(bits, M_TEMP);
986 done2:
987 return (error);
988 }
989
990 static int
991 pollscan(td, fds, nfd)
992 struct thread *td;
993 struct pollfd *fds;
994 u_int nfd;
995 {
996 register struct filedesc *fdp = td->td_proc->p_fd;
997 int i;
998 struct file *fp;
999 int n = 0;
1000
1001 FILEDESC_SLOCK(fdp);
1002 for (i = 0; i < nfd; i++, fds++) {
1003 if (fds->fd >= fdp->fd_nfiles) {
1004 fds->revents = POLLNVAL;
1005 n++;
1006 } else if (fds->fd < 0) {
1007 fds->revents = 0;
1008 } else {
1009 fp = fdp->fd_ofiles[fds->fd];
1010 if (fp == NULL) {
1011 fds->revents = POLLNVAL;
1012 n++;
1013 } else {
1014 /*
1015 * Note: backend also returns POLLHUP and
1016 * POLLERR if appropriate.
1017 */
1018 fds->revents = fo_poll(fp, fds->events,
1019 td->td_ucred, td);
1020 if (fds->revents != 0)
1021 n++;
1022 }
1023 }
1024 }
1025 FILEDESC_SUNLOCK(fdp);
1026 td->td_retval[0] = n;
1027 return (0);
1028 }
1029
1030 /*
1031 * OpenBSD poll system call.
1032 *
1033 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1034 */
1035 #ifndef _SYS_SYSPROTO_H_
1036 struct openbsd_poll_args {
1037 struct pollfd *fds;
1038 u_int nfds;
1039 int timeout;
1040 };
1041 #endif
1042 int
1043 openbsd_poll(td, uap)
1044 register struct thread *td;
1045 register struct openbsd_poll_args *uap;
1046 {
1047 return (poll(td, (struct poll_args *)uap));
1048 }
1049
1050 /*
1051 * Remove the references to the thread from all of the objects we were
1052 * polling.
1053 *
1054 * This code assumes that the underlying owner of the selinfo structure will
1055 * hold sellock before it changes it, and that it will unlink itself from our
1056 * list if it goes away.
1057 */
1058 void
1059 clear_selinfo_list(td)
1060 struct thread *td;
1061 {
1062 struct selinfo *si;
1063
1064 mtx_assert(&sellock, MA_OWNED);
1065 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1066 si->si_thread = NULL;
1067 TAILQ_INIT(&td->td_selq);
1068 }
1069
1070 /*
1071 * Record a select request.
1072 */
1073 void
1074 selrecord(selector, sip)
1075 struct thread *selector;
1076 struct selinfo *sip;
1077 {
1078
1079 mtx_lock(&sellock);
1080 /*
1081 * If the selinfo's thread pointer is NULL then take ownership of it.
1082 *
1083 * If the thread pointer is not NULL and it points to another
1084 * thread, then we have a collision.
1085 *
1086 * If the thread pointer is not NULL and points back to us then leave
1087 * it alone as we've already added pointed it at us and added it to
1088 * our list.
1089 */
1090 if (sip->si_thread == NULL) {
1091 sip->si_thread = selector;
1092 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1093 } else if (sip->si_thread != selector) {
1094 sip->si_flags |= SI_COLL;
1095 }
1096
1097 mtx_unlock(&sellock);
1098 }
1099
1100 /* Wake up a selecting thread. */
1101 void
1102 selwakeup(sip)
1103 struct selinfo *sip;
1104 {
1105 doselwakeup(sip, -1);
1106 }
1107
1108 /* Wake up a selecting thread, and set its priority. */
1109 void
1110 selwakeuppri(sip, pri)
1111 struct selinfo *sip;
1112 int pri;
1113 {
1114 doselwakeup(sip, pri);
1115 }
1116
1117 /*
1118 * Do a wakeup when a selectable event occurs.
1119 */
1120 static void
1121 doselwakeup(sip, pri)
1122 struct selinfo *sip;
1123 int pri;
1124 {
1125 struct thread *td;
1126
1127 mtx_lock(&sellock);
1128 td = sip->si_thread;
1129 if ((sip->si_flags & SI_COLL) != 0) {
1130 nselcoll++;
1131 sip->si_flags &= ~SI_COLL;
1132 cv_broadcastpri(&selwait, pri);
1133 }
1134 if (td == NULL) {
1135 mtx_unlock(&sellock);
1136 return;
1137 }
1138 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1139 sip->si_thread = NULL;
1140 thread_lock(td);
1141 td->td_flags &= ~TDF_SELECT;
1142 thread_unlock(td);
1143 sleepq_remove(td, &selwait);
1144 mtx_unlock(&sellock);
1145 }
1146
1147 static void selectinit(void *);
1148 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1149
1150 /* ARGSUSED*/
1151 static void
1152 selectinit(dummy)
1153 void *dummy;
1154 {
1155 cv_init(&selwait, "select");
1156 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1157 }
Cache object: 55b3daab76e5221405819bed80100915
|