1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD: releng/7.4/sys/kern/sys_generic.c 210086 2010-07-14 21:52:23Z jhb $");
39
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71
72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
75
76 static int pollout(struct pollfd *, struct pollfd *, u_int);
77 static int pollscan(struct thread *, struct pollfd *, u_int);
78 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
79 static int dofileread(struct thread *, int, struct file *, struct uio *,
80 off_t, int);
81 static int dofilewrite(struct thread *, int, struct file *, struct uio *,
82 off_t, int);
83 static void doselwakeup(struct selinfo *, int);
84
85 #ifndef _SYS_SYSPROTO_H_
86 struct read_args {
87 int fd;
88 void *buf;
89 size_t nbyte;
90 };
91 #endif
92 int
93 read(td, uap)
94 struct thread *td;
95 struct read_args *uap;
96 {
97 struct uio auio;
98 struct iovec aiov;
99 int error;
100
101 if (uap->nbyte > INT_MAX)
102 return (EINVAL);
103 aiov.iov_base = uap->buf;
104 aiov.iov_len = uap->nbyte;
105 auio.uio_iov = &aiov;
106 auio.uio_iovcnt = 1;
107 auio.uio_resid = uap->nbyte;
108 auio.uio_segflg = UIO_USERSPACE;
109 error = kern_readv(td, uap->fd, &auio);
110 return(error);
111 }
112
113 /*
114 * Positioned read system call
115 */
116 #ifndef _SYS_SYSPROTO_H_
117 struct pread_args {
118 int fd;
119 void *buf;
120 size_t nbyte;
121 int pad;
122 off_t offset;
123 };
124 #endif
125 int
126 pread(td, uap)
127 struct thread *td;
128 struct pread_args *uap;
129 {
130 struct uio auio;
131 struct iovec aiov;
132 int error;
133
134 if (uap->nbyte > INT_MAX)
135 return (EINVAL);
136 aiov.iov_base = uap->buf;
137 aiov.iov_len = uap->nbyte;
138 auio.uio_iov = &aiov;
139 auio.uio_iovcnt = 1;
140 auio.uio_resid = uap->nbyte;
141 auio.uio_segflg = UIO_USERSPACE;
142 error = kern_preadv(td, uap->fd, &auio, uap->offset);
143 return(error);
144 }
145
146 int
147 freebsd6_pread(td, uap)
148 struct thread *td;
149 struct freebsd6_pread_args *uap;
150 {
151 struct pread_args oargs;
152
153 oargs.fd = uap->fd;
154 oargs.buf = uap->buf;
155 oargs.nbyte = uap->nbyte;
156 oargs.offset = uap->offset;
157 return (pread(td, &oargs));
158 }
159
160 /*
161 * Scatter read system call.
162 */
163 #ifndef _SYS_SYSPROTO_H_
164 struct readv_args {
165 int fd;
166 struct iovec *iovp;
167 u_int iovcnt;
168 };
169 #endif
170 int
171 readv(struct thread *td, struct readv_args *uap)
172 {
173 struct uio *auio;
174 int error;
175
176 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
177 if (error)
178 return (error);
179 error = kern_readv(td, uap->fd, auio);
180 free(auio, M_IOV);
181 return (error);
182 }
183
184 int
185 kern_readv(struct thread *td, int fd, struct uio *auio)
186 {
187 struct file *fp;
188 int error;
189
190 error = fget_read(td, fd, &fp);
191 if (error)
192 return (error);
193 error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
194 fdrop(fp, td);
195 return (error);
196 }
197
198 /*
199 * Scatter positioned read system call.
200 */
201 #ifndef _SYS_SYSPROTO_H_
202 struct preadv_args {
203 int fd;
204 struct iovec *iovp;
205 u_int iovcnt;
206 off_t offset;
207 };
208 #endif
209 int
210 preadv(struct thread *td, struct preadv_args *uap)
211 {
212 struct uio *auio;
213 int error;
214
215 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
216 if (error)
217 return (error);
218 error = kern_preadv(td, uap->fd, auio, uap->offset);
219 free(auio, M_IOV);
220 return (error);
221 }
222
223 int
224 kern_preadv(td, fd, auio, offset)
225 struct thread *td;
226 int fd;
227 struct uio *auio;
228 off_t offset;
229 {
230 struct file *fp;
231 int error;
232
233 error = fget_read(td, fd, &fp);
234 if (error)
235 return (error);
236 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
237 error = ESPIPE;
238 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
239 error = EINVAL;
240 else
241 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
242 fdrop(fp, td);
243 return (error);
244 }
245
246 /*
247 * Common code for readv and preadv that reads data in
248 * from a file using the passed in uio, offset, and flags.
249 */
250 static int
251 dofileread(td, fd, fp, auio, offset, flags)
252 struct thread *td;
253 int fd;
254 struct file *fp;
255 struct uio *auio;
256 off_t offset;
257 int flags;
258 {
259 ssize_t cnt;
260 int error;
261 #ifdef KTRACE
262 struct uio *ktruio = NULL;
263 #endif
264
265 /* Finish zero length reads right here */
266 if (auio->uio_resid == 0) {
267 td->td_retval[0] = 0;
268 return(0);
269 }
270 auio->uio_rw = UIO_READ;
271 auio->uio_offset = offset;
272 auio->uio_td = td;
273 #ifdef KTRACE
274 if (KTRPOINT(td, KTR_GENIO))
275 ktruio = cloneuio(auio);
276 #endif
277 cnt = auio->uio_resid;
278 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
279 if (auio->uio_resid != cnt && (error == ERESTART ||
280 error == EINTR || error == EWOULDBLOCK))
281 error = 0;
282 }
283 cnt -= auio->uio_resid;
284 #ifdef KTRACE
285 if (ktruio != NULL) {
286 ktruio->uio_resid = cnt;
287 ktrgenio(fd, UIO_READ, ktruio, error);
288 }
289 #endif
290 td->td_retval[0] = cnt;
291 return (error);
292 }
293
294 #ifndef _SYS_SYSPROTO_H_
295 struct write_args {
296 int fd;
297 const void *buf;
298 size_t nbyte;
299 };
300 #endif
301 int
302 write(td, uap)
303 struct thread *td;
304 struct write_args *uap;
305 {
306 struct uio auio;
307 struct iovec aiov;
308 int error;
309
310 if (uap->nbyte > INT_MAX)
311 return (EINVAL);
312 aiov.iov_base = (void *)(uintptr_t)uap->buf;
313 aiov.iov_len = uap->nbyte;
314 auio.uio_iov = &aiov;
315 auio.uio_iovcnt = 1;
316 auio.uio_resid = uap->nbyte;
317 auio.uio_segflg = UIO_USERSPACE;
318 error = kern_writev(td, uap->fd, &auio);
319 return(error);
320 }
321
322 /*
323 * Positioned write system call.
324 */
325 #ifndef _SYS_SYSPROTO_H_
326 struct pwrite_args {
327 int fd;
328 const void *buf;
329 size_t nbyte;
330 int pad;
331 off_t offset;
332 };
333 #endif
334 int
335 pwrite(td, uap)
336 struct thread *td;
337 struct pwrite_args *uap;
338 {
339 struct uio auio;
340 struct iovec aiov;
341 int error;
342
343 if (uap->nbyte > INT_MAX)
344 return (EINVAL);
345 aiov.iov_base = (void *)(uintptr_t)uap->buf;
346 aiov.iov_len = uap->nbyte;
347 auio.uio_iov = &aiov;
348 auio.uio_iovcnt = 1;
349 auio.uio_resid = uap->nbyte;
350 auio.uio_segflg = UIO_USERSPACE;
351 error = kern_pwritev(td, uap->fd, &auio, uap->offset);
352 return(error);
353 }
354
355 int
356 freebsd6_pwrite(td, uap)
357 struct thread *td;
358 struct freebsd6_pwrite_args *uap;
359 {
360 struct pwrite_args oargs;
361
362 oargs.fd = uap->fd;
363 oargs.buf = uap->buf;
364 oargs.nbyte = uap->nbyte;
365 oargs.offset = uap->offset;
366 return (pwrite(td, &oargs));
367 }
368
369 /*
370 * Gather write system call.
371 */
372 #ifndef _SYS_SYSPROTO_H_
373 struct writev_args {
374 int fd;
375 struct iovec *iovp;
376 u_int iovcnt;
377 };
378 #endif
379 int
380 writev(struct thread *td, struct writev_args *uap)
381 {
382 struct uio *auio;
383 int error;
384
385 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
386 if (error)
387 return (error);
388 error = kern_writev(td, uap->fd, auio);
389 free(auio, M_IOV);
390 return (error);
391 }
392
393 int
394 kern_writev(struct thread *td, int fd, struct uio *auio)
395 {
396 struct file *fp;
397 int error;
398
399 error = fget_write(td, fd, &fp);
400 if (error)
401 return (error);
402 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
403 fdrop(fp, td);
404 return (error);
405 }
406
407 /*
408 * Gather positioned write system call.
409 */
410 #ifndef _SYS_SYSPROTO_H_
411 struct pwritev_args {
412 int fd;
413 struct iovec *iovp;
414 u_int iovcnt;
415 off_t offset;
416 };
417 #endif
418 int
419 pwritev(struct thread *td, struct pwritev_args *uap)
420 {
421 struct uio *auio;
422 int error;
423
424 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
425 if (error)
426 return (error);
427 error = kern_pwritev(td, uap->fd, auio, uap->offset);
428 free(auio, M_IOV);
429 return (error);
430 }
431
432 int
433 kern_pwritev(td, fd, auio, offset)
434 struct thread *td;
435 struct uio *auio;
436 int fd;
437 off_t offset;
438 {
439 struct file *fp;
440 int error;
441
442 error = fget_write(td, fd, &fp);
443 if (error)
444 return (error);
445 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
446 error = ESPIPE;
447 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
448 error = EINVAL;
449 else
450 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
451 fdrop(fp, td);
452 return (error);
453 }
454
455 /*
456 * Common code for writev and pwritev that writes data to
457 * a file using the passed in uio, offset, and flags.
458 */
459 static int
460 dofilewrite(td, fd, fp, auio, offset, flags)
461 struct thread *td;
462 int fd;
463 struct file *fp;
464 struct uio *auio;
465 off_t offset;
466 int flags;
467 {
468 ssize_t cnt;
469 int error;
470 #ifdef KTRACE
471 struct uio *ktruio = NULL;
472 #endif
473
474 auio->uio_rw = UIO_WRITE;
475 auio->uio_td = td;
476 auio->uio_offset = offset;
477 #ifdef KTRACE
478 if (KTRPOINT(td, KTR_GENIO))
479 ktruio = cloneuio(auio);
480 #endif
481 cnt = auio->uio_resid;
482 if (fp->f_type == DTYPE_VNODE)
483 bwillwrite();
484 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
485 if (auio->uio_resid != cnt && (error == ERESTART ||
486 error == EINTR || error == EWOULDBLOCK))
487 error = 0;
488 /* Socket layer is responsible for issuing SIGPIPE. */
489 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
490 PROC_LOCK(td->td_proc);
491 tdsignal(td->td_proc, td, SIGPIPE, NULL);
492 PROC_UNLOCK(td->td_proc);
493 }
494 }
495 cnt -= auio->uio_resid;
496 #ifdef KTRACE
497 if (ktruio != NULL) {
498 ktruio->uio_resid = cnt;
499 ktrgenio(fd, UIO_WRITE, ktruio, error);
500 }
501 #endif
502 td->td_retval[0] = cnt;
503 return (error);
504 }
505
506 #ifndef _SYS_SYSPROTO_H_
507 struct ioctl_args {
508 int fd;
509 u_long com;
510 caddr_t data;
511 };
512 #endif
513 /* ARGSUSED */
514 int
515 ioctl(struct thread *td, struct ioctl_args *uap)
516 {
517 u_long com;
518 int arg, error;
519 u_int size;
520 caddr_t data;
521
522 if (uap->com > 0xffffffff) {
523 printf(
524 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
525 td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
526 uap->com &= 0xffffffff;
527 }
528 com = uap->com;
529
530 /*
531 * Interpret high order word to find amount of data to be
532 * copied to/from the user's address space.
533 */
534 size = IOCPARM_LEN(com);
535 if ((size > IOCPARM_MAX) ||
536 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) ||
537 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
538 ((com & IOC_OUT) && size == 0) ||
539 #else
540 ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
541 #endif
542 ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
543 return (ENOTTY);
544
545 if (size > 0) {
546 if (!(com & IOC_VOID))
547 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
548 else {
549 /* Integer argument. */
550 arg = (intptr_t)uap->data;
551 data = (void *)&arg;
552 size = 0;
553 }
554 } else
555 data = (void *)&uap->data;
556 if (com & IOC_IN) {
557 error = copyin(uap->data, data, (u_int)size);
558 if (error) {
559 if (size > 0)
560 free(data, M_IOCTLOPS);
561 return (error);
562 }
563 } else if (com & IOC_OUT) {
564 /*
565 * Zero the buffer so the user always
566 * gets back something deterministic.
567 */
568 bzero(data, size);
569 }
570
571 error = kern_ioctl(td, uap->fd, com, data);
572
573 if (error == 0 && (com & IOC_OUT))
574 error = copyout(data, uap->data, (u_int)size);
575
576 if (size > 0)
577 free(data, M_IOCTLOPS);
578 return (error);
579 }
580
581 int
582 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
583 {
584 struct file *fp;
585 struct filedesc *fdp;
586 int error;
587 int tmp;
588
589 if ((error = fget(td, fd, &fp)) != 0)
590 return (error);
591 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
592 fdrop(fp, td);
593 return (EBADF);
594 }
595 fdp = td->td_proc->p_fd;
596 switch (com) {
597 case FIONCLEX:
598 FILEDESC_XLOCK(fdp);
599 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
600 FILEDESC_XUNLOCK(fdp);
601 goto out;
602 case FIOCLEX:
603 FILEDESC_XLOCK(fdp);
604 fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
605 FILEDESC_XUNLOCK(fdp);
606 goto out;
607 case FIONBIO:
608 FILE_LOCK(fp);
609 if ((tmp = *(int *)data))
610 fp->f_flag |= FNONBLOCK;
611 else
612 fp->f_flag &= ~FNONBLOCK;
613 FILE_UNLOCK(fp);
614 data = (void *)&tmp;
615 break;
616 case FIOASYNC:
617 FILE_LOCK(fp);
618 if ((tmp = *(int *)data))
619 fp->f_flag |= FASYNC;
620 else
621 fp->f_flag &= ~FASYNC;
622 FILE_UNLOCK(fp);
623 data = (void *)&tmp;
624 break;
625 }
626
627 error = fo_ioctl(fp, com, data, td->td_ucred, td);
628 out:
629 fdrop(fp, td);
630 return (error);
631 }
632
633 /*
634 * sellock and selwait are initialized in selectinit() via SYSINIT.
635 */
636 struct mtx sellock;
637 struct cv selwait;
638 u_int nselcoll; /* Select collisions since boot */
639 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
640
641 int
642 poll_no_poll(int events)
643 {
644 /*
645 * Return true for read/write. If the user asked for something
646 * special, return POLLNVAL, so that clients have a way of
647 * determining reliably whether or not the extended
648 * functionality is present without hard-coding knowledge
649 * of specific filesystem implementations.
650 */
651 if (events & ~POLLSTANDARD)
652 return (POLLNVAL);
653
654 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
655 }
656
657 #ifndef _SYS_SYSPROTO_H_
658 struct select_args {
659 int nd;
660 fd_set *in, *ou, *ex;
661 struct timeval *tv;
662 };
663 #endif
664 int
665 select(td, uap)
666 register struct thread *td;
667 register struct select_args *uap;
668 {
669 struct timeval tv, *tvp;
670 int error;
671
672 if (uap->tv != NULL) {
673 error = copyin(uap->tv, &tv, sizeof(tv));
674 if (error)
675 return (error);
676 tvp = &tv;
677 } else
678 tvp = NULL;
679
680 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
681 }
682
683 int
684 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
685 fd_set *fd_ex, struct timeval *tvp)
686 {
687 struct filedesc *fdp;
688 /*
689 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
690 * infds with the new FD_SETSIZE of 1024, and more than enough for
691 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
692 * of 256.
693 */
694 fd_mask s_selbits[howmany(2048, NFDBITS)];
695 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
696 struct timeval atv, rtv, ttv;
697 int error, timo;
698 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
699
700 if (nd < 0)
701 return (EINVAL);
702 fdp = td->td_proc->p_fd;
703
704 FILEDESC_SLOCK(fdp);
705 if (nd > td->td_proc->p_fd->fd_nfiles)
706 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
707 FILEDESC_SUNLOCK(fdp);
708
709 /*
710 * Allocate just enough bits for the non-null fd_sets. Use the
711 * preallocated auto buffer if possible.
712 */
713 nfdbits = roundup(nd, NFDBITS);
714 ncpbytes = nfdbits / NBBY;
715 nbufbytes = 0;
716 if (fd_in != NULL)
717 nbufbytes += 2 * ncpbytes;
718 if (fd_ou != NULL)
719 nbufbytes += 2 * ncpbytes;
720 if (fd_ex != NULL)
721 nbufbytes += 2 * ncpbytes;
722 if (nbufbytes <= sizeof s_selbits)
723 selbits = &s_selbits[0];
724 else
725 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
726
727 /*
728 * Assign pointers into the bit buffers and fetch the input bits.
729 * Put the output buffers together so that they can be bzeroed
730 * together.
731 */
732 sbp = selbits;
733 #define getbits(name, x) \
734 do { \
735 if (name == NULL) \
736 ibits[x] = NULL; \
737 else { \
738 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
739 obits[x] = sbp; \
740 sbp += ncpbytes / sizeof *sbp; \
741 error = copyin(name, ibits[x], ncpbytes); \
742 if (error != 0) \
743 goto done_nosellock; \
744 } \
745 } while (0)
746 getbits(fd_in, 0);
747 getbits(fd_ou, 1);
748 getbits(fd_ex, 2);
749 #undef getbits
750 if (nbufbytes != 0)
751 bzero(selbits, nbufbytes / 2);
752
753 if (tvp != NULL) {
754 atv = *tvp;
755 if (itimerfix(&atv)) {
756 error = EINVAL;
757 goto done_nosellock;
758 }
759 getmicrouptime(&rtv);
760 timevaladd(&atv, &rtv);
761 } else {
762 atv.tv_sec = 0;
763 atv.tv_usec = 0;
764 }
765 timo = 0;
766 TAILQ_INIT(&td->td_selq);
767 mtx_lock(&sellock);
768 retry:
769 ncoll = nselcoll;
770 thread_lock(td);
771 td->td_flags |= TDF_SELECT;
772 thread_unlock(td);
773 mtx_unlock(&sellock);
774
775 error = selscan(td, ibits, obits, nd);
776 mtx_lock(&sellock);
777 if (error || td->td_retval[0])
778 goto done;
779 if (atv.tv_sec || atv.tv_usec) {
780 getmicrouptime(&rtv);
781 if (timevalcmp(&rtv, &atv, >=))
782 goto done;
783 ttv = atv;
784 timevalsub(&ttv, &rtv);
785 timo = ttv.tv_sec > 24 * 60 * 60 ?
786 24 * 60 * 60 * hz : tvtohz(&ttv);
787 }
788
789 /*
790 * An event of interest may occur while we do not hold
791 * sellock, so check TDF_SELECT and the number of
792 * collisions and rescan the file descriptors if
793 * necessary.
794 */
795 thread_lock(td);
796 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
797 thread_unlock(td);
798 goto retry;
799 }
800 thread_unlock(td);
801
802 if (timo > 0)
803 error = cv_timedwait_sig(&selwait, &sellock, timo);
804 else
805 error = cv_wait_sig(&selwait, &sellock);
806
807 if (error == 0)
808 goto retry;
809
810 done:
811 clear_selinfo_list(td);
812 thread_lock(td);
813 td->td_flags &= ~TDF_SELECT;
814 thread_unlock(td);
815 mtx_unlock(&sellock);
816
817 done_nosellock:
818 /* select is not restarted after signals... */
819 if (error == ERESTART)
820 error = EINTR;
821 if (error == EWOULDBLOCK)
822 error = 0;
823 #define putbits(name, x) \
824 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
825 error = error2;
826 if (error == 0) {
827 int error2;
828
829 putbits(fd_in, 0);
830 putbits(fd_ou, 1);
831 putbits(fd_ex, 2);
832 #undef putbits
833 }
834 if (selbits != &s_selbits[0])
835 free(selbits, M_SELECT);
836
837 return (error);
838 }
839
840 static int
841 selscan(td, ibits, obits, nfd)
842 struct thread *td;
843 fd_mask **ibits, **obits;
844 int nfd;
845 {
846 int msk, i, fd;
847 fd_mask bits;
848 struct file *fp;
849 int n = 0;
850 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
851 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
852 struct filedesc *fdp = td->td_proc->p_fd;
853
854 FILEDESC_SLOCK(fdp);
855 for (msk = 0; msk < 3; msk++) {
856 if (ibits[msk] == NULL)
857 continue;
858 for (i = 0; i < nfd; i += NFDBITS) {
859 bits = ibits[msk][i/NFDBITS];
860 /* ffs(int mask) not portable, fd_mask is long */
861 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
862 if (!(bits & 1))
863 continue;
864 if ((fp = fget_locked(fdp, fd)) == NULL) {
865 FILEDESC_SUNLOCK(fdp);
866 return (EBADF);
867 }
868 if (fo_poll(fp, flag[msk], td->td_ucred,
869 td)) {
870 obits[msk][(fd)/NFDBITS] |=
871 ((fd_mask)1 << ((fd) % NFDBITS));
872 n++;
873 }
874 }
875 }
876 }
877 FILEDESC_SUNLOCK(fdp);
878 td->td_retval[0] = n;
879 return (0);
880 }
881
882 #ifndef _SYS_SYSPROTO_H_
883 struct poll_args {
884 struct pollfd *fds;
885 u_int nfds;
886 int timeout;
887 };
888 #endif
889 int
890 poll(td, uap)
891 struct thread *td;
892 struct poll_args *uap;
893 {
894 struct pollfd *bits;
895 struct pollfd smallbits[32];
896 struct timeval atv, rtv, ttv;
897 int error = 0, timo;
898 u_int ncoll, nfds;
899 size_t ni;
900
901 nfds = uap->nfds;
902
903 /*
904 * This is kinda bogus. We have fd limits, but that is not
905 * really related to the size of the pollfd array. Make sure
906 * we let the process use at least FD_SETSIZE entries and at
907 * least enough for the current limits. We want to be reasonably
908 * safe, but not overly restrictive.
909 */
910 PROC_LOCK(td->td_proc);
911 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
912 (nfds > FD_SETSIZE)) {
913 PROC_UNLOCK(td->td_proc);
914 error = EINVAL;
915 goto done2;
916 }
917 PROC_UNLOCK(td->td_proc);
918 ni = nfds * sizeof(struct pollfd);
919 if (ni > sizeof(smallbits))
920 bits = malloc(ni, M_TEMP, M_WAITOK);
921 else
922 bits = smallbits;
923 error = copyin(uap->fds, bits, ni);
924 if (error)
925 goto done_nosellock;
926 if (uap->timeout != INFTIM) {
927 atv.tv_sec = uap->timeout / 1000;
928 atv.tv_usec = (uap->timeout % 1000) * 1000;
929 if (itimerfix(&atv)) {
930 error = EINVAL;
931 goto done_nosellock;
932 }
933 getmicrouptime(&rtv);
934 timevaladd(&atv, &rtv);
935 } else {
936 atv.tv_sec = 0;
937 atv.tv_usec = 0;
938 }
939 timo = 0;
940 TAILQ_INIT(&td->td_selq);
941 mtx_lock(&sellock);
942 retry:
943 ncoll = nselcoll;
944 thread_lock(td);
945 td->td_flags |= TDF_SELECT;
946 thread_unlock(td);
947 mtx_unlock(&sellock);
948
949 error = pollscan(td, bits, nfds);
950 mtx_lock(&sellock);
951 if (error || td->td_retval[0])
952 goto done;
953 if (atv.tv_sec || atv.tv_usec) {
954 getmicrouptime(&rtv);
955 if (timevalcmp(&rtv, &atv, >=))
956 goto done;
957 ttv = atv;
958 timevalsub(&ttv, &rtv);
959 timo = ttv.tv_sec > 24 * 60 * 60 ?
960 24 * 60 * 60 * hz : tvtohz(&ttv);
961 }
962 /*
963 * An event of interest may occur while we do not hold
964 * sellock, so check TDF_SELECT and the number of collisions
965 * and rescan the file descriptors if necessary.
966 */
967 thread_lock(td);
968 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
969 thread_unlock(td);
970 goto retry;
971 }
972 thread_unlock(td);
973
974 if (timo > 0)
975 error = cv_timedwait_sig(&selwait, &sellock, timo);
976 else
977 error = cv_wait_sig(&selwait, &sellock);
978
979 if (error == 0)
980 goto retry;
981
982 done:
983 clear_selinfo_list(td);
984 thread_lock(td);
985 td->td_flags &= ~TDF_SELECT;
986 thread_unlock(td);
987 mtx_unlock(&sellock);
988
989 done_nosellock:
990 /* poll is not restarted after signals... */
991 if (error == ERESTART)
992 error = EINTR;
993 if (error == EWOULDBLOCK)
994 error = 0;
995 if (error == 0) {
996 error = pollout(bits, uap->fds, nfds);
997 if (error)
998 goto out;
999 }
1000 out:
1001 if (ni > sizeof(smallbits))
1002 free(bits, M_TEMP);
1003 done2:
1004 return (error);
1005 }
1006
1007 static int
1008 pollout(fds, ufds, nfd)
1009 struct pollfd *fds;
1010 struct pollfd *ufds;
1011 u_int nfd;
1012 {
1013 int error = 0;
1014 u_int i = 0;
1015
1016 for (i = 0; i < nfd; i++) {
1017 error = copyout(&fds->revents, &ufds->revents,
1018 sizeof(ufds->revents));
1019 if (error)
1020 return (error);
1021 fds++;
1022 ufds++;
1023 }
1024 return (0);
1025 }
1026
1027 static int
1028 pollscan(td, fds, nfd)
1029 struct thread *td;
1030 struct pollfd *fds;
1031 u_int nfd;
1032 {
1033 register struct filedesc *fdp = td->td_proc->p_fd;
1034 int i;
1035 struct file *fp;
1036 int n = 0;
1037
1038 FILEDESC_SLOCK(fdp);
1039 for (i = 0; i < nfd; i++, fds++) {
1040 if (fds->fd >= fdp->fd_nfiles) {
1041 fds->revents = POLLNVAL;
1042 n++;
1043 } else if (fds->fd < 0) {
1044 fds->revents = 0;
1045 } else {
1046 fp = fdp->fd_ofiles[fds->fd];
1047 if (fp == NULL) {
1048 fds->revents = POLLNVAL;
1049 n++;
1050 } else {
1051 /*
1052 * Note: backend also returns POLLHUP and
1053 * POLLERR if appropriate.
1054 */
1055 fds->revents = fo_poll(fp, fds->events,
1056 td->td_ucred, td);
1057 if (fds->revents != 0)
1058 n++;
1059 }
1060 }
1061 }
1062 FILEDESC_SUNLOCK(fdp);
1063 td->td_retval[0] = n;
1064 return (0);
1065 }
1066
1067 /*
1068 * OpenBSD poll system call.
1069 *
1070 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1071 */
1072 #ifndef _SYS_SYSPROTO_H_
1073 struct openbsd_poll_args {
1074 struct pollfd *fds;
1075 u_int nfds;
1076 int timeout;
1077 };
1078 #endif
1079 int
1080 openbsd_poll(td, uap)
1081 register struct thread *td;
1082 register struct openbsd_poll_args *uap;
1083 {
1084 return (poll(td, (struct poll_args *)uap));
1085 }
1086
1087 /*
1088 * Remove the references to the thread from all of the objects we were
1089 * polling.
1090 *
1091 * This code assumes that the underlying owner of the selinfo structure will
1092 * hold sellock before it changes it, and that it will unlink itself from our
1093 * list if it goes away.
1094 */
1095 void
1096 clear_selinfo_list(td)
1097 struct thread *td;
1098 {
1099 struct selinfo *si;
1100
1101 mtx_assert(&sellock, MA_OWNED);
1102 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1103 si->si_thread = NULL;
1104 TAILQ_INIT(&td->td_selq);
1105 }
1106
1107 /*
1108 * Record a select request.
1109 */
1110 void
1111 selrecord(selector, sip)
1112 struct thread *selector;
1113 struct selinfo *sip;
1114 {
1115
1116 mtx_lock(&sellock);
1117 /*
1118 * If the selinfo's thread pointer is NULL then take ownership of it.
1119 *
1120 * If the thread pointer is not NULL and it points to another
1121 * thread, then we have a collision.
1122 *
1123 * If the thread pointer is not NULL and points back to us then leave
1124 * it alone as we've already added pointed it at us and added it to
1125 * our list.
1126 */
1127 if (sip->si_thread == NULL) {
1128 sip->si_thread = selector;
1129 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1130 } else if (sip->si_thread != selector) {
1131 sip->si_flags |= SI_COLL;
1132 }
1133
1134 mtx_unlock(&sellock);
1135 }
1136
1137 /* Wake up a selecting thread. */
1138 void
1139 selwakeup(sip)
1140 struct selinfo *sip;
1141 {
1142 doselwakeup(sip, -1);
1143 }
1144
1145 /* Wake up a selecting thread, and set its priority. */
1146 void
1147 selwakeuppri(sip, pri)
1148 struct selinfo *sip;
1149 int pri;
1150 {
1151 doselwakeup(sip, pri);
1152 }
1153
1154 /*
1155 * Do a wakeup when a selectable event occurs.
1156 */
1157 static void
1158 doselwakeup(sip, pri)
1159 struct selinfo *sip;
1160 int pri;
1161 {
1162 struct thread *td;
1163
1164 mtx_lock(&sellock);
1165 td = sip->si_thread;
1166 if ((sip->si_flags & SI_COLL) != 0) {
1167 nselcoll++;
1168 sip->si_flags &= ~SI_COLL;
1169 cv_broadcastpri(&selwait, pri);
1170 }
1171 if (td == NULL) {
1172 mtx_unlock(&sellock);
1173 return;
1174 }
1175 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1176 sip->si_thread = NULL;
1177 thread_lock(td);
1178 td->td_flags &= ~TDF_SELECT;
1179 thread_unlock(td);
1180 sleepq_remove(td, &selwait);
1181 mtx_unlock(&sellock);
1182 }
1183
1184 static void selectinit(void *);
1185 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1186
1187 /* ARGSUSED*/
1188 static void
1189 selectinit(dummy)
1190 void *dummy;
1191 {
1192 cv_init(&selwait, "select");
1193 mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1194 }
Cache object: 1934a32ca9904823488b3cb57561acf9
|