1 /* $OpenBSD: sys_generic.c,v 1.151 2022/12/27 20:13:03 patrick Exp $ */
2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */
3
4 /*
5 * Copyright (c) 1996 Theo de Raadt
6 * Copyright (c) 1982, 1986, 1989, 1993
7 * The Regents of the University of California. All rights reserved.
8 * (c) UNIX System Laboratories, Inc.
9 * All or some portions of this file are derived from material licensed
10 * to the University of California by American Telephone and Telegraph
11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12 * the permission of UNIX System Laboratories, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 */
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/time.h>
54 #include <sys/malloc.h>
55 #include <sys/poll.h>
56 #include <sys/eventvar.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 #include <sys/pledge.h>
61
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64
65 /*
66 * Debug values:
67 * 1 - print implementation errors, things that should not happen.
68 * 2 - print ppoll(2) information, somewhat verbose
69 * 3 - print pselect(2) and ppoll(2) information, very verbose
70 */
71 int kqpoll_debug = 0;
72 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \
73 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
74 printf(x); \
75 }
76
77 int pselregister(struct proc *, fd_set **, fd_set **, int, int *, int *);
78 int pselcollect(struct proc *, struct kevent *, fd_set **, int *);
79 void ppollregister(struct proc *, struct pollfd *, int, int *, int *);
80 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
81
82 int pollout(struct pollfd *, struct pollfd *, u_int);
83 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
84 struct timespec *, const sigset_t *, register_t *);
85 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
86 const sigset_t *, register_t *);
87
88 int
89 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
90 unsigned int iovcnt, size_t *residp)
91 {
92 #ifdef KTRACE
93 struct proc *p = curproc;
94 #endif
95 struct iovec *iov;
96 int error, i;
97 size_t resid = 0;
98
99 if (iovcnt > UIO_SMALLIOV) {
100 if (iovcnt > IOV_MAX)
101 return (EINVAL);
102 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
103 } else if (iovcnt > 0) {
104 iov = aiov;
105 } else {
106 return (EINVAL);
107 }
108 *iovp = iov;
109
110 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
111 return (error);
112
113 #ifdef KTRACE
114 if (KTRPOINT(p, KTR_STRUCT))
115 ktriovec(p, iov, iovcnt);
116 #endif
117
118 for (i = 0; i < iovcnt; i++) {
119 resid += iov->iov_len;
120 /*
121 * Writes return ssize_t because -1 is returned on error.
122 * Therefore we must restrict the length to SSIZE_MAX to
123 * avoid garbage return values. Note that the addition is
124 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
125 */
126 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
127 return (EINVAL);
128 iov++;
129 }
130
131 if (residp != NULL)
132 *residp = resid;
133
134 return (0);
135 }
136
137 void
138 iovec_free(struct iovec *iov, unsigned int iovcnt)
139 {
140 if (iovcnt > UIO_SMALLIOV)
141 free(iov, M_IOV, iovcnt * sizeof(*iov));
142 }
143
144 /*
145 * Read system call.
146 */
147 int
148 sys_read(struct proc *p, void *v, register_t *retval)
149 {
150 struct sys_read_args /* {
151 syscallarg(int) fd;
152 syscallarg(void *) buf;
153 syscallarg(size_t) nbyte;
154 } */ *uap = v;
155 struct iovec iov;
156 struct uio auio;
157
158 iov.iov_base = SCARG(uap, buf);
159 iov.iov_len = SCARG(uap, nbyte);
160 if (iov.iov_len > SSIZE_MAX)
161 return (EINVAL);
162
163 auio.uio_iov = &iov;
164 auio.uio_iovcnt = 1;
165 auio.uio_resid = iov.iov_len;
166
167 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
168 }
169
170 /*
171 * Scatter read system call.
172 */
173 int
174 sys_readv(struct proc *p, void *v, register_t *retval)
175 {
176 struct sys_readv_args /* {
177 syscallarg(int) fd;
178 syscallarg(const struct iovec *) iovp;
179 syscallarg(int) iovcnt;
180 } */ *uap = v;
181 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
182 int error, iovcnt = SCARG(uap, iovcnt);
183 struct uio auio;
184 size_t resid;
185
186 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
187 if (error)
188 goto done;
189
190 auio.uio_iov = iov;
191 auio.uio_iovcnt = iovcnt;
192 auio.uio_resid = resid;
193
194 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
195 done:
196 iovec_free(iov, iovcnt);
197 return (error);
198 }
199
200 int
201 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
202 register_t *retval)
203 {
204 struct filedesc *fdp = p->p_fd;
205 struct file *fp;
206 long cnt, error = 0;
207 u_int iovlen;
208 #ifdef KTRACE
209 struct iovec *ktriov = NULL;
210 #endif
211
212 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
213 iovlen = uio->uio_iovcnt * sizeof(struct iovec);
214
215 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
216 return (EBADF);
217
218 /* Checks for positioned read. */
219 if (flags & FO_POSITION) {
220 struct vnode *vp = fp->f_data;
221
222 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
223 (vp->v_flag & VISTTY)) {
224 error = ESPIPE;
225 goto done;
226 }
227
228 if (uio->uio_offset < 0 && vp->v_type != VCHR) {
229 error = EINVAL;
230 goto done;
231 }
232 }
233
234 uio->uio_rw = UIO_READ;
235 uio->uio_segflg = UIO_USERSPACE;
236 uio->uio_procp = p;
237 #ifdef KTRACE
238 /*
239 * if tracing, save a copy of iovec
240 */
241 if (KTRPOINT(p, KTR_GENIO)) {
242 ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
243 memcpy(ktriov, uio->uio_iov, iovlen);
244 }
245 #endif
246 cnt = uio->uio_resid;
247 error = (*fp->f_ops->fo_read)(fp, uio, flags);
248 if (error) {
249 if (uio->uio_resid != cnt && (error == ERESTART ||
250 error == EINTR || error == EWOULDBLOCK))
251 error = 0;
252 }
253 cnt -= uio->uio_resid;
254
255 mtx_enter(&fp->f_mtx);
256 fp->f_rxfer++;
257 fp->f_rbytes += cnt;
258 mtx_leave(&fp->f_mtx);
259 #ifdef KTRACE
260 if (ktriov != NULL) {
261 if (error == 0)
262 ktrgenio(p, fd, UIO_READ, ktriov, cnt);
263 free(ktriov, M_TEMP, iovlen);
264 }
265 #endif
266 *retval = cnt;
267 done:
268 FRELE(fp, p);
269 return (error);
270 }
271
272 /*
273 * Write system call
274 */
275 int
276 sys_write(struct proc *p, void *v, register_t *retval)
277 {
278 struct sys_write_args /* {
279 syscallarg(int) fd;
280 syscallarg(const void *) buf;
281 syscallarg(size_t) nbyte;
282 } */ *uap = v;
283 struct iovec iov;
284 struct uio auio;
285
286 iov.iov_base = (void *)SCARG(uap, buf);
287 iov.iov_len = SCARG(uap, nbyte);
288 if (iov.iov_len > SSIZE_MAX)
289 return (EINVAL);
290
291 auio.uio_iov = &iov;
292 auio.uio_iovcnt = 1;
293 auio.uio_resid = iov.iov_len;
294
295 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
296 }
297
298 /*
299 * Gather write system call
300 */
301 int
302 sys_writev(struct proc *p, void *v, register_t *retval)
303 {
304 struct sys_writev_args /* {
305 syscallarg(int) fd;
306 syscallarg(const struct iovec *) iovp;
307 syscallarg(int) iovcnt;
308 } */ *uap = v;
309 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
310 int error, iovcnt = SCARG(uap, iovcnt);
311 struct uio auio;
312 size_t resid;
313
314 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
315 if (error)
316 goto done;
317
318 auio.uio_iov = iov;
319 auio.uio_iovcnt = iovcnt;
320 auio.uio_resid = resid;
321
322 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
323 done:
324 iovec_free(iov, iovcnt);
325 return (error);
326 }
327
328 int
329 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
330 register_t *retval)
331 {
332 struct filedesc *fdp = p->p_fd;
333 struct file *fp;
334 long cnt, error = 0;
335 u_int iovlen;
336 #ifdef KTRACE
337 struct iovec *ktriov = NULL;
338 #endif
339
340 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
341 iovlen = uio->uio_iovcnt * sizeof(struct iovec);
342
343 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
344 return (EBADF);
345
346 /* Checks for positioned write. */
347 if (flags & FO_POSITION) {
348 struct vnode *vp = fp->f_data;
349
350 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
351 (vp->v_flag & VISTTY)) {
352 error = ESPIPE;
353 goto done;
354 }
355
356 if (uio->uio_offset < 0 && vp->v_type != VCHR) {
357 error = EINVAL;
358 goto done;
359 }
360 }
361
362 uio->uio_rw = UIO_WRITE;
363 uio->uio_segflg = UIO_USERSPACE;
364 uio->uio_procp = p;
365 #ifdef KTRACE
366 /*
367 * if tracing, save a copy of iovec
368 */
369 if (KTRPOINT(p, KTR_GENIO)) {
370 ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
371 memcpy(ktriov, uio->uio_iov, iovlen);
372 }
373 #endif
374 cnt = uio->uio_resid;
375 error = (*fp->f_ops->fo_write)(fp, uio, flags);
376 if (error) {
377 if (uio->uio_resid != cnt && (error == ERESTART ||
378 error == EINTR || error == EWOULDBLOCK))
379 error = 0;
380 if (error == EPIPE) {
381 KERNEL_LOCK();
382 ptsignal(p, SIGPIPE, STHREAD);
383 KERNEL_UNLOCK();
384 }
385 }
386 cnt -= uio->uio_resid;
387
388 mtx_enter(&fp->f_mtx);
389 fp->f_wxfer++;
390 fp->f_wbytes += cnt;
391 mtx_leave(&fp->f_mtx);
392 #ifdef KTRACE
393 if (ktriov != NULL) {
394 if (error == 0)
395 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
396 free(ktriov, M_TEMP, iovlen);
397 }
398 #endif
399 *retval = cnt;
400 done:
401 FRELE(fp, p);
402 return (error);
403 }
404
405 /*
406 * Ioctl system call
407 */
408 int
409 sys_ioctl(struct proc *p, void *v, register_t *retval)
410 {
411 struct sys_ioctl_args /* {
412 syscallarg(int) fd;
413 syscallarg(u_long) com;
414 syscallarg(void *) data;
415 } */ *uap = v;
416 struct file *fp;
417 struct filedesc *fdp = p->p_fd;
418 u_long com = SCARG(uap, com);
419 int error = 0;
420 u_int size = 0;
421 caddr_t data, memp = NULL;
422 int tmp;
423 #define STK_PARAMS 128
424 long long stkbuf[STK_PARAMS / sizeof(long long)];
425
426 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
427 return (EBADF);
428
429 if (fp->f_type == DTYPE_SOCKET) {
430 struct socket *so = fp->f_data;
431
432 if (so->so_state & SS_DNS) {
433 error = EINVAL;
434 goto out;
435 }
436 }
437
438 error = pledge_ioctl(p, com, fp);
439 if (error)
440 goto out;
441
442 switch (com) {
443 case FIONCLEX:
444 case FIOCLEX:
445 fdplock(fdp);
446 if (com == FIONCLEX)
447 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
448 else
449 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
450 fdpunlock(fdp);
451 goto out;
452 }
453
454 /*
455 * Interpret high order word to find amount of data to be
456 * copied to/from the user's address space.
457 */
458 size = IOCPARM_LEN(com);
459 if (size > IOCPARM_MAX) {
460 error = ENOTTY;
461 goto out;
462 }
463 if (size > sizeof (stkbuf)) {
464 memp = malloc(size, M_IOCTLOPS, M_WAITOK);
465 data = memp;
466 } else
467 data = (caddr_t)stkbuf;
468 if (com&IOC_IN) {
469 if (size) {
470 error = copyin(SCARG(uap, data), data, size);
471 if (error) {
472 goto out;
473 }
474 } else
475 *(caddr_t *)data = SCARG(uap, data);
476 } else if ((com&IOC_OUT) && size)
477 /*
478 * Zero the buffer so the user always
479 * gets back something deterministic.
480 */
481 memset(data, 0, size);
482 else if (com&IOC_VOID)
483 *(caddr_t *)data = SCARG(uap, data);
484
485 switch (com) {
486
487 case FIONBIO:
488 if ((tmp = *(int *)data) != 0)
489 atomic_setbits_int(&fp->f_flag, FNONBLOCK);
490 else
491 atomic_clearbits_int(&fp->f_flag, FNONBLOCK);
492 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
493 break;
494
495 case FIOASYNC:
496 if ((tmp = *(int *)data) != 0)
497 atomic_setbits_int(&fp->f_flag, FASYNC);
498 else
499 atomic_clearbits_int(&fp->f_flag, FASYNC);
500 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
501 break;
502
503 default:
504 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
505 break;
506 }
507 /*
508 * Copy any data to user, size was
509 * already set and checked above.
510 */
511 if (error == 0 && (com&IOC_OUT) && size)
512 error = copyout(data, SCARG(uap, data), size);
513 out:
514 FRELE(fp, p);
515 free(memp, M_IOCTLOPS, size);
516 return (error);
517 }
518
519 /*
520 * Select system call.
521 */
522 int
523 sys_select(struct proc *p, void *v, register_t *retval)
524 {
525 struct sys_select_args /* {
526 syscallarg(int) nd;
527 syscallarg(fd_set *) in;
528 syscallarg(fd_set *) ou;
529 syscallarg(fd_set *) ex;
530 syscallarg(struct timeval *) tv;
531 } */ *uap = v;
532
533 struct timespec ts, *tsp = NULL;
534 int error;
535
536 if (SCARG(uap, tv) != NULL) {
537 struct timeval tv;
538 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
539 return (error);
540 #ifdef KTRACE
541 if (KTRPOINT(p, KTR_STRUCT))
542 ktrreltimeval(p, &tv);
543 #endif
544 if (tv.tv_sec < 0 || !timerisvalid(&tv))
545 return (EINVAL);
546 TIMEVAL_TO_TIMESPEC(&tv, &ts);
547 tsp = &ts;
548 }
549
550 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
551 SCARG(uap, ex), tsp, NULL, retval));
552 }
553
554 int
555 sys_pselect(struct proc *p, void *v, register_t *retval)
556 {
557 struct sys_pselect_args /* {
558 syscallarg(int) nd;
559 syscallarg(fd_set *) in;
560 syscallarg(fd_set *) ou;
561 syscallarg(fd_set *) ex;
562 syscallarg(const struct timespec *) ts;
563 syscallarg(const sigset_t *) mask;
564 } */ *uap = v;
565
566 struct timespec ts, *tsp = NULL;
567 sigset_t ss, *ssp = NULL;
568 int error;
569
570 if (SCARG(uap, ts) != NULL) {
571 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
572 return (error);
573 #ifdef KTRACE
574 if (KTRPOINT(p, KTR_STRUCT))
575 ktrreltimespec(p, &ts);
576 #endif
577 if (ts.tv_sec < 0 || !timespecisvalid(&ts))
578 return (EINVAL);
579 tsp = &ts;
580 }
581 if (SCARG(uap, mask) != NULL) {
582 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
583 return (error);
584 ssp = &ss;
585 }
586
587 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
588 SCARG(uap, ex), tsp, ssp, retval));
589 }
590
591 int
592 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
593 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
594 {
595 struct kqueue_scan_state scan;
596 struct timespec zerots = {};
597 fd_mask bits[6];
598 fd_set *pibits[3], *pobits[3];
599 int error, ncollected = 0, nevents = 0;
600 u_int ni;
601
602 if (nd < 0)
603 return (EINVAL);
604 if (nd > p->p_fd->fd_nfiles) {
605 /* forgiving; slightly wrong */
606 nd = p->p_fd->fd_nfiles;
607 }
608 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
609 if (ni > sizeof(bits[0])) {
610 caddr_t mbits;
611
612 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
613 pibits[0] = (fd_set *)&mbits[ni * 0];
614 pibits[1] = (fd_set *)&mbits[ni * 1];
615 pibits[2] = (fd_set *)&mbits[ni * 2];
616 pobits[0] = (fd_set *)&mbits[ni * 3];
617 pobits[1] = (fd_set *)&mbits[ni * 4];
618 pobits[2] = (fd_set *)&mbits[ni * 5];
619 } else {
620 memset(bits, 0, sizeof(bits));
621 pibits[0] = (fd_set *)&bits[0];
622 pibits[1] = (fd_set *)&bits[1];
623 pibits[2] = (fd_set *)&bits[2];
624 pobits[0] = (fd_set *)&bits[3];
625 pobits[1] = (fd_set *)&bits[4];
626 pobits[2] = (fd_set *)&bits[5];
627 }
628
629 kqpoll_init(nd);
630
631 #define getbits(name, x) \
632 if (name && (error = copyin(name, pibits[x], ni))) \
633 goto done;
634 getbits(in, 0);
635 getbits(ou, 1);
636 getbits(ex, 2);
637 #undef getbits
638 #ifdef KTRACE
639 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
640 if (in) ktrfdset(p, pibits[0], ni);
641 if (ou) ktrfdset(p, pibits[1], ni);
642 if (ex) ktrfdset(p, pibits[2], ni);
643 }
644 #endif
645
646 if (sigmask)
647 dosigsuspend(p, *sigmask &~ sigcantmask);
648
649 /* Register kqueue events */
650 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected);
651 if (error != 0)
652 goto done;
653
654 /*
655 * The poll/select family of syscalls has been designed to
656 * block when file descriptors are not available, even if
657 * there's nothing to wait for.
658 */
659 if (nevents == 0 && ncollected == 0) {
660 uint64_t nsecs = INFSLP;
661
662 if (timeout != NULL) {
663 if (!timespecisset(timeout))
664 goto done;
665 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
666 }
667 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs);
668 /* select is not restarted after signals... */
669 if (error == ERESTART)
670 error = EINTR;
671 if (error == EWOULDBLOCK)
672 error = 0;
673 goto done;
674 }
675
676 /* Do not block if registering found pending events. */
677 if (ncollected > 0)
678 timeout = &zerots;
679
680 /* Collect at most `nevents' possibly waiting in kqueue_scan() */
681 kqueue_scan_setup(&scan, p->p_kq);
682 while (nevents > 0) {
683 struct kevent kev[KQ_NEVENTS];
684 int i, ready, count;
685
686 /* Maximum number of events per iteration */
687 count = MIN(nitems(kev), nevents);
688 ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
689
690 /* Convert back events that are ready. */
691 for (i = 0; i < ready && error == 0; i++)
692 error = pselcollect(p, &kev[i], pobits, &ncollected);
693 /*
694 * Stop if there was an error or if we had enough
695 * space to collect all events that were ready.
696 */
697 if (error || ready < count)
698 break;
699
700 nevents -= ready;
701 }
702 kqueue_scan_finish(&scan);
703 *retval = ncollected;
704 done:
705 #define putbits(name, x) \
706 if (name && (error2 = copyout(pobits[x], name, ni))) \
707 error = error2;
708 if (error == 0) {
709 int error2;
710
711 putbits(in, 0);
712 putbits(ou, 1);
713 putbits(ex, 2);
714 #undef putbits
715 #ifdef KTRACE
716 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
717 if (in) ktrfdset(p, pobits[0], ni);
718 if (ou) ktrfdset(p, pobits[1], ni);
719 if (ex) ktrfdset(p, pobits[2], ni);
720 }
721 #endif
722 }
723
724 if (pibits[0] != (fd_set *)&bits[0])
725 free(pibits[0], M_TEMP, 6 * ni);
726
727 kqpoll_done(nd);
728
729 return (error);
730 }
731
732 /*
733 * Convert fd_set into kqueue events and register them on the
734 * per-thread queue.
735 */
736 int
737 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd,
738 int *nregistered, int *ncollected)
739 {
740 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT };
741 static const int evff[] = { 0, 0, NOTE_OOB };
742 int msk, i, j, fd, nevents = 0, error = 0;
743 struct kevent kev;
744 fd_mask bits;
745
746 for (msk = 0; msk < 3; msk++) {
747 for (i = 0; i < nfd; i += NFDBITS) {
748 bits = pibits[msk]->fds_bits[i / NFDBITS];
749 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
750 bits &= ~(1 << j);
751
752 DPRINTFN(2, "select fd %d mask %d serial %lu\n",
753 fd, msk, p->p_kq_serial);
754 EV_SET(&kev, fd, evf[msk],
755 EV_ADD|EV_ENABLE|__EV_SELECT,
756 evff[msk], 0, (void *)(p->p_kq_serial));
757 error = kqueue_register(p->p_kq, &kev, 0, p);
758 switch (error) {
759 case 0:
760 nevents++;
761 /* FALLTHROUGH */
762 case EOPNOTSUPP:/* No underlying kqfilter */
763 case EINVAL: /* Unimplemented filter */
764 case EPERM: /* Specific to FIFO and
765 * __EV_SELECT */
766 error = 0;
767 break;
768 case EPIPE: /* Specific to pipes */
769 KASSERT(kev.filter == EVFILT_WRITE);
770 FD_SET(kev.ident, pobits[1]);
771 (*ncollected)++;
772 error = 0;
773 break;
774 case ENXIO: /* Device has been detached */
775 default:
776 goto bad;
777 }
778 }
779 }
780 }
781
782 *nregistered = nevents;
783 return (0);
784 bad:
785 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
786 kev.filter, error);
787 return (error);
788 }
789
790 /*
791 * Convert given kqueue event into corresponding select(2) bit.
792 */
793 int
794 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3],
795 int *ncollected)
796 {
797 if ((unsigned long)kevp->udata != p->p_kq_serial) {
798 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx",
799 __func__, kevp, (int)kevp->ident,
800 (unsigned long)kevp->udata, p->p_kq_serial);
801 }
802
803 if (kevp->flags & EV_ERROR) {
804 DPRINTFN(2, "select fd %d filt %d error %d\n",
805 (int)kevp->ident, kevp->filter, (int)kevp->data);
806 return (kevp->data);
807 }
808
809 switch (kevp->filter) {
810 case EVFILT_READ:
811 FD_SET(kevp->ident, pobits[0]);
812 break;
813 case EVFILT_WRITE:
814 FD_SET(kevp->ident, pobits[1]);
815 break;
816 case EVFILT_EXCEPT:
817 FD_SET(kevp->ident, pobits[2]);
818 break;
819 default:
820 KASSERT(0);
821 }
822 (*ncollected)++;
823
824 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
825 return (0);
826 }
827
828 /*
829 * Do a wakeup when a selectable event occurs.
830 */
831 void
832 selwakeup(struct selinfo *sip)
833 {
834 KERNEL_LOCK();
835 KNOTE(&sip->si_note, NOTE_SUBMIT);
836 KERNEL_UNLOCK();
837 }
838
839 /*
840 * Only copyout the revents field.
841 */
842 int
843 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
844 {
845 int error = 0;
846 u_int i = 0;
847
848 while (!error && i++ < nfds) {
849 error = copyout(&pl->revents, &upl->revents,
850 sizeof(upl->revents));
851 pl++;
852 upl++;
853 }
854
855 return (error);
856 }
857
858 /*
859 * We are using the same mechanism as select only we encode/decode args
860 * differently.
861 */
862 int
863 sys_poll(struct proc *p, void *v, register_t *retval)
864 {
865 struct sys_poll_args /* {
866 syscallarg(struct pollfd *) fds;
867 syscallarg(u_int) nfds;
868 syscallarg(int) timeout;
869 } */ *uap = v;
870
871 struct timespec ts, *tsp = NULL;
872 int msec = SCARG(uap, timeout);
873
874 if (msec != INFTIM) {
875 if (msec < 0)
876 return (EINVAL);
877 ts.tv_sec = msec / 1000;
878 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
879 tsp = &ts;
880 }
881
882 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
883 retval));
884 }
885
886 int
887 sys_ppoll(struct proc *p, void *v, register_t *retval)
888 {
889 struct sys_ppoll_args /* {
890 syscallarg(struct pollfd *) fds;
891 syscallarg(u_int) nfds;
892 syscallarg(const struct timespec *) ts;
893 syscallarg(const sigset_t *) mask;
894 } */ *uap = v;
895
896 int error;
897 struct timespec ts, *tsp = NULL;
898 sigset_t ss, *ssp = NULL;
899
900 if (SCARG(uap, ts) != NULL) {
901 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
902 return (error);
903 #ifdef KTRACE
904 if (KTRPOINT(p, KTR_STRUCT))
905 ktrreltimespec(p, &ts);
906 #endif
907 if (ts.tv_sec < 0 || !timespecisvalid(&ts))
908 return (EINVAL);
909 tsp = &ts;
910 }
911
912 if (SCARG(uap, mask) != NULL) {
913 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
914 return (error);
915 ssp = &ss;
916 }
917
918 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
919 retval));
920 }
921
922 int
923 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
924 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
925 {
926 struct kqueue_scan_state scan;
927 struct timespec zerots = {};
928 struct pollfd pfds[4], *pl = pfds;
929 int error, ncollected = 0, nevents = 0;
930 size_t sz;
931
932 /* Standards say no more than MAX_OPEN; this is possibly better. */
933 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
934 return (EINVAL);
935
936 /* optimize for the default case, of a small nfds value */
937 if (nfds > nitems(pfds)) {
938 pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
939 M_WAITOK | M_CANFAIL);
940 if (pl == NULL)
941 return (EINVAL);
942 }
943
944 kqpoll_init(nfds);
945
946 sz = nfds * sizeof(*pl);
947
948 if ((error = copyin(fds, pl, sz)) != 0)
949 goto bad;
950
951 if (sigmask)
952 dosigsuspend(p, *sigmask &~ sigcantmask);
953
954 /* Register kqueue events */
955 ppollregister(p, pl, nfds, &nevents, &ncollected);
956
957 /*
958 * The poll/select family of syscalls has been designed to
959 * block when file descriptors are not available, even if
960 * there's nothing to wait for.
961 */
962 if (nevents == 0 && ncollected == 0) {
963 uint64_t nsecs = INFSLP;
964
965 if (timeout != NULL) {
966 if (!timespecisset(timeout))
967 goto done;
968 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
969 }
970
971 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs);
972 if (error == ERESTART)
973 error = EINTR;
974 if (error == EWOULDBLOCK)
975 error = 0;
976 goto done;
977 }
978
979 /* Do not block if registering found pending events. */
980 if (ncollected > 0)
981 timeout = &zerots;
982
983 /* Collect at most `nevents' possibly waiting in kqueue_scan() */
984 kqueue_scan_setup(&scan, p->p_kq);
985 while (nevents > 0) {
986 struct kevent kev[KQ_NEVENTS];
987 int i, ready, count;
988
989 /* Maximum number of events per iteration */
990 count = MIN(nitems(kev), nevents);
991 ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
992
993 /* Convert back events that are ready. */
994 for (i = 0; i < ready; i++)
995 ncollected += ppollcollect(p, &kev[i], pl, nfds);
996
997 /*
998 * Stop if there was an error or if we had enough
999 * place to collect all events that were ready.
1000 */
1001 if (error || ready < count)
1002 break;
1003
1004 nevents -= ready;
1005 }
1006 kqueue_scan_finish(&scan);
1007 *retval = ncollected;
1008 done:
1009 /*
1010 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1011 * ignored (since the whole point is to see what would block).
1012 */
1013 switch (error) {
1014 case EINTR:
1015 error = pollout(pl, fds, nfds);
1016 if (error == 0)
1017 error = EINTR;
1018 break;
1019 case EWOULDBLOCK:
1020 case 0:
1021 error = pollout(pl, fds, nfds);
1022 break;
1023 }
1024 #ifdef KTRACE
1025 if (KTRPOINT(p, KTR_STRUCT))
1026 ktrpollfd(p, pl, nfds);
1027 #endif /* KTRACE */
1028 bad:
1029 if (pl != pfds)
1030 free(pl, M_TEMP, sz);
1031
1032 kqpoll_done(nfds);
1033
1034 return (error);
1035 }
1036
1037 int
1038 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev,
1039 struct pollfd *pl, unsigned int pollid)
1040 {
1041 int i, error, nevents = 0;
1042
1043 KASSERT(pl->revents == 0);
1044
1045 for (i = 0; i < nkev; i++, kevp++) {
1046 again:
1047 error = kqueue_register(p->p_kq, kevp, pollid, p);
1048 switch (error) {
1049 case 0:
1050 nevents++;
1051 break;
1052 case EOPNOTSUPP:/* No underlying kqfilter */
1053 case EINVAL: /* Unimplemented filter */
1054 break;
1055 case EBADF: /* Bad file descriptor */
1056 pl->revents |= POLLNVAL;
1057 break;
1058 case EPERM: /* Specific to FIFO */
1059 KASSERT(kevp->filter == EVFILT_WRITE);
1060 if (nkev == 1) {
1061 /*
1062 * If this is the only filter make sure
1063 * POLLHUP is passed to userland.
1064 */
1065 kevp->filter = EVFILT_EXCEPT;
1066 goto again;
1067 }
1068 break;
1069 case EPIPE: /* Specific to pipes */
1070 KASSERT(kevp->filter == EVFILT_WRITE);
1071 pl->revents |= POLLHUP;
1072 break;
1073 default:
1074 DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
1075 " %lu filt %d ERROR=%d\n",
1076 ((unsigned long)kevp->udata - p->p_kq_serial),
1077 pl->fd, pl->revents, p->p_kq_serial, kevp->filter,
1078 error);
1079 /* FALLTHROUGH */
1080 case ENXIO: /* Device has been detached */
1081 pl->revents |= POLLERR;
1082 break;
1083 }
1084 }
1085
1086 return (nevents);
1087 }
1088
1089 /*
1090 * Convert pollfd into kqueue events and register them on the
1091 * per-thread queue.
1092 *
1093 * At most 3 events can correspond to a single pollfd.
1094 */
1095 void
1096 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered,
1097 int *ncollected)
1098 {
1099 int i, nkev, nevt, forcehup;
1100 struct kevent kev[3], *kevp;
1101
1102 for (i = 0; i < nfds; i++) {
1103 pl[i].events &= ~POLL_NOHUP;
1104 pl[i].revents = 0;
1105
1106 if (pl[i].fd < 0)
1107 continue;
1108
1109 /*
1110 * POLLHUP checking is implicit in the event filters.
1111 * However, the checking must be even if no events are
1112 * requested.
1113 */
1114 forcehup = ((pl[i].events & ~POLLHUP) == 0);
1115
1116 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n",
1117 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial);
1118
1119 nevt = 0;
1120 nkev = 0;
1121 kevp = kev;
1122 if (pl[i].events & (POLLIN | POLLRDNORM)) {
1123 EV_SET(kevp, pl[i].fd, EVFILT_READ,
1124 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
1125 (void *)(p->p_kq_serial + i));
1126 nkev++;
1127 kevp++;
1128 }
1129 if (pl[i].events & (POLLOUT | POLLWRNORM)) {
1130 EV_SET(kevp, pl[i].fd, EVFILT_WRITE,
1131 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
1132 (void *)(p->p_kq_serial + i));
1133 nkev++;
1134 kevp++;
1135 }
1136 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) {
1137 int evff = forcehup ? 0 : NOTE_OOB;
1138
1139 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT,
1140 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0,
1141 (void *)(p->p_kq_serial + i));
1142 nkev++;
1143 kevp++;
1144 }
1145
1146 if (nkev == 0)
1147 continue;
1148
1149 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i);
1150
1151 if (pl[i].revents != 0)
1152 (*ncollected)++;
1153 }
1154
1155 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered,
1156 *ncollected);
1157 }
1158
1159 /*
1160 * Convert given kqueue event into corresponding poll(2) revents bit.
1161 */
1162 int
1163 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds)
1164 {
1165 static struct timeval poll_errintvl = { 5, 0 };
1166 static struct timeval poll_lasterr;
1167 int already_seen;
1168 unsigned long i;
1169
1170 /* Extract poll array index */
1171 i = (unsigned long)kevp->udata - p->p_kq_serial;
1172
1173 if (i >= nfds) {
1174 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx",
1175 __func__, kevp, nfds,
1176 (unsigned long)kevp->udata, p->p_kq_serial);
1177 }
1178 if ((int)kevp->ident != pl[i].fd) {
1179 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx",
1180 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd,
1181 p->p_kq_serial);
1182 }
1183
1184 /*
1185 * A given descriptor may already have generated an error
1186 * against another filter during kqueue_register().
1187 *
1188 * Make sure to set the appropriate flags but do not
1189 * increment `*retval' more than once.
1190 */
1191 already_seen = (pl[i].revents != 0);
1192
1193 /* POLLNVAL preempts other events. */
1194 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) {
1195 pl[i].revents = POLLNVAL;
1196 goto done;
1197 } else if (pl[i].revents & POLLNVAL) {
1198 goto done;
1199 }
1200
1201 switch (kevp->filter) {
1202 case EVFILT_READ:
1203 if (kevp->flags & __EV_HUP)
1204 pl[i].revents |= POLLHUP;
1205 if (pl[i].events & (POLLIN | POLLRDNORM))
1206 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM);
1207 break;
1208 case EVFILT_WRITE:
1209 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
1210 if (kevp->flags & __EV_HUP) {
1211 pl[i].revents |= POLLHUP;
1212 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) {
1213 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM);
1214 }
1215 break;
1216 case EVFILT_EXCEPT:
1217 if (kevp->flags & __EV_HUP) {
1218 if (pl[i].events != 0 && pl[i].events != POLLOUT)
1219 DPRINTFN(0, "weird events %x\n", pl[i].events);
1220 pl[i].revents |= POLLHUP;
1221 break;
1222 }
1223 if (pl[i].events & (POLLPRI | POLLRDBAND))
1224 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND);
1225 break;
1226 default:
1227 KASSERT(0);
1228 }
1229
1230 done:
1231 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n",
1232 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata,
1233 kevp->filter);
1234
1235 /*
1236 * Make noise about unclaimed events as they might indicate a bug
1237 * and can result in spurious-looking wakeups of poll(2).
1238 *
1239 * Live-locking within the system call should not happen because
1240 * the scan loop in doppoll() has an upper limit for the number
1241 * of events to process.
1242 */
1243 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) {
1244 printf("%s[%d]: poll index %lu fd %d events 0x%x "
1245 "filter %d/0x%x unclaimed\n",
1246 p->p_p->ps_comm, p->p_tid, i, pl[i].fd,
1247 pl[i].events, kevp->filter, kevp->flags);
1248 }
1249
1250 if (!already_seen && (pl[i].revents != 0))
1251 return (1);
1252
1253 return (0);
1254 }
1255
1256 /*
1257 * utrace system call
1258 */
1259 int
1260 sys_utrace(struct proc *curp, void *v, register_t *retval)
1261 {
1262 #ifdef KTRACE
1263 struct sys_utrace_args /* {
1264 syscallarg(const char *) label;
1265 syscallarg(const void *) addr;
1266 syscallarg(size_t) len;
1267 } */ *uap = v;
1268
1269 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1270 SCARG(uap, len)));
1271 #else
1272 return (0);
1273 #endif
1274 }
Cache object: 6cc0325bc8e05a70bd70b9aeab258ff7
|