1 /*-
2 * Copyright (c) 2007 Roman Divacky
3 * Copyright (c) 2014 Dmitry Chagin
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD: releng/11.1/sys/compat/linux/linux_event.c 316302 2017-03-30 20:14:43Z dchagin $");
30
31 #include "opt_compat.h"
32
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/imgact.h>
36 #include <sys/kernel.h>
37 #include <sys/limits.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/callout.h>
41 #include <sys/capsicum.h>
42 #include <sys/types.h>
43 #include <sys/user.h>
44 #include <sys/file.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/errno.h>
48 #include <sys/event.h>
49 #include <sys/poll.h>
50 #include <sys/proc.h>
51 #include <sys/selinfo.h>
52 #include <sys/sx.h>
53 #include <sys/syscallsubr.h>
54 #include <sys/timespec.h>
55
56 #ifdef COMPAT_LINUX32
57 #include <machine/../linux32/linux.h>
58 #include <machine/../linux32/linux32_proto.h>
59 #else
60 #include <machine/../linux/linux.h>
61 #include <machine/../linux/linux_proto.h>
62 #endif
63
64 #include <compat/linux/linux_emul.h>
65 #include <compat/linux/linux_event.h>
66 #include <compat/linux/linux_file.h>
67 #include <compat/linux/linux_timer.h>
68 #include <compat/linux/linux_util.h>
69
70 /*
71 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
72 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
73 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
74 * data verbatuim. Therefore we allocate 64-bit memory block to pass
75 * user supplied data for every file descriptor.
76 */
77
78 typedef uint64_t epoll_udata_t;
79
80 struct epoll_emuldata {
81 uint32_t fdc; /* epoll udata max index */
82 epoll_udata_t udata[1]; /* epoll user data vector */
83 };
84
85 #define EPOLL_DEF_SZ 16
86 #define EPOLL_SIZE(fdn) \
87 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t))
88
89 struct epoll_event {
90 uint32_t events;
91 epoll_udata_t data;
92 }
93 #if defined(__amd64__)
94 __attribute__((packed))
95 #endif
96 ;
97
98 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
99
100 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata);
101 static int epoll_to_kevent(struct thread *td, struct file *epfp,
102 int fd, struct epoll_event *l_event, int *kev_flags,
103 struct kevent *kevent, int *nkevents);
104 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
105 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
106 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
107 static int epoll_delete_event(struct thread *td, struct file *epfp,
108 int fd, int filter);
109 static int epoll_delete_all_events(struct thread *td, struct file *epfp,
110 int fd);
111
112 struct epoll_copyin_args {
113 struct kevent *changelist;
114 };
115
116 struct epoll_copyout_args {
117 struct epoll_event *leventlist;
118 struct proc *p;
119 uint32_t count;
120 int error;
121 };
122
123 /* eventfd */
124 typedef uint64_t eventfd_t;
125
126 static fo_rdwr_t eventfd_read;
127 static fo_rdwr_t eventfd_write;
128 static fo_ioctl_t eventfd_ioctl;
129 static fo_poll_t eventfd_poll;
130 static fo_kqfilter_t eventfd_kqfilter;
131 static fo_stat_t eventfd_stat;
132 static fo_close_t eventfd_close;
133 static fo_fill_kinfo_t eventfd_fill_kinfo;
134
135 static struct fileops eventfdops = {
136 .fo_read = eventfd_read,
137 .fo_write = eventfd_write,
138 .fo_truncate = invfo_truncate,
139 .fo_ioctl = eventfd_ioctl,
140 .fo_poll = eventfd_poll,
141 .fo_kqfilter = eventfd_kqfilter,
142 .fo_stat = eventfd_stat,
143 .fo_close = eventfd_close,
144 .fo_chmod = invfo_chmod,
145 .fo_chown = invfo_chown,
146 .fo_sendfile = invfo_sendfile,
147 .fo_fill_kinfo = eventfd_fill_kinfo,
148 .fo_flags = DFLAG_PASSABLE
149 };
150
151 static void filt_eventfddetach(struct knote *kn);
152 static int filt_eventfdread(struct knote *kn, long hint);
153 static int filt_eventfdwrite(struct knote *kn, long hint);
154
155 static struct filterops eventfd_rfiltops = {
156 .f_isfd = 1,
157 .f_detach = filt_eventfddetach,
158 .f_event = filt_eventfdread
159 };
160 static struct filterops eventfd_wfiltops = {
161 .f_isfd = 1,
162 .f_detach = filt_eventfddetach,
163 .f_event = filt_eventfdwrite
164 };
165
166 /* timerfd */
167 typedef uint64_t timerfd_t;
168
169 static fo_rdwr_t timerfd_read;
170 static fo_poll_t timerfd_poll;
171 static fo_kqfilter_t timerfd_kqfilter;
172 static fo_stat_t timerfd_stat;
173 static fo_close_t timerfd_close;
174 static fo_fill_kinfo_t timerfd_fill_kinfo;
175
176 static struct fileops timerfdops = {
177 .fo_read = timerfd_read,
178 .fo_write = invfo_rdwr,
179 .fo_truncate = invfo_truncate,
180 .fo_ioctl = eventfd_ioctl,
181 .fo_poll = timerfd_poll,
182 .fo_kqfilter = timerfd_kqfilter,
183 .fo_stat = timerfd_stat,
184 .fo_close = timerfd_close,
185 .fo_chmod = invfo_chmod,
186 .fo_chown = invfo_chown,
187 .fo_sendfile = invfo_sendfile,
188 .fo_fill_kinfo = timerfd_fill_kinfo,
189 .fo_flags = DFLAG_PASSABLE
190 };
191
192 static void filt_timerfddetach(struct knote *kn);
193 static int filt_timerfdread(struct knote *kn, long hint);
194
195 static struct filterops timerfd_rfiltops = {
196 .f_isfd = 1,
197 .f_detach = filt_timerfddetach,
198 .f_event = filt_timerfdread
199 };
200
201 struct eventfd {
202 eventfd_t efd_count;
203 uint32_t efd_flags;
204 struct selinfo efd_sel;
205 struct mtx efd_lock;
206 };
207
208 struct timerfd {
209 clockid_t tfd_clockid;
210 struct itimerspec tfd_time;
211 struct callout tfd_callout;
212 timerfd_t tfd_count;
213 bool tfd_canceled;
214 struct selinfo tfd_sel;
215 struct mtx tfd_lock;
216 };
217
218 static int eventfd_create(struct thread *td, uint32_t initval, int flags);
219 static void linux_timerfd_expire(void *);
220 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *);
221
222
223 static void
224 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata)
225 {
226 struct linux_pemuldata *pem;
227 struct epoll_emuldata *emd;
228 struct proc *p;
229
230 p = td->td_proc;
231
232 pem = pem_find(p);
233 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
234
235 LINUX_PEM_XLOCK(pem);
236 if (pem->epoll == NULL) {
237 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
238 emd->fdc = fd;
239 pem->epoll = emd;
240 } else {
241 emd = pem->epoll;
242 if (fd > emd->fdc) {
243 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
244 emd->fdc = fd;
245 pem->epoll = emd;
246 }
247 }
248 emd->udata[fd] = udata;
249 LINUX_PEM_XUNLOCK(pem);
250 }
251
252 static int
253 epoll_create_common(struct thread *td, int flags)
254 {
255 int error;
256
257 error = kern_kqueue(td, flags, NULL);
258 if (error != 0)
259 return (error);
260
261 epoll_fd_install(td, EPOLL_DEF_SZ, 0);
262
263 return (0);
264 }
265
266 int
267 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
268 {
269
270 /*
271 * args->size is unused. Linux just tests it
272 * and then forgets it as well.
273 */
274 if (args->size <= 0)
275 return (EINVAL);
276
277 return (epoll_create_common(td, 0));
278 }
279
280 int
281 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
282 {
283 int flags;
284
285 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
286 return (EINVAL);
287
288 flags = 0;
289 if ((args->flags & LINUX_O_CLOEXEC) != 0)
290 flags |= O_CLOEXEC;
291
292 return (epoll_create_common(td, flags));
293 }
294
295 /* Structure converting function from epoll to kevent. */
296 static int
297 epoll_to_kevent(struct thread *td, struct file *epfp,
298 int fd, struct epoll_event *l_event, int *kev_flags,
299 struct kevent *kevent, int *nkevents)
300 {
301 uint32_t levents = l_event->events;
302 struct linux_pemuldata *pem;
303 struct proc *p;
304
305 /* flags related to how event is registered */
306 if ((levents & LINUX_EPOLLONESHOT) != 0)
307 *kev_flags |= EV_ONESHOT;
308 if ((levents & LINUX_EPOLLET) != 0)
309 *kev_flags |= EV_CLEAR;
310 if ((levents & LINUX_EPOLLERR) != 0)
311 *kev_flags |= EV_ERROR;
312 if ((levents & LINUX_EPOLLRDHUP) != 0)
313 *kev_flags |= EV_EOF;
314
315 /* flags related to what event is registered */
316 if ((levents & LINUX_EPOLL_EVRD) != 0) {
317 EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0);
318 ++(*nkevents);
319 }
320 if ((levents & LINUX_EPOLL_EVWR) != 0) {
321 EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0);
322 ++(*nkevents);
323 }
324
325 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
326 p = td->td_proc;
327
328 pem = pem_find(p);
329 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
330 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n"));
331
332 LINUX_PEM_XLOCK(pem);
333 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
334 pem->flags |= LINUX_XUNSUP_EPOLL;
335 LINUX_PEM_XUNLOCK(pem);
336 linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n",
337 levents);
338 } else
339 LINUX_PEM_XUNLOCK(pem);
340 return (EINVAL);
341 }
342
343 return (0);
344 }
345
346 /*
347 * Structure converting function from kevent to epoll. In a case
348 * this is called on error in registration we store the error in
349 * event->data and pick it up later in linux_epoll_ctl().
350 */
351 static void
352 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
353 {
354
355 if ((kevent->flags & EV_ERROR) != 0) {
356 l_event->events = LINUX_EPOLLERR;
357 return;
358 }
359
360 /* XXX EPOLLPRI, EPOLLHUP */
361 switch (kevent->filter) {
362 case EVFILT_READ:
363 l_event->events = LINUX_EPOLLIN;
364 if ((kevent->flags & EV_EOF) != 0)
365 l_event->events |= LINUX_EPOLLRDHUP;
366 break;
367 case EVFILT_WRITE:
368 l_event->events = LINUX_EPOLLOUT;
369 break;
370 }
371 }
372
373 /*
374 * Copyout callback used by kevent. This converts kevent
375 * events to epoll events and copies them back to the
376 * userspace. This is also called on error on registering
377 * of the filter.
378 */
379 static int
380 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
381 {
382 struct epoll_copyout_args *args;
383 struct linux_pemuldata *pem;
384 struct epoll_emuldata *emd;
385 struct epoll_event *eep;
386 int error, fd, i;
387
388 args = (struct epoll_copyout_args*) arg;
389 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
390
391 pem = pem_find(args->p);
392 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
393 LINUX_PEM_SLOCK(pem);
394 emd = pem->epoll;
395 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n"));
396
397 for (i = 0; i < count; i++) {
398 kevent_to_epoll(&kevp[i], &eep[i]);
399
400 fd = kevp[i].ident;
401 KASSERT(fd <= emd->fdc, ("epoll user data vector"
402 " is too small.\n"));
403 eep[i].data = emd->udata[fd];
404 }
405 LINUX_PEM_SUNLOCK(pem);
406
407 error = copyout(eep, args->leventlist, count * sizeof(*eep));
408 if (error == 0) {
409 args->leventlist += count;
410 args->count += count;
411 } else if (args->error == 0)
412 args->error = error;
413
414 free(eep, M_EPOLL);
415 return (error);
416 }
417
418 /*
419 * Copyin callback used by kevent. This copies already
420 * converted filters from kernel memory to the kevent
421 * internal kernel memory. Hence the memcpy instead of
422 * copyin.
423 */
424 static int
425 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
426 {
427 struct epoll_copyin_args *args;
428
429 args = (struct epoll_copyin_args*) arg;
430
431 memcpy(kevp, args->changelist, count * sizeof(*kevp));
432 args->changelist += count;
433
434 return (0);
435 }
436
437 /*
438 * Load epoll filter, convert it to kevent filter
439 * and load it into kevent subsystem.
440 */
441 int
442 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
443 {
444 struct file *epfp, *fp;
445 struct epoll_copyin_args ciargs;
446 struct kevent kev[2];
447 struct kevent_copyops k_ops = { &ciargs,
448 NULL,
449 epoll_kev_copyin};
450 struct epoll_event le;
451 cap_rights_t rights;
452 int kev_flags;
453 int nchanges = 0;
454 int error;
455
456 if (args->op != LINUX_EPOLL_CTL_DEL) {
457 error = copyin(args->event, &le, sizeof(le));
458 if (error != 0)
459 return (error);
460 }
461
462 error = fget(td, args->epfd,
463 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp);
464 if (error != 0)
465 return (error);
466 if (epfp->f_type != DTYPE_KQUEUE) {
467 error = EINVAL;
468 goto leave1;
469 }
470
471 /* Protect user data vector from incorrectly supplied fd. */
472 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
473 if (error != 0)
474 goto leave1;
475
476 /* Linux disallows spying on himself */
477 if (epfp == fp) {
478 error = EINVAL;
479 goto leave0;
480 }
481
482 ciargs.changelist = kev;
483
484 if (args->op != LINUX_EPOLL_CTL_DEL) {
485 kev_flags = EV_ADD | EV_ENABLE;
486 error = epoll_to_kevent(td, epfp, args->fd, &le,
487 &kev_flags, kev, &nchanges);
488 if (error != 0)
489 goto leave0;
490 }
491
492 switch (args->op) {
493 case LINUX_EPOLL_CTL_MOD:
494 error = epoll_delete_all_events(td, epfp, args->fd);
495 if (error != 0)
496 goto leave0;
497 break;
498
499 case LINUX_EPOLL_CTL_ADD:
500 /*
501 * kqueue_register() return ENOENT if event does not exists
502 * and the EV_ADD flag is not set.
503 */
504 kev[0].flags &= ~EV_ADD;
505 error = kqfd_register(args->epfd, &kev[0], td, 1);
506 if (error != ENOENT) {
507 error = EEXIST;
508 goto leave0;
509 }
510 error = 0;
511 kev[0].flags |= EV_ADD;
512 break;
513
514 case LINUX_EPOLL_CTL_DEL:
515 /* CTL_DEL means unregister this fd with this epoll */
516 error = epoll_delete_all_events(td, epfp, args->fd);
517 goto leave0;
518
519 default:
520 error = EINVAL;
521 goto leave0;
522 }
523
524 epoll_fd_install(td, args->fd, le.data);
525
526 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
527
528 leave0:
529 fdrop(fp, td);
530
531 leave1:
532 fdrop(epfp, td);
533 return (error);
534 }
535
536 /*
537 * Wait for a filter to be triggered on the epoll file descriptor.
538 */
539 static int
540 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
541 int maxevents, int timeout, sigset_t *uset)
542 {
543 struct epoll_copyout_args coargs;
544 struct kevent_copyops k_ops = { &coargs,
545 epoll_kev_copyout,
546 NULL};
547 struct timespec ts, *tsp;
548 cap_rights_t rights;
549 struct file *epfp;
550 sigset_t omask;
551 int error;
552
553 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
554 return (EINVAL);
555
556 error = fget(td, epfd,
557 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp);
558 if (error != 0)
559 return (error);
560 if (epfp->f_type != DTYPE_KQUEUE) {
561 error = EINVAL;
562 goto leave1;
563 }
564 if (uset != NULL) {
565 error = kern_sigprocmask(td, SIG_SETMASK, uset,
566 &omask, 0);
567 if (error != 0)
568 goto leave1;
569 td->td_pflags |= TDP_OLDMASK;
570 /*
571 * Make sure that ast() is called on return to
572 * usermode and TDP_OLDMASK is cleared, restoring old
573 * sigmask.
574 */
575 thread_lock(td);
576 td->td_flags |= TDF_ASTPENDING;
577 thread_unlock(td);
578 }
579
580
581 coargs.leventlist = events;
582 coargs.p = td->td_proc;
583 coargs.count = 0;
584 coargs.error = 0;
585
586 if (timeout != -1) {
587 if (timeout < 0) {
588 error = EINVAL;
589 goto leave0;
590 }
591 /* Convert from milliseconds to timespec. */
592 ts.tv_sec = timeout / 1000;
593 ts.tv_nsec = (timeout % 1000) * 1000000;
594 tsp = &ts;
595 } else {
596 tsp = NULL;
597 }
598
599 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
600 if (error == 0 && coargs.error != 0)
601 error = coargs.error;
602
603 /*
604 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
605 * Maybe we should translate that but I don't think it matters at all.
606 */
607 if (error == 0)
608 td->td_retval[0] = coargs.count;
609
610 leave0:
611 if (uset != NULL)
612 error = kern_sigprocmask(td, SIG_SETMASK, &omask,
613 NULL, 0);
614 leave1:
615 fdrop(epfp, td);
616 return (error);
617 }
618
619 int
620 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
621 {
622
623 return (linux_epoll_wait_common(td, args->epfd, args->events,
624 args->maxevents, args->timeout, NULL));
625 }
626
627 int
628 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
629 {
630 sigset_t mask, *pmask;
631 l_sigset_t lmask;
632 int error;
633
634 if (args->mask != NULL) {
635 if (args->sigsetsize != sizeof(l_sigset_t))
636 return (EINVAL);
637 error = copyin(args->mask, &lmask, sizeof(l_sigset_t));
638 if (error != 0)
639 return (error);
640 linux_to_bsd_sigset(&lmask, &mask);
641 pmask = &mask;
642 } else
643 pmask = NULL;
644 return (linux_epoll_wait_common(td, args->epfd, args->events,
645 args->maxevents, args->timeout, pmask));
646 }
647
648 static int
649 epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter)
650 {
651 struct epoll_copyin_args ciargs;
652 struct kevent kev;
653 struct kevent_copyops k_ops = { &ciargs,
654 NULL,
655 epoll_kev_copyin};
656
657 ciargs.changelist = &kev;
658 EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
659
660 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
661 }
662
663 static int
664 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
665 {
666 int error1, error2;
667
668 error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ);
669 error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE);
670
671 /* return 0 if at least one result positive */
672 return (error1 == 0 ? 0 : error2);
673 }
674
675 static int
676 eventfd_create(struct thread *td, uint32_t initval, int flags)
677 {
678 struct filedesc *fdp;
679 struct eventfd *efd;
680 struct file *fp;
681 int fflags, fd, error;
682
683 fflags = 0;
684 if ((flags & LINUX_O_CLOEXEC) != 0)
685 fflags |= O_CLOEXEC;
686
687 fdp = td->td_proc->p_fd;
688 error = falloc(td, &fp, &fd, fflags);
689 if (error != 0)
690 return (error);
691
692 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO);
693 efd->efd_flags = flags;
694 efd->efd_count = initval;
695 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
696
697 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
698
699 fflags = FREAD | FWRITE;
700 if ((flags & LINUX_O_NONBLOCK) != 0)
701 fflags |= FNONBLOCK;
702
703 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops);
704 fdrop(fp, td);
705
706 td->td_retval[0] = fd;
707 return (error);
708 }
709
710 int
711 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
712 {
713
714 return (eventfd_create(td, args->initval, 0));
715 }
716
717 int
718 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
719 {
720
721 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0)
722 return (EINVAL);
723
724 return (eventfd_create(td, args->initval, args->flags));
725 }
726
727 static int
728 eventfd_close(struct file *fp, struct thread *td)
729 {
730 struct eventfd *efd;
731
732 efd = fp->f_data;
733 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
734 return (EINVAL);
735
736 seldrain(&efd->efd_sel);
737 knlist_destroy(&efd->efd_sel.si_note);
738
739 fp->f_ops = &badfileops;
740 mtx_destroy(&efd->efd_lock);
741 free(efd, M_EPOLL);
742
743 return (0);
744 }
745
746 static int
747 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
748 int flags, struct thread *td)
749 {
750 struct eventfd *efd;
751 eventfd_t count;
752 int error;
753
754 efd = fp->f_data;
755 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
756 return (EINVAL);
757
758 if (uio->uio_resid < sizeof(eventfd_t))
759 return (EINVAL);
760
761 error = 0;
762 mtx_lock(&efd->efd_lock);
763 retry:
764 if (efd->efd_count == 0) {
765 if ((fp->f_flag & FNONBLOCK) != 0) {
766 mtx_unlock(&efd->efd_lock);
767 return (EAGAIN);
768 }
769 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0);
770 if (error == 0)
771 goto retry;
772 }
773 if (error == 0) {
774 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) {
775 count = 1;
776 --efd->efd_count;
777 } else {
778 count = efd->efd_count;
779 efd->efd_count = 0;
780 }
781 KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
782 selwakeup(&efd->efd_sel);
783 wakeup(&efd->efd_count);
784 mtx_unlock(&efd->efd_lock);
785 error = uiomove(&count, sizeof(eventfd_t), uio);
786 } else
787 mtx_unlock(&efd->efd_lock);
788
789 return (error);
790 }
791
792 static int
793 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
794 int flags, struct thread *td)
795 {
796 struct eventfd *efd;
797 eventfd_t count;
798 int error;
799
800 efd = fp->f_data;
801 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
802 return (EINVAL);
803
804 if (uio->uio_resid < sizeof(eventfd_t))
805 return (EINVAL);
806
807 error = uiomove(&count, sizeof(eventfd_t), uio);
808 if (error != 0)
809 return (error);
810 if (count == UINT64_MAX)
811 return (EINVAL);
812
813 mtx_lock(&efd->efd_lock);
814 retry:
815 if (UINT64_MAX - efd->efd_count <= count) {
816 if ((fp->f_flag & FNONBLOCK) != 0) {
817 mtx_unlock(&efd->efd_lock);
818 /* Do not not return the number of bytes written */
819 uio->uio_resid += sizeof(eventfd_t);
820 return (EAGAIN);
821 }
822 error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
823 PCATCH, "lefdwr", 0);
824 if (error == 0)
825 goto retry;
826 }
827 if (error == 0) {
828 efd->efd_count += count;
829 KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
830 selwakeup(&efd->efd_sel);
831 wakeup(&efd->efd_count);
832 }
833 mtx_unlock(&efd->efd_lock);
834
835 return (error);
836 }
837
838 static int
839 eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
840 struct thread *td)
841 {
842 struct eventfd *efd;
843 int revents = 0;
844
845 efd = fp->f_data;
846 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
847 return (POLLERR);
848
849 mtx_lock(&efd->efd_lock);
850 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0)
851 revents |= events & (POLLIN|POLLRDNORM);
852 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count)
853 revents |= events & (POLLOUT|POLLWRNORM);
854 if (revents == 0)
855 selrecord(td, &efd->efd_sel);
856 mtx_unlock(&efd->efd_lock);
857
858 return (revents);
859 }
860
861 /*ARGSUSED*/
862 static int
863 eventfd_kqfilter(struct file *fp, struct knote *kn)
864 {
865 struct eventfd *efd;
866
867 efd = fp->f_data;
868 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
869 return (EINVAL);
870
871 mtx_lock(&efd->efd_lock);
872 switch (kn->kn_filter) {
873 case EVFILT_READ:
874 kn->kn_fop = &eventfd_rfiltops;
875 break;
876 case EVFILT_WRITE:
877 kn->kn_fop = &eventfd_wfiltops;
878 break;
879 default:
880 mtx_unlock(&efd->efd_lock);
881 return (EINVAL);
882 }
883
884 kn->kn_hook = efd;
885 knlist_add(&efd->efd_sel.si_note, kn, 1);
886 mtx_unlock(&efd->efd_lock);
887
888 return (0);
889 }
890
891 static void
892 filt_eventfddetach(struct knote *kn)
893 {
894 struct eventfd *efd = kn->kn_hook;
895
896 mtx_lock(&efd->efd_lock);
897 knlist_remove(&efd->efd_sel.si_note, kn, 1);
898 mtx_unlock(&efd->efd_lock);
899 }
900
901 /*ARGSUSED*/
902 static int
903 filt_eventfdread(struct knote *kn, long hint)
904 {
905 struct eventfd *efd = kn->kn_hook;
906 int ret;
907
908 mtx_assert(&efd->efd_lock, MA_OWNED);
909 ret = (efd->efd_count > 0);
910
911 return (ret);
912 }
913
914 /*ARGSUSED*/
915 static int
916 filt_eventfdwrite(struct knote *kn, long hint)
917 {
918 struct eventfd *efd = kn->kn_hook;
919 int ret;
920
921 mtx_assert(&efd->efd_lock, MA_OWNED);
922 ret = (UINT64_MAX - 1 > efd->efd_count);
923
924 return (ret);
925 }
926
927 /*ARGSUSED*/
928 static int
929 eventfd_ioctl(struct file *fp, u_long cmd, void *data,
930 struct ucred *active_cred, struct thread *td)
931 {
932
933 if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD &&
934 fp->f_type != DTYPE_LINUXTFD))
935 return (EINVAL);
936
937 switch (cmd)
938 {
939 case FIONBIO:
940 if ((*(int *)data))
941 atomic_set_int(&fp->f_flag, FNONBLOCK);
942 else
943 atomic_clear_int(&fp->f_flag, FNONBLOCK);
944 case FIOASYNC:
945 return (0);
946 default:
947 return (ENXIO);
948 }
949 }
950
951 /*ARGSUSED*/
952 static int
953 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
954 struct thread *td)
955 {
956
957 return (ENXIO);
958 }
959
960 /*ARGSUSED*/
961 static int
962 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
963 {
964
965 kif->kf_type = KF_TYPE_UNKNOWN;
966 return (0);
967 }
968
969 int
970 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
971 {
972 struct filedesc *fdp;
973 struct timerfd *tfd;
974 struct file *fp;
975 clockid_t clockid;
976 int fflags, fd, error;
977
978 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
979 return (EINVAL);
980
981 error = linux_to_native_clockid(&clockid, args->clockid);
982 if (error != 0)
983 return (error);
984 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
985 return (EINVAL);
986
987 fflags = 0;
988 if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
989 fflags |= O_CLOEXEC;
990
991 fdp = td->td_proc->p_fd;
992 error = falloc(td, &fp, &fd, fflags);
993 if (error != 0)
994 return (error);
995
996 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
997 tfd->tfd_clockid = clockid;
998 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
999
1000 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
1001 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
1002
1003 fflags = FREAD;
1004 if ((args->flags & LINUX_O_NONBLOCK) != 0)
1005 fflags |= FNONBLOCK;
1006
1007 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
1008 fdrop(fp, td);
1009
1010 td->td_retval[0] = fd;
1011 return (error);
1012 }
1013
1014 static int
1015 timerfd_close(struct file *fp, struct thread *td)
1016 {
1017 struct timerfd *tfd;
1018
1019 tfd = fp->f_data;
1020 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1021 return (EINVAL);
1022
1023 timespecclear(&tfd->tfd_time.it_value);
1024 timespecclear(&tfd->tfd_time.it_interval);
1025
1026 mtx_lock(&tfd->tfd_lock);
1027 callout_drain(&tfd->tfd_callout);
1028 mtx_unlock(&tfd->tfd_lock);
1029
1030 seldrain(&tfd->tfd_sel);
1031 knlist_destroy(&tfd->tfd_sel.si_note);
1032
1033 fp->f_ops = &badfileops;
1034 mtx_destroy(&tfd->tfd_lock);
1035 free(tfd, M_EPOLL);
1036
1037 return (0);
1038 }
1039
1040 static int
1041 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1042 int flags, struct thread *td)
1043 {
1044 struct timerfd *tfd;
1045 timerfd_t count;
1046 int error;
1047
1048 tfd = fp->f_data;
1049 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1050 return (EINVAL);
1051
1052 if (uio->uio_resid < sizeof(timerfd_t))
1053 return (EINVAL);
1054
1055 error = 0;
1056 mtx_lock(&tfd->tfd_lock);
1057 retry:
1058 if (tfd->tfd_canceled) {
1059 tfd->tfd_count = 0;
1060 mtx_unlock(&tfd->tfd_lock);
1061 return (ECANCELED);
1062 }
1063 if (tfd->tfd_count == 0) {
1064 if ((fp->f_flag & FNONBLOCK) != 0) {
1065 mtx_unlock(&tfd->tfd_lock);
1066 return (EAGAIN);
1067 }
1068 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
1069 if (error == 0)
1070 goto retry;
1071 }
1072 if (error == 0) {
1073 count = tfd->tfd_count;
1074 tfd->tfd_count = 0;
1075 mtx_unlock(&tfd->tfd_lock);
1076 error = uiomove(&count, sizeof(timerfd_t), uio);
1077 } else
1078 mtx_unlock(&tfd->tfd_lock);
1079
1080 return (error);
1081 }
1082
1083 static int
1084 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
1085 struct thread *td)
1086 {
1087 struct timerfd *tfd;
1088 int revents = 0;
1089
1090 tfd = fp->f_data;
1091 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1092 return (POLLERR);
1093
1094 mtx_lock(&tfd->tfd_lock);
1095 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
1096 revents |= events & (POLLIN|POLLRDNORM);
1097 if (revents == 0)
1098 selrecord(td, &tfd->tfd_sel);
1099 mtx_unlock(&tfd->tfd_lock);
1100
1101 return (revents);
1102 }
1103
1104 /*ARGSUSED*/
1105 static int
1106 timerfd_kqfilter(struct file *fp, struct knote *kn)
1107 {
1108 struct timerfd *tfd;
1109
1110 tfd = fp->f_data;
1111 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1112 return (EINVAL);
1113
1114 if (kn->kn_filter == EVFILT_READ)
1115 kn->kn_fop = &timerfd_rfiltops;
1116 else
1117 return (EINVAL);
1118
1119 kn->kn_hook = tfd;
1120 knlist_add(&tfd->tfd_sel.si_note, kn, 0);
1121
1122 return (0);
1123 }
1124
1125 static void
1126 filt_timerfddetach(struct knote *kn)
1127 {
1128 struct timerfd *tfd = kn->kn_hook;
1129
1130 mtx_lock(&tfd->tfd_lock);
1131 knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
1132 mtx_unlock(&tfd->tfd_lock);
1133 }
1134
1135 /*ARGSUSED*/
1136 static int
1137 filt_timerfdread(struct knote *kn, long hint)
1138 {
1139 struct timerfd *tfd = kn->kn_hook;
1140
1141 return (tfd->tfd_count > 0);
1142 }
1143
1144 /*ARGSUSED*/
1145 static int
1146 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1147 struct thread *td)
1148 {
1149
1150 return (ENXIO);
1151 }
1152
1153 /*ARGSUSED*/
1154 static int
1155 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
1156 {
1157
1158 kif->kf_type = KF_TYPE_UNKNOWN;
1159 return (0);
1160 }
1161
1162 static void
1163 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
1164 {
1165
1166 if (tfd->tfd_clockid == CLOCK_REALTIME)
1167 getnanotime(ts);
1168 else /* CLOCK_MONOTONIC */
1169 getnanouptime(ts);
1170 }
1171
1172 static void
1173 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
1174 {
1175 struct timespec cts;
1176
1177 linux_timerfd_clocktime(tfd, &cts);
1178 *ots = tfd->tfd_time;
1179 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
1180 timespecsub(&ots->it_value, &cts);
1181 if (ots->it_value.tv_sec < 0 ||
1182 (ots->it_value.tv_sec == 0 &&
1183 ots->it_value.tv_nsec == 0)) {
1184 ots->it_value.tv_sec = 0;
1185 ots->it_value.tv_nsec = 1;
1186 }
1187 }
1188 }
1189
1190 int
1191 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
1192 {
1193 cap_rights_t rights;
1194 struct l_itimerspec lots;
1195 struct itimerspec ots;
1196 struct timerfd *tfd;
1197 struct file *fp;
1198 int error;
1199
1200 error = fget(td, args->fd, cap_rights_init(&rights, CAP_READ), &fp);
1201 if (error != 0)
1202 return (error);
1203 tfd = fp->f_data;
1204 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
1205 error = EINVAL;
1206 goto out;
1207 }
1208
1209 mtx_lock(&tfd->tfd_lock);
1210 linux_timerfd_curval(tfd, &ots);
1211 mtx_unlock(&tfd->tfd_lock);
1212
1213 error = native_to_linux_itimerspec(&lots, &ots);
1214 if (error == 0)
1215 error = copyout(&lots, args->old_value, sizeof(lots));
1216
1217 out:
1218 fdrop(fp, td);
1219 return (error);
1220 }
1221
1222 int
1223 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1224 {
1225 struct l_itimerspec lots;
1226 struct itimerspec nts, ots;
1227 struct timespec cts, ts;
1228 cap_rights_t rights;
1229 struct timerfd *tfd;
1230 struct timeval tv;
1231 struct file *fp;
1232 int error;
1233
1234 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
1235 return (EINVAL);
1236
1237 error = copyin(args->new_value, &lots, sizeof(lots));
1238 if (error != 0)
1239 return (error);
1240 error = linux_to_native_itimerspec(&nts, &lots);
1241 if (error != 0)
1242 return (error);
1243
1244 error = fget(td, args->fd, cap_rights_init(&rights, CAP_WRITE), &fp);
1245 if (error != 0)
1246 return (error);
1247 tfd = fp->f_data;
1248 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
1249 error = EINVAL;
1250 goto out;
1251 }
1252
1253 mtx_lock(&tfd->tfd_lock);
1254 if (!timespecisset(&nts.it_value))
1255 timespecclear(&nts.it_interval);
1256 if (args->old_value != NULL)
1257 linux_timerfd_curval(tfd, &ots);
1258
1259 tfd->tfd_time = nts;
1260 if (timespecisset(&nts.it_value)) {
1261 linux_timerfd_clocktime(tfd, &cts);
1262 ts = nts.it_value;
1263 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
1264 timespecadd(&tfd->tfd_time.it_value, &cts);
1265 } else {
1266 timespecsub(&ts, &cts);
1267 }
1268 TIMESPEC_TO_TIMEVAL(&tv, &ts);
1269 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1270 linux_timerfd_expire, tfd);
1271 tfd->tfd_canceled = false;
1272 } else {
1273 tfd->tfd_canceled = true;
1274 callout_stop(&tfd->tfd_callout);
1275 }
1276 mtx_unlock(&tfd->tfd_lock);
1277
1278 if (args->old_value != NULL) {
1279 error = native_to_linux_itimerspec(&lots, &ots);
1280 if (error == 0)
1281 error = copyout(&lots, args->old_value, sizeof(lots));
1282 }
1283
1284 out:
1285 fdrop(fp, td);
1286 return (error);
1287 }
1288
1289 static void
1290 linux_timerfd_expire(void *arg)
1291 {
1292 struct timespec cts, ts;
1293 struct timeval tv;
1294 struct timerfd *tfd;
1295
1296 tfd = (struct timerfd *)arg;
1297
1298 linux_timerfd_clocktime(tfd, &cts);
1299 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1300 if (timespecisset(&tfd->tfd_time.it_interval))
1301 timespecadd(&tfd->tfd_time.it_value,
1302 &tfd->tfd_time.it_interval);
1303 else
1304 /* single shot timer */
1305 timespecclear(&tfd->tfd_time.it_value);
1306 if (timespecisset(&tfd->tfd_time.it_value)) {
1307 ts = tfd->tfd_time.it_value;
1308 timespecsub(&ts, &cts);
1309 TIMESPEC_TO_TIMEVAL(&tv, &ts);
1310 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1311 linux_timerfd_expire, tfd);
1312 }
1313 tfd->tfd_count++;
1314 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1315 selwakeup(&tfd->tfd_sel);
1316 wakeup(&tfd->tfd_count);
1317 } else if (timespecisset(&tfd->tfd_time.it_value)) {
1318 ts = tfd->tfd_time.it_value;
1319 timespecsub(&ts, &cts);
1320 TIMESPEC_TO_TIMEVAL(&tv, &ts);
1321 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1322 linux_timerfd_expire, tfd);
1323 }
1324 }
Cache object: 6e38d427e282c71cea8402dc1af454a8
|