FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_event.c
1 /*-
2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/lock.h>
35 #include <sys/mutex.h>
36 #include <sys/proc.h>
37 #include <sys/malloc.h>
38 #include <sys/unistd.h>
39 #include <sys/file.h>
40 #include <sys/filedesc.h>
41 #include <sys/filio.h>
42 #include <sys/fcntl.h>
43 #include <sys/kthread.h>
44 #include <sys/selinfo.h>
45 #include <sys/queue.h>
46 #include <sys/event.h>
47 #include <sys/eventvar.h>
48 #include <sys/poll.h>
49 #include <sys/protosw.h>
50 #include <sys/sigio.h>
51 #include <sys/signalvar.h>
52 #include <sys/socket.h>
53 #include <sys/socketvar.h>
54 #include <sys/stat.h>
55 #include <sys/sysctl.h>
56 #include <sys/sysproto.h>
57 #include <sys/syscallsubr.h>
58 #include <sys/taskqueue.h>
59 #include <sys/uio.h>
60
61 #include <vm/uma.h>
62
63 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
64 /*
65 * This lock is used if multiple kq locks are required. This possibly
66 * should be made into a per proc lock.
67 */
68 static struct mtx kq_global;
69 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
70 #define KQ_GLOBAL_LOCK(lck, haslck) do { \
71 if (!haslck) \
72 mtx_lock(lck); \
73 haslck = 1; \
74 } while (0)
75 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
76 if (haslck) \
77 mtx_unlock(lck); \
78 haslck = 0; \
79 } while (0)
80
81 TASKQUEUE_DEFINE_THREAD(kqueue);
82
83 static int kevent_copyout(void *arg, struct kevent *kevp, int count);
84 static int kevent_copyin(void *arg, struct kevent *kevp, int count);
85 static int kqueue_aquire(struct file *fp, struct kqueue **kqp);
86 static void kqueue_release(struct kqueue *kq, int locked);
87 static int kqueue_expand(struct kqueue *kq, struct filterops *fops,
88 uintptr_t ident, int waitok);
89 static void kqueue_task(void *arg, int pending);
90 static int kqueue_scan(struct kqueue *kq, int maxevents,
91 struct kevent_copyops *k_ops,
92 const struct timespec *timeout,
93 struct kevent *keva, struct thread *td);
94 static void kqueue_wakeup(struct kqueue *kq);
95 static struct filterops *kqueue_fo_find(int filt);
96 static void kqueue_fo_release(int filt);
97
98 static fo_rdwr_t kqueue_read;
99 static fo_rdwr_t kqueue_write;
100 static fo_ioctl_t kqueue_ioctl;
101 static fo_poll_t kqueue_poll;
102 static fo_kqfilter_t kqueue_kqfilter;
103 static fo_stat_t kqueue_stat;
104 static fo_close_t kqueue_close;
105
106 static struct fileops kqueueops = {
107 .fo_read = kqueue_read,
108 .fo_write = kqueue_write,
109 .fo_ioctl = kqueue_ioctl,
110 .fo_poll = kqueue_poll,
111 .fo_kqfilter = kqueue_kqfilter,
112 .fo_stat = kqueue_stat,
113 .fo_close = kqueue_close,
114 };
115
116 static int knote_attach(struct knote *kn, struct kqueue *kq);
117 static void knote_drop(struct knote *kn, struct thread *td);
118 static void knote_enqueue(struct knote *kn);
119 static void knote_dequeue(struct knote *kn);
120 static void knote_init(void);
121 static struct knote *knote_alloc(int waitok);
122 static void knote_free(struct knote *kn);
123
124 static void filt_kqdetach(struct knote *kn);
125 static int filt_kqueue(struct knote *kn, long hint);
126 static int filt_procattach(struct knote *kn);
127 static void filt_procdetach(struct knote *kn);
128 static int filt_proc(struct knote *kn, long hint);
129 static int filt_fileattach(struct knote *kn);
130 static void filt_timerexpire(void *knx);
131 static int filt_timerattach(struct knote *kn);
132 static void filt_timerdetach(struct knote *kn);
133 static int filt_timer(struct knote *kn, long hint);
134
135 static struct filterops file_filtops =
136 { 1, filt_fileattach, NULL, NULL };
137 static struct filterops kqread_filtops =
138 { 1, NULL, filt_kqdetach, filt_kqueue };
139 /* XXX - move to kern_proc.c? */
140 static struct filterops proc_filtops =
141 { 0, filt_procattach, filt_procdetach, filt_proc };
142 static struct filterops timer_filtops =
143 { 0, filt_timerattach, filt_timerdetach, filt_timer };
144
145 static uma_zone_t knote_zone;
146 static int kq_ncallouts = 0;
147 static int kq_calloutmax = (4 * 1024);
148 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
149 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
150
151 /* XXX - ensure not KN_INFLUX?? */
152 #define KNOTE_ACTIVATE(kn, islock) do { \
153 if ((islock)) \
154 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
155 else \
156 KQ_LOCK((kn)->kn_kq); \
157 (kn)->kn_status |= KN_ACTIVE; \
158 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
159 knote_enqueue((kn)); \
160 if (!(islock)) \
161 KQ_UNLOCK((kn)->kn_kq); \
162 } while(0)
163 #define KQ_LOCK(kq) do { \
164 mtx_lock(&(kq)->kq_lock); \
165 } while (0)
166 #define KQ_FLUX_WAKEUP(kq) do { \
167 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
168 (kq)->kq_state &= ~KQ_FLUXWAIT; \
169 wakeup((kq)); \
170 } \
171 } while (0)
172 #define KQ_UNLOCK_FLUX(kq) do { \
173 KQ_FLUX_WAKEUP(kq); \
174 mtx_unlock(&(kq)->kq_lock); \
175 } while (0)
176 #define KQ_UNLOCK(kq) do { \
177 mtx_unlock(&(kq)->kq_lock); \
178 } while (0)
179 #define KQ_OWNED(kq) do { \
180 mtx_assert(&(kq)->kq_lock, MA_OWNED); \
181 } while (0)
182 #define KQ_NOTOWNED(kq) do { \
183 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
184 } while (0)
185 #define KN_LIST_LOCK(kn) do { \
186 if (kn->kn_knlist != NULL) \
187 mtx_lock(kn->kn_knlist->kl_lock); \
188 } while (0)
189 #define KN_LIST_UNLOCK(kn) do { \
190 if (kn->kn_knlist != NULL) \
191 mtx_unlock(kn->kn_knlist->kl_lock); \
192 } while (0)
193
194 #define KN_HASHSIZE 64 /* XXX should be tunable */
195 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
196
197 static int
198 filt_nullattach(struct knote *kn)
199 {
200
201 return (ENXIO);
202 };
203
204 struct filterops null_filtops =
205 { 0, filt_nullattach, NULL, NULL };
206
207 /* XXX - make SYSINIT to add these, and move into respective modules. */
208 extern struct filterops sig_filtops;
209 extern struct filterops fs_filtops;
210
211 /*
212 * Table for for all system-defined filters.
213 */
214 static struct mtx filterops_lock;
215 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
216 MTX_DEF);
217 static struct {
218 struct filterops *for_fop;
219 int for_refcnt;
220 } sysfilt_ops[EVFILT_SYSCOUNT] = {
221 { &file_filtops }, /* EVFILT_READ */
222 { &file_filtops }, /* EVFILT_WRITE */
223 { &null_filtops }, /* EVFILT_AIO */
224 { &file_filtops }, /* EVFILT_VNODE */
225 { &proc_filtops }, /* EVFILT_PROC */
226 { &sig_filtops }, /* EVFILT_SIGNAL */
227 { &timer_filtops }, /* EVFILT_TIMER */
228 { &file_filtops }, /* EVFILT_NETDEV */
229 { &fs_filtops }, /* EVFILT_FS */
230 };
231
232 /*
233 * Simple redirection for all cdevsw style objects to call their fo_kqfilter
234 * method.
235 */
236 static int
237 filt_fileattach(struct knote *kn)
238 {
239
240 return (fo_kqfilter(kn->kn_fp, kn));
241 }
242
243 /*ARGSUSED*/
244 static int
245 kqueue_kqfilter(struct file *fp, struct knote *kn)
246 {
247 struct kqueue *kq = kn->kn_fp->f_data;
248
249 if (kn->kn_filter != EVFILT_READ)
250 return (EINVAL);
251
252 kn->kn_status |= KN_KQUEUE;
253 kn->kn_fop = &kqread_filtops;
254 knlist_add(&kq->kq_sel.si_note, kn, 0);
255
256 return (0);
257 }
258
259 static void
260 filt_kqdetach(struct knote *kn)
261 {
262 struct kqueue *kq = kn->kn_fp->f_data;
263
264 knlist_remove(&kq->kq_sel.si_note, kn, 0);
265 }
266
267 /*ARGSUSED*/
268 static int
269 filt_kqueue(struct knote *kn, long hint)
270 {
271 struct kqueue *kq = kn->kn_fp->f_data;
272
273 kn->kn_data = kq->kq_count;
274 return (kn->kn_data > 0);
275 }
276
277 /* XXX - move to kern_proc.c? */
278 static int
279 filt_procattach(struct knote *kn)
280 {
281 struct proc *p;
282 int immediate;
283 int error;
284
285 immediate = 0;
286 p = pfind(kn->kn_id);
287 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
288 p = zpfind(kn->kn_id);
289 immediate = 1;
290 } else if (p != NULL && (p->p_flag & P_WEXIT)) {
291 immediate = 1;
292 }
293
294 if (p == NULL)
295 return (ESRCH);
296 if ((error = p_cansee(curthread, p)))
297 return (error);
298
299 kn->kn_ptr.p_proc = p;
300 kn->kn_flags |= EV_CLEAR; /* automatically set */
301
302 /*
303 * internal flag indicating registration done by kernel
304 */
305 if (kn->kn_flags & EV_FLAG1) {
306 kn->kn_data = kn->kn_sdata; /* ppid */
307 kn->kn_fflags = NOTE_CHILD;
308 kn->kn_flags &= ~EV_FLAG1;
309 }
310
311 if (immediate == 0)
312 knlist_add(&p->p_klist, kn, 1);
313
314 /*
315 * Immediately activate any exit notes if the target process is a
316 * zombie. This is necessary to handle the case where the target
317 * process, e.g. a child, dies before the kevent is registered.
318 */
319 if (immediate && filt_proc(kn, NOTE_EXIT))
320 KNOTE_ACTIVATE(kn, 0);
321
322 PROC_UNLOCK(p);
323
324 return (0);
325 }
326
327 /*
328 * The knote may be attached to a different process, which may exit,
329 * leaving nothing for the knote to be attached to. So when the process
330 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
331 * it will be deleted when read out. However, as part of the knote deletion,
332 * this routine is called, so a check is needed to avoid actually performing
333 * a detach, because the original process does not exist any more.
334 */
335 /* XXX - move to kern_proc.c? */
336 static void
337 filt_procdetach(struct knote *kn)
338 {
339 struct proc *p;
340
341 p = kn->kn_ptr.p_proc;
342 knlist_remove(&p->p_klist, kn, 0);
343 kn->kn_ptr.p_proc = NULL;
344 }
345
346 /* XXX - move to kern_proc.c? */
347 static int
348 filt_proc(struct knote *kn, long hint)
349 {
350 struct proc *p = kn->kn_ptr.p_proc;
351 u_int event;
352
353 /*
354 * mask off extra data
355 */
356 event = (u_int)hint & NOTE_PCTRLMASK;
357
358 /*
359 * if the user is interested in this event, record it.
360 */
361 if (kn->kn_sfflags & event)
362 kn->kn_fflags |= event;
363
364 /*
365 * process is gone, so flag the event as finished.
366 */
367 if (event == NOTE_EXIT) {
368 if (!(kn->kn_status & KN_DETACHED))
369 knlist_remove_inevent(&p->p_klist, kn);
370 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
371 kn->kn_ptr.p_proc = NULL;
372 return (1);
373 }
374
375 /*
376 * process forked, and user wants to track the new process,
377 * so attach a new knote to it, and immediately report an
378 * event with the parent's pid.
379 */
380 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
381 struct kevent kev;
382 int error;
383
384 /*
385 * register knote with new process.
386 */
387 kev.ident = hint & NOTE_PDATAMASK; /* pid */
388 kev.filter = kn->kn_filter;
389 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
390 kev.fflags = kn->kn_sfflags;
391 kev.data = kn->kn_id; /* parent */
392 kev.udata = kn->kn_kevent.udata; /* preserve udata */
393 error = kqueue_register(kn->kn_kq, &kev, NULL, 0);
394 if (error)
395 kn->kn_fflags |= NOTE_TRACKERR;
396 }
397
398 return (kn->kn_fflags != 0);
399 }
400
401 static int
402 timertoticks(intptr_t data)
403 {
404 struct timeval tv;
405 int tticks;
406
407 tv.tv_sec = data / 1000;
408 tv.tv_usec = (data % 1000) * 1000;
409 tticks = tvtohz(&tv);
410
411 return tticks;
412 }
413
414 /* XXX - move to kern_timeout.c? */
415 static void
416 filt_timerexpire(void *knx)
417 {
418 struct knote *kn = knx;
419 struct callout *calloutp;
420
421 kn->kn_data++;
422 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
423
424 if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
425 calloutp = (struct callout *)kn->kn_hook;
426 callout_reset(calloutp, timertoticks(kn->kn_sdata),
427 filt_timerexpire, kn);
428 }
429 }
430
431 /*
432 * data contains amount of time to sleep, in milliseconds
433 */
434 /* XXX - move to kern_timeout.c? */
435 static int
436 filt_timerattach(struct knote *kn)
437 {
438 struct callout *calloutp;
439
440 atomic_add_int(&kq_ncallouts, 1);
441
442 if (kq_ncallouts >= kq_calloutmax) {
443 atomic_add_int(&kq_ncallouts, -1);
444 return (ENOMEM);
445 }
446
447 kn->kn_flags |= EV_CLEAR; /* automatically set */
448 kn->kn_status &= ~KN_DETACHED; /* knlist_add usually sets it */
449 MALLOC(calloutp, struct callout *, sizeof(*calloutp),
450 M_KQUEUE, M_WAITOK);
451 callout_init(calloutp, CALLOUT_MPSAFE);
452 kn->kn_hook = calloutp;
453 callout_reset(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire,
454 kn);
455
456 return (0);
457 }
458
459 /* XXX - move to kern_timeout.c? */
460 static void
461 filt_timerdetach(struct knote *kn)
462 {
463 struct callout *calloutp;
464
465 calloutp = (struct callout *)kn->kn_hook;
466 callout_drain(calloutp);
467 FREE(calloutp, M_KQUEUE);
468 atomic_add_int(&kq_ncallouts, -1);
469 kn->kn_status |= KN_DETACHED; /* knlist_remove usually clears it */
470 }
471
472 /* XXX - move to kern_timeout.c? */
473 static int
474 filt_timer(struct knote *kn, long hint)
475 {
476
477 return (kn->kn_data != 0);
478 }
479
480 /*
481 * MPSAFE
482 */
483 int
484 kqueue(struct thread *td, struct kqueue_args *uap)
485 {
486 struct filedesc *fdp;
487 struct kqueue *kq;
488 struct file *fp;
489 int fd, error;
490
491 fdp = td->td_proc->p_fd;
492 error = falloc(td, &fp, &fd);
493 if (error)
494 goto done2;
495
496 /* An extra reference on `nfp' has been held for us by falloc(). */
497 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
498 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
499 TAILQ_INIT(&kq->kq_head);
500 kq->kq_fdp = fdp;
501 knlist_init(&kq->kq_sel.si_note, &kq->kq_lock);
502 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
503
504 FILEDESC_LOCK_FAST(fdp);
505 SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
506 FILEDESC_UNLOCK_FAST(fdp);
507
508 FILE_LOCK(fp);
509 fp->f_flag = FREAD | FWRITE;
510 fp->f_type = DTYPE_KQUEUE;
511 fp->f_ops = &kqueueops;
512 fp->f_data = kq;
513 FILE_UNLOCK(fp);
514 fdrop(fp, td);
515
516 td->td_retval[0] = fd;
517 done2:
518 return (error);
519 }
520
521 #ifndef _SYS_SYSPROTO_H_
522 struct kevent_args {
523 int fd;
524 const struct kevent *changelist;
525 int nchanges;
526 struct kevent *eventlist;
527 int nevents;
528 const struct timespec *timeout;
529 };
530 #endif
531 /*
532 * MPSAFE
533 */
534 int
535 kevent(struct thread *td, struct kevent_args *uap)
536 {
537 struct timespec ts, *tsp;
538 struct kevent_copyops k_ops = { uap,
539 kevent_copyout,
540 kevent_copyin};
541 int error;
542
543 if (uap->timeout != NULL) {
544 error = copyin(uap->timeout, &ts, sizeof(ts));
545 if (error)
546 return (error);
547 tsp = &ts;
548 } else
549 tsp = NULL;
550
551 return (kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
552 &k_ops, tsp));
553 }
554
555 /*
556 * Copy 'count' items into the destination list pointed to by uap->eventlist.
557 */
558 static int
559 kevent_copyout(void *arg, struct kevent *kevp, int count)
560 {
561 struct kevent_args *uap;
562 int error;
563
564 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
565 uap = (struct kevent_args *)arg;
566
567 error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
568 if (error == 0)
569 uap->eventlist += count;
570 return (error);
571 }
572
573 /*
574 * Copy 'count' items from the list pointed to by uap->changelist.
575 */
576 static int
577 kevent_copyin(void *arg, struct kevent *kevp, int count)
578 {
579 struct kevent_args *uap;
580 int error;
581
582 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
583 uap = (struct kevent_args *)arg;
584
585 error = copyin(uap->changelist, kevp, count * sizeof *kevp);
586 if (error == 0)
587 uap->changelist += count;
588 return (error);
589 }
590
591 int
592 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
593 struct kevent_copyops *k_ops, const struct timespec *timeout)
594 {
595 struct kevent keva[KQ_NEVENTS];
596 struct kevent *kevp, *changes;
597 struct kqueue *kq;
598 struct file *fp;
599 int i, n, nerrors, error;
600
601 if ((error = fget(td, fd, &fp)) != 0)
602 return (error);
603 if ((error = kqueue_aquire(fp, &kq)) != 0)
604 goto done_norel;
605
606 nerrors = 0;
607
608 while (nchanges > 0) {
609 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
610 error = k_ops->k_copyin(k_ops->arg, keva, n);
611 if (error)
612 goto done;
613 changes = keva;
614 for (i = 0; i < n; i++) {
615 kevp = &changes[i];
616 kevp->flags &= ~EV_SYSFLAGS;
617 error = kqueue_register(kq, kevp, td, 1);
618 if (error) {
619 if (nevents != 0) {
620 kevp->flags = EV_ERROR;
621 kevp->data = error;
622 (void) k_ops->k_copyout(k_ops->arg,
623 kevp, 1);
624 nevents--;
625 nerrors++;
626 } else {
627 goto done;
628 }
629 }
630 }
631 nchanges -= n;
632 }
633 if (nerrors) {
634 td->td_retval[0] = nerrors;
635 error = 0;
636 goto done;
637 }
638
639 error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
640 done:
641 kqueue_release(kq, 0);
642 done_norel:
643 if (fp != NULL)
644 fdrop(fp, td);
645 return (error);
646 }
647
648 int
649 kqueue_add_filteropts(int filt, struct filterops *filtops)
650 {
651 int error;
652
653 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
654 printf(
655 "trying to add a filterop that is out of range: %d is beyond %d\n",
656 ~filt, EVFILT_SYSCOUNT);
657 return EINVAL;
658 }
659 mtx_lock(&filterops_lock);
660 if (sysfilt_ops[~filt].for_fop != &null_filtops &&
661 sysfilt_ops[~filt].for_fop != NULL)
662 error = EEXIST;
663 else {
664 sysfilt_ops[~filt].for_fop = filtops;
665 sysfilt_ops[~filt].for_refcnt = 0;
666 }
667 mtx_unlock(&filterops_lock);
668
669 return (0);
670 }
671
672 int
673 kqueue_del_filteropts(int filt)
674 {
675 int error;
676
677 error = 0;
678 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
679 return EINVAL;
680
681 mtx_lock(&filterops_lock);
682 if (sysfilt_ops[~filt].for_fop == &null_filtops ||
683 sysfilt_ops[~filt].for_fop == NULL)
684 error = EINVAL;
685 else if (sysfilt_ops[~filt].for_refcnt != 0)
686 error = EBUSY;
687 else {
688 sysfilt_ops[~filt].for_fop = &null_filtops;
689 sysfilt_ops[~filt].for_refcnt = 0;
690 }
691 mtx_unlock(&filterops_lock);
692
693 return error;
694 }
695
696 static struct filterops *
697 kqueue_fo_find(int filt)
698 {
699
700 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
701 return NULL;
702
703 mtx_lock(&filterops_lock);
704 sysfilt_ops[~filt].for_refcnt++;
705 if (sysfilt_ops[~filt].for_fop == NULL)
706 sysfilt_ops[~filt].for_fop = &null_filtops;
707 mtx_unlock(&filterops_lock);
708
709 return sysfilt_ops[~filt].for_fop;
710 }
711
712 static void
713 kqueue_fo_release(int filt)
714 {
715
716 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
717 return;
718
719 mtx_lock(&filterops_lock);
720 KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
721 ("filter object refcount not valid on release"));
722 sysfilt_ops[~filt].for_refcnt--;
723 mtx_unlock(&filterops_lock);
724 }
725
726 /*
727 * A ref to kq (obtained via kqueue_aquire) should be held. waitok will
728 * influence if memory allocation should wait. Make sure it is 0 if you
729 * hold any mutexes.
730 */
731 int
732 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
733 {
734 struct filedesc *fdp;
735 struct filterops *fops;
736 struct file *fp;
737 struct knote *kn, *tkn;
738 int error, filt, event;
739 int haskqglobal;
740 int fd;
741
742 fdp = NULL;
743 fp = NULL;
744 kn = NULL;
745 error = 0;
746 haskqglobal = 0;
747
748 filt = kev->filter;
749 fops = kqueue_fo_find(filt);
750 if (fops == NULL)
751 return EINVAL;
752
753 tkn = knote_alloc(waitok); /* prevent waiting with locks */
754
755 findkn:
756 if (fops->f_isfd) {
757 KASSERT(td != NULL, ("td is NULL"));
758 fdp = td->td_proc->p_fd;
759 FILEDESC_LOCK(fdp);
760 /* validate descriptor */
761 fd = kev->ident;
762 if (fd < 0 || fd >= fdp->fd_nfiles ||
763 (fp = fdp->fd_ofiles[fd]) == NULL) {
764 FILEDESC_UNLOCK(fdp);
765 error = EBADF;
766 goto done;
767 }
768 fhold(fp);
769
770 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
771 kev->ident, 0) != 0) {
772 /* unlock and try again */
773 FILEDESC_UNLOCK(fdp);
774 fdrop(fp, td);
775 fp = NULL;
776 error = kqueue_expand(kq, fops, kev->ident, waitok);
777 if (error)
778 goto done;
779 goto findkn;
780 }
781
782 if (fp->f_type == DTYPE_KQUEUE) {
783 /*
784 * if we add some inteligence about what we are doing,
785 * we should be able to support events on ourselves.
786 * We need to know when we are doing this to prevent
787 * getting both the knlist lock and the kq lock since
788 * they are the same thing.
789 */
790 if (fp->f_data == kq) {
791 FILEDESC_UNLOCK(fdp);
792 error = EINVAL;
793 goto done_noglobal;
794 }
795
796 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
797 }
798
799 FILEDESC_UNLOCK(fdp);
800 KQ_LOCK(kq);
801 if (kev->ident < kq->kq_knlistsize) {
802 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
803 if (kev->filter == kn->kn_filter)
804 break;
805 }
806 } else {
807 if ((kev->flags & EV_ADD) == EV_ADD)
808 kqueue_expand(kq, fops, kev->ident, waitok);
809
810 KQ_LOCK(kq);
811 if (kq->kq_knhashmask != 0) {
812 struct klist *list;
813
814 list = &kq->kq_knhash[
815 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
816 SLIST_FOREACH(kn, list, kn_link)
817 if (kev->ident == kn->kn_id &&
818 kev->filter == kn->kn_filter)
819 break;
820 }
821 }
822
823 /* knote is in the process of changing, wait for it to stablize. */
824 if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
825 if (fp != NULL) {
826 fdrop(fp, td);
827 fp = NULL;
828 }
829 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
830 kq->kq_state |= KQ_FLUXWAIT;
831 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
832 goto findkn;
833 }
834
835 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
836 KQ_UNLOCK(kq);
837 error = ENOENT;
838 goto done;
839 }
840
841 /*
842 * kn now contains the matching knote, or NULL if no match
843 */
844 if (kev->flags & EV_ADD) {
845 if (kn == NULL) {
846 kn = tkn;
847 tkn = NULL;
848 if (kn == NULL) {
849 error = ENOMEM;
850 goto done;
851 }
852 kn->kn_fp = fp;
853 kn->kn_kq = kq;
854 kn->kn_fop = fops;
855 /*
856 * apply reference counts to knote structure, and
857 * do not release it at the end of this routine.
858 */
859 fops = NULL;
860 fp = NULL;
861
862 kn->kn_sfflags = kev->fflags;
863 kn->kn_sdata = kev->data;
864 kev->fflags = 0;
865 kev->data = 0;
866 kn->kn_kevent = *kev;
867 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
868 EV_ENABLE | EV_DISABLE);
869 kn->kn_status = KN_INFLUX|KN_DETACHED;
870
871 error = knote_attach(kn, kq);
872 KQ_UNLOCK(kq);
873 if (error != 0) {
874 tkn = kn;
875 goto done;
876 }
877
878 if ((error = kn->kn_fop->f_attach(kn)) != 0) {
879 knote_drop(kn, td);
880 goto done;
881 }
882 KN_LIST_LOCK(kn);
883 } else {
884 /*
885 * The user may change some filter values after the
886 * initial EV_ADD, but doing so will not reset any
887 * filter which has already been triggered.
888 */
889 kn->kn_status |= KN_INFLUX;
890 KQ_UNLOCK(kq);
891 KN_LIST_LOCK(kn);
892 kn->kn_sfflags = kev->fflags;
893 kn->kn_sdata = kev->data;
894 kn->kn_kevent.udata = kev->udata;
895 }
896
897 /*
898 * We can get here with kn->kn_knlist == NULL.
899 * This can happen when the initial attach event decides that
900 * the event is "completed" already. i.e. filt_procattach
901 * is called on a zombie process. It will call filt_proc
902 * which will remove it from the list, and NULL kn_knlist.
903 */
904 event = kn->kn_fop->f_event(kn, 0);
905 KQ_LOCK(kq);
906 if (event)
907 KNOTE_ACTIVATE(kn, 1);
908 kn->kn_status &= ~KN_INFLUX;
909 KN_LIST_UNLOCK(kn);
910 } else if (kev->flags & EV_DELETE) {
911 kn->kn_status |= KN_INFLUX;
912 KQ_UNLOCK(kq);
913 if (!(kn->kn_status & KN_DETACHED))
914 kn->kn_fop->f_detach(kn);
915 knote_drop(kn, td);
916 goto done;
917 }
918
919 if ((kev->flags & EV_DISABLE) &&
920 ((kn->kn_status & KN_DISABLED) == 0)) {
921 kn->kn_status |= KN_DISABLED;
922 }
923
924 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
925 kn->kn_status &= ~KN_DISABLED;
926 if ((kn->kn_status & KN_ACTIVE) &&
927 ((kn->kn_status & KN_QUEUED) == 0))
928 knote_enqueue(kn);
929 }
930 KQ_UNLOCK_FLUX(kq);
931
932 done:
933 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
934 done_noglobal:
935 if (fp != NULL)
936 fdrop(fp, td);
937 if (tkn != NULL)
938 knote_free(tkn);
939 if (fops != NULL)
940 kqueue_fo_release(filt);
941 return (error);
942 }
943
944 static int
945 kqueue_aquire(struct file *fp, struct kqueue **kqp)
946 {
947 int error;
948 struct kqueue *kq;
949
950 error = 0;
951
952 FILE_LOCK(fp);
953 do {
954 kq = fp->f_data;
955 if (fp->f_type != DTYPE_KQUEUE || kq == NULL) {
956 error = EBADF;
957 break;
958 }
959 *kqp = kq;
960 KQ_LOCK(kq);
961 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
962 KQ_UNLOCK(kq);
963 error = EBADF;
964 break;
965 }
966 kq->kq_refcnt++;
967 KQ_UNLOCK(kq);
968 } while (0);
969 FILE_UNLOCK(fp);
970
971 return error;
972 }
973
974 static void
975 kqueue_release(struct kqueue *kq, int locked)
976 {
977 if (locked)
978 KQ_OWNED(kq);
979 else
980 KQ_LOCK(kq);
981 kq->kq_refcnt--;
982 if (kq->kq_refcnt == 1)
983 wakeup(&kq->kq_refcnt);
984 if (!locked)
985 KQ_UNLOCK(kq);
986 }
987
988 static void
989 kqueue_schedtask(struct kqueue *kq)
990 {
991
992 KQ_OWNED(kq);
993 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
994 ("scheduling kqueue task while draining"));
995
996 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
997 taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
998 kq->kq_state |= KQ_TASKSCHED;
999 }
1000 }
1001
1002 /*
1003 * Expand the kq to make sure we have storage for fops/ident pair.
1004 *
1005 * Return 0 on success (or no work necessary), return errno on failure.
1006 *
1007 * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
1008 * If kqueue_register is called from a non-fd context, there usually/should
1009 * be no locks held.
1010 */
1011 static int
1012 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1013 int waitok)
1014 {
1015 struct klist *list, *tmp_knhash;
1016 u_long tmp_knhashmask;
1017 int size;
1018 int fd;
1019 int mflag = waitok ? M_WAITOK : M_NOWAIT;
1020
1021 KQ_NOTOWNED(kq);
1022
1023 if (fops->f_isfd) {
1024 fd = ident;
1025 if (kq->kq_knlistsize <= fd) {
1026 size = kq->kq_knlistsize;
1027 while (size <= fd)
1028 size += KQEXTENT;
1029 MALLOC(list, struct klist *,
1030 size * sizeof list, M_KQUEUE, mflag);
1031 if (list == NULL)
1032 return ENOMEM;
1033 KQ_LOCK(kq);
1034 if (kq->kq_knlistsize > fd) {
1035 FREE(list, M_KQUEUE);
1036 list = NULL;
1037 } else {
1038 if (kq->kq_knlist != NULL) {
1039 bcopy(kq->kq_knlist, list,
1040 kq->kq_knlistsize * sizeof list);
1041 FREE(kq->kq_knlist, M_KQUEUE);
1042 kq->kq_knlist = NULL;
1043 }
1044 bzero((caddr_t)list +
1045 kq->kq_knlistsize * sizeof list,
1046 (size - kq->kq_knlistsize) * sizeof list);
1047 kq->kq_knlistsize = size;
1048 kq->kq_knlist = list;
1049 }
1050 KQ_UNLOCK(kq);
1051 }
1052 } else {
1053 if (kq->kq_knhashmask == 0) {
1054 tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1055 &tmp_knhashmask);
1056 if (tmp_knhash == NULL)
1057 return ENOMEM;
1058 KQ_LOCK(kq);
1059 if (kq->kq_knhashmask == 0) {
1060 kq->kq_knhash = tmp_knhash;
1061 kq->kq_knhashmask = tmp_knhashmask;
1062 } else {
1063 free(tmp_knhash, M_KQUEUE);
1064 }
1065 KQ_UNLOCK(kq);
1066 }
1067 }
1068
1069 KQ_NOTOWNED(kq);
1070 return 0;
1071 }
1072
1073 static void
1074 kqueue_task(void *arg, int pending)
1075 {
1076 struct kqueue *kq;
1077 int haskqglobal;
1078
1079 haskqglobal = 0;
1080 kq = arg;
1081
1082 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1083 KQ_LOCK(kq);
1084
1085 KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1086
1087 kq->kq_state &= ~KQ_TASKSCHED;
1088 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1089 wakeup(&kq->kq_state);
1090 }
1091 KQ_UNLOCK(kq);
1092 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1093 }
1094
1095 /*
1096 * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1097 * We treat KN_MARKER knotes as if they are INFLUX.
1098 */
1099 static int
1100 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1101 const struct timespec *tsp, struct kevent *keva, struct thread *td)
1102 {
1103 struct kevent *kevp;
1104 struct timeval atv, rtv, ttv;
1105 struct knote *kn, *marker;
1106 int count, timeout, nkev, error;
1107 int haskqglobal;
1108
1109 count = maxevents;
1110 nkev = 0;
1111 error = 0;
1112 haskqglobal = 0;
1113
1114 if (maxevents == 0)
1115 goto done_nl;
1116
1117 if (tsp != NULL) {
1118 TIMESPEC_TO_TIMEVAL(&atv, tsp);
1119 if (itimerfix(&atv)) {
1120 error = EINVAL;
1121 goto done_nl;
1122 }
1123 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
1124 timeout = -1;
1125 else
1126 timeout = atv.tv_sec > 24 * 60 * 60 ?
1127 24 * 60 * 60 * hz : tvtohz(&atv);
1128 getmicrouptime(&rtv);
1129 timevaladd(&atv, &rtv);
1130 } else {
1131 atv.tv_sec = 0;
1132 atv.tv_usec = 0;
1133 timeout = 0;
1134 }
1135 marker = knote_alloc(1);
1136 if (marker == NULL) {
1137 error = ENOMEM;
1138 goto done_nl;
1139 }
1140 marker->kn_status = KN_MARKER;
1141 KQ_LOCK(kq);
1142 goto start;
1143
1144 retry:
1145 if (atv.tv_sec || atv.tv_usec) {
1146 getmicrouptime(&rtv);
1147 if (timevalcmp(&rtv, &atv, >=))
1148 goto done;
1149 ttv = atv;
1150 timevalsub(&ttv, &rtv);
1151 timeout = ttv.tv_sec > 24 * 60 * 60 ?
1152 24 * 60 * 60 * hz : tvtohz(&ttv);
1153 }
1154
1155 start:
1156 kevp = keva;
1157 if (kq->kq_count == 0) {
1158 if (timeout < 0) {
1159 error = EWOULDBLOCK;
1160 } else {
1161 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1162 kq->kq_state |= KQ_SLEEP;
1163 error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
1164 "kqread", timeout);
1165 }
1166 if (error == 0)
1167 goto retry;
1168 /* don't restart after signals... */
1169 if (error == ERESTART)
1170 error = EINTR;
1171 else if (error == EWOULDBLOCK)
1172 error = 0;
1173 goto done;
1174 }
1175
1176 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1177 while (count) {
1178 KQ_OWNED(kq);
1179 kn = TAILQ_FIRST(&kq->kq_head);
1180
1181 if ((kn->kn_status == KN_MARKER && kn != marker) ||
1182 (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1183 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1184 kq->kq_state |= KQ_FLUXWAIT;
1185 error = msleep(kq, &kq->kq_lock, PSOCK,
1186 "kqflxwt", 0);
1187 continue;
1188 }
1189
1190 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1191 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1192 kn->kn_status &= ~KN_QUEUED;
1193 kq->kq_count--;
1194 continue;
1195 }
1196 if (kn == marker) {
1197 KQ_FLUX_WAKEUP(kq);
1198 if (count == maxevents)
1199 goto retry;
1200 goto done;
1201 }
1202 KASSERT((kn->kn_status & KN_INFLUX) == 0,
1203 ("KN_INFLUX set when not suppose to be"));
1204
1205 if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1206 kn->kn_status &= ~KN_QUEUED;
1207 kn->kn_status |= KN_INFLUX;
1208 kq->kq_count--;
1209 KQ_UNLOCK(kq);
1210 /*
1211 * We don't need to lock the list since we've marked
1212 * it _INFLUX.
1213 */
1214 *kevp = kn->kn_kevent;
1215 if (!(kn->kn_status & KN_DETACHED))
1216 kn->kn_fop->f_detach(kn);
1217 knote_drop(kn, td);
1218 KQ_LOCK(kq);
1219 kn = NULL;
1220 } else {
1221 kn->kn_status |= KN_INFLUX;
1222 KQ_UNLOCK(kq);
1223 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1224 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1225 KN_LIST_LOCK(kn);
1226 if (kn->kn_fop->f_event(kn, 0) == 0) {
1227 KN_LIST_UNLOCK(kn);
1228 KQ_LOCK(kq);
1229 kn->kn_status &=
1230 ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
1231 kq->kq_count--;
1232 continue;
1233 }
1234 *kevp = kn->kn_kevent;
1235 KQ_LOCK(kq);
1236 if (kn->kn_flags & EV_CLEAR) {
1237 kn->kn_data = 0;
1238 kn->kn_fflags = 0;
1239 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1240 kq->kq_count--;
1241 } else
1242 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1243 KN_LIST_UNLOCK(kn);
1244 kn->kn_status &= ~(KN_INFLUX);
1245 }
1246
1247 /* we are returning a copy to the user */
1248 kevp++;
1249 nkev++;
1250 count--;
1251
1252 if (nkev == KQ_NEVENTS) {
1253 KQ_UNLOCK_FLUX(kq);
1254 error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1255 nkev = 0;
1256 kevp = keva;
1257 KQ_LOCK(kq);
1258 if (error)
1259 break;
1260 }
1261 }
1262 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1263 done:
1264 KQ_OWNED(kq);
1265 KQ_UNLOCK_FLUX(kq);
1266 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1267 knote_free(marker);
1268 done_nl:
1269 KQ_NOTOWNED(kq);
1270 if (nkev != 0)
1271 error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1272 td->td_retval[0] = maxevents - count;
1273 return (error);
1274 }
1275
1276 /*
1277 * XXX
1278 * This could be expanded to call kqueue_scan, if desired.
1279 */
1280 /*ARGSUSED*/
1281 static int
1282 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1283 int flags, struct thread *td)
1284 {
1285 return (ENXIO);
1286 }
1287
1288 /*ARGSUSED*/
1289 static int
1290 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
1291 int flags, struct thread *td)
1292 {
1293 return (ENXIO);
1294 }
1295
1296 /*ARGSUSED*/
1297 static int
1298 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1299 struct ucred *active_cred, struct thread *td)
1300 {
1301 /*
1302 * Enabling sigio causes two major problems:
1303 * 1) infinite recursion:
1304 * Synopsys: kevent is being used to track signals and have FIOASYNC
1305 * set. On receipt of a signal this will cause a kqueue to recurse
1306 * into itself over and over. Sending the sigio causes the kqueue
1307 * to become ready, which in turn posts sigio again, forever.
1308 * Solution: this can be solved by setting a flag in the kqueue that
1309 * we have a SIGIO in progress.
1310 * 2) locking problems:
1311 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1312 * us above the proc and pgrp locks.
1313 * Solution: Post a signal using an async mechanism, being sure to
1314 * record a generation count in the delivery so that we do not deliver
1315 * a signal to the wrong process.
1316 *
1317 * Note, these two mechanisms are somewhat mutually exclusive!
1318 */
1319 #if 0
1320 struct kqueue *kq;
1321
1322 kq = fp->f_data;
1323 switch (cmd) {
1324 case FIOASYNC:
1325 if (*(int *)data) {
1326 kq->kq_state |= KQ_ASYNC;
1327 } else {
1328 kq->kq_state &= ~KQ_ASYNC;
1329 }
1330 return (0);
1331
1332 case FIOSETOWN:
1333 return (fsetown(*(int *)data, &kq->kq_sigio));
1334
1335 case FIOGETOWN:
1336 *(int *)data = fgetown(&kq->kq_sigio);
1337 return (0);
1338 }
1339 #endif
1340
1341 return (ENOTTY);
1342 }
1343
1344 /*ARGSUSED*/
1345 static int
1346 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1347 struct thread *td)
1348 {
1349 struct kqueue *kq;
1350 int revents = 0;
1351 int error;
1352
1353 if ((error = kqueue_aquire(fp, &kq)))
1354 return POLLERR;
1355
1356 KQ_LOCK(kq);
1357 if (events & (POLLIN | POLLRDNORM)) {
1358 if (kq->kq_count) {
1359 revents |= events & (POLLIN | POLLRDNORM);
1360 } else {
1361 selrecord(td, &kq->kq_sel);
1362 kq->kq_state |= KQ_SEL;
1363 }
1364 }
1365 kqueue_release(kq, 1);
1366 KQ_UNLOCK(kq);
1367 return (revents);
1368 }
1369
1370 /*ARGSUSED*/
1371 static int
1372 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1373 struct thread *td)
1374 {
1375
1376 bzero((void *)st, sizeof *st);
1377 /*
1378 * We no longer return kq_count because the unlocked value is useless.
1379 * If you spent all this time getting the count, why not spend your
1380 * syscall better by calling kevent?
1381 *
1382 * XXX - This is needed for libc_r.
1383 */
1384 st->st_mode = S_IFIFO;
1385 return (0);
1386 }
1387
1388 /*ARGSUSED*/
1389 static int
1390 kqueue_close(struct file *fp, struct thread *td)
1391 {
1392 struct kqueue *kq = fp->f_data;
1393 struct filedesc *fdp;
1394 struct knote *kn;
1395 int i;
1396 int error;
1397
1398 if ((error = kqueue_aquire(fp, &kq)))
1399 return error;
1400
1401 KQ_LOCK(kq);
1402
1403 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1404 ("kqueue already closing"));
1405 kq->kq_state |= KQ_CLOSING;
1406 if (kq->kq_refcnt > 1)
1407 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1408
1409 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1410 fdp = kq->kq_fdp;
1411
1412 KASSERT(knlist_empty(&kq->kq_sel.si_note),
1413 ("kqueue's knlist not empty"));
1414
1415 for (i = 0; i < kq->kq_knlistsize; i++) {
1416 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1417 KASSERT((kn->kn_status & KN_INFLUX) == 0,
1418 ("KN_INFLUX set when not suppose to be"));
1419 kn->kn_status |= KN_INFLUX;
1420 KQ_UNLOCK(kq);
1421 if (!(kn->kn_status & KN_DETACHED))
1422 kn->kn_fop->f_detach(kn);
1423 knote_drop(kn, td);
1424 KQ_LOCK(kq);
1425 }
1426 }
1427 if (kq->kq_knhashmask != 0) {
1428 for (i = 0; i <= kq->kq_knhashmask; i++) {
1429 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1430 KASSERT((kn->kn_status & KN_INFLUX) == 0,
1431 ("KN_INFLUX set when not suppose to be"));
1432 kn->kn_status |= KN_INFLUX;
1433 KQ_UNLOCK(kq);
1434 if (!(kn->kn_status & KN_DETACHED))
1435 kn->kn_fop->f_detach(kn);
1436 knote_drop(kn, td);
1437 KQ_LOCK(kq);
1438 }
1439 }
1440 }
1441
1442 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1443 kq->kq_state |= KQ_TASKDRAIN;
1444 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1445 }
1446
1447 if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1448 kq->kq_state &= ~KQ_SEL;
1449 selwakeuppri(&kq->kq_sel, PSOCK);
1450 }
1451
1452 KQ_UNLOCK(kq);
1453
1454 FILEDESC_LOCK_FAST(fdp);
1455 SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
1456 FILEDESC_UNLOCK_FAST(fdp);
1457
1458 knlist_destroy(&kq->kq_sel.si_note);
1459 mtx_destroy(&kq->kq_lock);
1460 kq->kq_fdp = NULL;
1461
1462 if (kq->kq_knhash != NULL)
1463 free(kq->kq_knhash, M_KQUEUE);
1464 if (kq->kq_knlist != NULL)
1465 free(kq->kq_knlist, M_KQUEUE);
1466
1467 funsetown(&kq->kq_sigio);
1468 free(kq, M_KQUEUE);
1469 fp->f_data = NULL;
1470
1471 return (0);
1472 }
1473
1474 static void
1475 kqueue_wakeup(struct kqueue *kq)
1476 {
1477 KQ_OWNED(kq);
1478
1479 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
1480 kq->kq_state &= ~KQ_SLEEP;
1481 wakeup(kq);
1482 }
1483 if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1484 kq->kq_state &= ~KQ_SEL;
1485 selwakeuppri(&kq->kq_sel, PSOCK);
1486 }
1487 if (!knlist_empty(&kq->kq_sel.si_note))
1488 kqueue_schedtask(kq);
1489 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1490 pgsigio(&kq->kq_sigio, SIGIO, 0);
1491 }
1492 }
1493
1494 /*
1495 * Walk down a list of knotes, activating them if their event has triggered.
1496 *
1497 * There is a possibility to optimize in the case of one kq watching another.
1498 * Instead of scheduling a task to wake it up, you could pass enough state
1499 * down the chain to make up the parent kqueue. Make this code functional
1500 * first.
1501 */
1502 void
1503 knote(struct knlist *list, long hint, int islocked)
1504 {
1505 struct kqueue *kq;
1506 struct knote *kn;
1507
1508 if (list == NULL)
1509 return;
1510
1511 mtx_assert(list->kl_lock, islocked ? MA_OWNED : MA_NOTOWNED);
1512 if (!islocked)
1513 mtx_lock(list->kl_lock);
1514 /*
1515 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
1516 * the kqueue scheduling, but this will introduce four
1517 * lock/unlock's for each knote to test. If we do, continue to use
1518 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
1519 * only safe if you want to remove the current item, which we are
1520 * not doing.
1521 */
1522 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
1523 kq = kn->kn_kq;
1524 if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
1525 KQ_LOCK(kq);
1526 if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
1527 kn->kn_status |= KN_HASKQLOCK;
1528 if (kn->kn_fop->f_event(kn, hint))
1529 KNOTE_ACTIVATE(kn, 1);
1530 kn->kn_status &= ~KN_HASKQLOCK;
1531 }
1532 KQ_UNLOCK(kq);
1533 }
1534 kq = NULL;
1535 }
1536 if (!islocked)
1537 mtx_unlock(list->kl_lock);
1538 }
1539
1540 /*
1541 * add a knote to a knlist
1542 */
1543 void
1544 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
1545 {
1546 mtx_assert(knl->kl_lock, islocked ? MA_OWNED : MA_NOTOWNED);
1547 KQ_NOTOWNED(kn->kn_kq);
1548 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
1549 (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
1550 if (!islocked)
1551 mtx_lock(knl->kl_lock);
1552 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
1553 if (!islocked)
1554 mtx_unlock(knl->kl_lock);
1555 KQ_LOCK(kn->kn_kq);
1556 kn->kn_knlist = knl;
1557 kn->kn_status &= ~KN_DETACHED;
1558 KQ_UNLOCK(kn->kn_kq);
1559 }
1560
1561 static void
1562 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
1563 {
1564 KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
1565 mtx_assert(knl->kl_lock, knlislocked ? MA_OWNED : MA_NOTOWNED);
1566 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
1567 if (!kqislocked)
1568 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
1569 ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
1570 if (!knlislocked)
1571 mtx_lock(knl->kl_lock);
1572 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
1573 kn->kn_knlist = NULL;
1574 if (!knlislocked)
1575 mtx_unlock(knl->kl_lock);
1576 if (!kqislocked)
1577 KQ_LOCK(kn->kn_kq);
1578 kn->kn_status |= KN_DETACHED;
1579 if (!kqislocked)
1580 KQ_UNLOCK(kn->kn_kq);
1581 }
1582
1583 /*
1584 * remove all knotes from a specified klist
1585 */
1586 void
1587 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
1588 {
1589
1590 knlist_remove_kq(knl, kn, islocked, 0);
1591 }
1592
1593 /*
1594 * remove knote from a specified klist while in f_event handler.
1595 */
1596 void
1597 knlist_remove_inevent(struct knlist *knl, struct knote *kn)
1598 {
1599
1600 knlist_remove_kq(knl, kn, 1,
1601 (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
1602 }
1603
1604 int
1605 knlist_empty(struct knlist *knl)
1606 {
1607
1608 mtx_assert(knl->kl_lock, MA_OWNED);
1609 return SLIST_EMPTY(&knl->kl_list);
1610 }
1611
1612 static struct mtx knlist_lock;
1613 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
1614 MTX_DEF);
1615
1616 void
1617 knlist_init(struct knlist *knl, struct mtx *mtx)
1618 {
1619
1620 if (mtx == NULL)
1621 knl->kl_lock = &knlist_lock;
1622 else
1623 knl->kl_lock = mtx;
1624
1625 SLIST_INIT(&knl->kl_list);
1626 }
1627
1628 void
1629 knlist_destroy(struct knlist *knl)
1630 {
1631
1632 #ifdef INVARIANTS
1633 /*
1634 * if we run across this error, we need to find the offending
1635 * driver and have it call knlist_clear.
1636 */
1637 if (!SLIST_EMPTY(&knl->kl_list))
1638 printf("WARNING: destroying knlist w/ knotes on it!\n");
1639 #endif
1640
1641 knl->kl_lock = NULL;
1642 SLIST_INIT(&knl->kl_list);
1643 }
1644
1645 /*
1646 * Even if we are locked, we may need to drop the lock to allow any influx
1647 * knotes time to "settle".
1648 */
1649 void
1650 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
1651 {
1652 struct knote *kn;
1653 struct kqueue *kq;
1654
1655 if (islocked)
1656 mtx_assert(knl->kl_lock, MA_OWNED);
1657 else {
1658 mtx_assert(knl->kl_lock, MA_NOTOWNED);
1659 again: /* need to reaquire lock since we have dropped it */
1660 mtx_lock(knl->kl_lock);
1661 }
1662
1663 SLIST_FOREACH(kn, &knl->kl_list, kn_selnext) {
1664 kq = kn->kn_kq;
1665 KQ_LOCK(kq);
1666 if ((kn->kn_status & KN_INFLUX)) {
1667 KQ_UNLOCK(kq);
1668 continue;
1669 }
1670 knlist_remove_kq(knl, kn, 1, 1);
1671 if (killkn) {
1672 kn->kn_status |= KN_INFLUX | KN_DETACHED;
1673 KQ_UNLOCK(kq);
1674 knote_drop(kn, td);
1675 } else {
1676 /* Make sure cleared knotes disappear soon */
1677 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1678 KQ_UNLOCK(kq);
1679 }
1680 kq = NULL;
1681 }
1682
1683 if (!SLIST_EMPTY(&knl->kl_list)) {
1684 /* there are still KN_INFLUX remaining */
1685 kn = SLIST_FIRST(&knl->kl_list);
1686 kq = kn->kn_kq;
1687 KQ_LOCK(kq);
1688 KASSERT(kn->kn_status & KN_INFLUX,
1689 ("knote removed w/o list lock"));
1690 mtx_unlock(knl->kl_lock);
1691 kq->kq_state |= KQ_FLUXWAIT;
1692 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
1693 kq = NULL;
1694 goto again;
1695 }
1696
1697 if (islocked)
1698 mtx_assert(knl->kl_lock, MA_OWNED);
1699 else {
1700 mtx_unlock(knl->kl_lock);
1701 mtx_assert(knl->kl_lock, MA_NOTOWNED);
1702 }
1703 }
1704
1705 /*
1706 * remove all knotes referencing a specified fd
1707 * must be called with FILEDESC lock. This prevents a race where a new fd
1708 * comes along and occupies the entry and we attach a knote to the fd.
1709 */
1710 void
1711 knote_fdclose(struct thread *td, int fd)
1712 {
1713 struct filedesc *fdp = td->td_proc->p_fd;
1714 struct kqueue *kq;
1715 struct knote *kn;
1716 int influx;
1717
1718 FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1719
1720 /*
1721 * We shouldn't have to worry about new kevents appearing on fd
1722 * since filedesc is locked.
1723 */
1724 SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
1725 KQ_LOCK(kq);
1726
1727 again:
1728 influx = 0;
1729 while (kq->kq_knlistsize > fd &&
1730 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
1731 if (kn->kn_status & KN_INFLUX) {
1732 /* someone else might be waiting on our knote */
1733 if (influx)
1734 wakeup(kq);
1735 kq->kq_state |= KQ_FLUXWAIT;
1736 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
1737 goto again;
1738 }
1739 kn->kn_status |= KN_INFLUX;
1740 KQ_UNLOCK(kq);
1741 if (!(kn->kn_status & KN_DETACHED))
1742 kn->kn_fop->f_detach(kn);
1743 knote_drop(kn, td);
1744 influx = 1;
1745 KQ_LOCK(kq);
1746 }
1747 KQ_UNLOCK_FLUX(kq);
1748 }
1749 }
1750
1751 static int
1752 knote_attach(struct knote *kn, struct kqueue *kq)
1753 {
1754 struct klist *list;
1755
1756 KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
1757 KQ_OWNED(kq);
1758
1759 if (kn->kn_fop->f_isfd) {
1760 if (kn->kn_id >= kq->kq_knlistsize)
1761 return ENOMEM;
1762 list = &kq->kq_knlist[kn->kn_id];
1763 } else {
1764 if (kq->kq_knhash == NULL)
1765 return ENOMEM;
1766 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1767 }
1768
1769 SLIST_INSERT_HEAD(list, kn, kn_link);
1770
1771 return 0;
1772 }
1773
1774 /*
1775 * knote must already have been detatched using the f_detach method.
1776 * no lock need to be held, it is assumed that the KN_INFLUX flag is set
1777 * to prevent other removal.
1778 */
1779 static void
1780 knote_drop(struct knote *kn, struct thread *td)
1781 {
1782 struct kqueue *kq;
1783 struct klist *list;
1784
1785 kq = kn->kn_kq;
1786
1787 KQ_NOTOWNED(kq);
1788 KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
1789 ("knote_drop called without KN_INFLUX set in kn_status"));
1790
1791 KQ_LOCK(kq);
1792 if (kn->kn_fop->f_isfd)
1793 list = &kq->kq_knlist[kn->kn_id];
1794 else
1795 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1796
1797 SLIST_REMOVE(list, kn, knote, kn_link);
1798 if (kn->kn_status & KN_QUEUED)
1799 knote_dequeue(kn);
1800 KQ_UNLOCK_FLUX(kq);
1801
1802 if (kn->kn_fop->f_isfd) {
1803 fdrop(kn->kn_fp, td);
1804 kn->kn_fp = NULL;
1805 }
1806 kqueue_fo_release(kn->kn_kevent.filter);
1807 kn->kn_fop = NULL;
1808 knote_free(kn);
1809 }
1810
1811 static void
1812 knote_enqueue(struct knote *kn)
1813 {
1814 struct kqueue *kq = kn->kn_kq;
1815
1816 KQ_OWNED(kn->kn_kq);
1817 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
1818
1819 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1820 kn->kn_status |= KN_QUEUED;
1821 kq->kq_count++;
1822 kqueue_wakeup(kq);
1823 }
1824
1825 static void
1826 knote_dequeue(struct knote *kn)
1827 {
1828 struct kqueue *kq = kn->kn_kq;
1829
1830 KQ_OWNED(kn->kn_kq);
1831 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
1832
1833 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1834 kn->kn_status &= ~KN_QUEUED;
1835 kq->kq_count--;
1836 }
1837
1838 static void
1839 knote_init(void)
1840 {
1841
1842 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
1843 NULL, NULL, UMA_ALIGN_PTR, 0);
1844 }
1845 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
1846
1847 static struct knote *
1848 knote_alloc(int waitok)
1849 {
1850 return ((struct knote *)uma_zalloc(knote_zone,
1851 (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
1852 }
1853
1854 static void
1855 knote_free(struct knote *kn)
1856 {
1857 if (kn != NULL)
1858 uma_zfree(knote_zone, kn);
1859 }
Cache object: 0da659b3f8e0d38e794201d88a7e74da
|