kern_event.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: kern_event.c,v 1.146 2022/07/24 19:23:44 riastradh Exp $       */
    2 
    3 /*-
    4  * Copyright (c) 2008, 2009, 2021 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*-
   33  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
   34  * Copyright (c) 2009 Apple, Inc
   35  * All rights reserved.
   36  *
   37  * Redistribution and use in source and binary forms, with or without
   38  * modification, are permitted provided that the following conditions
   39  * are met:
   40  * 1. Redistributions of source code must retain the above copyright
   41  *    notice, this list of conditions and the following disclaimer.
   42  * 2. Redistributions in binary form must reproduce the above copyright
   43  *    notice, this list of conditions and the following disclaimer in the
   44  *    documentation and/or other materials provided with the distribution.
   45  *
   46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   56  * SUCH DAMAGE.
   57  *
   58  * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
   59  */
   60 
   61 #ifdef _KERNEL_OPT
   62 #include "opt_ddb.h"
   63 #endif /* _KERNEL_OPT */
   64 
   65 #include <sys/cdefs.h>
   66 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.146 2022/07/24 19:23:44 riastradh Exp $");
   67 
   68 #include <sys/param.h>
   69 #include <sys/systm.h>
   70 #include <sys/kernel.h>
   71 #include <sys/wait.h>
   72 #include <sys/proc.h>
   73 #include <sys/file.h>
   74 #include <sys/select.h>
   75 #include <sys/queue.h>
   76 #include <sys/event.h>
   77 #include <sys/eventvar.h>
   78 #include <sys/poll.h>
   79 #include <sys/kmem.h>
   80 #include <sys/stat.h>
   81 #include <sys/filedesc.h>
   82 #include <sys/syscallargs.h>
   83 #include <sys/kauth.h>
   84 #include <sys/conf.h>
   85 #include <sys/atomic.h>
   86 
   87 static int      kqueue_scan(file_t *, size_t, struct kevent *,
   88                             const struct timespec *, register_t *,
   89                             const struct kevent_ops *, struct kevent *,
   90                             size_t);
   91 static int      kqueue_ioctl(file_t *, u_long, void *);
   92 static int      kqueue_fcntl(file_t *, u_int, void *);
   93 static int      kqueue_poll(file_t *, int);
   94 static int      kqueue_kqfilter(file_t *, struct knote *);
   95 static int      kqueue_stat(file_t *, struct stat *);
   96 static int      kqueue_close(file_t *);
   97 static void     kqueue_restart(file_t *);
   98 static int      kqueue_register(struct kqueue *, struct kevent *);
   99 static void     kqueue_doclose(struct kqueue *, struct klist *, int);
  100 
  101 static void     knote_detach(struct knote *, filedesc_t *fdp, bool);
  102 static void     knote_enqueue(struct knote *);
  103 static void     knote_activate(struct knote *);
  104 static void     knote_activate_locked(struct knote *);
  105 static void     knote_deactivate_locked(struct knote *);
  106 
  107 static void     filt_kqdetach(struct knote *);
  108 static int      filt_kqueue(struct knote *, long hint);
  109 static int      filt_procattach(struct knote *);
  110 static void     filt_procdetach(struct knote *);
  111 static int      filt_proc(struct knote *, long hint);
  112 static int      filt_fileattach(struct knote *);
  113 static void     filt_timerexpire(void *x);
  114 static int      filt_timerattach(struct knote *);
  115 static void     filt_timerdetach(struct knote *);
  116 static int      filt_timer(struct knote *, long hint);
  117 static int      filt_timertouch(struct knote *, struct kevent *, long type);
  118 static int      filt_userattach(struct knote *);
  119 static void     filt_userdetach(struct knote *);
  120 static int      filt_user(struct knote *, long hint);
  121 static int      filt_usertouch(struct knote *, struct kevent *, long type);
  122 
  123 /*
  124  * Private knote state that should never be exposed outside
  125  * of kern_event.c
  126  *
  127  * Field locking:
  128  *
  129  * q    kn_kq->kq_lock
  130  */
  131 struct knote_impl {
  132         struct knote    ki_knote;
  133         unsigned int    ki_influx;      /* q: in-flux counter */
  134         kmutex_t        ki_foplock;     /* for kn_filterops */
  135 };
  136 
  137 #define KIMPL_TO_KNOTE(kip)     (&(kip)->ki_knote)
  138 #define KNOTE_TO_KIMPL(knp)     container_of((knp), struct knote_impl, ki_knote)
  139 
  140 static inline struct knote *
  141 knote_alloc(bool sleepok)
  142 {
  143         struct knote_impl *ki;
  144 
  145         ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
  146         mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);
  147 
  148         return KIMPL_TO_KNOTE(ki);
  149 }
  150 
  151 static inline void
  152 knote_free(struct knote *kn)
  153 {
  154         struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
  155 
  156         mutex_destroy(&ki->ki_foplock);
  157         kmem_free(ki, sizeof(*ki));
  158 }
  159 
  160 static inline void
  161 knote_foplock_enter(struct knote *kn)
  162 {
  163         mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
  164 }
  165 
  166 static inline void
  167 knote_foplock_exit(struct knote *kn)
  168 {
  169         mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
  170 }
  171 
  172 static inline bool __diagused
  173 knote_foplock_owned(struct knote *kn)
  174 {
  175         return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
  176 }
  177 
  178 static const struct fileops kqueueops = {
  179         .fo_name = "kqueue",
  180         .fo_read = (void *)enxio,
  181         .fo_write = (void *)enxio,
  182         .fo_ioctl = kqueue_ioctl,
  183         .fo_fcntl = kqueue_fcntl,
  184         .fo_poll = kqueue_poll,
  185         .fo_stat = kqueue_stat,
  186         .fo_close = kqueue_close,
  187         .fo_kqfilter = kqueue_kqfilter,
  188         .fo_restart = kqueue_restart,
  189 };
  190 
  191 static void
  192 filt_nopdetach(struct knote *kn __unused)
  193 {
  194 }
  195 
  196 static int
  197 filt_nopevent(struct knote *kn __unused, long hint __unused)
  198 {
  199         return 0;
  200 }
  201 
  202 static const struct filterops nop_fd_filtops = {
  203         .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
  204         .f_attach = NULL,
  205         .f_detach = filt_nopdetach,
  206         .f_event = filt_nopevent,
  207 };
  208 
  209 static const struct filterops nop_filtops = {
  210         .f_flags = FILTEROP_MPSAFE,
  211         .f_attach = NULL,
  212         .f_detach = filt_nopdetach,
  213         .f_event = filt_nopevent,
  214 };
  215 
  216 static const struct filterops kqread_filtops = {
  217         .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
  218         .f_attach = NULL,
  219         .f_detach = filt_kqdetach,
  220         .f_event = filt_kqueue,
  221 };
  222 
  223 static const struct filterops proc_filtops = {
  224         .f_flags = FILTEROP_MPSAFE,
  225         .f_attach = filt_procattach,
  226         .f_detach = filt_procdetach,
  227         .f_event = filt_proc,
  228 };
  229 
  230 /*
  231  * file_filtops is not marked MPSAFE because it's going to call
  232  * fileops::fo_kqfilter(), which might not be.  That function,
  233  * however, will override the knote's filterops, and thus will
  234  * inherit the MPSAFE-ness of the back-end at that time.
  235  */
  236 static const struct filterops file_filtops = {
  237         .f_flags = FILTEROP_ISFD,
  238         .f_attach = filt_fileattach,
  239         .f_detach = NULL,
  240         .f_event = NULL,
  241 };
  242 
  243 static const struct filterops timer_filtops = {
  244         .f_flags = FILTEROP_MPSAFE,
  245         .f_attach = filt_timerattach,
  246         .f_detach = filt_timerdetach,
  247         .f_event = filt_timer,
  248         .f_touch = filt_timertouch,
  249 };
  250 
  251 static const struct filterops user_filtops = {
  252         .f_flags = FILTEROP_MPSAFE,
  253         .f_attach = filt_userattach,
  254         .f_detach = filt_userdetach,
  255         .f_event = filt_user,
  256         .f_touch = filt_usertouch,
  257 };
  258 
  259 static u_int    kq_ncallouts = 0;
  260 static int      kq_calloutmax = (4 * 1024);
  261 
  262 #define KN_HASHSIZE             64              /* XXX should be tunable */
  263 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
  264 
  265 extern const struct filterops fs_filtops;       /* vfs_syscalls.c */
  266 extern const struct filterops sig_filtops;      /* kern_sig.c */
  267 
  268 /*
  269  * Table for for all system-defined filters.
  270  * These should be listed in the numeric order of the EVFILT_* defines.
  271  * If filtops is NULL, the filter isn't implemented in NetBSD.
  272  * End of list is when name is NULL.
  273  *
  274  * Note that 'refcnt' is meaningless for built-in filters.
  275  */
  276 struct kfilter {
  277         const char      *name;          /* name of filter */
  278         uint32_t        filter;         /* id of filter */
  279         unsigned        refcnt;         /* reference count */
  280         const struct filterops *filtops;/* operations for filter */
  281         size_t          namelen;        /* length of name string */
  282 };
  283 
  284 /* System defined filters */
  285 static struct kfilter sys_kfilters[] = {
  286         { "EVFILT_READ",        EVFILT_READ,    0, &file_filtops, 0 },
  287         { "EVFILT_WRITE",       EVFILT_WRITE,   0, &file_filtops, 0, },
  288         { "EVFILT_AIO",         EVFILT_AIO,     0, NULL, 0 },
  289         { "EVFILT_VNODE",       EVFILT_VNODE,   0, &file_filtops, 0 },
  290         { "EVFILT_PROC",        EVFILT_PROC,    0, &proc_filtops, 0 },
  291         { "EVFILT_SIGNAL",      EVFILT_SIGNAL,  0, &sig_filtops, 0 },
  292         { "EVFILT_TIMER",       EVFILT_TIMER,   0, &timer_filtops, 0 },
  293         { "EVFILT_FS",          EVFILT_FS,      0, &fs_filtops, 0 },
  294         { "EVFILT_USER",        EVFILT_USER,    0, &user_filtops, 0 },
  295         { "EVFILT_EMPTY",       EVFILT_EMPTY,   0, &file_filtops, 0 },
  296         { NULL,                 0,              0, NULL, 0 },
  297 };
  298 
  299 /* User defined kfilters */
  300 static struct kfilter   *user_kfilters;         /* array */
  301 static int              user_kfilterc;          /* current offset */
  302 static int              user_kfiltermaxc;       /* max size so far */
  303 static size_t           user_kfiltersz;         /* size of allocated memory */
  304 
  305 /*
  306  * Global Locks.
  307  *
  308  * Lock order:
  309  *
  310  *      kqueue_filter_lock
  311  *      -> kn_kq->kq_fdp->fd_lock
  312  *      -> knote foplock (if taken)
  313  *      -> object lock (e.g., device driver lock, &c.)
  314  *      -> kn_kq->kq_lock
  315  *
  316  * Locking rules.  ==> indicates the lock is acquired by the backing
  317  * object, locks prior are acquired before calling filter ops:
  318  *
  319  *      f_attach: fdp->fd_lock -> knote foplock ->
  320  *        (maybe) KERNEL_LOCK ==> backing object lock
  321  *
  322  *      f_detach: fdp->fd_lock -> knote foplock ->
  323  *         (maybe) KERNEL_LOCK ==> backing object lock
  324  *
  325  *      f_event via kevent: fdp->fd_lock -> knote foplock ->
  326  *         (maybe) KERNEL_LOCK ==> backing object lock
  327  *         N.B. NOTE_SUBMIT will never be set in the "hint" argument
  328  *         in this case.
  329  *
  330  *      f_event via knote (via backing object: Whatever caller guarantees.
  331  *      Typically:
  332  *              f_event(NOTE_SUBMIT): caller has already acquired backing
  333  *                  object lock.
  334  *              f_event(!NOTE_SUBMIT): caller has not acquired backing object,
  335  *                  lock or has possibly acquired KERNEL_LOCK.  Backing object
  336  *                  lock may or may not be acquired as-needed.
  337  *      N.B. the knote foplock will **not** be acquired in this case.  The
  338  *      caller guarantees that klist_fini() will not be called concurrently
  339  *      with knote().
  340  *
  341  *      f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
  342  *          N.B. knote foplock is **not** acquired in this case and
  343  *          the caller must guarantee that klist_fini() will never
  344  *          be called.  kevent_register() restricts filters that
  345  *          provide f_touch to known-safe cases.
  346  *
  347  *      klist_fini(): Caller must guarantee that no more knotes can
  348  *          be attached to the klist, and must **not** hold the backing
  349  *          object's lock; klist_fini() itself will acquire the foplock
  350  *          of each knote on the klist.
  351  *
  352  * Locking rules when detaching knotes:
  353  *
  354  * There are some situations where knote submission may require dropping
  355  * locks (see knote_proc_fork()).  In order to support this, it's possible
  356  * to mark a knote as being 'in-flux'.  Such a knote is guaranteed not to
  357  * be detached while it remains in-flux.  Because it will not be detached,
  358  * locks can be dropped so e.g. memory can be allocated, locks on other
  359  * data structures can be acquired, etc.  During this time, any attempt to
  360  * detach an in-flux knote must wait until the knote is no longer in-flux.
  361  * When this happens, the knote is marked for death (KN_WILLDETACH) and the
  362  * LWP who gets to finish the detach operation is recorded in the knote's
  363  * 'udata' field (which is no longer required for its original purpose once
  364  * a knote is so marked).  Code paths that lead to knote_detach() must ensure
  365  * that their LWP is the one tasked with its final demise after waiting for
  366  * the in-flux status of the knote to clear.  Note that once a knote is
  367  * marked KN_WILLDETACH, no code paths may put it into an in-flux state.
  368  *
  369  * Once the special circumstances have been handled, the locks are re-
  370  * acquired in the proper order (object lock -> kq_lock), the knote taken
  371  * out of flux, and any waiters are notified.  Because waiters must have
  372  * also dropped *their* locks in order to safely block, they must re-
  373  * validate all of their assumptions; see knote_detach_quiesce().  See also
  374  * the kqueue_register() (EV_ADD, EV_DELETE) and kqueue_scan() (EV_ONESHOT)
  375  * cases.
  376  *
  377  * When kqueue_scan() encounters an in-flux knote, the situation is
  378  * treated like another LWP's list marker.
  379  *
  380  * LISTEN WELL: It is important to not hold knotes in flux for an
  381  * extended period of time! In-flux knotes effectively block any
  382  * progress of the kqueue_scan() operation.  Any code paths that place
  383  * knotes in-flux should be careful to not block for indefinite periods
  384  * of time, such as for memory allocation (i.e. KM_NOSLEEP is OK, but
  385  * KM_SLEEP is not).
  386  */
  387 static krwlock_t        kqueue_filter_lock;     /* lock on filter lists */
  388 
  389 #define KQ_FLUX_WAIT(kq)        (void)cv_wait(&kq->kq_cv, &kq->kq_lock)
  390 #define KQ_FLUX_WAKEUP(kq)      cv_broadcast(&kq->kq_cv)
  391 
  392 static inline bool
  393 kn_in_flux(struct knote *kn)
  394 {
  395         KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
  396         return KNOTE_TO_KIMPL(kn)->ki_influx != 0;
  397 }
  398 
  399 static inline bool
  400 kn_enter_flux(struct knote *kn)
  401 {
  402         KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
  403 
  404         if (kn->kn_status & KN_WILLDETACH) {
  405                 return false;
  406         }
  407 
  408         struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
  409         KASSERT(ki->ki_influx < UINT_MAX);
  410         ki->ki_influx++;
  411 
  412         return true;
  413 }
  414 
  415 static inline bool
  416 kn_leave_flux(struct knote *kn)
  417 {
  418         KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
  419 
  420         struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
  421         KASSERT(ki->ki_influx > 0);
  422         ki->ki_influx--;
  423         return ki->ki_influx == 0;
  424 }
  425 
  426 static void
  427 kn_wait_flux(struct knote *kn, bool can_loop)
  428 {
  429         struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
  430         bool loop;
  431 
  432         KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
  433 
  434         /*
  435          * It may not be safe for us to touch the knote again after
  436          * dropping the kq_lock.  The caller has let us know in
  437          * 'can_loop'.
  438          */
  439         for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) {
  440                 KQ_FLUX_WAIT(kn->kn_kq);
  441         }
  442 }
  443 
  444 #define KNOTE_WILLDETACH(kn)                                            \
  445 do {                                                                    \
  446         (kn)->kn_status |= KN_WILLDETACH;                               \
  447         (kn)->kn_kevent.udata = curlwp;                                 \
  448 } while (/*CONSTCOND*/0)
  449 
  450 /*
  451  * Wait until the specified knote is in a quiescent state and
  452  * safe to detach.  Returns true if we potentially blocked (and
  453  * thus dropped our locks).
  454  */
  455 static bool
  456 knote_detach_quiesce(struct knote *kn)
  457 {
  458         struct kqueue *kq = kn->kn_kq;
  459         filedesc_t *fdp = kq->kq_fdp;
  460 
  461         KASSERT(mutex_owned(&fdp->fd_lock));
  462 
  463         mutex_spin_enter(&kq->kq_lock);
  464         /*
  465          * There are two cases where we might see KN_WILLDETACH here:
  466          *
  467          * 1. Someone else has already started detaching the knote but
  468          *    had to wait for it to settle first.
  469          *
  470          * 2. We had to wait for it to settle, and had to come back
  471          *    around after re-acquiring the locks.
  472          *
  473          * When KN_WILLDETACH is set, we also set the LWP that claimed
  474          * the prize of finishing the detach in the 'udata' field of the
  475          * knote (which will never be used again for its usual purpose
  476          * once the note is in this state).  If it doesn't point to us,
  477          * we must drop the locks and let them in to finish the job.
  478          *
  479          * Otherwise, once we have claimed the knote for ourselves, we
  480          * can finish waiting for it to settle.  The is the only scenario
  481          * where touching a detaching knote is safe after dropping the
  482          * locks.
  483          */
  484         if ((kn->kn_status & KN_WILLDETACH) != 0 &&
  485             kn->kn_kevent.udata != curlwp) {
  486                 /*
  487                  * N.B. it is NOT safe for us to touch the knote again
  488                  * after dropping the locks here.  The caller must go
  489                  * back around and re-validate everything.  However, if
  490                  * the knote is in-flux, we want to block to minimize
  491                  * busy-looping.
  492                  */
  493                 mutex_exit(&fdp->fd_lock);
  494                 if (kn_in_flux(kn)) {
  495                         kn_wait_flux(kn, false);
  496                         mutex_spin_exit(&kq->kq_lock);
  497                         return true;
  498                 }
  499                 mutex_spin_exit(&kq->kq_lock);
  500                 preempt_point();
  501                 return true;
  502         }
  503         /*
  504          * If we get here, we know that we will be claiming the
  505          * detach responsibilies, or that we already have and
  506          * this is the second attempt after re-validation.
  507          */
  508         KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
  509                 kn->kn_kevent.udata == curlwp);
  510         /*
  511          * Similarly, if we get here, either we are just claiming it
  512          * and may have to wait for it to settle, or if this is the
  513          * second attempt after re-validation that no other code paths
  514          * have put it in-flux.
  515          */
  516         KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
  517                 kn_in_flux(kn) == false);
  518         KNOTE_WILLDETACH(kn);
  519         if (kn_in_flux(kn)) {
  520                 mutex_exit(&fdp->fd_lock);
  521                 kn_wait_flux(kn, true);
  522                 /*
  523                  * It is safe for us to touch the knote again after
  524                  * dropping the locks, but the caller must still
  525                  * re-validate everything because other aspects of
  526                  * the environment may have changed while we blocked.
  527                  */
  528                 KASSERT(kn_in_flux(kn) == false);
  529                 mutex_spin_exit(&kq->kq_lock);
  530                 return true;
  531         }
  532         mutex_spin_exit(&kq->kq_lock);
  533 
  534         return false;
  535 }
  536 
  537 /*
  538  * Calls into the filterops need to be resilient against things which
  539  * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
  540  * chasing garbage pointers (to data, or even potentially code in a
  541  * module about to be unloaded).  To that end, we acquire the
  542  * knote foplock before calling into the filter ops.  When a driver
  543  * (or anything else) is tearing down its klist, klist_fini() enumerates
  544  * each knote, acquires its foplock, and replaces the filterops with a
  545  * nop stub, allowing knote detach (when descriptors are closed) to safely
  546  * proceed.
  547  */
  548 
  549 static int
  550 filter_attach(struct knote *kn)
  551 {
  552         int rv;
  553 
  554         KASSERT(knote_foplock_owned(kn));
  555         KASSERT(kn->kn_fop != NULL);
  556         KASSERT(kn->kn_fop->f_attach != NULL);
  557 
  558         /*
  559          * N.B. that kn->kn_fop may change as the result of calling
  560          * f_attach().  After f_attach() returns, kn->kn_fop may not
  561          * be modified by code outside of klist_fini().
  562          */
  563         if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
  564                 rv = kn->kn_fop->f_attach(kn);
  565         } else {
  566                 KERNEL_LOCK(1, NULL);
  567                 rv = kn->kn_fop->f_attach(kn);
  568                 KERNEL_UNLOCK_ONE(NULL);
  569         }
  570 
  571         return rv;
  572 }
  573 
  574 static void
  575 filter_detach(struct knote *kn)
  576 {
  577 
  578         KASSERT(knote_foplock_owned(kn));
  579         KASSERT(kn->kn_fop != NULL);
  580         KASSERT(kn->kn_fop->f_detach != NULL);
  581 
  582         if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
  583                 kn->kn_fop->f_detach(kn);
  584         } else {
  585                 KERNEL_LOCK(1, NULL);
  586                 kn->kn_fop->f_detach(kn);
  587                 KERNEL_UNLOCK_ONE(NULL);
  588         }
  589 }
  590 
  591 static int
  592 filter_event(struct knote *kn, long hint, bool submitting)
  593 {
  594         int rv;
  595 
  596         /* See knote(). */
  597         KASSERT(submitting || knote_foplock_owned(kn));
  598         KASSERT(kn->kn_fop != NULL);
  599         KASSERT(kn->kn_fop->f_event != NULL);
  600 
  601         if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
  602                 rv = kn->kn_fop->f_event(kn, hint);
  603         } else {
  604                 KERNEL_LOCK(1, NULL);
  605                 rv = kn->kn_fop->f_event(kn, hint);
  606                 KERNEL_UNLOCK_ONE(NULL);
  607         }
  608 
  609         return rv;
  610 }
  611 
  612 static int
  613 filter_touch(struct knote *kn, struct kevent *kev, long type)
  614 {
  615 
  616         /*
  617          * XXX We cannot assert that the knote foplock is held here
  618          * XXX beause we cannot safely acquire it in all cases
  619          * XXX where "touch" will be used in kqueue_scan().  We just
  620          * XXX have to assume that f_touch will always be safe to call,
  621          * XXX and kqueue_register() allows only the two known-safe
  622          * XXX users of that op.
  623          */
  624 
  625         KASSERT(kn->kn_fop != NULL);
  626         KASSERT(kn->kn_fop->f_touch != NULL);
  627 
  628         return kn->kn_fop->f_touch(kn, kev, type);
  629 }
  630 
  631 static kauth_listener_t kqueue_listener;
  632 
  633 static int
  634 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
  635     void *arg0, void *arg1, void *arg2, void *arg3)
  636 {
  637         struct proc *p;
  638         int result;
  639 
  640         result = KAUTH_RESULT_DEFER;
  641         p = arg0;
  642 
  643         if (action != KAUTH_PROCESS_KEVENT_FILTER)
  644                 return result;
  645 
  646         if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
  647             ISSET(p->p_flag, PK_SUGID)))
  648                 return result;
  649 
  650         result = KAUTH_RESULT_ALLOW;
  651 
  652         return result;
  653 }
  654 
  655 /*
  656  * Initialize the kqueue subsystem.
  657  */
  658 void
  659 kqueue_init(void)
  660 {
  661 
  662         rw_init(&kqueue_filter_lock);
  663 
  664         kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
  665             kqueue_listener_cb, NULL);
  666 }
  667 
  668 /*
  669  * Find kfilter entry by name, or NULL if not found.
  670  */
  671 static struct kfilter *
  672 kfilter_byname_sys(const char *name)
  673 {
  674         int i;
  675 
  676         KASSERT(rw_lock_held(&kqueue_filter_lock));
  677 
  678         for (i = 0; sys_kfilters[i].name != NULL; i++) {
  679                 if (strcmp(name, sys_kfilters[i].name) == 0)
  680                         return &sys_kfilters[i];
  681         }
  682         return NULL;
  683 }
  684 
  685 static struct kfilter *
  686 kfilter_byname_user(const char *name)
  687 {
  688         int i;
  689 
  690         KASSERT(rw_lock_held(&kqueue_filter_lock));
  691 
  692         /* user filter slots have a NULL name if previously deregistered */
  693         for (i = 0; i < user_kfilterc ; i++) {
  694                 if (user_kfilters[i].name != NULL &&
  695                     strcmp(name, user_kfilters[i].name) == 0)
  696                         return &user_kfilters[i];
  697         }
  698         return NULL;
  699 }
  700 
  701 static struct kfilter *
  702 kfilter_byname(const char *name)
  703 {
  704         struct kfilter *kfilter;
  705 
  706         KASSERT(rw_lock_held(&kqueue_filter_lock));
  707 
  708         if ((kfilter = kfilter_byname_sys(name)) != NULL)
  709                 return kfilter;
  710 
  711         return kfilter_byname_user(name);
  712 }
  713 
  714 /*
  715  * Find kfilter entry by filter id, or NULL if not found.
  716  * Assumes entries are indexed in filter id order, for speed.
  717  */
  718 static struct kfilter *
  719 kfilter_byfilter(uint32_t filter)
  720 {
  721         struct kfilter *kfilter;
  722 
  723         KASSERT(rw_lock_held(&kqueue_filter_lock));
  724 
  725         if (filter < EVFILT_SYSCOUNT)   /* it's a system filter */
  726                 kfilter = &sys_kfilters[filter];
  727         else if (user_kfilters != NULL &&
  728             filter < EVFILT_SYSCOUNT + user_kfilterc)
  729                                         /* it's a user filter */
  730                 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
  731         else
  732                 return (NULL);          /* out of range */
  733         KASSERT(kfilter->filter == filter);     /* sanity check! */
  734         return (kfilter);
  735 }
  736 
  737 /*
  738  * Register a new kfilter. Stores the entry in user_kfilters.
  739  * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
  740  * If retfilter != NULL, the new filterid is returned in it.
  741  */
  742 int
  743 kfilter_register(const char *name, const struct filterops *filtops,
  744                  int *retfilter)
  745 {
  746         struct kfilter *kfilter;
  747         size_t len;
  748         int i;
  749 
  750         if (name == NULL || name[0] == '\0' || filtops == NULL)
  751                 return (EINVAL);        /* invalid args */
  752 
  753         rw_enter(&kqueue_filter_lock, RW_WRITER);
  754         if (kfilter_byname(name) != NULL) {
  755                 rw_exit(&kqueue_filter_lock);
  756                 return (EEXIST);        /* already exists */
  757         }
  758         if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
  759                 rw_exit(&kqueue_filter_lock);
  760                 return (EINVAL);        /* too many */
  761         }
  762 
  763         for (i = 0; i < user_kfilterc; i++) {
  764                 kfilter = &user_kfilters[i];
  765                 if (kfilter->name == NULL) {
  766                         /* Previously deregistered slot.  Reuse. */
  767                         goto reuse;
  768                 }
  769         }
  770 
  771         /* check if need to grow user_kfilters */
  772         if (user_kfilterc + 1 > user_kfiltermaxc) {
  773                 /* Grow in KFILTER_EXTENT chunks. */
  774                 user_kfiltermaxc += KFILTER_EXTENT;
  775                 len = user_kfiltermaxc * sizeof(*kfilter);
  776                 kfilter = kmem_alloc(len, KM_SLEEP);
  777                 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
  778                 if (user_kfilters != NULL) {
  779                         memcpy(kfilter, user_kfilters, user_kfiltersz);
  780                         kmem_free(user_kfilters, user_kfiltersz);
  781                 }
  782                 user_kfiltersz = len;
  783                 user_kfilters = kfilter;
  784         }
  785         /* Adding new slot */
  786         kfilter = &user_kfilters[user_kfilterc++];
  787 reuse:
  788         kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
  789 
  790         kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
  791 
  792         kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
  793         memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
  794 
  795         if (retfilter != NULL)
  796                 *retfilter = kfilter->filter;
  797         rw_exit(&kqueue_filter_lock);
  798 
  799         return (0);
  800 }
  801 
  802 /*
  803  * Unregister a kfilter previously registered with kfilter_register.
  804  * This retains the filter id, but clears the name and frees filtops (filter
  805  * operations), so that the number isn't reused during a boot.
  806  * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
  807  */
  808 int
  809 kfilter_unregister(const char *name)
  810 {
  811         struct kfilter *kfilter;
  812 
  813         if (name == NULL || name[0] == '\0')
  814                 return (EINVAL);        /* invalid name */
  815 
  816         rw_enter(&kqueue_filter_lock, RW_WRITER);
  817         if (kfilter_byname_sys(name) != NULL) {
  818                 rw_exit(&kqueue_filter_lock);
  819                 return (EINVAL);        /* can't detach system filters */
  820         }
  821 
  822         kfilter = kfilter_byname_user(name);
  823         if (kfilter == NULL) {
  824                 rw_exit(&kqueue_filter_lock);
  825                 return (ENOENT);
  826         }
  827         if (kfilter->refcnt != 0) {
  828                 rw_exit(&kqueue_filter_lock);
  829                 return (EBUSY);
  830         }
  831 
  832         /* Cast away const (but we know it's safe. */
  833         kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
  834         kfilter->name = NULL;   /* mark as `not implemented' */
  835 
  836         if (kfilter->filtops != NULL) {
  837                 /* Cast away const (but we know it's safe. */
  838                 kmem_free(__UNCONST(kfilter->filtops),
  839                     sizeof(*kfilter->filtops));
  840                 kfilter->filtops = NULL; /* mark as `not implemented' */
  841         }
  842         rw_exit(&kqueue_filter_lock);
  843 
  844         return (0);
  845 }
  846 
  847 
  848 /*
  849  * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
  850  * descriptors. Calls fileops kqfilter method for given file descriptor.
  851  */
  852 static int
  853 filt_fileattach(struct knote *kn)
  854 {
  855         file_t *fp;
  856 
  857         fp = kn->kn_obj;
  858 
  859         return (*fp->f_ops->fo_kqfilter)(fp, kn);
  860 }
  861 
  862 /*
  863  * Filter detach method for EVFILT_READ on kqueue descriptor.
  864  */
  865 static void
  866 filt_kqdetach(struct knote *kn)
  867 {
  868         struct kqueue *kq;
  869 
  870         kq = ((file_t *)kn->kn_obj)->f_kqueue;
  871 
  872         mutex_spin_enter(&kq->kq_lock);
  873         selremove_knote(&kq->kq_sel, kn);
  874         mutex_spin_exit(&kq->kq_lock);
  875 }
  876 
  877 /*
  878  * Filter event method for EVFILT_READ on kqueue descriptor.
  879  */
  880 /*ARGSUSED*/
  881 static int
  882 filt_kqueue(struct knote *kn, long hint)
  883 {
  884         struct kqueue *kq;
  885         int rv;
  886 
  887         kq = ((file_t *)kn->kn_obj)->f_kqueue;
  888 
  889         if (hint != NOTE_SUBMIT)
  890                 mutex_spin_enter(&kq->kq_lock);
  891         kn->kn_data = KQ_COUNT(kq);
  892         rv = (kn->kn_data > 0);
  893         if (hint != NOTE_SUBMIT)
  894                 mutex_spin_exit(&kq->kq_lock);
  895 
  896         return rv;
  897 }
  898 
  899 /*
  900  * Filter attach method for EVFILT_PROC.
  901  */
  902 static int
  903 filt_procattach(struct knote *kn)
  904 {
  905         struct proc *p;
  906 
  907         mutex_enter(&proc_lock);
  908         p = proc_find(kn->kn_id);
  909         if (p == NULL) {
  910                 mutex_exit(&proc_lock);
  911                 return ESRCH;
  912         }
  913 
  914         /*
  915          * Fail if it's not owned by you, or the last exec gave us
  916          * setuid/setgid privs (unless you're root).
  917          */
  918         mutex_enter(p->p_lock);
  919         mutex_exit(&proc_lock);
  920         if (kauth_authorize_process(curlwp->l_cred,
  921             KAUTH_PROCESS_KEVENT_FILTER, p, NULL, NULL, NULL) != 0) {
  922                 mutex_exit(p->p_lock);
  923                 return EACCES;
  924         }
  925 
  926         kn->kn_obj = p;
  927         kn->kn_flags |= EV_CLEAR;       /* automatically set */
  928 
  929         /*
  930          * NOTE_CHILD is only ever generated internally; don't let it
  931          * leak in from user-space.  See knote_proc_fork_track().
  932          */
  933         kn->kn_sfflags &= ~NOTE_CHILD;
  934 
  935         klist_insert(&p->p_klist, kn);
  936         mutex_exit(p->p_lock);
  937 
  938         return 0;
  939 }
  940 
  941 /*
  942  * Filter detach method for EVFILT_PROC.
  943  *
  944  * The knote may be attached to a different process, which may exit,
  945  * leaving nothing for the knote to be attached to.  So when the process
  946  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  947  * it will be deleted when read out.  However, as part of the knote deletion,
  948  * this routine is called, so a check is needed to avoid actually performing
  949  * a detach, because the original process might not exist any more.
  950  */
  951 static void
  952 filt_procdetach(struct knote *kn)
  953 {
  954         struct kqueue *kq = kn->kn_kq;
  955         struct proc *p;
  956 
  957         /*
  958          * We have to synchronize with knote_proc_exit(), but we
  959          * are forced to acquire the locks in the wrong order here
  960          * because we can't be sure kn->kn_obj is valid unless
  961          * KN_DETACHED is not set.
  962          */
  963  again:
  964         mutex_spin_enter(&kq->kq_lock);
  965         if ((kn->kn_status & KN_DETACHED) == 0) {
  966                 p = kn->kn_obj;
  967                 if (!mutex_tryenter(p->p_lock)) {
  968                         mutex_spin_exit(&kq->kq_lock);
  969                         preempt_point();
  970                         goto again;
  971                 }
  972                 kn->kn_status |= KN_DETACHED;
  973                 klist_remove(&p->p_klist, kn);
  974                 mutex_exit(p->p_lock);
  975         }
  976         mutex_spin_exit(&kq->kq_lock);
  977 }
  978 
  979 /*
  980  * Filter event method for EVFILT_PROC.
  981  *
  982  * Due to some of the complexities of process locking, we have special
  983  * entry points for delivering knote submissions.  filt_proc() is used
  984  * only to check for activation from kqueue_register() and kqueue_scan().
  985  */
  986 static int
  987 filt_proc(struct knote *kn, long hint)
  988 {
  989         struct kqueue *kq = kn->kn_kq;
  990         uint32_t fflags;
  991 
  992         /*
  993          * Because we share the same klist with signal knotes, just
  994          * ensure that we're not being invoked for the proc-related
  995          * submissions.
  996          */
  997         KASSERT((hint & (NOTE_EXEC | NOTE_EXIT | NOTE_FORK)) == 0);
  998 
  999         mutex_spin_enter(&kq->kq_lock);
 1000         fflags = kn->kn_fflags;
 1001         mutex_spin_exit(&kq->kq_lock);
 1002 
 1003         return fflags != 0;
 1004 }
 1005 
 1006 void
 1007 knote_proc_exec(struct proc *p)
 1008 {
 1009         struct knote *kn, *tmpkn;
 1010         struct kqueue *kq;
 1011         uint32_t fflags;
 1012 
 1013         mutex_enter(p->p_lock);
 1014 
 1015         SLIST_FOREACH_SAFE(kn, &p->p_klist, kn_selnext, tmpkn) {
 1016                 /* N.B. EVFILT_SIGNAL knotes are on this same list. */
 1017                 if (kn->kn_fop == &sig_filtops) {
 1018                         continue;
 1019                 }
 1020                 KASSERT(kn->kn_fop == &proc_filtops);
 1021 
 1022                 kq = kn->kn_kq;
 1023                 mutex_spin_enter(&kq->kq_lock);
 1024                 fflags = (kn->kn_fflags |= (kn->kn_sfflags & NOTE_EXEC));
 1025                 if (fflags) {
 1026                         knote_activate_locked(kn);
 1027                 }
 1028                 mutex_spin_exit(&kq->kq_lock);
 1029         }
 1030 
 1031         mutex_exit(p->p_lock);
 1032 }
 1033 
 1034 static int __noinline
 1035 knote_proc_fork_track(struct proc *p1, struct proc *p2, struct knote *okn)
 1036 {
 1037         struct kqueue *kq = okn->kn_kq;
 1038 
 1039         KASSERT(mutex_owned(&kq->kq_lock));
 1040         KASSERT(mutex_owned(p1->p_lock));
 1041 
 1042         /*
 1043          * We're going to put this knote into flux while we drop
 1044          * the locks and create and attach a new knote to track the
 1045          * child.  If we are not able to enter flux, then this knote
 1046          * is about to go away, so skip the notification.
 1047          */
 1048         if (!kn_enter_flux(okn)) {
 1049                 return 0;
 1050         }
 1051 
 1052         mutex_spin_exit(&kq->kq_lock);
 1053         mutex_exit(p1->p_lock);
 1054 
 1055         /*
 1056          * We actually have to register *two* new knotes:
 1057          *
 1058          * ==> One for the NOTE_CHILD notification.  This is a forced
 1059          *     ONESHOT note.
 1060          *
 1061          * ==> One to actually track the child process as it subsequently
 1062          *     forks, execs, and, ultimately, exits.
 1063          *
 1064          * If we only register a single knote, then it's possible for
 1065          * for the NOTE_CHILD and NOTE_EXIT to be collapsed into a single
 1066          * notification if the child exits before the tracking process
 1067          * has received the NOTE_CHILD notification, which applications
 1068          * aren't expecting (the event's 'data' field would be clobbered,
 1069          * for example).
 1070          *
 1071          * To do this, what we have here is an **extremely** stripped-down
 1072          * version of kqueue_register() that has the following properties:
 1073          *
 1074          * ==> Does not block to allocate memory.  If we are unable
 1075          *     to allocate memory, we return ENOMEM.
 1076          *
 1077          * ==> Does not search for existing knotes; we know there
 1078          *     are not any because this is a new process that isn't
 1079          *     even visible to other processes yet.
 1080          *
 1081          * ==> Assumes that the knhash for our kq's descriptor table
 1082          *     already exists (after all, we're already tracking
 1083          *     processes with knotes if we got here).
 1084          *
 1085          * ==> Directly attaches the new tracking knote to the child
 1086          *     process.
 1087          *
 1088          * The whole point is to do the minimum amount of work while the
 1089          * knote is held in-flux, and to avoid doing extra work in general
 1090          * (we already have the new child process; why bother looking it
 1091          * up again?).
 1092          */
 1093         filedesc_t *fdp = kq->kq_fdp;
 1094         struct knote *knchild, *kntrack;
 1095         int error = 0;
 1096 
 1097         knchild = knote_alloc(false);
 1098         kntrack = knote_alloc(false);
 1099         if (__predict_false(knchild == NULL || kntrack == NULL)) {
 1100                 error = ENOMEM;
 1101                 goto out;
 1102         }
 1103 
 1104         kntrack->kn_obj = p2;
 1105         kntrack->kn_id = p2->p_pid;
 1106         kntrack->kn_kq = kq;
 1107         kntrack->kn_fop = okn->kn_fop;
 1108         kntrack->kn_kfilter = okn->kn_kfilter;
 1109         kntrack->kn_sfflags = okn->kn_sfflags;
 1110         kntrack->kn_sdata = p1->p_pid;
 1111 
 1112         kntrack->kn_kevent.ident = p2->p_pid;
 1113         kntrack->kn_kevent.filter = okn->kn_filter;
 1114         kntrack->kn_kevent.flags =
 1115             okn->kn_flags | EV_ADD | EV_ENABLE | EV_CLEAR;
 1116         kntrack->kn_kevent.fflags = 0;
 1117         kntrack->kn_kevent.data = 0;
 1118         kntrack->kn_kevent.udata = okn->kn_kevent.udata; /* preserve udata */
 1119 
 1120         /*
 1121          * The child note does not need to be attached to the
 1122          * new proc's klist at all.
 1123          */
 1124         *knchild = *kntrack;
 1125         knchild->kn_status = KN_DETACHED;
 1126         knchild->kn_sfflags = 0;
 1127         knchild->kn_kevent.flags |= EV_ONESHOT;
 1128         knchild->kn_kevent.fflags = NOTE_CHILD;
 1129         knchild->kn_kevent.data = p1->p_pid;             /* parent */
 1130 
 1131         mutex_enter(&fdp->fd_lock);
 1132 
 1133         /*
 1134          * We need to check to see if the kq is closing, and skip
 1135          * attaching the knote if so.  Normally, this isn't necessary
 1136          * when coming in the front door because the file descriptor
 1137          * layer will synchronize this.
 1138          *
 1139          * It's safe to test KQ_CLOSING without taking the kq_lock
 1140          * here because that flag is only ever set when the fd_lock
 1141          * is also held.
 1142          */
 1143         if (__predict_false(kq->kq_count & KQ_CLOSING)) {
 1144                 mutex_exit(&fdp->fd_lock);
 1145                 goto out;
 1146         }
 1147 
 1148         /*
 1149          * We do the "insert into FD table" and "attach to klist" steps
 1150          * in the opposite order of kqueue_register() here to avoid
 1151          * having to take p2->p_lock twice.  But this is OK because we
 1152          * hold fd_lock across the entire operation.
 1153          */
 1154 
 1155         mutex_enter(p2->p_lock);
 1156         error = kauth_authorize_process(curlwp->l_cred,
 1157             KAUTH_PROCESS_KEVENT_FILTER, p2, NULL, NULL, NULL);
 1158         if (__predict_false(error != 0)) {
 1159                 mutex_exit(p2->p_lock);
 1160                 mutex_exit(&fdp->fd_lock);
 1161                 error = EACCES;
 1162                 goto out;
 1163         }
 1164         klist_insert(&p2->p_klist, kntrack);
 1165         mutex_exit(p2->p_lock);
 1166 
 1167         KASSERT(fdp->fd_knhashmask != 0);
 1168         KASSERT(fdp->fd_knhash != NULL);
 1169         struct klist *list = &fdp->fd_knhash[KN_HASH(kntrack->kn_id,
 1170             fdp->fd_knhashmask)];
 1171         SLIST_INSERT_HEAD(list, kntrack, kn_link);
 1172         SLIST_INSERT_HEAD(list, knchild, kn_link);
 1173 
 1174         /* This adds references for knchild *and* kntrack. */
 1175         atomic_add_int(&kntrack->kn_kfilter->refcnt, 2);
 1176 
 1177         knote_activate(knchild);
 1178 
 1179         kntrack = NULL;
 1180         knchild = NULL;
 1181 
 1182         mutex_exit(&fdp->fd_lock);
 1183 
 1184  out:
 1185         if (__predict_false(knchild != NULL)) {
 1186                 knote_free(knchild);
 1187         }
 1188         if (__predict_false(kntrack != NULL)) {
 1189                 knote_free(kntrack);
 1190         }
 1191         mutex_enter(p1->p_lock);
 1192         mutex_spin_enter(&kq->kq_lock);
 1193 
 1194         if (kn_leave_flux(okn)) {
 1195                 KQ_FLUX_WAKEUP(kq);
 1196         }
 1197 
 1198         return error;
 1199 }
 1200 
 1201 void
 1202 knote_proc_fork(struct proc *p1, struct proc *p2)
 1203 {
 1204         struct knote *kn;
 1205         struct kqueue *kq;
 1206         uint32_t fflags;
 1207 
 1208         mutex_enter(p1->p_lock);
 1209 
 1210         /*
 1211          * N.B. We DO NOT use SLIST_FOREACH_SAFE() here because we
 1212          * don't want to pre-fetch the next knote; in the event we
 1213          * have to drop p_lock, we will have put the knote in-flux,
 1214          * meaning that no one will be able to detach it until we
 1215          * have taken the knote out of flux.  However, that does
 1216          * NOT stop someone else from detaching the next note in the
 1217          * list while we have it unlocked.  Thus, we want to fetch
 1218          * the next note in the list only after we have re-acquired
 1219          * the lock, and using SLIST_FOREACH() will satisfy that.
 1220          */
 1221         SLIST_FOREACH(kn, &p1->p_klist, kn_selnext) {
 1222                 /* N.B. EVFILT_SIGNAL knotes are on this same list. */
 1223                 if (kn->kn_fop == &sig_filtops) {
 1224                         continue;
 1225                 }
 1226                 KASSERT(kn->kn_fop == &proc_filtops);
 1227 
 1228                 kq = kn->kn_kq;
 1229                 mutex_spin_enter(&kq->kq_lock);
 1230                 kn->kn_fflags |= (kn->kn_sfflags & NOTE_FORK);
 1231                 if (__predict_false(kn->kn_sfflags & NOTE_TRACK)) {
 1232                         /*
 1233                          * This will drop kq_lock and p_lock and
 1234                          * re-acquire them before it returns.
 1235                          */
 1236                         if (knote_proc_fork_track(p1, p2, kn)) {
 1237                                 kn->kn_fflags |= NOTE_TRACKERR;
 1238                         }
 1239                         KASSERT(mutex_owned(p1->p_lock));
 1240                         KASSERT(mutex_owned(&kq->kq_lock));
 1241                 }
 1242                 fflags = kn->kn_fflags;
 1243                 if (fflags) {
 1244                         knote_activate_locked(kn);
 1245                 }
 1246                 mutex_spin_exit(&kq->kq_lock);
 1247         }
 1248 
 1249         mutex_exit(p1->p_lock);
 1250 }
 1251 
 1252 void
 1253 knote_proc_exit(struct proc *p)
 1254 {
 1255         struct knote *kn;
 1256         struct kqueue *kq;
 1257 
 1258         KASSERT(mutex_owned(p->p_lock));
 1259 
 1260         while (!SLIST_EMPTY(&p->p_klist)) {
 1261                 kn = SLIST_FIRST(&p->p_klist);
 1262                 kq = kn->kn_kq;
 1263 
 1264                 KASSERT(kn->kn_obj == p);
 1265 
 1266                 mutex_spin_enter(&kq->kq_lock);
 1267                 kn->kn_data = P_WAITSTATUS(p);
 1268                 /*
 1269                  * Mark as ONESHOT, so that the knote is g/c'ed
 1270                  * when read.
 1271                  */
 1272                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 1273                 kn->kn_fflags |= kn->kn_sfflags & NOTE_EXIT;
 1274 
 1275                 /*
 1276                  * Detach the knote from the process and mark it as such.
 1277                  * N.B. EVFILT_SIGNAL are also on p_klist, but by the
 1278                  * time we get here, all open file descriptors for this
 1279                  * process have been released, meaning that signal knotes
 1280                  * will have already been detached.
 1281                  *
 1282                  * We need to synchronize this with filt_procdetach().
 1283                  */
 1284                 KASSERT(kn->kn_fop == &proc_filtops);
 1285                 if ((kn->kn_status & KN_DETACHED) == 0) {
 1286                         kn->kn_status |= KN_DETACHED;
 1287                         SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext);
 1288                 }
 1289 
 1290                 /*
 1291                  * Always activate the knote for NOTE_EXIT regardless
 1292                  * of whether or not the listener cares about it.
 1293                  * This matches historical behavior.
 1294                  */
 1295                 knote_activate_locked(kn);
 1296                 mutex_spin_exit(&kq->kq_lock);
 1297         }
 1298 }
 1299 
 1300 #define FILT_TIMER_NOSCHED      ((uintptr_t)-1)
 1301 
 1302 static int
 1303 filt_timercompute(struct kevent *kev, uintptr_t *tticksp)
 1304 {
 1305         struct timespec ts;
 1306         uintptr_t tticks;
 1307 
 1308         if (kev->fflags & ~(NOTE_TIMER_UNITMASK | NOTE_ABSTIME)) {
 1309                 return EINVAL;
 1310         }
 1311 
 1312         /*
 1313          * Convert the event 'data' to a timespec, then convert the
 1314          * timespec to callout ticks.
 1315          */
 1316         switch (kev->fflags & NOTE_TIMER_UNITMASK) {
 1317         case NOTE_SECONDS:
 1318                 ts.tv_sec = kev->data;
 1319                 ts.tv_nsec = 0;
 1320                 break;
 1321 
 1322         case NOTE_MSECONDS:             /* == historical value 0 */
 1323                 ts.tv_sec = kev->data / 1000;
 1324                 ts.tv_nsec = (kev->data % 1000) * 1000000;
 1325                 break;
 1326 
 1327         case NOTE_USECONDS:
 1328                 ts.tv_sec = kev->data / 1000000;
 1329                 ts.tv_nsec = (kev->data % 1000000) * 1000;
 1330                 break;
 1331 
 1332         case NOTE_NSECONDS:
 1333                 ts.tv_sec = kev->data / 1000000000;
 1334                 ts.tv_nsec = kev->data % 1000000000;
 1335                 break;
 1336 
 1337         default:
 1338                 return EINVAL;
 1339         }
 1340 
 1341         if (kev->fflags & NOTE_ABSTIME) {
 1342                 struct timespec deadline = ts;
 1343 
 1344                 /*
 1345                  * Get current time.
 1346                  *
 1347                  * XXX This is CLOCK_REALTIME.  There is no way to
 1348                  * XXX specify CLOCK_MONOTONIC.
 1349                  */
 1350                 nanotime(&ts);
 1351 
 1352                 /* Absolute timers do not repeat. */
 1353                 kev->data = FILT_TIMER_NOSCHED;
 1354 
 1355                 /* If we're past the deadline, then the event will fire. */
 1356                 if (timespeccmp(&deadline, &ts, <=)) {
 1357                         tticks = FILT_TIMER_NOSCHED;
 1358                         goto out;
 1359                 }
 1360 
 1361                 /* Calculate how much time is left. */
 1362                 timespecsub(&deadline, &ts, &ts);
 1363         } else {
 1364                 /* EV_CLEAR automatically set for relative timers. */
 1365                 kev->flags |= EV_CLEAR;
 1366         }
 1367 
 1368         tticks = tstohz(&ts);
 1369 
 1370         /* if the supplied value is under our resolution, use 1 tick */
 1371         if (tticks == 0) {
 1372                 if (kev->data == 0)
 1373                         return EINVAL;
 1374                 tticks = 1;
 1375         } else if (tticks > INT_MAX) {
 1376                 return EINVAL;
 1377         }
 1378 
 1379         if ((kev->flags & EV_ONESHOT) != 0) {
 1380                 /* Timer does not repeat. */
 1381                 kev->data = FILT_TIMER_NOSCHED;
 1382         } else {
 1383                 KASSERT((uintptr_t)tticks != FILT_TIMER_NOSCHED);
 1384                 kev->data = tticks;
 1385         }
 1386 
 1387  out:
 1388         *tticksp = tticks;
 1389 
 1390         return 0;
 1391 }
 1392 
 1393 static void
 1394 filt_timerexpire(void *knx)
 1395 {
 1396         struct knote *kn = knx;
 1397         struct kqueue *kq = kn->kn_kq;
 1398 
 1399         mutex_spin_enter(&kq->kq_lock);
 1400         kn->kn_data++;
 1401         knote_activate_locked(kn);
 1402         if (kn->kn_sdata != FILT_TIMER_NOSCHED) {
 1403                 KASSERT(kn->kn_sdata > 0 && kn->kn_sdata <= INT_MAX);
 1404                 callout_schedule((callout_t *)kn->kn_hook,
 1405                     (int)kn->kn_sdata);
 1406         }
 1407         mutex_spin_exit(&kq->kq_lock);
 1408 }
 1409 
 1410 static inline void
 1411 filt_timerstart(struct knote *kn, uintptr_t tticks)
 1412 {
 1413         callout_t *calloutp = kn->kn_hook;
 1414 
 1415         KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
 1416         KASSERT(!callout_pending(calloutp));
 1417 
 1418         if (__predict_false(tticks == FILT_TIMER_NOSCHED)) {
 1419                 kn->kn_data = 1;
 1420         } else {
 1421                 KASSERT(tticks <= INT_MAX);
 1422                 callout_reset(calloutp, (int)tticks, filt_timerexpire, kn);
 1423         }
 1424 }
 1425 
 1426 static int
 1427 filt_timerattach(struct knote *kn)
 1428 {
 1429         callout_t *calloutp;
 1430         struct kqueue *kq;
 1431         uintptr_t tticks;
 1432         int error;
 1433 
 1434         struct kevent kev = {
 1435                 .flags = kn->kn_flags,
 1436                 .fflags = kn->kn_sfflags,
 1437                 .data = kn->kn_sdata,
 1438         };
 1439 
 1440         error = filt_timercompute(&kev, &tticks);
 1441         if (error) {
 1442                 return error;
 1443         }
 1444 
 1445         if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
 1446             (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
 1447                 atomic_dec_uint(&kq_ncallouts);
 1448                 return ENOMEM;
 1449         }
 1450         callout_init(calloutp, CALLOUT_MPSAFE);
 1451 
 1452         kq = kn->kn_kq;
 1453         mutex_spin_enter(&kq->kq_lock);
 1454 
 1455         kn->kn_sdata = kev.data;
 1456         kn->kn_flags = kev.flags;
 1457         KASSERT(kn->kn_sfflags == kev.fflags);
 1458         kn->kn_hook = calloutp;
 1459 
 1460         filt_timerstart(kn, tticks);
 1461 
 1462         mutex_spin_exit(&kq->kq_lock);
 1463 
 1464         return (0);
 1465 }
 1466 
 1467 static void
 1468 filt_timerdetach(struct knote *kn)
 1469 {
 1470         callout_t *calloutp;
 1471         struct kqueue *kq = kn->kn_kq;
 1472 
 1473         /* prevent rescheduling when we expire */
 1474         mutex_spin_enter(&kq->kq_lock);
 1475         kn->kn_sdata = FILT_TIMER_NOSCHED;
 1476         mutex_spin_exit(&kq->kq_lock);
 1477 
 1478         calloutp = (callout_t *)kn->kn_hook;
 1479 
 1480         /*
 1481          * Attempt to stop the callout.  This will block if it's
 1482          * already running.
 1483          */
 1484         callout_halt(calloutp, NULL);
 1485 
 1486         callout_destroy(calloutp);
 1487         kmem_free(calloutp, sizeof(*calloutp));
 1488         atomic_dec_uint(&kq_ncallouts);
 1489 }
 1490 
 1491 static int
 1492 filt_timertouch(struct knote *kn, struct kevent *kev, long type)
 1493 {
 1494         struct kqueue *kq = kn->kn_kq;
 1495         callout_t *calloutp;
 1496         uintptr_t tticks;
 1497         int error;
 1498 
 1499         KASSERT(mutex_owned(&kq->kq_lock));
 1500 
 1501         switch (type) {
 1502         case EVENT_REGISTER:
 1503                 /* Only relevant for EV_ADD. */
 1504                 if ((kev->flags & EV_ADD) == 0) {
 1505                         return 0;
 1506                 }
 1507 
 1508                 /*
 1509                  * Stop the timer, under the assumption that if
 1510                  * an application is re-configuring the timer,
 1511                  * they no longer care about the old one.  We
 1512                  * can safely drop the kq_lock while we wait
 1513                  * because fdp->fd_lock will be held throughout,
 1514                  * ensuring that no one can sneak in with an
 1515                  * EV_DELETE or close the kq.
 1516                  */
 1517                 KASSERT(mutex_owned(&kq->kq_fdp->fd_lock));
 1518 
 1519                 calloutp = kn->kn_hook;
 1520                 callout_halt(calloutp, &kq->kq_lock);
 1521                 KASSERT(mutex_owned(&kq->kq_lock));
 1522                 knote_deactivate_locked(kn);
 1523                 kn->kn_data = 0;
 1524 
 1525                 error = filt_timercompute(kev, &tticks);
 1526                 if (error) {
 1527                         return error;
 1528                 }
 1529                 kn->kn_sdata = kev->data;
 1530                 kn->kn_flags = kev->flags;
 1531                 kn->kn_sfflags = kev->fflags;
 1532                 filt_timerstart(kn, tticks);
 1533                 break;
 1534 
 1535         case EVENT_PROCESS:
 1536                 *kev = kn->kn_kevent;
 1537                 break;
 1538 
 1539         default:
 1540                 panic("%s: invalid type (%ld)", __func__, type);
 1541         }
 1542 
 1543         return 0;
 1544 }
 1545 
 1546 static int
 1547 filt_timer(struct knote *kn, long hint)
 1548 {
 1549         struct kqueue *kq = kn->kn_kq;
 1550         int rv;
 1551 
 1552         mutex_spin_enter(&kq->kq_lock);
 1553         rv = (kn->kn_data != 0);
 1554         mutex_spin_exit(&kq->kq_lock);
 1555 
 1556         return rv;
 1557 }
 1558 
 1559 static int
 1560 filt_userattach(struct knote *kn)
 1561 {
 1562         struct kqueue *kq = kn->kn_kq;
 1563 
 1564         /*
 1565          * EVFILT_USER knotes are not attached to anything in the kernel.
 1566          */
 1567         mutex_spin_enter(&kq->kq_lock);
 1568         kn->kn_hook = NULL;
 1569         if (kn->kn_fflags & NOTE_TRIGGER)
 1570                 kn->kn_hookid = 1;
 1571         else
 1572                 kn->kn_hookid = 0;
 1573         mutex_spin_exit(&kq->kq_lock);
 1574         return (0);
 1575 }
 1576 
 1577 static void
 1578 filt_userdetach(struct knote *kn)
 1579 {
 1580 
 1581         /*
 1582          * EVFILT_USER knotes are not attached to anything in the kernel.
 1583          */
 1584 }
 1585 
 1586 static int
 1587 filt_user(struct knote *kn, long hint)
 1588 {
 1589         struct kqueue *kq = kn->kn_kq;
 1590         int hookid;
 1591 
 1592         mutex_spin_enter(&kq->kq_lock);
 1593         hookid = kn->kn_hookid;
 1594         mutex_spin_exit(&kq->kq_lock);
 1595 
 1596         return hookid;
 1597 }
 1598 
 1599 static int
 1600 filt_usertouch(struct knote *kn, struct kevent *kev, long type)
 1601 {
 1602         int ffctrl;
 1603 
 1604         KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
 1605 
 1606         switch (type) {
 1607         case EVENT_REGISTER:
 1608                 if (kev->fflags & NOTE_TRIGGER)
 1609                         kn->kn_hookid = 1;
 1610 
 1611                 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
 1612                 kev->fflags &= NOTE_FFLAGSMASK;
 1613                 switch (ffctrl) {
 1614                 case NOTE_FFNOP:
 1615                         break;
 1616 
 1617                 case NOTE_FFAND:
 1618                         kn->kn_sfflags &= kev->fflags;
 1619                         break;
 1620 
 1621                 case NOTE_FFOR:
 1622                         kn->kn_sfflags |= kev->fflags;
 1623                         break;
 1624 
 1625                 case NOTE_FFCOPY:
 1626                         kn->kn_sfflags = kev->fflags;
 1627                         break;
 1628 
 1629                 default:
 1630                         /* XXX Return error? */
 1631                         break;
 1632                 }
 1633                 kn->kn_sdata = kev->data;
 1634                 if (kev->flags & EV_CLEAR) {
 1635                         kn->kn_hookid = 0;
 1636                         kn->kn_data = 0;
 1637                         kn->kn_fflags = 0;
 1638                 }
 1639                 break;
 1640 
 1641         case EVENT_PROCESS:
 1642                 *kev = kn->kn_kevent;
 1643                 kev->fflags = kn->kn_sfflags;
 1644                 kev->data = kn->kn_sdata;
 1645                 if (kn->kn_flags & EV_CLEAR) {
 1646                         kn->kn_hookid = 0;
 1647                         kn->kn_data = 0;
 1648                         kn->kn_fflags = 0;
 1649                 }
 1650                 break;
 1651 
 1652         default:
 1653                 panic("filt_usertouch() - invalid type (%ld)", type);
 1654                 break;
 1655         }
 1656 
 1657         return 0;
 1658 }
 1659 
 1660 /*
 1661  * filt_seltrue:
 1662  *
 1663  *      This filter "event" routine simulates seltrue().
 1664  */
 1665 int
 1666 filt_seltrue(struct knote *kn, long hint)
 1667 {
 1668 
 1669         /*
 1670          * We don't know how much data can be read/written,
 1671          * but we know that it *can* be.  This is about as
 1672          * good as select/poll does as well.
 1673          */
 1674         kn->kn_data = 0;
 1675         return (1);
 1676 }
 1677 
 1678 /*
 1679  * This provides full kqfilter entry for device switch tables, which
 1680  * has same effect as filter using filt_seltrue() as filter method.
 1681  */
 1682 static void
 1683 filt_seltruedetach(struct knote *kn)
 1684 {
 1685         /* Nothing to do */
 1686 }
 1687 
 1688 const struct filterops seltrue_filtops = {
 1689         .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
 1690         .f_attach = NULL,
 1691         .f_detach = filt_seltruedetach,
 1692         .f_event = filt_seltrue,
 1693 };
 1694 
 1695 int
 1696 seltrue_kqfilter(dev_t dev, struct knote *kn)
 1697 {
 1698         switch (kn->kn_filter) {
 1699         case EVFILT_READ:
 1700         case EVFILT_WRITE:
 1701                 kn->kn_fop = &seltrue_filtops;
 1702                 break;
 1703         default:
 1704                 return (EINVAL);
 1705         }
 1706 
 1707         /* Nothing more to do */
 1708         return (0);
 1709 }
 1710 
 1711 /*
 1712  * kqueue(2) system call.
 1713  */
 1714 static int
 1715 kqueue1(struct lwp *l, int flags, register_t *retval)
 1716 {
 1717         struct kqueue *kq;
 1718         file_t *fp;
 1719         int fd, error;
 1720 
 1721         if ((error = fd_allocfile(&fp, &fd)) != 0)
 1722                 return error;
 1723         fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
 1724         fp->f_type = DTYPE_KQUEUE;
 1725         fp->f_ops = &kqueueops;
 1726         kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
 1727         mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
 1728         cv_init(&kq->kq_cv, "kqueue");
 1729         selinit(&kq->kq_sel);
 1730         TAILQ_INIT(&kq->kq_head);
 1731         fp->f_kqueue = kq;
 1732         *retval = fd;
 1733         kq->kq_fdp = curlwp->l_fd;
 1734         fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
 1735         fd_affix(curproc, fp, fd);
 1736         return error;
 1737 }
 1738 
 1739 /*
 1740  * kqueue(2) system call.
 1741  */
 1742 int
 1743 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
 1744 {
 1745         return kqueue1(l, 0, retval);
 1746 }
 1747 
 1748 int
 1749 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
 1750     register_t *retval)
 1751 {
 1752         /* {
 1753                 syscallarg(int) flags;
 1754         } */
 1755         return kqueue1(l, SCARG(uap, flags), retval);
 1756 }
 1757 
 1758 /*
 1759  * kevent(2) system call.
 1760  */
 1761 int
 1762 kevent_fetch_changes(void *ctx, const struct kevent *changelist,
 1763     struct kevent *changes, size_t index, int n)
 1764 {
 1765 
 1766         return copyin(changelist + index, changes, n * sizeof(*changes));
 1767 }
 1768 
 1769 int
 1770 kevent_put_events(void *ctx, struct kevent *events,
 1771     struct kevent *eventlist, size_t index, int n)
 1772 {
 1773 
 1774         return copyout(events, eventlist + index, n * sizeof(*events));
 1775 }
 1776 
 1777 static const struct kevent_ops kevent_native_ops = {
 1778         .keo_private = NULL,
 1779         .keo_fetch_timeout = copyin,
 1780         .keo_fetch_changes = kevent_fetch_changes,
 1781         .keo_put_events = kevent_put_events,
 1782 };
 1783 
 1784 int
 1785 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
 1786     register_t *retval)
 1787 {
 1788         /* {
 1789                 syscallarg(int) fd;
 1790                 syscallarg(const struct kevent *) changelist;
 1791                 syscallarg(size_t) nchanges;
 1792                 syscallarg(struct kevent *) eventlist;
 1793                 syscallarg(size_t) nevents;
 1794                 syscallarg(const struct timespec *) timeout;
 1795         } */
 1796 
 1797         return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
 1798             SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
 1799             SCARG(uap, timeout), &kevent_native_ops);
 1800 }
 1801 
 1802 int
 1803 kevent1(register_t *retval, int fd,
 1804         const struct kevent *changelist, size_t nchanges,
 1805         struct kevent *eventlist, size_t nevents,
 1806         const struct timespec *timeout,
 1807         const struct kevent_ops *keops)
 1808 {
 1809         struct kevent *kevp;
 1810         struct kqueue *kq;
 1811         struct timespec ts;
 1812         size_t i, n, ichange;
 1813         int nerrors, error;
 1814         struct kevent kevbuf[KQ_NEVENTS];       /* approx 300 bytes on 64-bit */
 1815         file_t *fp;
 1816 
 1817         /* check that we're dealing with a kq */
 1818         fp = fd_getfile(fd);
 1819         if (fp == NULL)
 1820                 return (EBADF);
 1821 
 1822         if (fp->f_type != DTYPE_KQUEUE) {
 1823                 fd_putfile(fd);
 1824                 return (EBADF);
 1825         }
 1826 
 1827         if (timeout != NULL) {
 1828                 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
 1829                 if (error)
 1830                         goto done;
 1831                 timeout = &ts;
 1832         }
 1833 
 1834         kq = fp->f_kqueue;
 1835         nerrors = 0;
 1836         ichange = 0;
 1837 
 1838         /* traverse list of events to register */
 1839         while (nchanges > 0) {
 1840                 n = MIN(nchanges, __arraycount(kevbuf));
 1841                 error = (*keops->keo_fetch_changes)(keops->keo_private,
 1842                     changelist, kevbuf, ichange, n);
 1843                 if (error)
 1844                         goto done;
 1845                 for (i = 0; i < n; i++) {
 1846                         kevp = &kevbuf[i];
 1847                         kevp->flags &= ~EV_SYSFLAGS;
 1848                         /* register each knote */
 1849                         error = kqueue_register(kq, kevp);
 1850                         if (!error && !(kevp->flags & EV_RECEIPT))
 1851                                 continue;
 1852                         if (nevents == 0)
 1853                                 goto done;
 1854                         kevp->flags = EV_ERROR;
 1855                         kevp->data = error;
 1856                         error = (*keops->keo_put_events)
 1857                                 (keops->keo_private, kevp,
 1858                                  eventlist, nerrors, 1);
 1859                         if (error)
 1860                                 goto done;
 1861                         nevents--;
 1862                         nerrors++;
 1863                 }
 1864                 nchanges -= n;  /* update the results */
 1865                 ichange += n;
 1866         }
 1867         if (nerrors) {
 1868                 *retval = nerrors;
 1869                 error = 0;
 1870                 goto done;
 1871         }
 1872 
 1873         /* actually scan through the events */
 1874         error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
 1875             kevbuf, __arraycount(kevbuf));
 1876  done:
 1877         fd_putfile(fd);
 1878         return (error);
 1879 }
 1880 
 1881 /*
 1882  * Register a given kevent kev onto the kqueue
 1883  */
 1884 static int
 1885 kqueue_register(struct kqueue *kq, struct kevent *kev)
 1886 {
 1887         struct kfilter *kfilter;
 1888         filedesc_t *fdp;
 1889         file_t *fp;
 1890         fdfile_t *ff;
 1891         struct knote *kn, *newkn;
 1892         struct klist *list;
 1893         int error, fd, rv;
 1894 
 1895         fdp = kq->kq_fdp;
 1896         fp = NULL;
 1897         kn = NULL;
 1898         error = 0;
 1899         fd = 0;
 1900 
 1901         newkn = knote_alloc(true);
 1902 
 1903         rw_enter(&kqueue_filter_lock, RW_READER);
 1904         kfilter = kfilter_byfilter(kev->filter);
 1905         if (kfilter == NULL || kfilter->filtops == NULL) {
 1906                 /* filter not found nor implemented */
 1907                 rw_exit(&kqueue_filter_lock);
 1908                 knote_free(newkn);
 1909                 return (EINVAL);
 1910         }
 1911 
 1912         /* search if knote already exists */
 1913         if (kfilter->filtops->f_flags & FILTEROP_ISFD) {
 1914                 /* monitoring a file descriptor */
 1915                 /* validate descriptor */
 1916                 if (kev->ident > INT_MAX
 1917                     || (fp = fd_getfile(fd = kev->ident)) == NULL) {
 1918                         rw_exit(&kqueue_filter_lock);
 1919                         knote_free(newkn);
 1920                         return EBADF;
 1921                 }
 1922                 mutex_enter(&fdp->fd_lock);
 1923                 ff = fdp->fd_dt->dt_ff[fd];
 1924                 if (ff->ff_refcnt & FR_CLOSING) {
 1925                         error = EBADF;
 1926                         goto doneunlock;
 1927                 }
 1928                 if (fd <= fdp->fd_lastkqfile) {
 1929                         SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
 1930                                 if (kq == kn->kn_kq &&
 1931                                     kev->filter == kn->kn_filter)
 1932                                         break;
 1933                         }
 1934                 }
 1935         } else {
 1936                 /*
 1937                  * not monitoring a file descriptor, so
 1938                  * lookup knotes in internal hash table
 1939                  */
 1940                 mutex_enter(&fdp->fd_lock);
 1941                 if (fdp->fd_knhashmask != 0) {
 1942                         list = &fdp->fd_knhash[
 1943                             KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
 1944                         SLIST_FOREACH(kn, list, kn_link) {
 1945                                 if (kev->ident == kn->kn_id &&
 1946                                     kq == kn->kn_kq &&
 1947                                     kev->filter == kn->kn_filter)
 1948                                         break;
 1949                         }
 1950                 }
 1951         }
 1952 
 1953         /* It's safe to test KQ_CLOSING while holding only the fd_lock. */
 1954         KASSERT(mutex_owned(&fdp->fd_lock));
 1955         KASSERT((kq->kq_count & KQ_CLOSING) == 0);
 1956 
 1957         /*
 1958          * kn now contains the matching knote, or NULL if no match
 1959          */
 1960         if (kn == NULL) {
 1961                 if (kev->flags & EV_ADD) {
 1962                         /* create new knote */
 1963                         kn = newkn;
 1964                         newkn = NULL;
 1965                         kn->kn_obj = fp;
 1966                         kn->kn_id = kev->ident;
 1967                         kn->kn_kq = kq;
 1968                         kn->kn_fop = kfilter->filtops;
 1969                         kn->kn_kfilter = kfilter;
 1970                         kn->kn_sfflags = kev->fflags;
 1971                         kn->kn_sdata = kev->data;
 1972                         kev->fflags = 0;
 1973                         kev->data = 0;
 1974                         kn->kn_kevent = *kev;
 1975 
 1976                         KASSERT(kn->kn_fop != NULL);
 1977                         /*
 1978                          * XXX Allow only known-safe users of f_touch.
 1979                          * XXX See filter_touch() for details.
 1980                          */
 1981                         if (kn->kn_fop->f_touch != NULL &&
 1982                             kn->kn_fop != &timer_filtops &&
 1983                             kn->kn_fop != &user_filtops) {
 1984                                 error = ENOTSUP;
 1985                                 goto fail_ev_add;
 1986                         }
 1987 
 1988                         /*
 1989                          * apply reference count to knote structure, and
 1990                          * do not release it at the end of this routine.
 1991                          */
 1992                         fp = NULL;
 1993 
 1994                         if (!(kn->kn_fop->f_flags & FILTEROP_ISFD)) {
 1995                                 /*
 1996                                  * If knote is not on an fd, store on
 1997                                  * internal hash table.
 1998                                  */
 1999                                 if (fdp->fd_knhashmask == 0) {
 2000                                         /* XXXAD can block with fd_lock held */
 2001                                         fdp->fd_knhash = hashinit(KN_HASHSIZE,
 2002                                             HASH_LIST, true,
 2003                                             &fdp->fd_knhashmask);
 2004                                 }
 2005                                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
 2006                                     fdp->fd_knhashmask)];
 2007                         } else {
 2008                                 /* Otherwise, knote is on an fd. */
 2009                                 list = (struct klist *)
 2010                                     &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
 2011                                 if ((int)kn->kn_id > fdp->fd_lastkqfile)
 2012                                         fdp->fd_lastkqfile = kn->kn_id;
 2013                         }
 2014                         SLIST_INSERT_HEAD(list, kn, kn_link);
 2015 
 2016                         /*
 2017                          * N.B. kn->kn_fop may change as the result
 2018                          * of filter_attach()!
 2019                          */
 2020                         knote_foplock_enter(kn);
 2021                         error = filter_attach(kn);
 2022                         if (error != 0) {
 2023 #ifdef DEBUG
 2024                                 struct proc *p = curlwp->l_proc;
 2025                                 const file_t *ft = kn->kn_obj;
 2026                                 printf("%s: %s[%d]: event type %d not "
 2027                                     "supported for file type %d/%s "
 2028                                     "(error %d)\n", __func__,
 2029                                     p->p_comm, p->p_pid,
 2030                                     kn->kn_filter, ft ? ft->f_type : -1,
 2031                                     ft ? ft->f_ops->fo_name : "?", error);
 2032 #endif
 2033 
 2034  fail_ev_add:
 2035                                 /*
 2036                                  * N.B. no need to check for this note to
 2037                                  * be in-flux, since it was never visible
 2038                                  * to the monitored object.
 2039                                  *
 2040                                  * knote_detach() drops fdp->fd_lock
 2041                                  */
 2042                                 knote_foplock_exit(kn);
 2043                                 mutex_enter(&kq->kq_lock);
 2044                                 KNOTE_WILLDETACH(kn);
 2045                                 KASSERT(kn_in_flux(kn) == false);
 2046                                 mutex_exit(&kq->kq_lock);
 2047                                 knote_detach(kn, fdp, false);
 2048                                 goto done;
 2049                         }
 2050                         atomic_inc_uint(&kfilter->refcnt);
 2051                         goto done_ev_add;
 2052                 } else {
 2053                         /* No matching knote and the EV_ADD flag is not set. */
 2054                         error = ENOENT;
 2055                         goto doneunlock;
 2056                 }
 2057         }
 2058 
 2059         if (kev->flags & EV_DELETE) {
 2060                 /*
 2061                  * Let the world know that this knote is about to go
 2062                  * away, and wait for it to settle if it's currently
 2063                  * in-flux.
 2064                  */
 2065                 mutex_spin_enter(&kq->kq_lock);
 2066                 if (kn->kn_status & KN_WILLDETACH) {
 2067                         /*
 2068                          * This knote is already on its way out,
 2069                          * so just be done.
 2070                          */
 2071                         mutex_spin_exit(&kq->kq_lock);
 2072                         goto doneunlock;
 2073                 }
 2074                 KNOTE_WILLDETACH(kn);
 2075                 if (kn_in_flux(kn)) {
 2076                         mutex_exit(&fdp->fd_lock);
 2077                         /*
 2078                          * It's safe for us to conclusively wait for
 2079                          * this knote to settle because we know we'll
 2080                          * be completing the detach.
 2081                          */
 2082                         kn_wait_flux(kn, true);
 2083                         KASSERT(kn_in_flux(kn) == false);
 2084                         mutex_spin_exit(&kq->kq_lock);
 2085                         mutex_enter(&fdp->fd_lock);
 2086                 } else {
 2087                         mutex_spin_exit(&kq->kq_lock);
 2088                 }
 2089 
 2090                 /* knote_detach() drops fdp->fd_lock */
 2091                 knote_detach(kn, fdp, true);
 2092                 goto done;
 2093         }
 2094 
 2095         /*
 2096          * The user may change some filter values after the
 2097          * initial EV_ADD, but doing so will not reset any
 2098          * filter which have already been triggered.
 2099          */
 2100         knote_foplock_enter(kn);
 2101         kn->kn_kevent.udata = kev->udata;
 2102         KASSERT(kn->kn_fop != NULL);
 2103         if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
 2104             kn->kn_fop->f_touch != NULL) {
 2105                 mutex_spin_enter(&kq->kq_lock);
 2106                 error = filter_touch(kn, kev, EVENT_REGISTER);
 2107                 mutex_spin_exit(&kq->kq_lock);
 2108                 if (__predict_false(error != 0)) {
 2109                         /* Never a new knote (which would consume newkn). */
 2110                         KASSERT(newkn != NULL);
 2111                         knote_foplock_exit(kn);
 2112                         goto doneunlock;
 2113                 }
 2114         } else {
 2115                 kn->kn_sfflags = kev->fflags;
 2116                 kn->kn_sdata = kev->data;
 2117         }
 2118 
 2119         /*
 2120          * We can get here if we are trying to attach
 2121          * an event to a file descriptor that does not
 2122          * support events, and the attach routine is
 2123          * broken and does not return an error.
 2124          */
 2125  done_ev_add:
 2126         rv = filter_event(kn, 0, false);
 2127         if (rv)
 2128                 knote_activate(kn);
 2129 
 2130         knote_foplock_exit(kn);
 2131 
 2132         /* disable knote */
 2133         if ((kev->flags & EV_DISABLE)) {
 2134                 mutex_spin_enter(&kq->kq_lock);
 2135                 if ((kn->kn_status & KN_DISABLED) == 0)
 2136                         kn->kn_status |= KN_DISABLED;
 2137                 mutex_spin_exit(&kq->kq_lock);
 2138         }
 2139 
 2140         /* enable knote */
 2141         if ((kev->flags & EV_ENABLE)) {
 2142                 knote_enqueue(kn);
 2143         }
 2144  doneunlock:
 2145         mutex_exit(&fdp->fd_lock);
 2146  done:
 2147         rw_exit(&kqueue_filter_lock);
 2148         if (newkn != NULL)
 2149                 knote_free(newkn);
 2150         if (fp != NULL)
 2151                 fd_putfile(fd);
 2152         return (error);
 2153 }
 2154 
 2155 #define KN_FMT(buf, kn) \
 2156     (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
 2157 
 2158 #if defined(DDB)
 2159 void
 2160 kqueue_printit(struct kqueue *kq, bool full, void (*pr)(const char *, ...))
 2161 {
 2162         const struct knote *kn;
 2163         u_int count;
 2164         int nmarker;
 2165         char buf[128];
 2166 
 2167         count = 0;
 2168         nmarker = 0;
 2169 
 2170         (*pr)("kqueue %p (restart=%d count=%u):\n", kq,
 2171             !!(kq->kq_count & KQ_RESTART), KQ_COUNT(kq));
 2172         (*pr)("  Queued knotes:\n");
 2173         TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
 2174                 if (kn->kn_status & KN_MARKER) {
 2175                         nmarker++;
 2176                 } else {
 2177                         count++;
 2178                 }
 2179                 (*pr)("    knote %p: kq=%p status=%s\n",
 2180                     kn, kn->kn_kq, KN_FMT(buf, kn));
 2181                 (*pr)("      id=0x%lx (%lu) filter=%d\n",
 2182                     (u_long)kn->kn_id, (u_long)kn->kn_id, kn->kn_filter);
 2183                 if (kn->kn_kq != kq) {
 2184                         (*pr)("      !!! kn->kn_kq != kq\n");
 2185                 }
 2186         }
 2187         if (count != KQ_COUNT(kq)) {
 2188                 (*pr)("  !!! count(%u) != KQ_COUNT(%u)\n",
 2189                     count, KQ_COUNT(kq));
 2190         }
 2191 }
 2192 #endif /* DDB */
 2193 
 2194 #if defined(DEBUG)
 2195 static void
 2196 kqueue_check(const char *func, size_t line, const struct kqueue *kq)
 2197 {
 2198         const struct knote *kn;
 2199         u_int count;
 2200         int nmarker;
 2201         char buf[128];
 2202 
 2203         KASSERT(mutex_owned(&kq->kq_lock));
 2204 
 2205         count = 0;
 2206         nmarker = 0;
 2207         TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
 2208                 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
 2209                         panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
 2210                             func, line, kq, kn, KN_FMT(buf, kn));
 2211                 }
 2212                 if ((kn->kn_status & KN_MARKER) == 0) {
 2213                         if (kn->kn_kq != kq) {
 2214                                 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
 2215                                     func, line, kq, kn, kn->kn_kq,
 2216                                     KN_FMT(buf, kn));
 2217                         }
 2218                         if ((kn->kn_status & KN_ACTIVE) == 0) {
 2219                                 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
 2220                                     func, line, kq, kn, KN_FMT(buf, kn));
 2221                         }
 2222                         count++;
 2223                         if (count > KQ_COUNT(kq)) {
 2224                                 panic("%s,%zu: kq=%p kq->kq_count(%u) != "
 2225                                     "count(%d), nmarker=%d",
 2226                                     func, line, kq, KQ_COUNT(kq), count,
 2227                                     nmarker);
 2228                         }
 2229                 } else {
 2230                         nmarker++;
 2231                 }
 2232         }
 2233 }
 2234 #define kq_check(a) kqueue_check(__func__, __LINE__, (a))
 2235 #else /* defined(DEBUG) */
 2236 #define kq_check(a)     /* nothing */
 2237 #endif /* defined(DEBUG) */
 2238 
 2239 static void
 2240 kqueue_restart(file_t *fp)
 2241 {
 2242         struct kqueue *kq = fp->f_kqueue;
 2243         KASSERT(kq != NULL);
 2244 
 2245         mutex_spin_enter(&kq->kq_lock);
 2246         kq->kq_count |= KQ_RESTART;
 2247         cv_broadcast(&kq->kq_cv);
 2248         mutex_spin_exit(&kq->kq_lock);
 2249 }
 2250 
 2251 /*
 2252  * Scan through the list of events on fp (for a maximum of maxevents),
 2253  * returning the results in to ulistp. Timeout is determined by tsp; if
 2254  * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
 2255  * as appropriate.
 2256  */
 2257 static int
 2258 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
 2259             const struct timespec *tsp, register_t *retval,
 2260             const struct kevent_ops *keops, struct kevent *kevbuf,
 2261             size_t kevcnt)
 2262 {
 2263         struct kqueue   *kq;
 2264         struct kevent   *kevp;
 2265         struct timespec ats, sleepts;
 2266         struct knote    *kn, *marker;
 2267         struct knote_impl morker;
 2268         size_t          count, nkev, nevents;
 2269         int             timeout, error, touch, rv, influx;
 2270         filedesc_t      *fdp;
 2271 
 2272         fdp = curlwp->l_fd;
 2273         kq = fp->f_kqueue;
 2274         count = maxevents;
 2275         nkev = nevents = error = 0;
 2276         if (count == 0) {
 2277                 *retval = 0;
 2278                 return 0;
 2279         }
 2280 
 2281         if (tsp) {                              /* timeout supplied */
 2282                 ats = *tsp;
 2283                 if (inittimeleft(&ats, &sleepts) == -1) {
 2284                         *retval = maxevents;
 2285                         return EINVAL;
 2286                 }
 2287                 timeout = tstohz(&ats);
 2288                 if (timeout <= 0)
 2289                         timeout = -1;           /* do poll */
 2290         } else {
 2291                 /* no timeout, wait forever */
 2292                 timeout = 0;
 2293         }
 2294 
 2295         memset(&morker, 0, sizeof(morker));
 2296         marker = &morker.ki_knote;
 2297         marker->kn_kq = kq;
 2298         marker->kn_status = KN_MARKER;
 2299         mutex_spin_enter(&kq->kq_lock);
 2300  retry:
 2301         kevp = kevbuf;
 2302         if (KQ_COUNT(kq) == 0) {
 2303                 if (timeout >= 0) {
 2304                         error = cv_timedwait_sig(&kq->kq_cv,
 2305                             &kq->kq_lock, timeout);
 2306                         if (error == 0) {
 2307                                 if (KQ_COUNT(kq) == 0 &&
 2308                                     (kq->kq_count & KQ_RESTART)) {
 2309                                         /* return to clear file reference */
 2310                                         error = ERESTART;
 2311                                 } else if (tsp == NULL || (timeout =
 2312                                     gettimeleft(&ats, &sleepts)) > 0) {
 2313                                         goto retry;
 2314                                 }
 2315                         } else {
 2316                                 /* don't restart after signals... */
 2317                                 if (error == ERESTART)
 2318                                         error = EINTR;
 2319                                 if (error == EWOULDBLOCK)
 2320                                         error = 0;
 2321                         }
 2322                 }
 2323                 mutex_spin_exit(&kq->kq_lock);
 2324                 goto done;
 2325         }
 2326 
 2327         /* mark end of knote list */
 2328         TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
 2329         influx = 0;
 2330 
 2331         /*
 2332          * Acquire the fdp->fd_lock interlock to avoid races with
 2333          * file creation/destruction from other threads.
 2334          */
 2335         mutex_spin_exit(&kq->kq_lock);
 2336 relock:
 2337         mutex_enter(&fdp->fd_lock);
 2338         mutex_spin_enter(&kq->kq_lock);
 2339 
 2340         while (count != 0) {
 2341                 /*
 2342                  * Get next knote.  We are guaranteed this will never
 2343                  * be NULL because of the marker we inserted above.
 2344                  */
 2345                 kn = TAILQ_FIRST(&kq->kq_head);
 2346 
 2347                 bool kn_is_other_marker =
 2348                     (kn->kn_status & KN_MARKER) != 0 && kn != marker;
 2349                 bool kn_is_detaching = (kn->kn_status & KN_WILLDETACH) != 0;
 2350                 bool kn_is_in_flux = kn_in_flux(kn);
 2351 
 2352                 /*
 2353                  * If we found a marker that's not ours, or this knote
 2354                  * is in a state of flux, then wait for everything to
 2355                  * settle down and go around again.
 2356                  */
 2357                 if (kn_is_other_marker || kn_is_detaching || kn_is_in_flux) {
 2358                         if (influx) {
 2359                                 influx = 0;
 2360                                 KQ_FLUX_WAKEUP(kq);
 2361                         }
 2362                         mutex_exit(&fdp->fd_lock);
 2363                         if (kn_is_other_marker || kn_is_in_flux) {
 2364                                 KQ_FLUX_WAIT(kq);
 2365                                 mutex_spin_exit(&kq->kq_lock);
 2366                         } else {
 2367                                 /*
 2368                                  * Detaching but not in-flux?  Someone is
 2369                                  * actively trying to finish the job; just
 2370                                  * go around and try again.
 2371                                  */
 2372                                 KASSERT(kn_is_detaching);
 2373                                 mutex_spin_exit(&kq->kq_lock);
 2374                                 preempt_point();
 2375                         }
 2376                         goto relock;
 2377                 }
 2378 
 2379                 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 2380                 if (kn == marker) {
 2381                         /* it's our marker, stop */
 2382                         KQ_FLUX_WAKEUP(kq);
 2383                         if (count == maxevents) {
 2384                                 mutex_exit(&fdp->fd_lock);
 2385                                 goto retry;
 2386                         }
 2387                         break;
 2388                 }
 2389                 KASSERT((kn->kn_status & KN_BUSY) == 0);
 2390 
 2391                 kq_check(kq);
 2392                 kn->kn_status &= ~KN_QUEUED;
 2393                 kn->kn_status |= KN_BUSY;
 2394                 kq_check(kq);
 2395                 if (kn->kn_status & KN_DISABLED) {
 2396                         kn->kn_status &= ~KN_BUSY;
 2397                         kq->kq_count--;
 2398                         /* don't want disabled events */
 2399                         continue;
 2400                 }
 2401                 if ((kn->kn_flags & EV_ONESHOT) == 0) {
 2402                         mutex_spin_exit(&kq->kq_lock);
 2403                         KASSERT(mutex_owned(&fdp->fd_lock));
 2404                         knote_foplock_enter(kn);
 2405                         rv = filter_event(kn, 0, false);
 2406                         knote_foplock_exit(kn);
 2407                         mutex_spin_enter(&kq->kq_lock);
 2408                         /* Re-poll if note was re-enqueued. */
 2409                         if ((kn->kn_status & KN_QUEUED) != 0) {
 2410                                 kn->kn_status &= ~KN_BUSY;
 2411                                 /* Re-enqueue raised kq_count, lower it again */
 2412                                 kq->kq_count--;
 2413                                 influx = 1;
 2414                                 continue;
 2415                         }
 2416                         if (rv == 0) {
 2417                                 /*
 2418                                  * non-ONESHOT event that hasn't triggered
 2419                                  * again, so it will remain de-queued.
 2420                                  */
 2421                                 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
 2422                                 kq->kq_count--;
 2423                                 influx = 1;
 2424                                 continue;
 2425                         }
 2426                 } else {
 2427                         /*
 2428                          * Must NOT drop kq_lock until we can do
 2429                          * the KNOTE_WILLDETACH() below.
 2430                          */
 2431                 }
 2432                 KASSERT(kn->kn_fop != NULL);
 2433                 touch = (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
 2434                                 kn->kn_fop->f_touch != NULL);
 2435                 /* XXXAD should be got from f_event if !oneshot. */
 2436                 KASSERT((kn->kn_status & KN_WILLDETACH) == 0);
 2437                 if (touch) {
 2438                         (void)filter_touch(kn, kevp, EVENT_PROCESS);
 2439                 } else {
 2440                         *kevp = kn->kn_kevent;
 2441                 }
 2442                 kevp++;
 2443                 nkev++;
 2444                 influx = 1;
 2445                 if (kn->kn_flags & EV_ONESHOT) {
 2446                         /* delete ONESHOT events after retrieval */
 2447                         KNOTE_WILLDETACH(kn);
 2448                         kn->kn_status &= ~KN_BUSY;
 2449                         kq->kq_count--;
 2450                         KASSERT(kn_in_flux(kn) == false);
 2451                         KASSERT((kn->kn_status & KN_WILLDETACH) != 0 &&
 2452                                 kn->kn_kevent.udata == curlwp);
 2453                         mutex_spin_exit(&kq->kq_lock);
 2454                         knote_detach(kn, fdp, true);
 2455                         mutex_enter(&fdp->fd_lock);
 2456                         mutex_spin_enter(&kq->kq_lock);
 2457                 } else if (kn->kn_flags & EV_CLEAR) {
 2458                         /* clear state after retrieval */
 2459                         kn->kn_data = 0;
 2460                         kn->kn_fflags = 0;
 2461                         /*
 2462                          * Manually clear knotes who weren't
 2463                          * 'touch'ed.
 2464                          */
 2465                         if (touch == 0) {
 2466                                 kn->kn_data = 0;
 2467                                 kn->kn_fflags = 0;
 2468                         }
 2469                         kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
 2470                         kq->kq_count--;
 2471                 } else if (kn->kn_flags & EV_DISPATCH) {
 2472                         kn->kn_status |= KN_DISABLED;
 2473                         kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
 2474                         kq->kq_count--;
 2475                 } else {
 2476                         /* add event back on list */
 2477                         kq_check(kq);
 2478                         kn->kn_status |= KN_QUEUED;
 2479                         kn->kn_status &= ~KN_BUSY;
 2480                         TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 2481                         kq_check(kq);
 2482                 }
 2483 
 2484                 if (nkev == kevcnt) {
 2485                         /* do copyouts in kevcnt chunks */
 2486                         influx = 0;
 2487                         KQ_FLUX_WAKEUP(kq);
 2488                         mutex_spin_exit(&kq->kq_lock);
 2489                         mutex_exit(&fdp->fd_lock);
 2490                         error = (*keops->keo_put_events)
 2491                             (keops->keo_private,
 2492                             kevbuf, ulistp, nevents, nkev);
 2493                         mutex_enter(&fdp->fd_lock);
 2494                         mutex_spin_enter(&kq->kq_lock);
 2495                         nevents += nkev;
 2496                         nkev = 0;
 2497                         kevp = kevbuf;
 2498                 }
 2499                 count--;
 2500                 if (error != 0 || count == 0) {
 2501                         /* remove marker */
 2502                         TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
 2503                         break;
 2504                 }
 2505         }
 2506         KQ_FLUX_WAKEUP(kq);
 2507         mutex_spin_exit(&kq->kq_lock);
 2508         mutex_exit(&fdp->fd_lock);
 2509 
 2510 done:
 2511         if (nkev != 0) {
 2512                 /* copyout remaining events */
 2513                 error = (*keops->keo_put_events)(keops->keo_private,
 2514                     kevbuf, ulistp, nevents, nkev);
 2515         }
 2516         *retval = maxevents - count;
 2517 
 2518         return error;
 2519 }
 2520 
 2521 /*
 2522  * fileops ioctl method for a kqueue descriptor.
 2523  *
 2524  * Two ioctls are currently supported. They both use struct kfilter_mapping:
 2525  *      KFILTER_BYNAME          find name for filter, and return result in
 2526  *                              name, which is of size len.
 2527  *      KFILTER_BYFILTER        find filter for name. len is ignored.
 2528  */
 2529 /*ARGSUSED*/
 2530 static int
 2531 kqueue_ioctl(file_t *fp, u_long com, void *data)
 2532 {
 2533         struct kfilter_mapping  *km;
 2534         const struct kfilter    *kfilter;
 2535         char                    *name;
 2536         int                     error;
 2537 
 2538         km = data;
 2539         error = 0;
 2540         name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
 2541 
 2542         switch (com) {
 2543         case KFILTER_BYFILTER:  /* convert filter -> name */
 2544                 rw_enter(&kqueue_filter_lock, RW_READER);
 2545                 kfilter = kfilter_byfilter(km->filter);
 2546                 if (kfilter != NULL) {
 2547                         strlcpy(name, kfilter->name, KFILTER_MAXNAME);
 2548                         rw_exit(&kqueue_filter_lock);
 2549                         error = copyoutstr(name, km->name, km->len, NULL);
 2550                 } else {
 2551                         rw_exit(&kqueue_filter_lock);
 2552                         error = ENOENT;
 2553                 }
 2554                 break;
 2555 
 2556         case KFILTER_BYNAME:    /* convert name -> filter */
 2557                 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
 2558                 if (error) {
 2559                         break;
 2560                 }
 2561                 rw_enter(&kqueue_filter_lock, RW_READER);
 2562                 kfilter = kfilter_byname(name);
 2563                 if (kfilter != NULL)
 2564                         km->filter = kfilter->filter;
 2565                 else
 2566                         error = ENOENT;
 2567                 rw_exit(&kqueue_filter_lock);
 2568                 break;
 2569 
 2570         default:
 2571                 error = ENOTTY;
 2572                 break;
 2573 
 2574         }
 2575         kmem_free(name, KFILTER_MAXNAME);
 2576         return (error);
 2577 }
 2578 
 2579 /*
 2580  * fileops fcntl method for a kqueue descriptor.
 2581  */
 2582 static int
 2583 kqueue_fcntl(file_t *fp, u_int com, void *data)
 2584 {
 2585 
 2586         return (ENOTTY);
 2587 }
 2588 
 2589 /*
 2590  * fileops poll method for a kqueue descriptor.
 2591  * Determine if kqueue has events pending.
 2592  */
 2593 static int
 2594 kqueue_poll(file_t *fp, int events)
 2595 {
 2596         struct kqueue   *kq;
 2597         int             revents;
 2598 
 2599         kq = fp->f_kqueue;
 2600 
 2601         revents = 0;
 2602         if (events & (POLLIN | POLLRDNORM)) {
 2603                 mutex_spin_enter(&kq->kq_lock);
 2604                 if (KQ_COUNT(kq) != 0) {
 2605                         revents |= events & (POLLIN | POLLRDNORM);
 2606                 } else {
 2607                         selrecord(curlwp, &kq->kq_sel);
 2608                 }
 2609                 kq_check(kq);
 2610                 mutex_spin_exit(&kq->kq_lock);
 2611         }
 2612 
 2613         return revents;
 2614 }
 2615 
 2616 /*
 2617  * fileops stat method for a kqueue descriptor.
 2618  * Returns dummy info, with st_size being number of events pending.
 2619  */
 2620 static int
 2621 kqueue_stat(file_t *fp, struct stat *st)
 2622 {
 2623         struct kqueue *kq;
 2624 
 2625         kq = fp->f_kqueue;
 2626 
 2627         memset(st, 0, sizeof(*st));
 2628         st->st_size = KQ_COUNT(kq);
 2629         st->st_blksize = sizeof(struct kevent);
 2630         st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 2631         st->st_blocks = 1;
 2632         st->st_uid = kauth_cred_geteuid(fp->f_cred);
 2633         st->st_gid = kauth_cred_getegid(fp->f_cred);
 2634 
 2635         return 0;
 2636 }
 2637 
 2638 static void
 2639 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
 2640 {
 2641         struct knote *kn;
 2642         filedesc_t *fdp;
 2643 
 2644         fdp = kq->kq_fdp;
 2645 
 2646         KASSERT(mutex_owned(&fdp->fd_lock));
 2647 
 2648  again:
 2649         for (kn = SLIST_FIRST(list); kn != NULL;) {
 2650                 if (kq != kn->kn_kq) {
 2651                         kn = SLIST_NEXT(kn, kn_link);
 2652                         continue;
 2653                 }
 2654                 if (knote_detach_quiesce(kn)) {
 2655                         mutex_enter(&fdp->fd_lock);
 2656                         goto again;
 2657                 }
 2658                 knote_detach(kn, fdp, true);
 2659                 mutex_enter(&fdp->fd_lock);
 2660                 kn = SLIST_FIRST(list);
 2661         }
 2662 }
 2663 
 2664 /*
 2665  * fileops close method for a kqueue descriptor.
 2666  */
 2667 static int
 2668 kqueue_close(file_t *fp)
 2669 {
 2670         struct kqueue *kq;
 2671         filedesc_t *fdp;
 2672         fdfile_t *ff;
 2673         int i;
 2674 
 2675         kq = fp->f_kqueue;
 2676         fp->f_kqueue = NULL;
 2677         fp->f_type = 0;
 2678         fdp = curlwp->l_fd;
 2679 
 2680         KASSERT(kq->kq_fdp == fdp);
 2681 
 2682         mutex_enter(&fdp->fd_lock);
 2683 
 2684         /*
 2685          * We're doing to drop the fd_lock multiple times while
 2686          * we detach knotes.  During this time, attempts to register
 2687          * knotes via the back door (e.g. knote_proc_fork_track())
 2688          * need to fail, lest they sneak in to attach a knote after
 2689          * we've already drained the list it's destined for.
 2690          *
 2691          * We must acquire kq_lock here to set KQ_CLOSING (to serialize
 2692          * with other code paths that modify kq_count without holding
 2693          * the fd_lock), but once this bit is set, it's only safe to
 2694          * test it while holding the fd_lock, and holding kq_lock while
 2695          * doing so is not necessary.
 2696          */
 2697         mutex_enter(&kq->kq_lock);
 2698         kq->kq_count |= KQ_CLOSING;
 2699         mutex_exit(&kq->kq_lock);
 2700 
 2701         for (i = 0; i <= fdp->fd_lastkqfile; i++) {
 2702                 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
 2703                         continue;
 2704                 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
 2705         }
 2706         if (fdp->fd_knhashmask != 0) {
 2707                 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
 2708                         kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
 2709                 }
 2710         }
 2711 
 2712         mutex_exit(&fdp->fd_lock);
 2713 
 2714 #if defined(DEBUG)
 2715         mutex_enter(&kq->kq_lock);
 2716         kq_check(kq);
 2717         mutex_exit(&kq->kq_lock);
 2718 #endif /* DEBUG */
 2719         KASSERT(TAILQ_EMPTY(&kq->kq_head));
 2720         KASSERT(KQ_COUNT(kq) == 0);
 2721         mutex_destroy(&kq->kq_lock);
 2722         cv_destroy(&kq->kq_cv);
 2723         seldestroy(&kq->kq_sel);
 2724         kmem_free(kq, sizeof(*kq));
 2725 
 2726         return (0);
 2727 }
 2728 
 2729 /*
 2730  * struct fileops kqfilter method for a kqueue descriptor.
 2731  * Event triggered when monitored kqueue changes.
 2732  */
 2733 static int
 2734 kqueue_kqfilter(file_t *fp, struct knote *kn)
 2735 {
 2736         struct kqueue *kq;
 2737 
 2738         kq = ((file_t *)kn->kn_obj)->f_kqueue;
 2739 
 2740         KASSERT(fp == kn->kn_obj);
 2741 
 2742         if (kn->kn_filter != EVFILT_READ)
 2743                 return EINVAL;
 2744 
 2745         kn->kn_fop = &kqread_filtops;
 2746         mutex_enter(&kq->kq_lock);
 2747         selrecord_knote(&kq->kq_sel, kn);
 2748         mutex_exit(&kq->kq_lock);
 2749 
 2750         return 0;
 2751 }
 2752 
 2753 
 2754 /*
 2755  * Walk down a list of knotes, activating them if their event has
 2756  * triggered.  The caller's object lock (e.g. device driver lock)
 2757  * must be held.
 2758  */
 2759 void
 2760 knote(struct klist *list, long hint)
 2761 {
 2762         struct knote *kn, *tmpkn;
 2763 
 2764         SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
 2765                 /*
 2766                  * We assume here that the backing object's lock is
 2767                  * already held if we're traversing the klist, and
 2768                  * so acquiring the knote foplock would create a
 2769                  * deadlock scenario.  But we also know that the klist
 2770                  * won't disappear on us while we're here, so not
 2771                  * acquiring it is safe.
 2772                  */
 2773                 if (filter_event(kn, hint, true)) {
 2774                         knote_activate(kn);
 2775                 }
 2776         }
 2777 }
 2778 
 2779 /*
 2780  * Remove all knotes referencing a specified fd
 2781  */
 2782 void
 2783 knote_fdclose(int fd)
 2784 {
 2785         struct klist *list;
 2786         struct knote *kn;
 2787         filedesc_t *fdp;
 2788 
 2789  again:
 2790         fdp = curlwp->l_fd;
 2791         mutex_enter(&fdp->fd_lock);
 2792         list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
 2793         while ((kn = SLIST_FIRST(list)) != NULL) {
 2794                 if (knote_detach_quiesce(kn)) {
 2795                         goto again;
 2796                 }
 2797                 knote_detach(kn, fdp, true);
 2798                 mutex_enter(&fdp->fd_lock);
 2799         }
 2800         mutex_exit(&fdp->fd_lock);
 2801 }
 2802 
 2803 /*
 2804  * Drop knote.  Called with fdp->fd_lock held, and will drop before
 2805  * returning.
 2806  */
 2807 static void
 2808 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
 2809 {
 2810         struct klist *list;
 2811         struct kqueue *kq;
 2812 
 2813         kq = kn->kn_kq;
 2814 
 2815         KASSERT((kn->kn_status & KN_MARKER) == 0);
 2816         KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
 2817         KASSERT(kn->kn_fop != NULL);
 2818         KASSERT(mutex_owned(&fdp->fd_lock));
 2819 
 2820         /* Remove from monitored object. */
 2821         if (dofop) {
 2822                 knote_foplock_enter(kn);
 2823                 filter_detach(kn);
 2824                 knote_foplock_exit(kn);
 2825         }
 2826 
 2827         /* Remove from descriptor table. */
 2828         if (kn->kn_fop->f_flags & FILTEROP_ISFD)
 2829                 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
 2830         else
 2831                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
 2832 
 2833         SLIST_REMOVE(list, kn, knote, kn_link);
 2834 
 2835         /* Remove from kqueue. */
 2836 again:
 2837         mutex_spin_enter(&kq->kq_lock);
 2838         KASSERT(kn_in_flux(kn) == false);
 2839         if ((kn->kn_status & KN_QUEUED) != 0) {
 2840                 kq_check(kq);
 2841                 KASSERT(KQ_COUNT(kq) != 0);
 2842                 kq->kq_count--;
 2843                 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 2844                 kn->kn_status &= ~KN_QUEUED;
 2845                 kq_check(kq);
 2846         } else if (kn->kn_status & KN_BUSY) {
 2847                 mutex_spin_exit(&kq->kq_lock);
 2848                 goto again;
 2849         }
 2850         mutex_spin_exit(&kq->kq_lock);
 2851 
 2852         mutex_exit(&fdp->fd_lock);
 2853         if (kn->kn_fop->f_flags & FILTEROP_ISFD)
 2854                 fd_putfile(kn->kn_id);
 2855         atomic_dec_uint(&kn->kn_kfilter->refcnt);
 2856         knote_free(kn);
 2857 }
 2858 
 2859 /*
 2860  * Queue new event for knote.
 2861  */
 2862 static void
 2863 knote_enqueue(struct knote *kn)
 2864 {
 2865         struct kqueue *kq;
 2866 
 2867         KASSERT((kn->kn_status & KN_MARKER) == 0);
 2868 
 2869         kq = kn->kn_kq;
 2870 
 2871         mutex_spin_enter(&kq->kq_lock);
 2872         if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
 2873                 /* Don't bother enqueueing a dying knote. */
 2874                 goto out;
 2875         }
 2876         if ((kn->kn_status & KN_DISABLED) != 0) {
 2877                 kn->kn_status &= ~KN_DISABLED;
 2878         }
 2879         if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
 2880                 kq_check(kq);
 2881                 kn->kn_status |= KN_QUEUED;
 2882                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 2883                 KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
 2884                 kq->kq_count++;
 2885                 kq_check(kq);
 2886                 cv_broadcast(&kq->kq_cv);
 2887                 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
 2888         }
 2889  out:
 2890         mutex_spin_exit(&kq->kq_lock);
 2891 }
 2892 /*
 2893  * Queue new event for knote.
 2894  */
 2895 static void
 2896 knote_activate_locked(struct knote *kn)
 2897 {
 2898         struct kqueue *kq;
 2899 
 2900         KASSERT((kn->kn_status & KN_MARKER) == 0);
 2901 
 2902         kq = kn->kn_kq;
 2903 
 2904         if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
 2905                 /* Don't bother enqueueing a dying knote. */
 2906                 return;
 2907         }
 2908         kn->kn_status |= KN_ACTIVE;
 2909         if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
 2910                 kq_check(kq);
 2911                 kn->kn_status |= KN_QUEUED;
 2912                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 2913                 KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
 2914                 kq->kq_count++;
 2915                 kq_check(kq);
 2916                 cv_broadcast(&kq->kq_cv);
 2917                 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
 2918         }
 2919 }
 2920 
 2921 static void
 2922 knote_activate(struct knote *kn)
 2923 {
 2924         struct kqueue *kq = kn->kn_kq;
 2925 
 2926         mutex_spin_enter(&kq->kq_lock);
 2927         knote_activate_locked(kn);
 2928         mutex_spin_exit(&kq->kq_lock);
 2929 }
 2930 
 2931 static void
 2932 knote_deactivate_locked(struct knote *kn)
 2933 {
 2934         struct kqueue *kq = kn->kn_kq;
 2935 
 2936         if (kn->kn_status & KN_QUEUED) {
 2937                 kq_check(kq);
 2938                 kn->kn_status &= ~KN_QUEUED;
 2939                 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 2940                 KASSERT(KQ_COUNT(kq) > 0);
 2941                 kq->kq_count--;
 2942                 kq_check(kq);
 2943         }
 2944         kn->kn_status &= ~KN_ACTIVE;
 2945 }
 2946 
 2947 /*
 2948  * Set EV_EOF on the specified knote.  Also allows additional
 2949  * EV_* flags to be set (e.g. EV_ONESHOT).
 2950  */
 2951 void
 2952 knote_set_eof(struct knote *kn, uint32_t flags)
 2953 {
 2954         struct kqueue *kq = kn->kn_kq;
 2955 
 2956         mutex_spin_enter(&kq->kq_lock);
 2957         kn->kn_flags |= EV_EOF | flags;
 2958         mutex_spin_exit(&kq->kq_lock);
 2959 }
 2960 
 2961 /*
 2962  * Clear EV_EOF on the specified knote.
 2963  */
 2964 void
 2965 knote_clear_eof(struct knote *kn)
 2966 {
 2967         struct kqueue *kq = kn->kn_kq;
 2968 
 2969         mutex_spin_enter(&kq->kq_lock);
 2970         kn->kn_flags &= ~EV_EOF;
 2971         mutex_spin_exit(&kq->kq_lock);
 2972 }
 2973 
 2974 /*
 2975  * Initialize a klist.
 2976  */
 2977 void
 2978 klist_init(struct klist *list)
 2979 {
 2980         SLIST_INIT(list);
 2981 }
 2982 
 2983 /*
 2984  * Finalize a klist.
 2985  */
 2986 void
 2987 klist_fini(struct klist *list)
 2988 {
 2989         struct knote *kn;
 2990 
 2991         /*
 2992          * Neuter all existing knotes on the klist because the list is
 2993          * being destroyed.  The caller has guaranteed that no additional
 2994          * knotes will be added to the list, that the backing object's
 2995          * locks are not held (otherwise there is a locking order issue
 2996          * with acquiring the knote foplock ), and that we can traverse
 2997          * the list safely in this state.
 2998          */
 2999         SLIST_FOREACH(kn, list, kn_selnext) {
 3000                 knote_foplock_enter(kn);
 3001                 KASSERT(kn->kn_fop != NULL);
 3002                 if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
 3003                         kn->kn_fop = &nop_fd_filtops;
 3004                 } else {
 3005                         kn->kn_fop = &nop_filtops;
 3006                 }
 3007                 knote_foplock_exit(kn);
 3008         }
 3009 }
 3010 
 3011 /*
 3012  * Insert a knote into a klist.
 3013  */
 3014 void
 3015 klist_insert(struct klist *list, struct knote *kn)
 3016 {
 3017         SLIST_INSERT_HEAD(list, kn, kn_selnext);
 3018 }
 3019 
 3020 /*
 3021  * Remove a knote from a klist.  Returns true if the last
 3022  * knote was removed and the list is now empty.
 3023  */
 3024 bool
 3025 klist_remove(struct klist *list, struct knote *kn)
 3026 {
 3027         SLIST_REMOVE(list, kn, knote, kn_selnext);
 3028         return SLIST_EMPTY(list);
 3029 }
Cache object: c76c499d8c3a57ac529d80aa4bf480a2
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/kern_event.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_event.c