kern_event.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: kern_event.c,v 1.60.6.2 2010/01/09 01:08:39 snj Exp $  */
    2 
    3 /*-
    4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*-
   33  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
   34  * All rights reserved.
   35  *
   36  * Redistribution and use in source and binary forms, with or without
   37  * modification, are permitted provided that the following conditions
   38  * are met:
   39  * 1. Redistributions of source code must retain the above copyright
   40  *    notice, this list of conditions and the following disclaimer.
   41  * 2. Redistributions in binary form must reproduce the above copyright
   42  *    notice, this list of conditions and the following disclaimer in the
   43  *    documentation and/or other materials provided with the distribution.
   44  *
   45  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   55  * SUCH DAMAGE.
   56  *
   57  * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
   58  */
   59 
   60 #include <sys/cdefs.h>
   61 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.60.6.2 2010/01/09 01:08:39 snj Exp $");
   62 
   63 #include <sys/param.h>
   64 #include <sys/systm.h>
   65 #include <sys/kernel.h>
   66 #include <sys/proc.h>
   67 #include <sys/file.h>
   68 #include <sys/select.h>
   69 #include <sys/queue.h>
   70 #include <sys/event.h>
   71 #include <sys/eventvar.h>
   72 #include <sys/poll.h>
   73 #include <sys/kmem.h>
   74 #include <sys/stat.h>
   75 #include <sys/filedesc.h>
   76 #include <sys/syscallargs.h>
   77 #include <sys/kauth.h>
   78 #include <sys/conf.h>
   79 #include <sys/atomic.h>
   80 
   81 static int      kqueue_scan(file_t *, size_t, struct kevent *,
   82                             const struct timespec *, register_t *,
   83                             const struct kevent_ops *, struct kevent *,
   84                             size_t);
   85 static int      kqueue_ioctl(file_t *, u_long, void *);
   86 static int      kqueue_fcntl(file_t *, u_int, void *);
   87 static int      kqueue_poll(file_t *, int);
   88 static int      kqueue_kqfilter(file_t *, struct knote *);
   89 static int      kqueue_stat(file_t *, struct stat *);
   90 static int      kqueue_close(file_t *);
   91 static int      kqueue_register(struct kqueue *, struct kevent *);
   92 static void     kqueue_doclose(struct kqueue *, struct klist *, int);
   93 
   94 static void     knote_detach(struct knote *, filedesc_t *fdp, bool);
   95 static void     knote_enqueue(struct knote *);
   96 static void     knote_activate(struct knote *);
   97 
   98 static void     filt_kqdetach(struct knote *);
   99 static int      filt_kqueue(struct knote *, long hint);
  100 static int      filt_procattach(struct knote *);
  101 static void     filt_procdetach(struct knote *);
  102 static int      filt_proc(struct knote *, long hint);
  103 static int      filt_fileattach(struct knote *);
  104 static void     filt_timerexpire(void *x);
  105 static int      filt_timerattach(struct knote *);
  106 static void     filt_timerdetach(struct knote *);
  107 static int      filt_timer(struct knote *, long hint);
  108 
  109 static const struct fileops kqueueops = {
  110         .fo_read = (void *)enxio,
  111         .fo_write = (void *)enxio,
  112         .fo_ioctl = kqueue_ioctl,
  113         .fo_fcntl = kqueue_fcntl,
  114         .fo_poll = kqueue_poll,
  115         .fo_stat = kqueue_stat,
  116         .fo_close = kqueue_close,
  117         .fo_kqfilter = kqueue_kqfilter,
  118         .fo_drain = fnullop_drain,
  119 };
  120 
  121 static const struct filterops kqread_filtops =
  122         { 1, NULL, filt_kqdetach, filt_kqueue };
  123 static const struct filterops proc_filtops =
  124         { 0, filt_procattach, filt_procdetach, filt_proc };
  125 static const struct filterops file_filtops =
  126         { 1, filt_fileattach, NULL, NULL };
  127 static const struct filterops timer_filtops =
  128         { 0, filt_timerattach, filt_timerdetach, filt_timer };
  129 
  130 static u_int    kq_ncallouts = 0;
  131 static int      kq_calloutmax = (4 * 1024);
  132 
  133 #define KN_HASHSIZE             64              /* XXX should be tunable */
  134 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
  135 
  136 extern const struct filterops sig_filtops;
  137 
  138 /*
  139  * Table for for all system-defined filters.
  140  * These should be listed in the numeric order of the EVFILT_* defines.
  141  * If filtops is NULL, the filter isn't implemented in NetBSD.
  142  * End of list is when name is NULL.
  143  * 
  144  * Note that 'refcnt' is meaningless for built-in filters.
  145  */
  146 struct kfilter {
  147         const char      *name;          /* name of filter */
  148         uint32_t        filter;         /* id of filter */
  149         unsigned        refcnt;         /* reference count */
  150         const struct filterops *filtops;/* operations for filter */
  151         size_t          namelen;        /* length of name string */
  152 };
  153 
  154 /* System defined filters */
  155 static struct kfilter sys_kfilters[] = {
  156         { "EVFILT_READ",        EVFILT_READ,    0, &file_filtops, 0 },
  157         { "EVFILT_WRITE",       EVFILT_WRITE,   0, &file_filtops, 0, },
  158         { "EVFILT_AIO",         EVFILT_AIO,     0, NULL, 0 },
  159         { "EVFILT_VNODE",       EVFILT_VNODE,   0, &file_filtops, 0 },
  160         { "EVFILT_PROC",        EVFILT_PROC,    0, &proc_filtops, 0 },
  161         { "EVFILT_SIGNAL",      EVFILT_SIGNAL,  0, &sig_filtops, 0 },
  162         { "EVFILT_TIMER",       EVFILT_TIMER,   0, &timer_filtops, 0 },
  163         { NULL,                 0,              0, NULL, 0 },
  164 };
  165 
  166 /* User defined kfilters */
  167 static struct kfilter   *user_kfilters;         /* array */
  168 static int              user_kfilterc;          /* current offset */
  169 static int              user_kfiltermaxc;       /* max size so far */
  170 static size_t           user_kfiltersz;         /* size of allocated memory */
  171 
  172 /* Locks */
  173 static krwlock_t        kqueue_filter_lock;     /* lock on filter lists */
  174 static kmutex_t         kqueue_misc_lock;       /* miscellaneous */
  175 
  176 /*
  177  * Initialize the kqueue subsystem.
  178  */
  179 void
  180 kqueue_init(void)
  181 {
  182 
  183         rw_init(&kqueue_filter_lock);
  184         mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
  185 }
  186 
  187 /*
  188  * Find kfilter entry by name, or NULL if not found.
  189  */
  190 static struct kfilter *
  191 kfilter_byname_sys(const char *name)
  192 {
  193         int i;
  194 
  195         KASSERT(rw_lock_held(&kqueue_filter_lock));
  196 
  197         for (i = 0; sys_kfilters[i].name != NULL; i++) {
  198                 if (strcmp(name, sys_kfilters[i].name) == 0)
  199                         return &sys_kfilters[i];
  200         }
  201         return NULL;
  202 }
  203 
  204 static struct kfilter *
  205 kfilter_byname_user(const char *name)
  206 {
  207         int i;
  208 
  209         KASSERT(rw_lock_held(&kqueue_filter_lock));
  210 
  211         /* user filter slots have a NULL name if previously deregistered */
  212         for (i = 0; i < user_kfilterc ; i++) {
  213                 if (user_kfilters[i].name != NULL &&
  214                     strcmp(name, user_kfilters[i].name) == 0)
  215                         return &user_kfilters[i];
  216         }
  217         return NULL;
  218 }
  219 
  220 static struct kfilter *
  221 kfilter_byname(const char *name)
  222 {
  223         struct kfilter *kfilter;
  224 
  225         KASSERT(rw_lock_held(&kqueue_filter_lock));
  226 
  227         if ((kfilter = kfilter_byname_sys(name)) != NULL)
  228                 return kfilter;
  229 
  230         return kfilter_byname_user(name);
  231 }
  232 
  233 /*
  234  * Find kfilter entry by filter id, or NULL if not found.
  235  * Assumes entries are indexed in filter id order, for speed.
  236  */
  237 static struct kfilter *
  238 kfilter_byfilter(uint32_t filter)
  239 {
  240         struct kfilter *kfilter;
  241 
  242         KASSERT(rw_lock_held(&kqueue_filter_lock));
  243 
  244         if (filter < EVFILT_SYSCOUNT)   /* it's a system filter */
  245                 kfilter = &sys_kfilters[filter];
  246         else if (user_kfilters != NULL &&
  247             filter < EVFILT_SYSCOUNT + user_kfilterc)
  248                                         /* it's a user filter */
  249                 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
  250         else
  251                 return (NULL);          /* out of range */
  252         KASSERT(kfilter->filter == filter);     /* sanity check! */
  253         return (kfilter);
  254 }
  255 
  256 /*
  257  * Register a new kfilter. Stores the entry in user_kfilters.
  258  * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
  259  * If retfilter != NULL, the new filterid is returned in it.
  260  */
  261 int
  262 kfilter_register(const char *name, const struct filterops *filtops,
  263                  int *retfilter)
  264 {
  265         struct kfilter *kfilter;
  266         size_t len;
  267         int i;
  268 
  269         if (name == NULL || name[0] == '\0' || filtops == NULL)
  270                 return (EINVAL);        /* invalid args */
  271 
  272         rw_enter(&kqueue_filter_lock, RW_WRITER);
  273         if (kfilter_byname(name) != NULL) {
  274                 rw_exit(&kqueue_filter_lock);
  275                 return (EEXIST);        /* already exists */
  276         }
  277         if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
  278                 rw_exit(&kqueue_filter_lock);
  279                 return (EINVAL);        /* too many */
  280         }
  281 
  282         for (i = 0; i < user_kfilterc; i++) {
  283                 kfilter = &user_kfilters[i];
  284                 if (kfilter->name == NULL) {
  285                         /* Previously deregistered slot.  Reuse. */
  286                         goto reuse;
  287                 }
  288         }
  289 
  290         /* check if need to grow user_kfilters */
  291         if (user_kfilterc + 1 > user_kfiltermaxc) {
  292                 /* Grow in KFILTER_EXTENT chunks. */
  293                 user_kfiltermaxc += KFILTER_EXTENT;
  294                 len = user_kfiltermaxc * sizeof(*kfilter);
  295                 kfilter = kmem_alloc(len, KM_SLEEP);
  296                 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
  297                 if (user_kfilters != NULL) {
  298                         memcpy(kfilter, user_kfilters, user_kfiltersz);
  299                         kmem_free(user_kfilters, user_kfiltersz);
  300                 }
  301                 user_kfiltersz = len;
  302                 user_kfilters = kfilter;
  303         }
  304         /* Adding new slot */
  305         kfilter = &user_kfilters[user_kfilterc++];
  306 reuse:
  307         kfilter->namelen = strlen(name) + 1;
  308         kfilter->name = kmem_alloc(kfilter->namelen, KM_SLEEP);
  309         memcpy(__UNCONST(kfilter->name), name, kfilter->namelen);
  310 
  311         kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
  312 
  313         kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
  314         memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
  315 
  316         if (retfilter != NULL)
  317                 *retfilter = kfilter->filter;
  318         rw_exit(&kqueue_filter_lock);
  319 
  320         return (0);
  321 }
  322 
  323 /*
  324  * Unregister a kfilter previously registered with kfilter_register.
  325  * This retains the filter id, but clears the name and frees filtops (filter
  326  * operations), so that the number isn't reused during a boot.
  327  * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
  328  */
  329 int
  330 kfilter_unregister(const char *name)
  331 {
  332         struct kfilter *kfilter;
  333 
  334         if (name == NULL || name[0] == '\0')
  335                 return (EINVAL);        /* invalid name */
  336 
  337         rw_enter(&kqueue_filter_lock, RW_WRITER);
  338         if (kfilter_byname_sys(name) != NULL) {
  339                 rw_exit(&kqueue_filter_lock);
  340                 return (EINVAL);        /* can't detach system filters */
  341         }
  342 
  343         kfilter = kfilter_byname_user(name);
  344         if (kfilter == NULL) {
  345                 rw_exit(&kqueue_filter_lock);
  346                 return (ENOENT);
  347         }
  348         if (kfilter->refcnt != 0) {
  349                 rw_exit(&kqueue_filter_lock);
  350                 return (EBUSY);
  351         }
  352 
  353         /* Cast away const (but we know it's safe. */
  354         kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
  355         kfilter->name = NULL;   /* mark as `not implemented' */
  356 
  357         if (kfilter->filtops != NULL) {
  358                 /* Cast away const (but we know it's safe. */
  359                 kmem_free(__UNCONST(kfilter->filtops),
  360                     sizeof(*kfilter->filtops));
  361                 kfilter->filtops = NULL; /* mark as `not implemented' */
  362         }
  363         rw_exit(&kqueue_filter_lock);
  364 
  365         return (0);
  366 }
  367 
  368 
  369 /*
  370  * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
  371  * descriptors. Calls fileops kqfilter method for given file descriptor.
  372  */
  373 static int
  374 filt_fileattach(struct knote *kn)
  375 {
  376         file_t *fp;
  377 
  378         fp = kn->kn_obj;
  379 
  380         return (*fp->f_ops->fo_kqfilter)(fp, kn);
  381 }
  382 
  383 /*
  384  * Filter detach method for EVFILT_READ on kqueue descriptor.
  385  */
  386 static void
  387 filt_kqdetach(struct knote *kn)
  388 {
  389         struct kqueue *kq;
  390 
  391         kq = ((file_t *)kn->kn_obj)->f_data;
  392 
  393         mutex_spin_enter(&kq->kq_lock);
  394         SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
  395         mutex_spin_exit(&kq->kq_lock);
  396 }
  397 
  398 /*
  399  * Filter event method for EVFILT_READ on kqueue descriptor.
  400  */
  401 /*ARGSUSED*/
  402 static int
  403 filt_kqueue(struct knote *kn, long hint)
  404 {
  405         struct kqueue *kq;
  406         int rv;
  407 
  408         kq = ((file_t *)kn->kn_obj)->f_data;
  409 
  410         if (hint != NOTE_SUBMIT)
  411                 mutex_spin_enter(&kq->kq_lock);
  412         kn->kn_data = kq->kq_count;
  413         rv = (kn->kn_data > 0);
  414         if (hint != NOTE_SUBMIT)
  415                 mutex_spin_exit(&kq->kq_lock);
  416 
  417         return rv;
  418 }
  419 
  420 /*
  421  * Filter attach method for EVFILT_PROC.
  422  */
  423 static int
  424 filt_procattach(struct knote *kn)
  425 {
  426         struct proc *p, *curp;
  427         struct lwp *curl;
  428 
  429         curl = curlwp;
  430         curp = curl->l_proc;
  431 
  432         mutex_enter(proc_lock);
  433         p = p_find(kn->kn_id, PFIND_LOCKED);
  434         if (p == NULL) {
  435                 mutex_exit(proc_lock);
  436                 return ESRCH;
  437         }
  438 
  439         /*
  440          * Fail if it's not owned by you, or the last exec gave us
  441          * setuid/setgid privs (unless you're root).
  442          */
  443         mutex_enter(p->p_lock);
  444         mutex_exit(proc_lock);
  445         if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
  446             p, NULL, NULL, NULL) != 0) {
  447                 mutex_exit(p->p_lock);
  448                 return EACCES;
  449         }
  450 
  451         kn->kn_obj = p;
  452         kn->kn_flags |= EV_CLEAR;       /* automatically set */
  453 
  454         /*
  455          * internal flag indicating registration done by kernel
  456          */
  457         if (kn->kn_flags & EV_FLAG1) {
  458                 kn->kn_data = kn->kn_sdata;     /* ppid */
  459                 kn->kn_fflags = NOTE_CHILD;
  460                 kn->kn_flags &= ~EV_FLAG1;
  461         }
  462         SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
  463         mutex_exit(p->p_lock);
  464 
  465         return 0;
  466 }
  467 
  468 /*
  469  * Filter detach method for EVFILT_PROC.
  470  *
  471  * The knote may be attached to a different process, which may exit,
  472  * leaving nothing for the knote to be attached to.  So when the process
  473  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  474  * it will be deleted when read out.  However, as part of the knote deletion,
  475  * this routine is called, so a check is needed to avoid actually performing
  476  * a detach, because the original process might not exist any more.
  477  */
  478 static void
  479 filt_procdetach(struct knote *kn)
  480 {
  481         struct proc *p;
  482 
  483         if (kn->kn_status & KN_DETACHED)
  484                 return;
  485 
  486         p = kn->kn_obj;
  487 
  488         mutex_enter(p->p_lock);
  489         SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
  490         mutex_exit(p->p_lock);
  491 }
  492 
  493 /*
  494  * Filter event method for EVFILT_PROC.
  495  */
  496 static int
  497 filt_proc(struct knote *kn, long hint)
  498 {
  499         u_int event, fflag;
  500         struct kevent kev;
  501         struct kqueue *kq;
  502         int error;
  503 
  504         event = (u_int)hint & NOTE_PCTRLMASK;
  505         kq = kn->kn_kq;
  506         fflag = 0;
  507 
  508         /* If the user is interested in this event, record it. */
  509         if (kn->kn_sfflags & event)
  510                 fflag |= event;
  511 
  512         if (event == NOTE_EXIT) {
  513                 /*
  514                  * Process is gone, so flag the event as finished.
  515                  *
  516                  * Detach the knote from watched process and mark
  517                  * it as such. We can't leave this to kqueue_scan(),
  518                  * since the process might not exist by then. And we
  519                  * have to do this now, since psignal KNOTE() is called
  520                  * also for zombies and we might end up reading freed
  521                  * memory if the kevent would already be picked up
  522                  * and knote g/c'ed.
  523                  */
  524                 filt_procdetach(kn);
  525 
  526                 mutex_spin_enter(&kq->kq_lock);
  527                 kn->kn_status |= KN_DETACHED;
  528                 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
  529                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
  530                 kn->kn_fflags |= fflag;
  531                 mutex_spin_exit(&kq->kq_lock);
  532 
  533                 return 1;
  534         }
  535 
  536         mutex_spin_enter(&kq->kq_lock);
  537         if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
  538                 /*
  539                  * Process forked, and user wants to track the new process,
  540                  * so attach a new knote to it, and immediately report an
  541                  * event with the parent's pid.  Register knote with new
  542                  * process.
  543                  */
  544                 kev.ident = hint & NOTE_PDATAMASK;      /* pid */
  545                 kev.filter = kn->kn_filter;
  546                 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
  547                 kev.fflags = kn->kn_sfflags;
  548                 kev.data = kn->kn_id;                   /* parent */
  549                 kev.udata = kn->kn_kevent.udata;        /* preserve udata */
  550                 mutex_spin_exit(&kq->kq_lock);
  551                 error = kqueue_register(kq, &kev);
  552                 mutex_spin_enter(&kq->kq_lock);
  553                 if (error != 0)
  554                         kn->kn_fflags |= NOTE_TRACKERR;
  555         }
  556         kn->kn_fflags |= fflag;
  557         fflag = kn->kn_fflags;
  558         mutex_spin_exit(&kq->kq_lock);
  559 
  560         return fflag != 0;
  561 }
  562 
  563 static void
  564 filt_timerexpire(void *knx)
  565 {
  566         struct knote *kn = knx;
  567         int tticks;
  568 
  569         mutex_enter(&kqueue_misc_lock);
  570         kn->kn_data++;
  571         knote_activate(kn);
  572         if ((kn->kn_flags & EV_ONESHOT) == 0) {
  573                 tticks = mstohz(kn->kn_sdata);
  574                 callout_schedule((callout_t *)kn->kn_hook, tticks);
  575         }
  576         mutex_exit(&kqueue_misc_lock);
  577 }
  578 
  579 /*
  580  * data contains amount of time to sleep, in milliseconds
  581  */
  582 static int
  583 filt_timerattach(struct knote *kn)
  584 {
  585         callout_t *calloutp;
  586         struct kqueue *kq;
  587         int tticks;
  588 
  589         tticks = mstohz(kn->kn_sdata);
  590 
  591         /* if the supplied value is under our resolution, use 1 tick */
  592         if (tticks == 0) {
  593                 if (kn->kn_sdata == 0)
  594                         return EINVAL;
  595                 tticks = 1;
  596         }
  597 
  598         if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
  599             (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
  600                 atomic_dec_uint(&kq_ncallouts);
  601                 return ENOMEM;
  602         }
  603         callout_init(calloutp, CALLOUT_MPSAFE);
  604 
  605         kq = kn->kn_kq;
  606         mutex_spin_enter(&kq->kq_lock);
  607         kn->kn_flags |= EV_CLEAR;               /* automatically set */
  608         kn->kn_hook = calloutp;
  609         mutex_spin_exit(&kq->kq_lock);
  610 
  611         callout_reset(calloutp, tticks, filt_timerexpire, kn);
  612 
  613         return (0);
  614 }
  615 
  616 static void
  617 filt_timerdetach(struct knote *kn)
  618 {
  619         callout_t *calloutp;
  620 
  621         calloutp = (callout_t *)kn->kn_hook;
  622         callout_halt(calloutp, NULL);
  623         callout_destroy(calloutp);
  624         kmem_free(calloutp, sizeof(*calloutp));
  625         atomic_dec_uint(&kq_ncallouts);
  626 }
  627 
  628 static int
  629 filt_timer(struct knote *kn, long hint)
  630 {
  631         int rv;
  632 
  633         mutex_enter(&kqueue_misc_lock);
  634         rv = (kn->kn_data != 0);
  635         mutex_exit(&kqueue_misc_lock);
  636 
  637         return rv;
  638 }
  639 
  640 /*
  641  * filt_seltrue:
  642  *
  643  *      This filter "event" routine simulates seltrue().
  644  */
  645 int
  646 filt_seltrue(struct knote *kn, long hint)
  647 {
  648 
  649         /*
  650          * We don't know how much data can be read/written,
  651          * but we know that it *can* be.  This is about as
  652          * good as select/poll does as well.
  653          */
  654         kn->kn_data = 0;
  655         return (1);
  656 }
  657 
  658 /*
  659  * This provides full kqfilter entry for device switch tables, which
  660  * has same effect as filter using filt_seltrue() as filter method.
  661  */
  662 static void
  663 filt_seltruedetach(struct knote *kn)
  664 {
  665         /* Nothing to do */
  666 }
  667 
  668 const struct filterops seltrue_filtops =
  669         { 1, NULL, filt_seltruedetach, filt_seltrue };
  670 
  671 int
  672 seltrue_kqfilter(dev_t dev, struct knote *kn)
  673 {
  674         switch (kn->kn_filter) {
  675         case EVFILT_READ:
  676         case EVFILT_WRITE:
  677                 kn->kn_fop = &seltrue_filtops;
  678                 break;
  679         default:
  680                 return (EINVAL);
  681         }
  682 
  683         /* Nothing more to do */
  684         return (0);
  685 }
  686 
  687 /*
  688  * kqueue(2) system call.
  689  */
  690 int
  691 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
  692 {
  693         struct kqueue *kq;
  694         file_t *fp;
  695         int fd, error;
  696 
  697         if ((error = fd_allocfile(&fp, &fd)) != 0)
  698                 return error;
  699         fp->f_flag = FREAD | FWRITE;
  700         fp->f_type = DTYPE_KQUEUE;
  701         fp->f_ops = &kqueueops;
  702         kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
  703         mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
  704         cv_init(&kq->kq_cv, "kqueue");
  705         selinit(&kq->kq_sel);
  706         TAILQ_INIT(&kq->kq_head);
  707         fp->f_data = kq;
  708         *retval = fd;
  709         kq->kq_fdp = curlwp->l_fd;
  710         fd_affix(curproc, fp, fd);
  711         return error;
  712 }
  713 
  714 /*
  715  * kevent(2) system call.
  716  */
  717 static int
  718 kevent_fetch_changes(void *private, const struct kevent *changelist,
  719                      struct kevent *changes, size_t index, int n)
  720 {
  721 
  722         return copyin(changelist + index, changes, n * sizeof(*changes));
  723 }
  724 
  725 static int
  726 kevent_put_events(void *private, struct kevent *events,
  727                   struct kevent *eventlist, size_t index, int n)
  728 {
  729 
  730         return copyout(events, eventlist + index, n * sizeof(*events));
  731 }
  732 
  733 static const struct kevent_ops kevent_native_ops = {
  734         .keo_private = NULL,
  735         .keo_fetch_timeout = copyin,
  736         .keo_fetch_changes = kevent_fetch_changes,
  737         .keo_put_events = kevent_put_events,
  738 };
  739 
  740 int
  741 sys_kevent(struct lwp *l, const struct sys_kevent_args *uap, register_t *retval)
  742 {
  743         /* {
  744                 syscallarg(int) fd;
  745                 syscallarg(const struct kevent *) changelist;
  746                 syscallarg(size_t) nchanges;
  747                 syscallarg(struct kevent *) eventlist;
  748                 syscallarg(size_t) nevents;
  749                 syscallarg(const struct timespec *) timeout;
  750         } */
  751 
  752         return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
  753             SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
  754             SCARG(uap, timeout), &kevent_native_ops);
  755 }
  756 
  757 int
  758 kevent1(register_t *retval, int fd,
  759         const struct kevent *changelist, size_t nchanges,
  760         struct kevent *eventlist, size_t nevents,
  761         const struct timespec *timeout,
  762         const struct kevent_ops *keops)
  763 {
  764         struct kevent *kevp;
  765         struct kqueue *kq;
  766         struct timespec ts;
  767         size_t i, n, ichange;
  768         int nerrors, error;
  769         struct kevent kevbuf[8];        /* approx 300 bytes on 64-bit */
  770         file_t *fp;
  771 
  772         /* check that we're dealing with a kq */
  773         fp = fd_getfile(fd);
  774         if (fp == NULL)
  775                 return (EBADF);
  776 
  777         if (fp->f_type != DTYPE_KQUEUE) {
  778                 fd_putfile(fd);
  779                 return (EBADF);
  780         }
  781 
  782         if (timeout != NULL) {
  783                 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
  784                 if (error)
  785                         goto done;
  786                 timeout = &ts;
  787         }
  788 
  789         kq = (struct kqueue *)fp->f_data;
  790         nerrors = 0;
  791         ichange = 0;
  792 
  793         /* traverse list of events to register */
  794         while (nchanges > 0) {
  795                 n = MIN(nchanges, __arraycount(kevbuf));
  796                 error = (*keops->keo_fetch_changes)(keops->keo_private,
  797                     changelist, kevbuf, ichange, n);
  798                 if (error)
  799                         goto done;
  800                 for (i = 0; i < n; i++) {
  801                         kevp = &kevbuf[i];
  802                         kevp->flags &= ~EV_SYSFLAGS;
  803                         /* register each knote */
  804                         error = kqueue_register(kq, kevp);
  805                         if (error) {
  806                                 if (nevents != 0) {
  807                                         kevp->flags = EV_ERROR;
  808                                         kevp->data = error;
  809                                         error = (*keops->keo_put_events)
  810                                             (keops->keo_private, kevp,
  811                                             eventlist, nerrors, 1);
  812                                         if (error)
  813                                                 goto done;
  814                                         nevents--;
  815                                         nerrors++;
  816                                 } else {
  817                                         goto done;
  818                                 }
  819                         }
  820                 }
  821                 nchanges -= n;  /* update the results */
  822                 ichange += n;
  823         }
  824         if (nerrors) {
  825                 *retval = nerrors;
  826                 error = 0;
  827                 goto done;
  828         }
  829 
  830         /* actually scan through the events */
  831         error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
  832             kevbuf, __arraycount(kevbuf));
  833  done:
  834         fd_putfile(fd);
  835         return (error);
  836 }
  837 
  838 /*
  839  * Register a given kevent kev onto the kqueue
  840  */
  841 static int
  842 kqueue_register(struct kqueue *kq, struct kevent *kev)
  843 {
  844         struct kfilter *kfilter;
  845         filedesc_t *fdp;
  846         file_t *fp;
  847         fdfile_t *ff;
  848         struct knote *kn, *newkn;
  849         struct klist *list;
  850         int error, fd, rv;
  851 
  852         fdp = kq->kq_fdp;
  853         fp = NULL;
  854         kn = NULL;
  855         error = 0;
  856         fd = 0;
  857 
  858         newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
  859 
  860         rw_enter(&kqueue_filter_lock, RW_READER);
  861         kfilter = kfilter_byfilter(kev->filter);
  862         if (kfilter == NULL || kfilter->filtops == NULL) {
  863                 /* filter not found nor implemented */
  864                 rw_exit(&kqueue_filter_lock);
  865                 kmem_free(newkn, sizeof(*newkn));
  866                 return (EINVAL);
  867         }
  868 
  869         mutex_enter(&fdp->fd_lock);
  870 
  871         /* search if knote already exists */
  872         if (kfilter->filtops->f_isfd) {
  873                 /* monitoring a file descriptor */
  874                 fd = kev->ident;
  875                 if ((fp = fd_getfile(fd)) == NULL) {
  876                         mutex_exit(&fdp->fd_lock);
  877                         rw_exit(&kqueue_filter_lock);
  878                         kmem_free(newkn, sizeof(*newkn));
  879                         return EBADF;
  880                 }
  881                 ff = fdp->fd_ofiles[fd];
  882                 if (fd <= fdp->fd_lastkqfile) {
  883                         SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
  884                                 if (kq == kn->kn_kq &&
  885                                     kev->filter == kn->kn_filter)
  886                                         break;
  887                         }
  888                 }
  889         } else {
  890                 /*
  891                  * not monitoring a file descriptor, so
  892                  * lookup knotes in internal hash table
  893                  */
  894                 if (fdp->fd_knhashmask != 0) {
  895                         list = &fdp->fd_knhash[
  896                             KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
  897                         SLIST_FOREACH(kn, list, kn_link) {
  898                                 if (kev->ident == kn->kn_id &&
  899                                     kq == kn->kn_kq &&
  900                                     kev->filter == kn->kn_filter)
  901                                         break;
  902                         }
  903                 }
  904         }
  905 
  906         /*
  907          * kn now contains the matching knote, or NULL if no match
  908          */
  909         if (kev->flags & EV_ADD) {
  910                 if (kn == NULL) {
  911                         /* create new knote */
  912                         kn = newkn;
  913                         newkn = NULL;
  914                         kn->kn_obj = fp;
  915                         kn->kn_kq = kq;
  916                         kn->kn_fop = kfilter->filtops;
  917                         kn->kn_kfilter = kfilter;
  918                         kn->kn_sfflags = kev->fflags;
  919                         kn->kn_sdata = kev->data;
  920                         kev->fflags = 0;
  921                         kev->data = 0;
  922                         kn->kn_kevent = *kev;
  923 
  924                         /*
  925                          * apply reference count to knote structure, and
  926                          * do not release it at the end of this routine.
  927                          */
  928                         fp = NULL;
  929 
  930                         if (!kn->kn_fop->f_isfd) {
  931                                 /*
  932                                  * If knote is not on an fd, store on
  933                                  * internal hash table.
  934                                  */
  935                                 if (fdp->fd_knhashmask == 0) {
  936                                         /* XXXAD can block with fd_lock held */
  937                                         fdp->fd_knhash = hashinit(KN_HASHSIZE,
  938                                             HASH_LIST, true,
  939                                             &fdp->fd_knhashmask);
  940                                 }
  941                                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
  942                                     fdp->fd_knhashmask)];
  943                         } else {
  944                                 /* Otherwise, knote is on an fd. */
  945                                 list = (struct klist *)
  946                                     &fdp->fd_ofiles[kn->kn_id]->ff_knlist;
  947                                 if ((int)kn->kn_id > fdp->fd_lastkqfile)
  948                                         fdp->fd_lastkqfile = kn->kn_id;
  949                         }
  950                         SLIST_INSERT_HEAD(list, kn, kn_link);
  951 
  952                         KERNEL_LOCK(1, NULL);           /* XXXSMP */
  953                         error = (*kfilter->filtops->f_attach)(kn);
  954                         KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
  955                         if (error != 0) {
  956                                 /* knote_detach() drops fdp->fd_lock */
  957                                 knote_detach(kn, fdp, false);
  958                                 goto done;
  959                         }
  960                         atomic_inc_uint(&kfilter->refcnt);
  961                 } else {
  962                         /*
  963                          * The user may change some filter values after the
  964                          * initial EV_ADD, but doing so will not reset any
  965                          * filter which have already been triggered.
  966                          */
  967                         kn->kn_sfflags = kev->fflags;
  968                         kn->kn_sdata = kev->data;
  969                         kn->kn_kevent.udata = kev->udata;
  970                 }
  971                 KERNEL_LOCK(1, NULL);                   /* XXXSMP */
  972                 rv = (*kn->kn_fop->f_event)(kn, 0);
  973                 KERNEL_UNLOCK_ONE(NULL);                /* XXXSMP */
  974                 if (rv)
  975                         knote_activate(kn);
  976         } else {
  977                 if (kn == NULL) {
  978                         error = ENOENT;
  979                         mutex_exit(&fdp->fd_lock);
  980                         goto done;
  981                 }
  982                 if (kev->flags & EV_DELETE) {
  983                         /* knote_detach() drops fdp->fd_lock */
  984                         knote_detach(kn, fdp, true);
  985                         goto done;
  986                 }
  987         }
  988 
  989         /* disable knote */
  990         if ((kev->flags & EV_DISABLE)) {
  991                 mutex_spin_enter(&kq->kq_lock);
  992                 if ((kn->kn_status & KN_DISABLED) == 0)
  993                         kn->kn_status |= KN_DISABLED;
  994                 mutex_spin_exit(&kq->kq_lock);
  995         }
  996 
  997         /* enable knote */
  998         if ((kev->flags & EV_ENABLE)) {
  999                 knote_enqueue(kn);
 1000         }
 1001         mutex_exit(&fdp->fd_lock);
 1002  done:
 1003         rw_exit(&kqueue_filter_lock);
 1004         if (newkn != NULL)
 1005                 kmem_free(newkn, sizeof(*newkn));
 1006         if (fp != NULL)
 1007                 fd_putfile(fd);
 1008         return (error);
 1009 }
 1010 
 1011 #if defined(DEBUG)
 1012 static void
 1013 kq_check(struct kqueue *kq)
 1014 {
 1015         const struct knote *kn;
 1016         int count;
 1017         int nmarker;
 1018 
 1019         KASSERT(mutex_owned(&kq->kq_lock));
 1020         KASSERT(kq->kq_count >= 0);
 1021 
 1022         count = 0;
 1023         nmarker = 0;
 1024         TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
 1025                 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
 1026                         panic("%s: kq=%p kn=%p inconsist 1", __func__, kq, kn);
 1027                 }
 1028                 if ((kn->kn_status & KN_MARKER) == 0) {
 1029                         if (kn->kn_kq != kq) {
 1030                                 panic("%s: kq=%p kn=%p inconsist 2",
 1031                                     __func__, kq, kn);
 1032                         }
 1033                         if ((kn->kn_status & KN_ACTIVE) == 0) {
 1034                                 panic("%s: kq=%p kn=%p: not active",
 1035                                     __func__, kq, kn);
 1036                         }
 1037                         count++;
 1038                         if (count > kq->kq_count) {
 1039                                 goto bad;
 1040                         }
 1041                 } else {
 1042                         nmarker++;
 1043 #if 0
 1044                         if (nmarker > 10000) {
 1045                                 panic("%s: kq=%p too many markers: %d != %d, "
 1046                                     "nmarker=%d",
 1047                                     __func__, kq, kq->kq_count, count, nmarker);
 1048                         }
 1049 #endif
 1050                 }
 1051         }
 1052         if (kq->kq_count != count) {
 1053 bad:
 1054                 panic("%s: kq=%p inconsist 3: %d != %d, nmarker=%d",
 1055                     __func__, kq, kq->kq_count, count, nmarker);
 1056         }
 1057 }
 1058 #else /* defined(DEBUG) */
 1059 #define kq_check(a)     /* nothing */
 1060 #endif /* defined(DEBUG) */
 1061 
 1062 /*
 1063  * Scan through the list of events on fp (for a maximum of maxevents),
 1064  * returning the results in to ulistp. Timeout is determined by tsp; if
 1065  * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
 1066  * as appropriate.
 1067  */
 1068 static int
 1069 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
 1070             const struct timespec *tsp, register_t *retval,
 1071             const struct kevent_ops *keops, struct kevent *kevbuf,
 1072             size_t kevcnt)
 1073 {
 1074         struct kqueue   *kq;
 1075         struct kevent   *kevp;
 1076         struct timeval  atv, sleeptv;
 1077         struct knote    *kn, *marker;
 1078         size_t          count, nkev, nevents;
 1079         int             timeout, error, rv;
 1080         filedesc_t      *fdp;
 1081 
 1082         fdp = curlwp->l_fd;
 1083         kq = fp->f_data;
 1084         count = maxevents;
 1085         nkev = nevents = error = 0;
 1086         if (count == 0) {
 1087                 *retval = 0;
 1088                 return 0;
 1089         }
 1090 
 1091         if (tsp) {                              /* timeout supplied */
 1092                 TIMESPEC_TO_TIMEVAL(&atv, tsp);
 1093                 if (inittimeleft(&atv, &sleeptv) == -1) {
 1094                         *retval = maxevents;
 1095                         return EINVAL;
 1096                 }
 1097                 timeout = tvtohz(&atv);
 1098                 if (timeout <= 0)
 1099                         timeout = -1;           /* do poll */
 1100         } else {
 1101                 /* no timeout, wait forever */
 1102                 timeout = 0;
 1103         }       
 1104 
 1105         marker = kmem_zalloc(sizeof(*marker), KM_SLEEP);
 1106         marker->kn_status = KN_MARKER;
 1107         mutex_spin_enter(&kq->kq_lock);
 1108  retry:
 1109         kevp = kevbuf;
 1110         if (kq->kq_count == 0) {
 1111                 if (timeout >= 0) {
 1112                         error = cv_timedwait_sig(&kq->kq_cv,
 1113                             &kq->kq_lock, timeout);
 1114                         if (error == 0) {
 1115                                  if (tsp == NULL || (timeout =
 1116                                      gettimeleft(&atv, &sleeptv)) > 0)
 1117                                         goto retry;
 1118                         } else {
 1119                                 /* don't restart after signals... */
 1120                                 if (error == ERESTART)
 1121                                         error = EINTR;
 1122                                 if (error == EWOULDBLOCK)
 1123                                         error = 0;
 1124                         }
 1125                 }
 1126         } else {
 1127                 /* mark end of knote list */
 1128                 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
 1129 
 1130                 while (count != 0) {
 1131                         kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
 1132                         while ((kn->kn_status & KN_MARKER) != 0) {
 1133                                 if (kn == marker) {
 1134                                         /* it's our marker, stop */
 1135                                         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 1136                                         if (count < maxevents || (tsp != NULL &&
 1137                                             (timeout = gettimeleft(&atv,
 1138                                             &sleeptv)) <= 0))
 1139                                                 goto done;
 1140                                         goto retry;
 1141                                 }
 1142                                 /* someone else's marker. */
 1143                                 kn = TAILQ_NEXT(kn, kn_tqe);
 1144                         }
 1145                         kq_check(kq);
 1146                         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 1147                         kq->kq_count--;
 1148                         kn->kn_status &= ~KN_QUEUED;
 1149                         kq_check(kq);
 1150                         if (kn->kn_status & KN_DISABLED) {
 1151                                 /* don't want disabled events */
 1152                                 continue;
 1153                         }
 1154                         if ((kn->kn_flags & EV_ONESHOT) == 0) {
 1155                                 mutex_spin_exit(&kq->kq_lock);
 1156                                 KERNEL_LOCK(1, NULL);           /* XXXSMP */
 1157                                 rv = (*kn->kn_fop->f_event)(kn, 0);
 1158                                 KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
 1159                                 mutex_spin_enter(&kq->kq_lock);
 1160                                 /* Re-poll if note was re-enqueued. */
 1161                                 if ((kn->kn_status & KN_QUEUED) != 0)
 1162                                         continue;
 1163                                 if (rv == 0) {
 1164                                         /*
 1165                                          * non-ONESHOT event that hasn't
 1166                                          * triggered again, so de-queue.
 1167                                          */
 1168                                         kn->kn_status &= ~KN_ACTIVE;
 1169                                         continue;
 1170                                 }
 1171                         }
 1172                         /* XXXAD should be got from f_event if !oneshot. */
 1173                         *kevp++ = kn->kn_kevent;
 1174                         nkev++;
 1175                         if (kn->kn_flags & EV_ONESHOT) {
 1176                                 /* delete ONESHOT events after retrieval */
 1177                                 mutex_spin_exit(&kq->kq_lock);
 1178                                 mutex_enter(&fdp->fd_lock);
 1179                                 knote_detach(kn, fdp, true);
 1180                                 mutex_spin_enter(&kq->kq_lock);
 1181                         } else if (kn->kn_flags & EV_CLEAR) {
 1182                                 /* clear state after retrieval */
 1183                                 kn->kn_data = 0;
 1184                                 kn->kn_fflags = 0;
 1185                                 kn->kn_status &= ~KN_ACTIVE;
 1186                         } else {
 1187                                 /* add event back on list */
 1188                                 kq_check(kq);
 1189                                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 1190                                 kq->kq_count++;
 1191                                 kn->kn_status |= KN_QUEUED;
 1192                                 kq_check(kq);
 1193                         }
 1194                         if (nkev == kevcnt) {
 1195                                 /* do copyouts in kevcnt chunks */
 1196                                 mutex_spin_exit(&kq->kq_lock);
 1197                                 error = (*keops->keo_put_events)
 1198                                     (keops->keo_private,
 1199                                     kevbuf, ulistp, nevents, nkev);
 1200                                 mutex_spin_enter(&kq->kq_lock);
 1201                                 nevents += nkev;
 1202                                 nkev = 0;
 1203                                 kevp = kevbuf;
 1204                         }
 1205                         count--;
 1206                         if (error != 0 || count == 0) {
 1207                                 /* remove marker */
 1208                                 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
 1209                                 break;
 1210                         }
 1211                 }
 1212         }
 1213  done:
 1214         mutex_spin_exit(&kq->kq_lock);
 1215         if (marker != NULL)
 1216                 kmem_free(marker, sizeof(*marker));
 1217         if (nkev != 0) {
 1218                 /* copyout remaining events */
 1219                 error = (*keops->keo_put_events)(keops->keo_private,
 1220                     kevbuf, ulistp, nevents, nkev);
 1221         }
 1222         *retval = maxevents - count;
 1223 
 1224         return error;
 1225 }
 1226 
 1227 /*
 1228  * fileops ioctl method for a kqueue descriptor.
 1229  *
 1230  * Two ioctls are currently supported. They both use struct kfilter_mapping:
 1231  *      KFILTER_BYNAME          find name for filter, and return result in
 1232  *                              name, which is of size len.
 1233  *      KFILTER_BYFILTER        find filter for name. len is ignored.
 1234  */
 1235 /*ARGSUSED*/
 1236 static int
 1237 kqueue_ioctl(file_t *fp, u_long com, void *data)
 1238 {
 1239         struct kfilter_mapping  *km;
 1240         const struct kfilter    *kfilter;
 1241         char                    *name;
 1242         int                     error;
 1243 
 1244         km = data;
 1245         error = 0;
 1246         name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
 1247 
 1248         switch (com) {
 1249         case KFILTER_BYFILTER:  /* convert filter -> name */
 1250                 rw_enter(&kqueue_filter_lock, RW_READER);
 1251                 kfilter = kfilter_byfilter(km->filter);
 1252                 if (kfilter != NULL) {
 1253                         strlcpy(name, kfilter->name, KFILTER_MAXNAME);
 1254                         rw_exit(&kqueue_filter_lock);
 1255                         error = copyoutstr(name, km->name, km->len, NULL);
 1256                 } else {
 1257                         rw_exit(&kqueue_filter_lock);
 1258                         error = ENOENT;
 1259                 }
 1260                 break;
 1261 
 1262         case KFILTER_BYNAME:    /* convert name -> filter */
 1263                 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
 1264                 if (error) {
 1265                         break;
 1266                 }
 1267                 rw_enter(&kqueue_filter_lock, RW_READER);
 1268                 kfilter = kfilter_byname(name);
 1269                 if (kfilter != NULL)
 1270                         km->filter = kfilter->filter;
 1271                 else
 1272                         error = ENOENT;
 1273                 rw_exit(&kqueue_filter_lock);
 1274                 break;
 1275 
 1276         default:
 1277                 error = ENOTTY;
 1278                 break;
 1279 
 1280         }
 1281         kmem_free(name, KFILTER_MAXNAME);
 1282         return (error);
 1283 }
 1284 
 1285 /*
 1286  * fileops fcntl method for a kqueue descriptor.
 1287  */
 1288 static int
 1289 kqueue_fcntl(file_t *fp, u_int com, void *data)
 1290 {
 1291 
 1292         return (ENOTTY);
 1293 }
 1294 
 1295 /*
 1296  * fileops poll method for a kqueue descriptor.
 1297  * Determine if kqueue has events pending.
 1298  */
 1299 static int
 1300 kqueue_poll(file_t *fp, int events)
 1301 {
 1302         struct kqueue   *kq;
 1303         int             revents;
 1304 
 1305         kq = fp->f_data;
 1306 
 1307         revents = 0;
 1308         if (events & (POLLIN | POLLRDNORM)) {
 1309                 mutex_spin_enter(&kq->kq_lock);
 1310                 if (kq->kq_count != 0) {
 1311                         revents |= events & (POLLIN | POLLRDNORM);
 1312                 } else {
 1313                         selrecord(curlwp, &kq->kq_sel);
 1314                 }
 1315                 kq_check(kq);
 1316                 mutex_spin_exit(&kq->kq_lock);
 1317         }
 1318 
 1319         return revents;
 1320 }
 1321 
 1322 /*
 1323  * fileops stat method for a kqueue descriptor.
 1324  * Returns dummy info, with st_size being number of events pending.
 1325  */
 1326 static int
 1327 kqueue_stat(file_t *fp, struct stat *st)
 1328 {
 1329         struct kqueue *kq;
 1330 
 1331         kq = fp->f_data;
 1332 
 1333         memset(st, 0, sizeof(*st));
 1334         st->st_size = kq->kq_count;
 1335         st->st_blksize = sizeof(struct kevent);
 1336         st->st_mode = S_IFIFO;
 1337 
 1338         return 0;
 1339 }
 1340 
 1341 static void
 1342 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
 1343 {
 1344         struct knote *kn;
 1345         filedesc_t *fdp;
 1346 
 1347         fdp = kq->kq_fdp;
 1348 
 1349         KASSERT(mutex_owned(&fdp->fd_lock));
 1350 
 1351         for (kn = SLIST_FIRST(list); kn != NULL;) {
 1352                 if (kq != kn->kn_kq) {
 1353                         kn = SLIST_NEXT(kn, kn_link);
 1354                         continue;
 1355                 }
 1356                 knote_detach(kn, fdp, true);
 1357                 mutex_enter(&fdp->fd_lock);
 1358                 kn = SLIST_FIRST(list);
 1359         }
 1360 }
 1361 
 1362 
 1363 /*
 1364  * fileops close method for a kqueue descriptor.
 1365  */
 1366 static int
 1367 kqueue_close(file_t *fp)
 1368 {
 1369         struct kqueue *kq;
 1370         filedesc_t *fdp;
 1371         fdfile_t *ff;
 1372         int i;
 1373 
 1374         kq = fp->f_data;
 1375         fdp = curlwp->l_fd;
 1376 
 1377         mutex_enter(&fdp->fd_lock);
 1378         for (i = 0; i <= fdp->fd_lastkqfile; i++) {
 1379                 if ((ff = fdp->fd_ofiles[i]) == NULL)
 1380                         continue;
 1381                 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
 1382         }
 1383         if (fdp->fd_knhashmask != 0) {
 1384                 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
 1385                         kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
 1386                 }
 1387         }
 1388         mutex_exit(&fdp->fd_lock);
 1389 
 1390         KASSERT(kq->kq_count == 0);
 1391         mutex_destroy(&kq->kq_lock);
 1392         cv_destroy(&kq->kq_cv);
 1393         seldestroy(&kq->kq_sel);
 1394         kmem_free(kq, sizeof(*kq));
 1395         fp->f_data = NULL;
 1396 
 1397         return (0);
 1398 }
 1399 
 1400 /*
 1401  * struct fileops kqfilter method for a kqueue descriptor.
 1402  * Event triggered when monitored kqueue changes.
 1403  */
 1404 static int
 1405 kqueue_kqfilter(file_t *fp, struct knote *kn)
 1406 {
 1407         struct kqueue *kq;
 1408         filedesc_t *fdp;
 1409 
 1410         kq = ((file_t *)kn->kn_obj)->f_data;
 1411 
 1412         KASSERT(fp == kn->kn_obj);
 1413 
 1414         if (kn->kn_filter != EVFILT_READ)
 1415                 return 1;
 1416 
 1417         kn->kn_fop = &kqread_filtops;
 1418         fdp = curlwp->l_fd;
 1419         mutex_enter(&kq->kq_lock);
 1420         SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
 1421         mutex_exit(&kq->kq_lock);
 1422 
 1423         return 0;
 1424 }
 1425 
 1426 
 1427 /*
 1428  * Walk down a list of knotes, activating them if their event has
 1429  * triggered.  The caller's object lock (e.g. device driver lock)
 1430  * must be held.
 1431  */
 1432 void
 1433 knote(struct klist *list, long hint)
 1434 {
 1435         struct knote *kn;
 1436 
 1437         SLIST_FOREACH(kn, list, kn_selnext) {
 1438                 if ((*kn->kn_fop->f_event)(kn, hint))
 1439                         knote_activate(kn);
 1440         }
 1441 }
 1442 
 1443 /*
 1444  * Remove all knotes referencing a specified fd
 1445  */
 1446 void
 1447 knote_fdclose(int fd)
 1448 {
 1449         struct klist *list;
 1450         struct knote *kn;
 1451         filedesc_t *fdp;
 1452 
 1453         fdp = curlwp->l_fd;
 1454         list = (struct klist *)&fdp->fd_ofiles[fd]->ff_knlist;
 1455         mutex_enter(&fdp->fd_lock);
 1456         while ((kn = SLIST_FIRST(list)) != NULL) {
 1457                 knote_detach(kn, fdp, true);
 1458                 mutex_enter(&fdp->fd_lock);
 1459         }
 1460         mutex_exit(&fdp->fd_lock);
 1461 }
 1462 
 1463 /*
 1464  * Drop knote.  Called with fdp->fd_lock held, and will drop before
 1465  * returning.
 1466  */
 1467 static void
 1468 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
 1469 {
 1470         struct klist *list;
 1471         struct kqueue *kq;
 1472 
 1473         kq = kn->kn_kq;
 1474 
 1475         KASSERT((kn->kn_status & KN_MARKER) == 0);
 1476         KASSERT(mutex_owned(&fdp->fd_lock));
 1477 
 1478         /* Remove from monitored object. */
 1479         if (dofop) {
 1480                 KERNEL_LOCK(1, NULL);           /* XXXSMP */
 1481                 (*kn->kn_fop->f_detach)(kn);
 1482                 KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
 1483         }
 1484 
 1485         /* Remove from descriptor table. */
 1486         if (kn->kn_fop->f_isfd)
 1487                 list = (struct klist *)&fdp->fd_ofiles[kn->kn_id]->ff_knlist;
 1488         else
 1489                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
 1490 
 1491         SLIST_REMOVE(list, kn, knote, kn_link);
 1492 
 1493         /* Remove from kqueue. */
 1494         /* XXXAD should verify not in use by kqueue_scan. */
 1495         mutex_spin_enter(&kq->kq_lock);
 1496         if ((kn->kn_status & KN_QUEUED) != 0) {
 1497                 kq_check(kq);
 1498                 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 1499                 kn->kn_status &= ~KN_QUEUED;
 1500                 kq->kq_count--;
 1501                 kq_check(kq);
 1502         }
 1503         mutex_spin_exit(&kq->kq_lock);
 1504 
 1505         mutex_exit(&fdp->fd_lock);
 1506         if (kn->kn_fop->f_isfd)         
 1507                 fd_putfile(kn->kn_id);
 1508         atomic_dec_uint(&kn->kn_kfilter->refcnt);
 1509         kmem_free(kn, sizeof(*kn));
 1510 }
 1511 
 1512 /*
 1513  * Queue new event for knote.
 1514  */
 1515 static void
 1516 knote_enqueue(struct knote *kn)
 1517 {
 1518         struct kqueue *kq;
 1519 
 1520         KASSERT((kn->kn_status & KN_MARKER) == 0);
 1521 
 1522         kq = kn->kn_kq;
 1523 
 1524         mutex_spin_enter(&kq->kq_lock);
 1525         if ((kn->kn_status & KN_DISABLED) != 0) {
 1526                 kn->kn_status &= ~KN_DISABLED;
 1527         }
 1528         if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
 1529                 kq_check(kq);
 1530                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 1531                 kn->kn_status |= KN_QUEUED;
 1532                 kq->kq_count++;
 1533                 kq_check(kq);
 1534                 cv_broadcast(&kq->kq_cv);
 1535                 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
 1536         }
 1537         mutex_spin_exit(&kq->kq_lock);
 1538 }
 1539 /*
 1540  * Queue new event for knote.
 1541  */
 1542 static void
 1543 knote_activate(struct knote *kn)
 1544 {
 1545         struct kqueue *kq;
 1546 
 1547         KASSERT((kn->kn_status & KN_MARKER) == 0);
 1548 
 1549         kq = kn->kn_kq;
 1550 
 1551         mutex_spin_enter(&kq->kq_lock);
 1552         kn->kn_status |= KN_ACTIVE;
 1553         if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
 1554                 kq_check(kq);
 1555                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 1556                 kn->kn_status |= KN_QUEUED;
 1557                 kq->kq_count++;
 1558                 kq_check(kq);
 1559                 cv_broadcast(&kq->kq_cv);
 1560                 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
 1561         }
 1562         mutex_spin_exit(&kq->kq_lock);
 1563 }
Cache object: e419b7487e51533635c87972d8fd2671
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/kern_event.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_event.c