sys_eventfd.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: sys_eventfd.c,v 1.9 2022/02/17 16:28:29 thorpej Exp $  */
    2 
    3 /*-
    4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 #include <sys/cdefs.h>
   33 __KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.9 2022/02/17 16:28:29 thorpej Exp $");
   34 
   35 /*
   36  * eventfd
   37  *
   38  * Eventfd objects present a simple counting object associated with a
   39  * file descriptor.  Writes and reads to this file descriptor increment
   40  * and decrement the count, respectively.  When the count is non-zero,
   41  * the descriptor is considered "readable", and when less than the max
   42  * value (EVENTFD_MAXVAL), is considered "writable".
   43  *
   44  * This implementation is API compatible with the Linux eventfd(2)
   45  * interface.
   46  */
   47 
   48 #include <sys/param.h>
   49 #include <sys/types.h>
   50 #include <sys/condvar.h>
   51 #include <sys/eventfd.h>
   52 #include <sys/file.h>
   53 #include <sys/filedesc.h>
   54 #include <sys/kauth.h>
   55 #include <sys/mutex.h>
   56 #include <sys/poll.h>
   57 #include <sys/proc.h>
   58 #include <sys/select.h>
   59 #include <sys/stat.h>
   60 #include <sys/syscallargs.h>
   61 #include <sys/uio.h>
   62 
   63 struct eventfd {
   64         kmutex_t        efd_lock;
   65         kcondvar_t      efd_read_wait;
   66         kcondvar_t      efd_write_wait;
   67         struct selinfo  efd_read_sel;
   68         struct selinfo  efd_write_sel;
   69         eventfd_t       efd_val;
   70         int64_t         efd_nwaiters;
   71         bool            efd_restarting;
   72         bool            efd_has_read_waiters;
   73         bool            efd_has_write_waiters;
   74         bool            efd_is_semaphore;
   75 
   76         /*
   77          * Information kept for stat(2).
   78          */
   79         struct timespec efd_btime;      /* time created */
   80         struct timespec efd_mtime;      /* last write */
   81         struct timespec efd_atime;      /* last read */
   82 };
   83 
   84 #define EVENTFD_MAXVAL  (UINT64_MAX - 1)
   85 
   86 /*
   87  * eventfd_create:
   88  *
   89  *      Create an eventfd object.
   90  */
   91 static struct eventfd *
   92 eventfd_create(unsigned int const val, int const flags)
   93 {
   94         struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP);
   95 
   96         mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE);
   97         cv_init(&efd->efd_read_wait, "efdread");
   98         cv_init(&efd->efd_write_wait, "efdwrite");
   99         selinit(&efd->efd_read_sel);
  100         selinit(&efd->efd_write_sel);
  101         efd->efd_val = val;
  102         efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE);
  103         getnanotime(&efd->efd_btime);
  104 
  105         /* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */
  106 
  107         return efd;
  108 }
  109 
  110 /*
  111  * eventfd_destroy:
  112  *
  113  *      Destroy an eventfd object.
  114  */
  115 static void
  116 eventfd_destroy(struct eventfd * const efd)
  117 {
  118 
  119         KASSERT(efd->efd_nwaiters == 0);
  120         KASSERT(efd->efd_has_read_waiters == false);
  121         KASSERT(efd->efd_has_write_waiters == false);
  122 
  123         cv_destroy(&efd->efd_read_wait);
  124         cv_destroy(&efd->efd_write_wait);
  125 
  126         seldestroy(&efd->efd_read_sel);
  127         seldestroy(&efd->efd_write_sel);
  128 
  129         mutex_destroy(&efd->efd_lock);
  130 
  131         kmem_free(efd, sizeof(*efd));
  132 }
  133 
  134 /*
  135  * eventfd_wait:
  136  *
  137  *      Block on an eventfd.  Handles non-blocking, as well as
  138  *      the restart cases.
  139  */
  140 static int
  141 eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write)
  142 {
  143         kcondvar_t *waitcv;
  144         int error;
  145 
  146         if (fflag & FNONBLOCK) {
  147                 return EAGAIN;
  148         }
  149 
  150         /*
  151          * We're going to block.  Check if we need to return ERESTART.
  152          */
  153         if (efd->efd_restarting) {
  154                 return ERESTART;
  155         }
  156 
  157         if (is_write) {
  158                 efd->efd_has_write_waiters = true;
  159                 waitcv = &efd->efd_write_wait;
  160         } else {
  161                 efd->efd_has_read_waiters = true;
  162                 waitcv = &efd->efd_read_wait;
  163         }
  164 
  165         efd->efd_nwaiters++;
  166         KASSERT(efd->efd_nwaiters > 0);
  167         error = cv_wait_sig(waitcv, &efd->efd_lock);
  168         efd->efd_nwaiters--;
  169         KASSERT(efd->efd_nwaiters >= 0);
  170 
  171         /*
  172          * If a restart was triggered while we were asleep, we need
  173          * to return ERESTART if no other error was returned.
  174          */
  175         if (efd->efd_restarting) {
  176                 if (error == 0) {
  177                         error = ERESTART;
  178                 }
  179         }
  180 
  181         return error;
  182 }
  183 
  184 /*
  185  * eventfd_wake:
  186  *
  187  *      Wake LWPs block on an eventfd.
  188  */
  189 static void
  190 eventfd_wake(struct eventfd * const efd, bool const is_write)
  191 {
  192         kcondvar_t *waitcv = NULL;
  193         struct selinfo *sel;
  194         int pollev;
  195 
  196         if (is_write) {
  197                 if (efd->efd_has_read_waiters) {
  198                         waitcv = &efd->efd_read_wait;
  199                         efd->efd_has_read_waiters = false;
  200                 }
  201                 sel = &efd->efd_read_sel;
  202                 pollev = POLLIN | POLLRDNORM;
  203         } else {
  204                 if (efd->efd_has_write_waiters) {
  205                         waitcv = &efd->efd_write_wait;
  206                         efd->efd_has_write_waiters = false;
  207                 }
  208                 sel = &efd->efd_write_sel;
  209                 pollev = POLLOUT | POLLWRNORM;
  210         }
  211         if (waitcv != NULL) {
  212                 cv_broadcast(waitcv);
  213         }
  214         selnotify(sel, pollev, NOTE_SUBMIT);
  215 }
  216 
  217 /*
  218  * eventfd file operations
  219  */
  220 
  221 static int
  222 eventfd_fop_read(file_t * const fp, off_t * const offset,
  223     struct uio * const uio, kauth_cred_t const cred, int const flags)
  224 {
  225         struct eventfd * const efd = fp->f_eventfd;
  226         int const fflag = fp->f_flag;
  227         eventfd_t return_value;
  228         int error;
  229 
  230         if (uio->uio_resid < sizeof(eventfd_t)) {
  231                 return EINVAL;
  232         }
  233 
  234         mutex_enter(&efd->efd_lock);
  235 
  236         while (efd->efd_val == 0) {
  237                 if ((error = eventfd_wait(efd, fflag, false)) != 0) {
  238                         mutex_exit(&efd->efd_lock);
  239                         return error;
  240                 }
  241         }
  242 
  243         if (efd->efd_is_semaphore) {
  244                 return_value = 1;
  245                 efd->efd_val--;
  246         } else {
  247                 return_value = efd->efd_val;
  248                 efd->efd_val = 0;
  249         }
  250 
  251         getnanotime(&efd->efd_atime);
  252         eventfd_wake(efd, false);
  253 
  254         mutex_exit(&efd->efd_lock);
  255 
  256         error = uiomove(&return_value, sizeof(return_value), uio);
  257 
  258         return error;
  259 }
  260 
  261 static int
  262 eventfd_fop_write(file_t * const fp, off_t * const offset,
  263     struct uio * const uio, kauth_cred_t const cred, int const flags)
  264 {
  265         struct eventfd * const efd = fp->f_eventfd;
  266         int const fflag = fp->f_flag;
  267         eventfd_t write_value;
  268         int error;
  269 
  270         if (uio->uio_resid < sizeof(eventfd_t)) {
  271                 return EINVAL;
  272         }
  273 
  274         if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) {
  275                 return error;
  276         }
  277 
  278         if (write_value > EVENTFD_MAXVAL) {
  279                 error = EINVAL;
  280                 goto out;
  281         }
  282 
  283         mutex_enter(&efd->efd_lock);
  284 
  285         KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
  286         while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) {
  287                 if ((error = eventfd_wait(efd, fflag, true)) != 0) {
  288                         mutex_exit(&efd->efd_lock);
  289                         goto out;
  290                 }
  291         }
  292 
  293         efd->efd_val += write_value;
  294         KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
  295 
  296         getnanotime(&efd->efd_mtime);
  297         eventfd_wake(efd, true);
  298 
  299         mutex_exit(&efd->efd_lock);
  300 
  301  out:
  302         if (error) {
  303                 /*
  304                  * Undo the effect of uiomove() so that the error
  305                  * gets reported correctly; see dofilewrite().
  306                  */
  307                 uio->uio_resid += sizeof(write_value);
  308         }
  309         return error;
  310 }
  311 
  312 static int
  313 eventfd_ioctl(file_t * const fp, u_long const cmd, void * const data)
  314 {
  315         struct eventfd * const efd = fp->f_eventfd;
  316 
  317         switch (cmd) {
  318         case FIONBIO:
  319                 return 0;
  320         
  321         case FIONREAD:
  322                 mutex_enter(&efd->efd_lock);
  323                 *(int *)data = efd->efd_val != 0 ? sizeof(eventfd_t) : 0;
  324                 mutex_exit(&efd->efd_lock);
  325                 return 0;
  326         
  327         case FIONWRITE:
  328                 *(int *)data = 0;
  329                 return 0;
  330 
  331         case FIONSPACE:
  332                 /*
  333                  * FIONSPACE doesn't really work for eventfd, because the
  334                  * writability depends on the contents (value) being written.
  335                  */
  336                 break;
  337 
  338         default:
  339                 break;
  340         }
  341 
  342         return EPASSTHROUGH;
  343 }
  344 
  345 static int
  346 eventfd_fop_poll(file_t * const fp, int const events)
  347 {
  348         struct eventfd * const efd = fp->f_eventfd;
  349         int revents = 0;
  350 
  351         /*
  352          * Note that Linux will return POLLERR if the eventfd count
  353          * overflows, but that is not possible in the normal read/write
  354          * API, only with Linux kernel-internal interfaces.  So, this
  355          * implementation never returns POLLERR.
  356          *
  357          * Also note that the Linux eventfd(2) man page does not
  358          * specifically discuss returning POLLRDNORM, but we check
  359          * for that event in addition to POLLIN.
  360          */
  361 
  362         mutex_enter(&efd->efd_lock);
  363 
  364         if (events & (POLLIN | POLLRDNORM)) {
  365                 if (efd->efd_val != 0) {
  366                         revents |= events & (POLLIN | POLLRDNORM);
  367                 } else {
  368                         selrecord(curlwp, &efd->efd_read_sel);
  369                 }
  370         }
  371 
  372         if (events & (POLLOUT | POLLWRNORM)) {
  373                 if (efd->efd_val < EVENTFD_MAXVAL) {
  374                         revents |= events & (POLLOUT | POLLWRNORM);
  375                 } else {
  376                         selrecord(curlwp, &efd->efd_write_sel);
  377                 }
  378         }
  379 
  380         mutex_exit(&efd->efd_lock);
  381 
  382         return revents;
  383 }
  384 
  385 static int
  386 eventfd_fop_stat(file_t * const fp, struct stat * const st)
  387 {
  388         struct eventfd * const efd = fp->f_eventfd;
  389 
  390         memset(st, 0, sizeof(*st));
  391 
  392         mutex_enter(&efd->efd_lock);
  393         st->st_size = (off_t)efd->efd_val;
  394         st->st_blksize = sizeof(eventfd_t);
  395         st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
  396         st->st_blocks = 1;
  397         st->st_birthtimespec = st->st_ctimespec = efd->efd_btime;
  398         st->st_atimespec = efd->efd_atime;
  399         st->st_mtimespec = efd->efd_mtime;
  400         st->st_uid = kauth_cred_geteuid(fp->f_cred);
  401         st->st_gid = kauth_cred_getegid(fp->f_cred);
  402         mutex_exit(&efd->efd_lock);
  403 
  404         return 0;
  405 }
  406 
  407 static int
  408 eventfd_fop_close(file_t * const fp)
  409 {
  410         struct eventfd * const efd = fp->f_eventfd;
  411 
  412         fp->f_eventfd = NULL;
  413         eventfd_destroy(efd);
  414 
  415         return 0;
  416 }
  417 
  418 static void
  419 eventfd_filt_read_detach(struct knote * const kn)
  420 {
  421         struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
  422 
  423         mutex_enter(&efd->efd_lock);
  424         KASSERT(kn->kn_hook == efd);
  425         selremove_knote(&efd->efd_read_sel, kn);
  426         mutex_exit(&efd->efd_lock);
  427 }
  428 
  429 static int
  430 eventfd_filt_read(struct knote * const kn, long const hint)
  431 {
  432         struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
  433         int rv;
  434 
  435         if (hint & NOTE_SUBMIT) {
  436                 KASSERT(mutex_owned(&efd->efd_lock));
  437         } else {
  438                 mutex_enter(&efd->efd_lock);
  439         }
  440 
  441         kn->kn_data = (int64_t)efd->efd_val;
  442         rv = (eventfd_t)kn->kn_data > 0;
  443 
  444         if ((hint & NOTE_SUBMIT) == 0) {
  445                 mutex_exit(&efd->efd_lock);
  446         }
  447 
  448         return rv;
  449 }
  450 
  451 static const struct filterops eventfd_read_filterops = {
  452         .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
  453         .f_detach = eventfd_filt_read_detach,
  454         .f_event = eventfd_filt_read,
  455 };
  456 
  457 static void
  458 eventfd_filt_write_detach(struct knote * const kn)
  459 {
  460         struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
  461 
  462         mutex_enter(&efd->efd_lock);
  463         KASSERT(kn->kn_hook == efd);
  464         selremove_knote(&efd->efd_write_sel, kn);
  465         mutex_exit(&efd->efd_lock);
  466 }
  467 
  468 static int
  469 eventfd_filt_write(struct knote * const kn, long const hint)
  470 {
  471         struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
  472         int rv;
  473 
  474         if (hint & NOTE_SUBMIT) {
  475                 KASSERT(mutex_owned(&efd->efd_lock));
  476         } else {
  477                 mutex_enter(&efd->efd_lock);
  478         }
  479 
  480         kn->kn_data = (int64_t)efd->efd_val;
  481         rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL;
  482 
  483         if ((hint & NOTE_SUBMIT) == 0) {
  484                 mutex_exit(&efd->efd_lock);
  485         }
  486 
  487         return rv;
  488 }
  489 
  490 static const struct filterops eventfd_write_filterops = {
  491         .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
  492         .f_detach = eventfd_filt_write_detach,
  493         .f_event = eventfd_filt_write,
  494 };
  495 
  496 static int
  497 eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn)
  498 {
  499         struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
  500         struct selinfo *sel;
  501 
  502         switch (kn->kn_filter) {
  503         case EVFILT_READ:
  504                 sel = &efd->efd_read_sel;
  505                 kn->kn_fop = &eventfd_read_filterops;
  506                 break;
  507 
  508         case EVFILT_WRITE:
  509                 sel = &efd->efd_write_sel;
  510                 kn->kn_fop = &eventfd_write_filterops;
  511                 break;
  512 
  513         default:
  514                 return EINVAL;
  515         }
  516 
  517         kn->kn_hook = efd;
  518 
  519         mutex_enter(&efd->efd_lock);
  520         selrecord_knote(sel, kn);
  521         mutex_exit(&efd->efd_lock);
  522 
  523         return 0;
  524 }
  525 
  526 static void
  527 eventfd_fop_restart(file_t * const fp)
  528 {
  529         struct eventfd * const efd = fp->f_eventfd;
  530 
  531         /*
  532          * Unblock blocked reads/writes in order to allow close() to complete.
  533          * System calls return ERESTART so that the fd is revalidated.
  534          */
  535 
  536         mutex_enter(&efd->efd_lock);
  537 
  538         if (efd->efd_nwaiters != 0) {
  539                 efd->efd_restarting = true;
  540                 if (efd->efd_has_read_waiters) {
  541                         cv_broadcast(&efd->efd_read_wait);
  542                         efd->efd_has_read_waiters = false;
  543                 }
  544                 if (efd->efd_has_write_waiters) {
  545                         cv_broadcast(&efd->efd_write_wait);
  546                         efd->efd_has_write_waiters = false;
  547                 }
  548         }
  549 
  550         mutex_exit(&efd->efd_lock);
  551 }
  552 
  553 static const struct fileops eventfd_fileops = {
  554         .fo_name = "eventfd",
  555         .fo_read = eventfd_fop_read,
  556         .fo_write = eventfd_fop_write,
  557         .fo_ioctl = eventfd_ioctl,
  558         .fo_fcntl = fnullop_fcntl,
  559         .fo_poll = eventfd_fop_poll,
  560         .fo_stat = eventfd_fop_stat,
  561         .fo_close = eventfd_fop_close,
  562         .fo_kqfilter = eventfd_fop_kqfilter,
  563         .fo_restart = eventfd_fop_restart,
  564 };
  565 
  566 /*
  567  * eventfd(2) system call
  568  */
  569 int
  570 do_eventfd(struct lwp * const l, unsigned int const val, int const flags,
  571     register_t *retval)
  572 {
  573         file_t *fp;
  574         int fd, error;
  575 
  576         if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) {
  577                 return EINVAL;
  578         }
  579 
  580         if ((error = fd_allocfile(&fp, &fd)) != 0) {
  581                 return error;
  582         }
  583 
  584         fp->f_flag = FREAD | FWRITE;
  585         if (flags & EFD_NONBLOCK) {
  586                 fp->f_flag |= FNONBLOCK;
  587         }
  588         fp->f_type = DTYPE_EVENTFD;
  589         fp->f_ops = &eventfd_fileops;
  590         fp->f_eventfd = eventfd_create(val, flags);
  591         fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC));
  592         fd_affix(curproc, fp, fd);
  593 
  594         *retval = fd;
  595         return 0;
  596 }
  597 
  598 int
  599 sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap,
  600     register_t *retval)
  601 {
  602         /* {
  603                 syscallarg(unsigned int) val;
  604                 syscallarg(int) flags;
  605         } */
  606 
  607         return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval);
  608 }
Cache object: 40292b39a95ef333ae3390ba7c396520
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/sys_eventfd.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_eventfd.c