The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_select.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: sys_select.c,v 1.60 2022/06/29 22:27:01 riastradh Exp $        */
    2 
    3 /*-
    4  * Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran and Mindaugas Rasiukevicius.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  * Copyright (c) 1982, 1986, 1989, 1993
   34  *      The Regents of the University of California.  All rights reserved.
   35  * (c) UNIX System Laboratories, Inc.
   36  * All or some portions of this file are derived from material licensed
   37  * to the University of California by American Telephone and Telegraph
   38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   39  * the permission of UNIX System Laboratories, Inc.
   40  *
   41  * Redistribution and use in source and binary forms, with or without
   42  * modification, are permitted provided that the following conditions
   43  * are met:
   44  * 1. Redistributions of source code must retain the above copyright
   45  *    notice, this list of conditions and the following disclaimer.
   46  * 2. Redistributions in binary form must reproduce the above copyright
   47  *    notice, this list of conditions and the following disclaimer in the
   48  *    documentation and/or other materials provided with the distribution.
   49  * 3. Neither the name of the University nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   63  * SUCH DAMAGE.
   64  *
   65  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
   66  */
   67 
   68 /*
   69  * System calls of synchronous I/O multiplexing subsystem.
   70  *
   71  * Locking
   72  *
   73  * Two locks are used: <object-lock> and selcluster_t::sc_lock.
   74  *
   75  * The <object-lock> might be a device driver or another subsystem, e.g.
   76  * socket or pipe.  This lock is not exported, and thus invisible to this
   77  * subsystem.  Mainly, synchronisation between selrecord() and selnotify()
   78  * routines depends on this lock, as it will be described in the comments.
   79  *
   80  * Lock order
   81  *
   82  *      <object-lock> ->
   83  *              selcluster_t::sc_lock
   84  */
   85 
   86 #include <sys/cdefs.h>
   87 __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.60 2022/06/29 22:27:01 riastradh Exp $");
   88 
   89 #include <sys/param.h>
   90 #include <sys/systm.h>
   91 #include <sys/filedesc.h>
   92 #include <sys/file.h>
   93 #include <sys/proc.h>
   94 #include <sys/socketvar.h>
   95 #include <sys/signalvar.h>
   96 #include <sys/uio.h>
   97 #include <sys/kernel.h>
   98 #include <sys/lwp.h>
   99 #include <sys/poll.h>
  100 #include <sys/mount.h>
  101 #include <sys/syscallargs.h>
  102 #include <sys/cpu.h>
  103 #include <sys/atomic.h>
  104 #include <sys/socketvar.h>
  105 #include <sys/sleepq.h>
  106 #include <sys/sysctl.h>
  107 #include <sys/bitops.h>
  108 
  109 /* Flags for lwp::l_selflag. */
  110 #define SEL_RESET       0       /* awoken, interrupted, or not yet polling */
  111 #define SEL_SCANNING    1       /* polling descriptors */
  112 #define SEL_BLOCKING    2       /* blocking and waiting for event */
  113 #define SEL_EVENT       3       /* interrupted, events set directly */
  114 
  115 /*
  116  * Per-cluster state for select()/poll().  For a system with fewer
  117  * than 64 CPUs, this gives us per-CPU clusters.
  118  */
  119 #define SELCLUSTERS     64
  120 #define SELCLUSTERMASK  (SELCLUSTERS - 1)
  121 
  122 typedef struct selcluster {
  123         kmutex_t        *sc_lock;
  124         sleepq_t        sc_sleepq;
  125         uint64_t        sc_mask;
  126         int             sc_ncoll;
  127 } selcluster_t;
  128 
  129 static inline int       selscan(char *, const int, const size_t, register_t *);
  130 static inline int       pollscan(struct pollfd *, const int, register_t *);
  131 static void             selclear(void);
  132 
  133 static const int sel_flag[] = {
  134         POLLRDNORM | POLLHUP | POLLERR,
  135         POLLWRNORM | POLLHUP | POLLERR,
  136         POLLRDBAND
  137 };
  138 
  139 /* 
  140  * LWPs are woken using the sleep queue only due to a collision, the case
  141  * with the maximum Suck Factor.  Save the cost of sorting for named waiters
  142  * by inserting in LIFO order.  In the future it would be preferable to not
  143  * enqueue LWPs at all, unless subject to a collision.
  144  */
  145 syncobj_t select_sobj = {
  146         .sobj_flag      = SOBJ_SLEEPQ_LIFO,
  147         .sobj_unsleep   = sleepq_unsleep,
  148         .sobj_changepri = sleepq_changepri,
  149         .sobj_lendpri   = sleepq_lendpri,
  150         .sobj_owner     = syncobj_noowner,
  151 };
  152 
  153 static selcluster_t     *selcluster[SELCLUSTERS] __read_mostly;
  154 static int              direct_select __read_mostly = 0;
  155 
  156 /* Operations: either select() or poll(). */
  157 const char              selop_select[] = "select";
  158 const char              selop_poll[] = "poll";
  159 
  160 /*
  161  * Select system call.
  162  */
  163 int
  164 sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
  165     register_t *retval)
  166 {
  167         /* {
  168                 syscallarg(int)                         nd;
  169                 syscallarg(fd_set *)                    in;
  170                 syscallarg(fd_set *)                    ou;
  171                 syscallarg(fd_set *)                    ex;
  172                 syscallarg(const struct timespec *)     ts;
  173                 syscallarg(sigset_t *)                  mask;
  174         } */
  175         struct timespec ats, *ts = NULL;
  176         sigset_t        amask, *mask = NULL;
  177         int             error;
  178 
  179         if (SCARG(uap, ts)) {
  180                 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
  181                 if (error)
  182                         return error;
  183                 ts = &ats;
  184         }
  185         if (SCARG(uap, mask) != NULL) {
  186                 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
  187                 if (error)
  188                         return error;
  189                 mask = &amask;
  190         }
  191 
  192         return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
  193             SCARG(uap, ou), SCARG(uap, ex), ts, mask);
  194 }
  195 
  196 int
  197 sys___select50(struct lwp *l, const struct sys___select50_args *uap,
  198     register_t *retval)
  199 {
  200         /* {
  201                 syscallarg(int)                 nd;
  202                 syscallarg(fd_set *)            in;
  203                 syscallarg(fd_set *)            ou;
  204                 syscallarg(fd_set *)            ex;
  205                 syscallarg(struct timeval *)    tv;
  206         } */
  207         struct timeval atv;
  208         struct timespec ats, *ts = NULL;
  209         int error;
  210 
  211         if (SCARG(uap, tv)) {
  212                 error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv));
  213                 if (error)
  214                         return error;
  215 
  216                 if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
  217                         return EINVAL;
  218 
  219                 TIMEVAL_TO_TIMESPEC(&atv, &ats);
  220                 ts = &ats;
  221         }
  222 
  223         return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
  224             SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
  225 }
  226 
  227 /*
  228  * sel_do_scan: common code to perform the scan on descriptors.
  229  */
  230 static int
  231 sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni,
  232     struct timespec *ts, sigset_t *mask, register_t *retval)
  233 {
  234         lwp_t           * const l = curlwp;
  235         selcluster_t    *sc;
  236         kmutex_t        *lock;
  237         struct timespec sleepts;
  238         int             error, timo;
  239 
  240         timo = 0;
  241         if (ts && inittimeleft(ts, &sleepts) == -1) {
  242                 return EINVAL;
  243         }
  244 
  245         if (__predict_false(mask))
  246                 sigsuspendsetup(l, mask);
  247 
  248         /*
  249          * We may context switch during or at any time after picking a CPU
  250          * and cluster to associate with, but it doesn't matter.  In the
  251          * unlikely event we migrate elsewhere all we risk is a little lock
  252          * contention; correctness is not sacrificed.
  253          */
  254         sc = curcpu()->ci_data.cpu_selcluster;
  255         lock = sc->sc_lock;
  256         l->l_selcluster = sc;
  257 
  258         if (opname == selop_select) {
  259                 l->l_selbits = fds;
  260                 l->l_selni = ni;
  261         } else {
  262                 l->l_selbits = NULL;
  263         }
  264 
  265         for (;;) {
  266                 int ncoll;
  267 
  268                 SLIST_INIT(&l->l_selwait);
  269                 l->l_selret = 0;
  270 
  271                 /*
  272                  * No need to lock.  If this is overwritten by another value
  273                  * while scanning, we will retry below.  We only need to see
  274                  * exact state from the descriptors that we are about to poll,
  275                  * and lock activity resulting from fo_poll is enough to
  276                  * provide an up to date value for new polling activity.
  277                  */
  278                 if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) {
  279                         /* Non-blocking: no need for selrecord()/selclear() */
  280                         l->l_selflag = SEL_RESET;
  281                 } else {
  282                         l->l_selflag = SEL_SCANNING;
  283                 }
  284                 ncoll = sc->sc_ncoll;
  285                 membar_release();
  286 
  287                 if (opname == selop_select) {
  288                         error = selscan((char *)fds, nf, ni, retval);
  289                 } else {
  290                         error = pollscan((struct pollfd *)fds, nf, retval);
  291                 }
  292                 if (error || *retval)
  293                         break;
  294                 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
  295                         break;
  296                 /*
  297                  * Acquire the lock and perform the (re)checks.  Note, if
  298                  * collision has occurred, then our state does not matter,
  299                  * as we must perform re-scan.  Therefore, check it first.
  300                  */
  301 state_check:
  302                 mutex_spin_enter(lock);
  303                 if (__predict_false(sc->sc_ncoll != ncoll)) {
  304                         /* Collision: perform re-scan. */
  305                         mutex_spin_exit(lock);
  306                         selclear();
  307                         continue;
  308                 }
  309                 if (__predict_true(l->l_selflag == SEL_EVENT)) {
  310                         /* Events occurred, they are set directly. */
  311                         mutex_spin_exit(lock);
  312                         break;
  313                 }
  314                 if (__predict_true(l->l_selflag == SEL_RESET)) {
  315                         /* Events occurred, but re-scan is requested. */
  316                         mutex_spin_exit(lock);
  317                         selclear();
  318                         continue;
  319                 }
  320                 /* Nothing happen, therefore - sleep. */
  321                 l->l_selflag = SEL_BLOCKING;
  322                 l->l_kpriority = true;
  323                 sleepq_enter(&sc->sc_sleepq, l, lock);
  324                 sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true);
  325                 error = sleepq_block(timo, true, &select_sobj);
  326                 if (error != 0) {
  327                         break;
  328                 }
  329                 /* Awoken: need to check the state. */
  330                 goto state_check;
  331         }
  332         selclear();
  333 
  334         /* Add direct events if any. */
  335         if (l->l_selflag == SEL_EVENT) {
  336                 KASSERT(l->l_selret != 0);
  337                 *retval += l->l_selret;
  338         }
  339 
  340         if (__predict_false(mask))
  341                 sigsuspendteardown(l);
  342 
  343         /* select and poll are not restarted after signals... */
  344         if (error == ERESTART)
  345                 return EINTR;
  346         if (error == EWOULDBLOCK)
  347                 return 0;
  348         return error;
  349 }
  350 
  351 int
  352 selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou,
  353     fd_set *u_ex, struct timespec *ts, sigset_t *mask)
  354 {
  355         char            smallbits[howmany(FD_SETSIZE, NFDBITS) *
  356                             sizeof(fd_mask) * 6];
  357         char            *bits;
  358         int             error, nf;
  359         size_t          ni;
  360 
  361         if (nd < 0)
  362                 return (EINVAL);
  363         nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles;
  364         if (nd > nf) {
  365                 /* forgiving; slightly wrong */
  366                 nd = nf;
  367         }
  368         ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
  369         if (ni * 6 > sizeof(smallbits))
  370                 bits = kmem_alloc(ni * 6, KM_SLEEP);
  371         else
  372                 bits = smallbits;
  373 
  374 #define getbits(name, x)                                                \
  375         if (u_ ## name) {                                               \
  376                 error = copyin(u_ ## name, bits + ni * x, ni);          \
  377                 if (error)                                              \
  378                         goto fail;                                      \
  379         } else                                                          \
  380                 memset(bits + ni * x, 0, ni);
  381         getbits(in, 0);
  382         getbits(ou, 1);
  383         getbits(ex, 2);
  384 #undef  getbits
  385 
  386         error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval);
  387         if (error == 0 && u_in != NULL)
  388                 error = copyout(bits + ni * 3, u_in, ni);
  389         if (error == 0 && u_ou != NULL)
  390                 error = copyout(bits + ni * 4, u_ou, ni);
  391         if (error == 0 && u_ex != NULL)
  392                 error = copyout(bits + ni * 5, u_ex, ni);
  393  fail:
  394         if (bits != smallbits)
  395                 kmem_free(bits, ni * 6);
  396         return (error);
  397 }
  398 
  399 static inline int
  400 selscan(char *bits, const int nfd, const size_t ni, register_t *retval)
  401 {
  402         fd_mask *ibitp, *obitp;
  403         int msk, i, j, fd, n;
  404         file_t *fp;
  405         lwp_t *l;
  406 
  407         ibitp = (fd_mask *)(bits + ni * 0);
  408         obitp = (fd_mask *)(bits + ni * 3);
  409         n = 0;
  410         l = curlwp;
  411 
  412         memset(obitp, 0, ni * 3);
  413         for (msk = 0; msk < 3; msk++) {
  414                 for (i = 0; i < nfd; i += NFDBITS) {
  415                         fd_mask ibits, obits;
  416 
  417                         ibits = *ibitp;
  418                         obits = 0;
  419                         while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
  420                                 ibits &= ~(1U << j);
  421                                 if ((fp = fd_getfile(fd)) == NULL)
  422                                         return (EBADF);
  423                                 /*
  424                                  * Setup an argument to selrecord(), which is
  425                                  * a file descriptor number.
  426                                  */
  427                                 l->l_selrec = fd;
  428                                 if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) {
  429                                         if (!direct_select) {
  430                                                 /*
  431                                                  * Have events: do nothing in
  432                                                  * selrecord().
  433                                                  */
  434                                                 l->l_selflag = SEL_RESET;
  435                                         }
  436                                         obits |= (1U << j);
  437                                         n++;
  438                                 }
  439                                 fd_putfile(fd);
  440                         }
  441                         if (obits != 0) {
  442                                 if (direct_select) {
  443                                         kmutex_t *lock;
  444                                         lock = l->l_selcluster->sc_lock;
  445                                         mutex_spin_enter(lock);
  446                                         *obitp |= obits;
  447                                         mutex_spin_exit(lock);
  448                                 } else {
  449                                         *obitp |= obits;
  450                                 }
  451                         }
  452                         ibitp++;
  453                         obitp++;
  454                 }
  455         }
  456         *retval = n;
  457         return (0);
  458 }
  459 
  460 /*
  461  * Poll system call.
  462  */
  463 int
  464 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
  465 {
  466         /* {
  467                 syscallarg(struct pollfd *)     fds;
  468                 syscallarg(u_int)               nfds;
  469                 syscallarg(int)                 timeout;
  470         } */
  471         struct timespec ats, *ts = NULL;
  472 
  473         if (SCARG(uap, timeout) != INFTIM) {
  474                 ats.tv_sec = SCARG(uap, timeout) / 1000;
  475                 ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000;
  476                 ts = &ats;
  477         }
  478 
  479         return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL);
  480 }
  481 
  482 /*
  483  * Poll system call.
  484  */
  485 int
  486 sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
  487     register_t *retval)
  488 {
  489         /* {
  490                 syscallarg(struct pollfd *)             fds;
  491                 syscallarg(u_int)                       nfds;
  492                 syscallarg(const struct timespec *)     ts;
  493                 syscallarg(const sigset_t *)            mask;
  494         } */
  495         struct timespec ats, *ts = NULL;
  496         sigset_t        amask, *mask = NULL;
  497         int             error;
  498 
  499         if (SCARG(uap, ts)) {
  500                 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
  501                 if (error)
  502                         return error;
  503                 ts = &ats;
  504         }
  505         if (SCARG(uap, mask)) {
  506                 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
  507                 if (error)
  508                         return error;
  509                 mask = &amask;
  510         }
  511 
  512         return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask);
  513 }
  514 
  515 int
  516 pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds,
  517     struct timespec *ts, sigset_t *mask)
  518 {
  519         struct pollfd   smallfds[32];
  520         struct pollfd   *fds;
  521         int             error;
  522         size_t          ni;
  523 
  524         if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) {
  525                 /*
  526                  * Prevent userland from causing over-allocation.
  527                  * Raising the default limit too high can still cause
  528                  * a lot of memory to be allocated, but this also means
  529                  * that the file descriptor array will also be large.
  530                  *
  531                  * To reduce the memory requirements here, we could 
  532                  * process the 'fds' array in chunks, but that
  533                  * is a lot of code that isn't normally useful.
  534                  * (Or just move the copyin/out into pollscan().)
  535                  *
  536                  * Historically the code silently truncated 'fds' to
  537                  * dt_nfiles entries - but that does cause issues.
  538                  *
  539                  * Using the max limit equivalent to sysctl
  540                  * kern.maxfiles is the moral equivalent of OPEN_MAX
  541                  * as specified by POSIX.
  542                  *
  543                  * We add a slop of 1000 in case the resource limit was
  544                  * changed after opening descriptors or the same descriptor
  545                  * was specified more than once.
  546                  */
  547                 return EINVAL;
  548         }
  549         ni = nfds * sizeof(struct pollfd);
  550         if (ni > sizeof(smallfds))
  551                 fds = kmem_alloc(ni, KM_SLEEP);
  552         else
  553                 fds = smallfds;
  554 
  555         error = copyin(u_fds, fds, ni);
  556         if (error)
  557                 goto fail;
  558 
  559         error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval);
  560         if (error == 0)
  561                 error = copyout(fds, u_fds, ni);
  562  fail:
  563         if (fds != smallfds)
  564                 kmem_free(fds, ni);
  565         return (error);
  566 }
  567 
  568 static inline int
  569 pollscan(struct pollfd *fds, const int nfd, register_t *retval)
  570 {
  571         file_t *fp;
  572         int i, n = 0, revents;
  573 
  574         for (i = 0; i < nfd; i++, fds++) {
  575                 fds->revents = 0;
  576                 if (fds->fd < 0) {
  577                         revents = 0;
  578                 } else if ((fp = fd_getfile(fds->fd)) == NULL) {
  579                         revents = POLLNVAL;
  580                 } else {
  581                         /*
  582                          * Perform poll: registers select request or returns
  583                          * the events which are set.  Setup an argument for
  584                          * selrecord(), which is a pointer to struct pollfd.
  585                          */
  586                         curlwp->l_selrec = (uintptr_t)fds;
  587                         revents = (*fp->f_ops->fo_poll)(fp,
  588                             fds->events | POLLERR | POLLHUP);
  589                         fd_putfile(fds->fd);
  590                 }
  591                 if (revents) {
  592                         if (!direct_select)  {
  593                                 /* Have events: do nothing in selrecord(). */
  594                                 curlwp->l_selflag = SEL_RESET;
  595                         }
  596                         fds->revents = revents;
  597                         n++;
  598                 }
  599         }
  600         *retval = n;
  601         return (0);
  602 }
  603 
  604 int
  605 seltrue(dev_t dev, int events, lwp_t *l)
  606 {
  607 
  608         return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
  609 }
  610 
  611 /*
  612  * Record a select request.  Concurrency issues:
  613  *
  614  * The caller holds the same lock across calls to selrecord() and
  615  * selnotify(), so we don't need to consider a concurrent wakeup
  616  * while in this routine.
  617  *
  618  * The only activity we need to guard against is selclear(), called by
  619  * another thread that is exiting sel_do_scan().
  620  * `sel_lwp' can only become non-NULL while the caller's lock is held,
  621  * so it cannot become non-NULL due to a change made by another thread
  622  * while we are in this routine.  It can only become _NULL_ due to a
  623  * call to selclear().
  624  *
  625  * If it is non-NULL and != selector there is the potential for
  626  * selclear() to be called by another thread.  If either of those
  627  * conditions are true, we're not interested in touching the `named
  628  * waiter' part of the selinfo record because we need to record a
  629  * collision.  Hence there is no need for additional locking in this
  630  * routine.
  631  */
  632 void
  633 selrecord(lwp_t *selector, struct selinfo *sip)
  634 {
  635         selcluster_t *sc;
  636         lwp_t *other;
  637 
  638         KASSERT(selector == curlwp);
  639 
  640         sc = selector->l_selcluster;
  641         other = sip->sel_lwp;
  642 
  643         if (selector->l_selflag == SEL_RESET) {
  644                 /* 0. We're not going to block - will poll again if needed. */
  645         } else if (other == selector) {
  646                 /* 1. We (selector) already claimed to be the first LWP. */
  647                 KASSERT(sip->sel_cluster == sc);
  648         } else if (other == NULL) {
  649                 /*
  650                  * 2. No first LWP, therefore we (selector) are the first.
  651                  *
  652                  * There may be unnamed waiters (collisions).  Issue a memory
  653                  * barrier to ensure that we access sel_lwp (above) before
  654                  * other fields - this guards against a call to selclear().
  655                  */
  656                 membar_acquire();
  657                 sip->sel_lwp = selector;
  658                 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
  659                 /* Copy the argument, which is for selnotify(). */
  660                 sip->sel_fdinfo = selector->l_selrec;
  661                 /* Replace selinfo's lock with the chosen cluster's lock. */
  662                 sip->sel_cluster = sc;
  663         } else {
  664                 /* 3. Multiple waiters: record a collision. */
  665                 sip->sel_collision |= sc->sc_mask;
  666                 KASSERT(sip->sel_cluster != NULL);
  667         }
  668 }
  669 
  670 /*
  671  * Record a knote.
  672  *
  673  * The caller holds the same lock as for selrecord().
  674  */
  675 void
  676 selrecord_knote(struct selinfo *sip, struct knote *kn)
  677 {
  678         klist_insert(&sip->sel_klist, kn);
  679 }
  680 
  681 /*
  682  * Remove a knote.
  683  *
  684  * The caller holds the same lock as for selrecord().
  685  *
  686  * Returns true if the last knote was removed and the list
  687  * is now empty.
  688  */
  689 bool
  690 selremove_knote(struct selinfo *sip, struct knote *kn)
  691 {
  692         return klist_remove(&sip->sel_klist, kn);
  693 }
  694 
  695 /*
  696  * sel_setevents: a helper function for selnotify(), to set the events
  697  * for LWP sleeping in selcommon() or pollcommon().
  698  */
  699 static inline bool
  700 sel_setevents(lwp_t *l, struct selinfo *sip, const int events)
  701 {
  702         const int oflag = l->l_selflag;
  703         int ret = 0;
  704 
  705         /*
  706          * If we require re-scan or it was required by somebody else,
  707          * then just (re)set SEL_RESET and return.
  708          */
  709         if (__predict_false(events == 0 || oflag == SEL_RESET)) {
  710                 l->l_selflag = SEL_RESET;
  711                 return true;
  712         }
  713         /*
  714          * Direct set.  Note: select state of LWP is locked.  First,
  715          * determine whether it is selcommon() or pollcommon().
  716          */
  717         if (l->l_selbits != NULL) {
  718                 const size_t ni = l->l_selni;
  719                 fd_mask *fds = (fd_mask *)l->l_selbits;
  720                 fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3);
  721                 const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK);
  722                 const int idx = fd >> __NFDSHIFT;
  723                 int n;
  724 
  725                 for (n = 0; n < 3; n++) {
  726                         if ((fds[idx] & fbit) != 0 &&
  727                             (ofds[idx] & fbit) == 0 &&
  728                             (sel_flag[n] & events)) {
  729                                 ofds[idx] |= fbit;
  730                                 ret++;
  731                         }
  732                         fds = (fd_mask *)((char *)fds + ni);
  733                         ofds = (fd_mask *)((char *)ofds + ni);
  734                 }
  735         } else {
  736                 struct pollfd *pfd = (void *)sip->sel_fdinfo;
  737                 int revents = events & (pfd->events | POLLERR | POLLHUP);
  738 
  739                 if (revents) {
  740                         if (pfd->revents == 0)
  741                                 ret = 1;
  742                         pfd->revents |= revents;
  743                 }
  744         }
  745         /* Check whether there are any events to return. */
  746         if (!ret) {
  747                 return false;
  748         }
  749         /* Indicate direct set and note the event (cluster lock is held). */
  750         l->l_selflag = SEL_EVENT;
  751         l->l_selret += ret;
  752         return true;
  753 }
  754 
  755 /*
  756  * Do a wakeup when a selectable event occurs.  Concurrency issues:
  757  *
  758  * As per selrecord(), the caller's object lock is held.  If there
  759  * is a named waiter, we must acquire the associated selcluster's lock
  760  * in order to synchronize with selclear() and pollers going to sleep
  761  * in sel_do_scan().
  762  *
  763  * sip->sel_cluser cannot change at this point, as it is only changed
  764  * in selrecord(), and concurrent calls to selrecord() are locked
  765  * out by the caller.
  766  */
  767 void
  768 selnotify(struct selinfo *sip, int events, long knhint)
  769 {
  770         selcluster_t *sc;
  771         uint64_t mask;
  772         int index, oflag;
  773         lwp_t *l;
  774         kmutex_t *lock;
  775 
  776         KNOTE(&sip->sel_klist, knhint);
  777 
  778         if (sip->sel_lwp != NULL) {
  779                 /* One named LWP is waiting. */
  780                 sc = sip->sel_cluster;
  781                 lock = sc->sc_lock;
  782                 mutex_spin_enter(lock);
  783                 /* Still there? */
  784                 if (sip->sel_lwp != NULL) {
  785                         /*
  786                          * Set the events for our LWP and indicate that.
  787                          * Otherwise, request for a full re-scan.
  788                          */
  789                         l = sip->sel_lwp;
  790                         oflag = l->l_selflag;
  791 
  792                         if (!direct_select) {
  793                                 l->l_selflag = SEL_RESET;
  794                         } else if (!sel_setevents(l, sip, events)) {
  795                                 /* No events to return. */
  796                                 mutex_spin_exit(lock);
  797                                 return;
  798                         }
  799 
  800                         /*
  801                          * If thread is sleeping, wake it up.  If it's not
  802                          * yet asleep, it will notice the change in state
  803                          * and will re-poll the descriptors.
  804                          */
  805                         if (oflag == SEL_BLOCKING && l->l_mutex == lock) {
  806                                 KASSERT(l->l_wchan == sc);
  807                                 sleepq_unsleep(l, false);
  808                         }
  809                 }
  810                 mutex_spin_exit(lock);
  811         }
  812 
  813         if ((mask = sip->sel_collision) != 0) {
  814                 /*
  815                  * There was a collision (multiple waiters): we must
  816                  * inform all potentially interested waiters.
  817                  */
  818                 sip->sel_collision = 0;
  819                 do {
  820                         index = ffs64(mask) - 1;
  821                         mask ^= __BIT(index);
  822                         sc = selcluster[index];
  823                         lock = sc->sc_lock;
  824                         mutex_spin_enter(lock);
  825                         sc->sc_ncoll++;
  826                         sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock);
  827                 } while (__predict_false(mask != 0));
  828         }
  829 }
  830 
  831 /*
  832  * Remove an LWP from all objects that it is waiting for.  Concurrency
  833  * issues:
  834  *
  835  * The object owner's (e.g. device driver) lock is not held here.  Calls
  836  * can be made to selrecord() and we do not synchronize against those
  837  * directly using locks.  However, we use `sel_lwp' to lock out changes.
  838  * Before clearing it we must use memory barriers to ensure that we can
  839  * safely traverse the list of selinfo records.
  840  */
  841 static void
  842 selclear(void)
  843 {
  844         struct selinfo *sip, *next;
  845         selcluster_t *sc;
  846         lwp_t *l;
  847         kmutex_t *lock;
  848 
  849         l = curlwp;
  850         sc = l->l_selcluster;
  851         lock = sc->sc_lock;
  852 
  853         /*
  854          * If the request was non-blocking, or we found events on the first
  855          * descriptor, there will be no need to clear anything - avoid
  856          * taking the lock.
  857          */
  858         if (SLIST_EMPTY(&l->l_selwait)) {
  859                 return;
  860         }
  861 
  862         mutex_spin_enter(lock);
  863         for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) {
  864                 KASSERT(sip->sel_lwp == l);
  865                 KASSERT(sip->sel_cluster == l->l_selcluster);
  866 
  867                 /*
  868                  * Read link to next selinfo record, if any.
  869                  * It's no longer safe to touch `sip' after clearing
  870                  * `sel_lwp', so ensure that the read of `sel_chain'
  871                  * completes before the clearing of sel_lwp becomes
  872                  * globally visible.
  873                  */
  874                 next = SLIST_NEXT(sip, sel_chain);
  875                 /* Release the record for another named waiter to use. */
  876                 atomic_store_release(&sip->sel_lwp, NULL);
  877         }
  878         mutex_spin_exit(lock);
  879 }
  880 
  881 /*
  882  * Initialize the select/poll system calls.  Called once for each
  883  * CPU in the system, as they are attached.
  884  */
  885 void
  886 selsysinit(struct cpu_info *ci)
  887 {
  888         selcluster_t *sc;
  889         u_int index;
  890 
  891         /* If already a cluster in place for this bit, re-use. */
  892         index = cpu_index(ci) & SELCLUSTERMASK;
  893         sc = selcluster[index];
  894         if (sc == NULL) {
  895                 sc = kmem_alloc(roundup2(sizeof(selcluster_t),
  896                     coherency_unit) + coherency_unit, KM_SLEEP);
  897                 sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
  898                 sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
  899                 sleepq_init(&sc->sc_sleepq);
  900                 sc->sc_ncoll = 0;
  901                 sc->sc_mask = __BIT(index);
  902                 selcluster[index] = sc;
  903         }
  904         ci->ci_data.cpu_selcluster = sc;
  905 }
  906 
  907 /*
  908  * Initialize a selinfo record.
  909  */
  910 void
  911 selinit(struct selinfo *sip)
  912 {
  913 
  914         memset(sip, 0, sizeof(*sip));
  915         klist_init(&sip->sel_klist);
  916 }
  917 
  918 /*
  919  * Destroy a selinfo record.  The owning object must not gain new
  920  * references while this is in progress: all activity on the record
  921  * must be stopped.
  922  *
  923  * Concurrency issues: we only need guard against a call to selclear()
  924  * by a thread exiting sel_do_scan().  The caller has prevented further
  925  * references being made to the selinfo record via selrecord(), and it
  926  * will not call selnotify() again.
  927  */
  928 void
  929 seldestroy(struct selinfo *sip)
  930 {
  931         selcluster_t *sc;
  932         kmutex_t *lock;
  933         lwp_t *l;
  934 
  935         klist_fini(&sip->sel_klist);
  936 
  937         if (sip->sel_lwp == NULL)
  938                 return;
  939 
  940         /*
  941          * Lock out selclear().  The selcluster pointer can't change while
  942          * we are here since it is only ever changed in selrecord(),
  943          * and that will not be entered again for this record because
  944          * it is dying.
  945          */
  946         KASSERT(sip->sel_cluster != NULL);
  947         sc = sip->sel_cluster;
  948         lock = sc->sc_lock;
  949         mutex_spin_enter(lock);
  950         if ((l = sip->sel_lwp) != NULL) {
  951                 /*
  952                  * This should rarely happen, so although SLIST_REMOVE()
  953                  * is slow, using it here is not a problem.
  954                  */
  955                 KASSERT(l->l_selcluster == sc);
  956                 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
  957                 sip->sel_lwp = NULL;
  958         }
  959         mutex_spin_exit(lock);
  960 }
  961 
  962 /*
  963  * System control nodes.
  964  */
  965 SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup")
  966 {
  967 
  968         sysctl_createv(clog, 0, NULL, NULL,
  969                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
  970                 CTLTYPE_INT, "direct_select",
  971                 SYSCTL_DESCR("Enable/disable direct select (for testing)"),
  972                 NULL, 0, &direct_select, 0,
  973                 CTL_KERN, CTL_CREATE, CTL_EOL);
  974 }

Cache object: 8a7d14f07878cb84bacae8c67a9fd606


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.