The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_futex.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: sys_futex.c,v 1.18 2022/04/21 12:05:13 riastradh Exp $ */
    2 
    3 /*-
    4  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Taylor R. Campbell and Jason R. Thorpe.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 #include <sys/cdefs.h>
   33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.18 2022/04/21 12:05:13 riastradh Exp $");
   34 
   35 /*
   36  * Futexes
   37  *
   38  *      The futex system call coordinates notifying threads waiting for
   39  *      changes on a 32-bit word of memory.  The word can be managed by
   40  *      CPU atomic operations in userland, without system calls, as long
   41  *      as there is no contention.
   42  *
   43  *      The simplest use case demonstrating the utility is:
   44  *
   45  *              // 32-bit word of memory shared among threads or
   46  *              // processes in userland.  lock & 1 means owned;
   47  *              // lock & 2 means there are waiters waiting.
   48  *              volatile int lock = 0;
   49  *
   50  *              int v;
   51  *
   52  *              // Acquire a lock.
   53  *              do {
   54  *                      v = lock;
   55  *                      if (v & 1) {
   56  *                              // Lock is held.  Set a bit to say that
   57  *                              // there are waiters, and wait for lock
   58  *                              // to change to anything other than v;
   59  *                              // then retry.
   60  *                              if (atomic_cas_uint(&lock, v, v | 2) != v)
   61  *                                      continue;
   62  *                              futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
   63  *                              continue;
   64  *                      }
   65  *              } while (atomic_cas_uint(&lock, v, v & ~1) != v);
   66  *              membar_acquire();
   67  *
   68  *              ...
   69  *
   70  *              // Release the lock.  Optimistically assume there are
   71  *              // no waiters first until demonstrated otherwise.
   72  *              membar_release();
   73  *              if (atomic_cas_uint(&lock, 1, 0) != 1) {
   74  *                      // There may be waiters.
   75  *                      v = atomic_swap_uint(&lock, 0);
   76  *                      // If there are still waiters, wake one.
   77  *                      if (v & 2)
   78  *                              futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
   79  *              }
   80  *
   81  *      The goal is to avoid the futex system call unless there is
   82  *      contention; then if there is contention, to guarantee no missed
   83  *      wakeups.
   84  *
   85  *      For a simple implementation, futex(FUTEX_WAIT) could queue
   86  *      itself to be woken, double-check the lock word, and then sleep;
   87  *      spurious wakeups are generally a fact of life, so any
   88  *      FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
   89  *
   90  *      If this were all there is to it, we could then increase
   91  *      parallelism by refining the approximation: partition the
   92  *      waiters into buckets by hashing the lock addresses to reduce
   93  *      the incidence of spurious wakeups.  But this is not all.
   94  *
   95  *      The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
   96  *      operation not only wakes n waiters on lock if lock == val, but
   97  *      also _transfers_ m additional waiters to lock2.  Unless wakeups
   98  *      on lock2 also trigger wakeups on lock, we cannot move waiters
   99  *      to lock2 if they merely share the same hash as waiters on lock.
  100  *      Thus, we can't approximately distribute waiters into queues by
  101  *      a hash function; we must distinguish futex queues exactly by
  102  *      lock address.
  103  *
  104  *      For now, we use a global red/black tree to index futexes.  This
  105  *      should be replaced by a lockless radix tree with a thread to
  106  *      free entries no longer in use once all lookups on all CPUs have
  107  *      completed.
  108  *
  109  *      Specifically, we maintain two maps:
  110  *
  111  *      futex_tab.va[vmspace, va] for private futexes
  112  *      futex_tab.oa[uvm_voaddr] for shared futexes
  113  *
  114  *      This implementation does not support priority inheritance.
  115  */
  116 
  117 #include <sys/param.h>
  118 #include <sys/types.h>
  119 #include <sys/atomic.h>
  120 #include <sys/condvar.h>
  121 #include <sys/futex.h>
  122 #include <sys/mutex.h>
  123 #include <sys/rbtree.h>
  124 #include <sys/queue.h>
  125 
  126 #include <sys/syscall.h>
  127 #include <sys/syscallargs.h>
  128 #include <sys/syscallvar.h>
  129 
  130 #include <uvm/uvm_extern.h>
  131 
  132 /*
  133  * Lock order:
  134  *
  135  *      futex_tab.lock
  136  *      futex::fx_qlock                 ordered by kva of struct futex
  137  *       -> futex_wait::fw_lock         only one at a time
  138  *      futex_wait::fw_lock             only one at a time
  139  *       -> futex::fx_abortlock         only one at a time
  140  */
  141 
  142 /*
  143  * union futex_key
  144  *
  145  *      A futex is addressed either by a vmspace+va (private) or by
  146  *      a uvm_voaddr (shared).
  147  */
  148 union futex_key {
  149         struct {
  150                 struct vmspace                  *vmspace;
  151                 vaddr_t                         va;
  152         }                       fk_private;
  153         struct uvm_voaddr       fk_shared;
  154 };
  155 
  156 /*
  157  * struct futex
  158  *
  159  *      Kernel state for a futex located at a particular address in a
  160  *      particular virtual address space.
  161  *
  162  *      N.B. fx_refcnt is an unsigned long because we need to be able
  163  *      to operate on it atomically on all systems while at the same
  164  *      time rendering practically impossible the chance of it reaching
  165  *      its max value.  In practice, we're limited by the number of LWPs
  166  *      that can be present on the system at any given time, and the
  167  *      assumption is that limit will be good enough on a 32-bit platform.
  168  *      See futex_wake() for why overflow needs to be avoided.
  169  */
  170 struct futex {
  171         union futex_key         fx_key;
  172         unsigned long           fx_refcnt;
  173         bool                    fx_shared;
  174         bool                    fx_on_tree;
  175         struct rb_node          fx_node;
  176 
  177         kmutex_t                        fx_qlock;
  178         TAILQ_HEAD(, futex_wait)        fx_queue;
  179 
  180         kmutex_t                        fx_abortlock;
  181         LIST_HEAD(, futex_wait)         fx_abortlist;
  182         kcondvar_t                      fx_abortcv;
  183 };
  184 
  185 /*
  186  * struct futex_wait
  187  *
  188  *      State for a thread to wait on a futex.  Threads wait on fw_cv
  189  *      for fw_bitset to be set to zero.  The thread may transition to
  190  *      a different futex queue at any time under the futex's lock.
  191  */
  192 struct futex_wait {
  193         kmutex_t                fw_lock;
  194         kcondvar_t              fw_cv;
  195         struct futex            *fw_futex;
  196         TAILQ_ENTRY(futex_wait) fw_entry;       /* queue lock */
  197         LIST_ENTRY(futex_wait)  fw_abort;       /* queue abortlock */
  198         int                     fw_bitset;
  199         bool                    fw_aborting;    /* fw_lock */
  200 };
  201 
  202 /*
  203  * futex_tab
  204  *
  205  *      Global trees of futexes by vmspace/va and VM object address.
  206  *
  207  *      XXX This obviously doesn't scale in parallel.  We could use a
  208  *      pserialize-safe data structure, but there may be a high cost to
  209  *      frequent deletion since we don't cache futexes after we're done
  210  *      with them.  We could use hashed locks.  But for now, just make
  211  *      sure userland can't DoS the serial performance, by using a
  212  *      balanced binary tree for lookup.
  213  *
  214  *      XXX We could use a per-process tree for the table indexed by
  215  *      virtual address to reduce contention between processes.
  216  */
  217 static struct {
  218         kmutex_t        lock;
  219         struct rb_tree  va;
  220         struct rb_tree  oa;
  221 } futex_tab __cacheline_aligned;
  222 
  223 static int
  224 compare_futex_key(void *cookie, const void *n, const void *k)
  225 {
  226         const struct futex *fa = n;
  227         const union futex_key *fka = &fa->fx_key;
  228         const union futex_key *fkb = k;
  229 
  230         if ((uintptr_t)fka->fk_private.vmspace <
  231             (uintptr_t)fkb->fk_private.vmspace)
  232                 return -1;
  233         if ((uintptr_t)fka->fk_private.vmspace >
  234             (uintptr_t)fkb->fk_private.vmspace)
  235                 return +1;
  236         if (fka->fk_private.va < fkb->fk_private.va)
  237                 return -1;
  238         if (fka->fk_private.va > fkb->fk_private.va)
  239                 return +1;
  240         return 0;
  241 }
  242 
  243 static int
  244 compare_futex(void *cookie, const void *na, const void *nb)
  245 {
  246         const struct futex *fa = na;
  247         const struct futex *fb = nb;
  248 
  249         return compare_futex_key(cookie, fa, &fb->fx_key);
  250 }
  251 
  252 static const rb_tree_ops_t futex_rb_ops = {
  253         .rbto_compare_nodes = compare_futex,
  254         .rbto_compare_key = compare_futex_key,
  255         .rbto_node_offset = offsetof(struct futex, fx_node),
  256 };
  257 
  258 static int
  259 compare_futex_shared_key(void *cookie, const void *n, const void *k)
  260 {
  261         const struct futex *fa = n;
  262         const union futex_key *fka = &fa->fx_key;
  263         const union futex_key *fkb = k;
  264 
  265         return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
  266 }
  267 
  268 static int
  269 compare_futex_shared(void *cookie, const void *na, const void *nb)
  270 {
  271         const struct futex *fa = na;
  272         const struct futex *fb = nb;
  273 
  274         return compare_futex_shared_key(cookie, fa, &fb->fx_key);
  275 }
  276 
  277 static const rb_tree_ops_t futex_shared_rb_ops = {
  278         .rbto_compare_nodes = compare_futex_shared,
  279         .rbto_compare_key = compare_futex_shared_key,
  280         .rbto_node_offset = offsetof(struct futex, fx_node),
  281 };
  282 
  283 static void     futex_wait_dequeue(struct futex_wait *, struct futex *);
  284 
  285 /*
  286  * futex_load(uaddr, kaddr)
  287  *
  288  *      Perform a single atomic load to read *uaddr, and return the
  289  *      result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
  290  *      mapped.
  291  */
  292 static inline int
  293 futex_load(int *uaddr, int *kaddr)
  294 {
  295         return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
  296 }
  297 
  298 /*
  299  * futex_test(uaddr, expected)
  300  *
  301  *      True if *uaddr == expected.  False if *uaddr != expected, or if
  302  *      uaddr is not mapped.
  303  */
  304 static bool
  305 futex_test(int *uaddr, int expected)
  306 {
  307         int val;
  308         int error;
  309 
  310         error = futex_load(uaddr, &val);
  311         if (error)
  312                 return false;
  313         return val == expected;
  314 }
  315 
  316 /*
  317  * futex_sys_init()
  318  *
  319  *      Initialize the futex subsystem.
  320  */
  321 void
  322 futex_sys_init(void)
  323 {
  324 
  325         mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
  326         rb_tree_init(&futex_tab.va, &futex_rb_ops);
  327         rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
  328 }
  329 
  330 /*
  331  * futex_sys_fini()
  332  *
  333  *      Finalize the futex subsystem.
  334  */
  335 void
  336 futex_sys_fini(void)
  337 {
  338 
  339         KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
  340         KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
  341         mutex_destroy(&futex_tab.lock);
  342 }
  343 
  344 /*
  345  * futex_queue_init(f)
  346  *
  347  *      Initialize the futex queue.  Caller must call futex_queue_fini
  348  *      when done.
  349  *
  350  *      Never sleeps.
  351  */
  352 static void
  353 futex_queue_init(struct futex *f)
  354 {
  355 
  356         mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE);
  357         mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE);
  358         cv_init(&f->fx_abortcv, "fqabort");
  359         LIST_INIT(&f->fx_abortlist);
  360         TAILQ_INIT(&f->fx_queue);
  361 }
  362 
  363 /*
  364  * futex_queue_drain(f)
  365  *
  366  *      Wait for any aborting waiters in f; then empty the queue of
  367  *      any stragglers and wake them.  Caller must guarantee no new
  368  *      references to f.
  369  *
  370  *      May sleep.
  371  */
  372 static void
  373 futex_queue_drain(struct futex *f)
  374 {
  375         struct futex_wait *fw, *fw_next;
  376 
  377         mutex_enter(&f->fx_abortlock);
  378         while (!LIST_EMPTY(&f->fx_abortlist))
  379                 cv_wait(&f->fx_abortcv, &f->fx_abortlock);
  380         mutex_exit(&f->fx_abortlock);
  381 
  382         mutex_enter(&f->fx_qlock);
  383         TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
  384                 mutex_enter(&fw->fw_lock);
  385                 futex_wait_dequeue(fw, f);
  386                 cv_broadcast(&fw->fw_cv);
  387                 mutex_exit(&fw->fw_lock);
  388         }
  389         mutex_exit(&f->fx_qlock);
  390 }
  391 
  392 /*
  393  * futex_queue_fini(fq)
  394  *
  395  *      Finalize the futex queue initialized by futex_queue_init.  Queue
  396  *      must be empty.  Caller must not use f again until a subsequent
  397  *      futex_queue_init.
  398  */
  399 static void
  400 futex_queue_fini(struct futex *f)
  401 {
  402 
  403         KASSERT(TAILQ_EMPTY(&f->fx_queue));
  404         KASSERT(LIST_EMPTY(&f->fx_abortlist));
  405         mutex_destroy(&f->fx_qlock);
  406         mutex_destroy(&f->fx_abortlock);
  407         cv_destroy(&f->fx_abortcv);
  408 }
  409 
  410 /*
  411  * futex_key_init(key, vm, va, shared)
  412  *
  413  *      Initialize a futex key for lookup, etc.
  414  */
  415 static int
  416 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
  417 {
  418         int error = 0;
  419 
  420         if (__predict_false(shared)) {
  421                 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
  422                         error = EFAULT;
  423         } else {
  424                 fk->fk_private.vmspace = vm;
  425                 fk->fk_private.va = va;
  426         }
  427 
  428         return error;
  429 }
  430 
  431 /*
  432  * futex_key_fini(key, shared)
  433  *
  434  *      Release a futex key.
  435  */
  436 static void
  437 futex_key_fini(union futex_key *fk, bool shared)
  438 {
  439         if (__predict_false(shared))
  440                 uvm_voaddr_release(&fk->fk_shared);
  441         memset(fk, 0, sizeof(*fk));
  442 }
  443 
  444 /*
  445  * futex_create(fk, shared)
  446  *
  447  *      Create a futex.  Initial reference count is 1, representing the
  448  *      caller.  Returns NULL on failure.  Always takes ownership of the
  449  *      key, either transferring it to the newly-created futex, or releasing
  450  *      the key if creation fails.
  451  *
  452  *      Never sleeps for memory, but may sleep to acquire a lock.
  453  */
  454 static struct futex *
  455 futex_create(union futex_key *fk, bool shared)
  456 {
  457         struct futex *f;
  458 
  459         f = kmem_alloc(sizeof(*f), KM_NOSLEEP);
  460         if (f == NULL) {
  461                 futex_key_fini(fk, shared);
  462                 return NULL;
  463         }
  464         f->fx_key = *fk;
  465         f->fx_refcnt = 1;
  466         f->fx_shared = shared;
  467         f->fx_on_tree = false;
  468         futex_queue_init(f);
  469 
  470         return f;
  471 }
  472 
  473 /*
  474  * futex_destroy(f)
  475  *
  476  *      Destroy a futex created with futex_create.  Reference count
  477  *      must be zero.
  478  *
  479  *      May sleep.
  480  */
  481 static void
  482 futex_destroy(struct futex *f)
  483 {
  484 
  485         ASSERT_SLEEPABLE();
  486 
  487         KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
  488         KASSERT(!f->fx_on_tree);
  489 
  490         /* Drain and destroy the private queue.  */
  491         futex_queue_drain(f);
  492         futex_queue_fini(f);
  493 
  494         futex_key_fini(&f->fx_key, f->fx_shared);
  495 
  496         kmem_free(f, sizeof(*f));
  497 }
  498 
  499 /*
  500  * futex_hold(f)
  501  *
  502  *      Attempt to acquire a reference to f.  Return 0 on success,
  503  *      ENFILE on too many references.
  504  *
  505  *      Never sleeps.
  506  */
  507 static int
  508 futex_hold(struct futex *f)
  509 {
  510         unsigned long refcnt;
  511 
  512         do {
  513                 refcnt = atomic_load_relaxed(&f->fx_refcnt);
  514                 if (refcnt == ULONG_MAX)
  515                         return ENFILE;
  516         } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt);
  517 
  518         return 0;
  519 }
  520 
  521 /*
  522  * futex_rele(f)
  523  *
  524  *      Release a reference to f acquired with futex_create or
  525  *      futex_hold.
  526  *
  527  *      May sleep to free f.
  528  */
  529 static void
  530 futex_rele(struct futex *f)
  531 {
  532         unsigned long refcnt;
  533 
  534         ASSERT_SLEEPABLE();
  535 
  536         do {
  537                 refcnt = atomic_load_relaxed(&f->fx_refcnt);
  538                 if (refcnt == 1)
  539                         goto trylast;
  540 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  541                 membar_release();
  542 #endif
  543         } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
  544         return;
  545 
  546 trylast:
  547         mutex_enter(&futex_tab.lock);
  548         if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) {
  549 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  550                 membar_acquire();
  551 #endif
  552                 if (f->fx_on_tree) {
  553                         if (__predict_false(f->fx_shared))
  554                                 rb_tree_remove_node(&futex_tab.oa, f);
  555                         else
  556                                 rb_tree_remove_node(&futex_tab.va, f);
  557                         f->fx_on_tree = false;
  558                 }
  559         } else {
  560                 /* References remain -- don't destroy it.  */
  561                 f = NULL;
  562         }
  563         mutex_exit(&futex_tab.lock);
  564         if (f != NULL)
  565                 futex_destroy(f);
  566 }
  567 
  568 /*
  569  * futex_rele_not_last(f)
  570  *
  571  *      Release a reference to f acquired with futex_create or
  572  *      futex_hold.
  573  *
  574  *      This version asserts that we are not dropping the last
  575  *      reference to f.
  576  */
  577 static void
  578 futex_rele_not_last(struct futex *f)
  579 {
  580         unsigned long refcnt;
  581 
  582         do {
  583                 refcnt = atomic_load_relaxed(&f->fx_refcnt);
  584                 KASSERT(refcnt > 1);
  585         } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
  586 }
  587 
  588 /*
  589  * futex_lookup_by_key(key, shared, &f)
  590  *
  591  *      Try to find an existing futex va reference in the specified key
  592  *      On success, return 0, set f to found futex or to NULL if not found,
  593  *      and increment f's reference count if found.
  594  *
  595  *      Return ENFILE if reference count too high.
  596  *
  597  *      Internal lookup routine shared by futex_lookup() and
  598  *      futex_lookup_create().
  599  */
  600 static int
  601 futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp)
  602 {
  603         struct futex *f;
  604         int error = 0;
  605 
  606         mutex_enter(&futex_tab.lock);
  607         if (__predict_false(shared)) {
  608                 f = rb_tree_find_node(&futex_tab.oa, fk);
  609         } else {
  610                 f = rb_tree_find_node(&futex_tab.va, fk);
  611         }
  612         if (f) {
  613                 error = futex_hold(f);
  614                 if (error)
  615                         f = NULL;
  616         }
  617         *fp = f;
  618         mutex_exit(&futex_tab.lock);
  619 
  620         return error;
  621 }
  622 
  623 /*
  624  * futex_insert(f, fp)
  625  *
  626  *      Try to insert the futex f into the tree by va.  If there
  627  *      already is a futex for its va, acquire a reference to it, and
  628  *      store it in *fp; otherwise store f in *fp.
  629  *
  630  *      Return 0 on success, ENFILE if there already is a futex but its
  631  *      reference count is too high.
  632  */
  633 static int
  634 futex_insert(struct futex *f, struct futex **fp)
  635 {
  636         struct futex *f0;
  637         int error;
  638 
  639         KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
  640         KASSERT(!f->fx_on_tree);
  641 
  642         mutex_enter(&futex_tab.lock);
  643         if (__predict_false(f->fx_shared))
  644                 f0 = rb_tree_insert_node(&futex_tab.oa, f);
  645         else
  646                 f0 = rb_tree_insert_node(&futex_tab.va, f);
  647         if (f0 == f) {
  648                 f->fx_on_tree = true;
  649                 error = 0;
  650         } else {
  651                 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
  652                 KASSERT(f0->fx_on_tree);
  653                 error = futex_hold(f0);
  654                 if (error)
  655                         goto out;
  656         }
  657         *fp = f0;
  658 out:    mutex_exit(&futex_tab.lock);
  659 
  660         return error;
  661 }
  662 
  663 /*
  664  * futex_lookup(uaddr, shared, &f)
  665  *
  666  *      Find a futex at the userland pointer uaddr in the current
  667  *      process's VM space.  On success, return the futex in f and
  668  *      increment its reference count.
  669  *
  670  *      Caller must call futex_rele when done.
  671  */
  672 static int
  673 futex_lookup(int *uaddr, bool shared, struct futex **fp)
  674 {
  675         union futex_key fk;
  676         struct vmspace *vm = curproc->p_vmspace;
  677         vaddr_t va = (vaddr_t)uaddr;
  678         int error;
  679 
  680         /*
  681          * Reject unaligned user pointers so we don't cross page
  682          * boundaries and so atomics will work.
  683          */
  684         if ((va & 3) != 0)
  685                 return EINVAL;
  686 
  687         /* Look it up. */
  688         error = futex_key_init(&fk, vm, va, shared);
  689         if (error)
  690                 return error;
  691 
  692         error = futex_lookup_by_key(&fk, shared, fp);
  693         futex_key_fini(&fk, shared);
  694         if (error)
  695                 return error;
  696 
  697         KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
  698         KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
  699 
  700         /*
  701          * Success!  (Caller must still check whether we found
  702          * anything, but nothing went _wrong_ like trying to use
  703          * unmapped memory.)
  704          */
  705         KASSERT(error == 0);
  706 
  707         return error;
  708 }
  709 
  710 /*
  711  * futex_lookup_create(uaddr, shared, &f)
  712  *
  713  *      Find or create a futex at the userland pointer uaddr in the
  714  *      current process's VM space.  On success, return the futex in f
  715  *      and increment its reference count.
  716  *
  717  *      Caller must call futex_rele when done.
  718  */
  719 static int
  720 futex_lookup_create(int *uaddr, bool shared, struct futex **fp)
  721 {
  722         union futex_key fk;
  723         struct vmspace *vm = curproc->p_vmspace;
  724         struct futex *f = NULL;
  725         vaddr_t va = (vaddr_t)uaddr;
  726         int error;
  727 
  728         /*
  729          * Reject unaligned user pointers so we don't cross page
  730          * boundaries and so atomics will work.
  731          */
  732         if ((va & 3) != 0)
  733                 return EINVAL;
  734 
  735         error = futex_key_init(&fk, vm, va, shared);
  736         if (error)
  737                 return error;
  738 
  739         /*
  740          * Optimistically assume there already is one, and try to find
  741          * it.
  742          */
  743         error = futex_lookup_by_key(&fk, shared, fp);
  744         if (error || *fp != NULL) {
  745                 /*
  746                  * We either found one, or there was an error.
  747                  * In either case, we are done with the key.
  748                  */
  749                 futex_key_fini(&fk, shared);
  750                 goto out;
  751         }
  752 
  753         /*
  754          * Create a futex record.  This transfers ownership of the key
  755          * in all cases.
  756          */
  757         f = futex_create(&fk, shared);
  758         if (f == NULL) {
  759                 error = ENOMEM;
  760                 goto out;
  761         }
  762 
  763         /*
  764          * Insert our new futex, or use existing if someone else beat
  765          * us to it.
  766          */
  767         error = futex_insert(f, fp);
  768         if (error)
  769                 goto out;
  770         if (*fp == f)
  771                 f = NULL;       /* don't release on exit */
  772 
  773         /* Success!  */
  774         KASSERT(error == 0);
  775 
  776 out:    if (f != NULL)
  777                 futex_rele(f);
  778         KASSERT(error || *fp != NULL);
  779         KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
  780         return error;
  781 }
  782 
  783 /*
  784  * futex_wait_init(fw, bitset)
  785  *
  786  *      Initialize a record for a thread to wait on a futex matching
  787  *      the specified bit set.  Should be passed to futex_wait_enqueue
  788  *      before futex_wait, and should be passed to futex_wait_fini when
  789  *      done.
  790  */
  791 static void
  792 futex_wait_init(struct futex_wait *fw, int bitset)
  793 {
  794 
  795         KASSERT(bitset);
  796 
  797         mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE);
  798         cv_init(&fw->fw_cv, "futex");
  799         fw->fw_futex = NULL;
  800         fw->fw_bitset = bitset;
  801         fw->fw_aborting = false;
  802 }
  803 
  804 /*
  805  * futex_wait_fini(fw)
  806  *
  807  *      Finalize a record for a futex waiter.  Must not be on any
  808  *      futex's queue.
  809  */
  810 static void
  811 futex_wait_fini(struct futex_wait *fw)
  812 {
  813 
  814         KASSERT(fw->fw_futex == NULL);
  815 
  816         cv_destroy(&fw->fw_cv);
  817         mutex_destroy(&fw->fw_lock);
  818 }
  819 
  820 /*
  821  * futex_wait_enqueue(fw, f)
  822  *
  823  *      Put fw on the futex queue.  Must be done before futex_wait.
  824  *      Caller must hold fw's lock and f's lock, and fw must not be on
  825  *      any existing futex's waiter list.
  826  */
  827 static void
  828 futex_wait_enqueue(struct futex_wait *fw, struct futex *f)
  829 {
  830 
  831         KASSERT(mutex_owned(&f->fx_qlock));
  832         KASSERT(mutex_owned(&fw->fw_lock));
  833         KASSERT(fw->fw_futex == NULL);
  834         KASSERT(!fw->fw_aborting);
  835 
  836         fw->fw_futex = f;
  837         TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry);
  838 }
  839 
  840 /*
  841  * futex_wait_dequeue(fw, f)
  842  *
  843  *      Remove fw from the futex queue.  Precludes subsequent
  844  *      futex_wait until a futex_wait_enqueue.  Caller must hold fw's
  845  *      lock and f's lock, and fw must be on f.
  846  */
  847 static void
  848 futex_wait_dequeue(struct futex_wait *fw, struct futex *f)
  849 {
  850 
  851         KASSERT(mutex_owned(&f->fx_qlock));
  852         KASSERT(mutex_owned(&fw->fw_lock));
  853         KASSERT(fw->fw_futex == f);
  854 
  855         TAILQ_REMOVE(&f->fx_queue, fw, fw_entry);
  856         fw->fw_futex = NULL;
  857 }
  858 
  859 /*
  860  * futex_wait_abort(fw)
  861  *
  862  *      Caller is no longer waiting for fw.  Remove it from any queue
  863  *      if it was on one.  Caller must hold fw->fw_lock.
  864  */
  865 static void
  866 futex_wait_abort(struct futex_wait *fw)
  867 {
  868         struct futex *f;
  869 
  870         KASSERT(mutex_owned(&fw->fw_lock));
  871 
  872         /*
  873          * Grab the futex queue.  It can't go away as long as we hold
  874          * fw_lock.  However, we can't take the queue lock because
  875          * that's a lock order reversal.
  876          */
  877         f = fw->fw_futex;
  878 
  879         /* Put us on the abort list so that fq won't go away.  */
  880         mutex_enter(&f->fx_abortlock);
  881         LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort);
  882         mutex_exit(&f->fx_abortlock);
  883 
  884         /*
  885          * Mark fw as aborting so it won't lose wakeups and won't be
  886          * transferred to any other queue.
  887          */
  888         fw->fw_aborting = true;
  889 
  890         /* f is now stable, so we can release fw_lock.  */
  891         mutex_exit(&fw->fw_lock);
  892 
  893         /* Now we can remove fw under the queue lock.  */
  894         mutex_enter(&f->fx_qlock);
  895         mutex_enter(&fw->fw_lock);
  896         futex_wait_dequeue(fw, f);
  897         mutex_exit(&fw->fw_lock);
  898         mutex_exit(&f->fx_qlock);
  899 
  900         /*
  901          * Finally, remove us from the abort list and notify anyone
  902          * waiting for the abort to complete if we were the last to go.
  903          */
  904         mutex_enter(&f->fx_abortlock);
  905         LIST_REMOVE(fw, fw_abort);
  906         if (LIST_EMPTY(&f->fx_abortlist))
  907                 cv_broadcast(&f->fx_abortcv);
  908         mutex_exit(&f->fx_abortlock);
  909 
  910         /*
  911          * Release our reference to the futex now that we are not
  912          * waiting for it.
  913          */
  914         futex_rele(f);
  915 
  916         /*
  917          * Reacquire the fw lock as caller expects.  Verify that we're
  918          * aborting and no longer associated with a futex.
  919          */
  920         mutex_enter(&fw->fw_lock);
  921         KASSERT(fw->fw_aborting);
  922         KASSERT(fw->fw_futex == NULL);
  923 }
  924 
  925 /*
  926  * futex_wait(fw, deadline, clkid)
  927  *
  928  *      fw must be a waiter on a futex's queue.  Wait until deadline on
  929  *      the clock clkid, or forever if deadline is NULL, for a futex
  930  *      wakeup.  Return 0 on explicit wakeup or destruction of futex,
  931  *      ETIMEDOUT on timeout, EINTR/ERESTART on signal.  Either way, fw
  932  *      will no longer be on a futex queue on return.
  933  */
  934 static int
  935 futex_wait(struct futex_wait *fw, const struct timespec *deadline,
  936     clockid_t clkid)
  937 {
  938         int error = 0;
  939 
  940         /* Test and wait under the wait lock.  */
  941         mutex_enter(&fw->fw_lock);
  942 
  943         for (;;) {
  944                 /* If we're done yet, stop and report success.  */
  945                 if (fw->fw_bitset == 0 || fw->fw_futex == NULL) {
  946                         error = 0;
  947                         break;
  948                 }
  949 
  950                 /* If anything went wrong in the last iteration, stop.  */
  951                 if (error)
  952                         break;
  953 
  954                 /* Not done yet.  Wait.  */
  955                 if (deadline) {
  956                         struct timespec ts;
  957 
  958                         /* Check our watch.  */
  959                         error = clock_gettime1(clkid, &ts);
  960                         if (error)
  961                                 break;
  962 
  963                         /* If we're past the deadline, ETIMEDOUT.  */
  964                         if (timespeccmp(deadline, &ts, <=)) {
  965                                 error = ETIMEDOUT;
  966                                 break;
  967                         }
  968 
  969                         /* Count how much time is left.  */
  970                         timespecsub(deadline, &ts, &ts);
  971 
  972                         /* Wait for that much time, allowing signals.  */
  973                         error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock,
  974                             tstohz(&ts));
  975                 } else {
  976                         /* Wait indefinitely, allowing signals. */
  977                         error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock);
  978                 }
  979         }
  980 
  981         /*
  982          * If we were woken up, the waker will have removed fw from the
  983          * queue.  But if anything went wrong, we must remove fw from
  984          * the queue ourselves.  While here, convert EWOULDBLOCK to
  985          * ETIMEDOUT.
  986          */
  987         if (error) {
  988                 futex_wait_abort(fw);
  989                 if (error == EWOULDBLOCK)
  990                         error = ETIMEDOUT;
  991         }
  992 
  993         mutex_exit(&fw->fw_lock);
  994 
  995         return error;
  996 }
  997 
  998 /*
  999  * futex_wake(f, nwake, f2, nrequeue, bitset)
 1000  *
 1001  *      Wake up to nwake waiters on f matching bitset; then, if f2 is
 1002  *      provided, move up to nrequeue remaining waiters on f matching
 1003  *      bitset to f2.  Return the number of waiters actually woken.
 1004  *      Caller must hold the locks of f and f2, if provided.
 1005  */
 1006 static unsigned
 1007 futex_wake(struct futex *f, unsigned nwake, struct futex *f2,
 1008     unsigned nrequeue, int bitset)
 1009 {
 1010         struct futex_wait *fw, *fw_next;
 1011         unsigned nwoken = 0;
 1012         int hold_error __diagused;
 1013 
 1014         KASSERT(mutex_owned(&f->fx_qlock));
 1015         KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock));
 1016 
 1017         /* Wake up to nwake waiters, and count the number woken.  */
 1018         TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
 1019                 if ((fw->fw_bitset & bitset) == 0)
 1020                         continue;
 1021                 if (nwake > 0) {
 1022                         mutex_enter(&fw->fw_lock);
 1023                         if (__predict_false(fw->fw_aborting)) {
 1024                                 mutex_exit(&fw->fw_lock);
 1025                                 continue;
 1026                         }
 1027                         futex_wait_dequeue(fw, f);
 1028                         fw->fw_bitset = 0;
 1029                         cv_broadcast(&fw->fw_cv);
 1030                         mutex_exit(&fw->fw_lock);
 1031                         nwake--;
 1032                         nwoken++;
 1033                         /*
 1034                          * Drop the futex reference on behalf of the
 1035                          * waiter.  We assert this is not the last
 1036                          * reference on the futex (our caller should
 1037                          * also have one).
 1038                          */
 1039                         futex_rele_not_last(f);
 1040                 } else {
 1041                         break;
 1042                 }
 1043         }
 1044 
 1045         if (f2) {
 1046                 /* Move up to nrequeue waiters from f's queue to f2's queue. */
 1047                 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
 1048                         if ((fw->fw_bitset & bitset) == 0)
 1049                                 continue;
 1050                         if (nrequeue > 0) {
 1051                                 mutex_enter(&fw->fw_lock);
 1052                                 if (__predict_false(fw->fw_aborting)) {
 1053                                         mutex_exit(&fw->fw_lock);
 1054                                         continue;
 1055                                 }
 1056                                 futex_wait_dequeue(fw, f);
 1057                                 futex_wait_enqueue(fw, f2);
 1058                                 mutex_exit(&fw->fw_lock);
 1059                                 nrequeue--;
 1060                                 /*
 1061                                  * Transfer the reference from f to f2.
 1062                                  * As above, we assert that we are not
 1063                                  * dropping the last reference to f here.
 1064                                  *
 1065                                  * XXX futex_hold() could theoretically
 1066                                  * XXX fail here.
 1067                                  */
 1068                                 futex_rele_not_last(f);
 1069                                 hold_error = futex_hold(f2);
 1070                                 KASSERT(hold_error == 0);
 1071                         } else {
 1072                                 break;
 1073                         }
 1074                 }
 1075         } else {
 1076                 KASSERT(nrequeue == 0);
 1077         }
 1078 
 1079         /* Return the number of waiters woken.  */
 1080         return nwoken;
 1081 }
 1082 
 1083 /*
 1084  * futex_queue_lock(f)
 1085  *
 1086  *      Acquire the queue lock of f.  Pair with futex_queue_unlock.  Do
 1087  *      not use if caller needs to acquire two locks; use
 1088  *      futex_queue_lock2 instead.
 1089  */
 1090 static void
 1091 futex_queue_lock(struct futex *f)
 1092 {
 1093         mutex_enter(&f->fx_qlock);
 1094 }
 1095 
 1096 /*
 1097  * futex_queue_unlock(f)
 1098  *
 1099  *      Release the queue lock of f.
 1100  */
 1101 static void
 1102 futex_queue_unlock(struct futex *f)
 1103 {
 1104         mutex_exit(&f->fx_qlock);
 1105 }
 1106 
 1107 /*
 1108  * futex_queue_lock2(f, f2)
 1109  *
 1110  *      Acquire the queue locks of both f and f2, which may be null, or
 1111  *      which may have the same underlying queue.  If they are
 1112  *      distinct, an arbitrary total order is chosen on the locks.
 1113  *
 1114  *      Callers should only ever acquire multiple queue locks
 1115  *      simultaneously using futex_queue_lock2.
 1116  */
 1117 static void
 1118 futex_queue_lock2(struct futex *f, struct futex *f2)
 1119 {
 1120 
 1121         /*
 1122          * If both are null, do nothing; if one is null and the other
 1123          * is not, lock the other and be done with it.
 1124          */
 1125         if (f == NULL && f2 == NULL) {
 1126                 return;
 1127         } else if (f == NULL) {
 1128                 mutex_enter(&f2->fx_qlock);
 1129                 return;
 1130         } else if (f2 == NULL) {
 1131                 mutex_enter(&f->fx_qlock);
 1132                 return;
 1133         }
 1134 
 1135         /* If both futexes are the same, acquire only one. */
 1136         if (f == f2) {
 1137                 mutex_enter(&f->fx_qlock);
 1138                 return;
 1139         }
 1140 
 1141         /* Otherwise, use the ordering on the kva of the futex pointer.  */
 1142         if ((uintptr_t)f < (uintptr_t)f2) {
 1143                 mutex_enter(&f->fx_qlock);
 1144                 mutex_enter(&f2->fx_qlock);
 1145         } else {
 1146                 mutex_enter(&f2->fx_qlock);
 1147                 mutex_enter(&f->fx_qlock);
 1148         }
 1149 }
 1150 
 1151 /*
 1152  * futex_queue_unlock2(f, f2)
 1153  *
 1154  *      Release the queue locks of both f and f2, which may be null, or
 1155  *      which may have the same underlying queue.
 1156  */
 1157 static void
 1158 futex_queue_unlock2(struct futex *f, struct futex *f2)
 1159 {
 1160 
 1161         /*
 1162          * If both are null, do nothing; if one is null and the other
 1163          * is not, unlock the other and be done with it.
 1164          */
 1165         if (f == NULL && f2 == NULL) {
 1166                 return;
 1167         } else if (f == NULL) {
 1168                 mutex_exit(&f2->fx_qlock);
 1169                 return;
 1170         } else if (f2 == NULL) {
 1171                 mutex_exit(&f->fx_qlock);
 1172                 return;
 1173         }
 1174 
 1175         /* If both futexes are the same, release only one. */
 1176         if (f == f2) {
 1177                 mutex_exit(&f->fx_qlock);
 1178                 return;
 1179         }
 1180 
 1181         /* Otherwise, use the ordering on the kva of the futex pointer.  */
 1182         if ((uintptr_t)f < (uintptr_t)f2) {
 1183                 mutex_exit(&f2->fx_qlock);
 1184                 mutex_exit(&f->fx_qlock);
 1185         } else {
 1186                 mutex_exit(&f->fx_qlock);
 1187                 mutex_exit(&f2->fx_qlock);
 1188         }
 1189 }
 1190 
 1191 /*
 1192  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
 1193  *
 1194  *      Implement futex(FUTEX_WAIT).
 1195  */
 1196 static int
 1197 futex_func_wait(bool shared, int *uaddr, int val, int val3,
 1198     const struct timespec *timeout, clockid_t clkid, int clkflags,
 1199     register_t *retval)
 1200 {
 1201         struct futex *f;
 1202         struct futex_wait wait, *fw = &wait;
 1203         struct timespec ts;
 1204         const struct timespec *deadline;
 1205         int error;
 1206 
 1207         /*
 1208          * If there's nothing to wait for, and nobody will ever wake
 1209          * us, then don't set anything up to wait -- just stop here.
 1210          */
 1211         if (val3 == 0)
 1212                 return EINVAL;
 1213 
 1214         /* Optimistically test before anything else.  */
 1215         if (!futex_test(uaddr, val))
 1216                 return EAGAIN;
 1217 
 1218         /* Determine a deadline on the specified clock.  */
 1219         if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
 1220                 deadline = timeout;
 1221         } else {
 1222                 error = clock_gettime1(clkid, &ts);
 1223                 if (error)
 1224                         return error;
 1225                 timespecadd(&ts, timeout, &ts);
 1226                 deadline = &ts;
 1227         }
 1228 
 1229         /* Get the futex, creating it if necessary.  */
 1230         error = futex_lookup_create(uaddr, shared, &f);
 1231         if (error)
 1232                 return error;
 1233         KASSERT(f);
 1234 
 1235         /* Get ready to wait.  */
 1236         futex_wait_init(fw, val3);
 1237 
 1238         /*
 1239          * Under the queue lock, check the value again: if it has
 1240          * already changed, EAGAIN; otherwise enqueue the waiter.
 1241          * Since FUTEX_WAKE will use the same lock and be done after
 1242          * modifying the value, the order in which we check and enqueue
 1243          * is immaterial.
 1244          */
 1245         futex_queue_lock(f);
 1246         if (!futex_test(uaddr, val)) {
 1247                 futex_queue_unlock(f);
 1248                 error = EAGAIN;
 1249                 goto out;
 1250         }
 1251         mutex_enter(&fw->fw_lock);
 1252         futex_wait_enqueue(fw, f);
 1253         mutex_exit(&fw->fw_lock);
 1254         futex_queue_unlock(f);
 1255 
 1256         /*
 1257          * We cannot drop our reference to the futex here, because
 1258          * we might be enqueued on a different one when we are awakened.
 1259          * The references will be managed on our behalf in the requeue
 1260          * and wake cases.
 1261          */
 1262         f = NULL;
 1263 
 1264         /* Wait. */
 1265         error = futex_wait(fw, deadline, clkid);
 1266         if (error)
 1267                 goto out;
 1268 
 1269         /* Return 0 on success, error on failure. */
 1270         *retval = 0;
 1271 
 1272 out:    if (f != NULL)
 1273                 futex_rele(f);
 1274         futex_wait_fini(fw);
 1275         return error;
 1276 }
 1277 
 1278 /*
 1279  * futex_func_wake(uaddr, val, val3, retval)
 1280  *
 1281  *      Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
 1282  */
 1283 static int
 1284 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
 1285 {
 1286         struct futex *f;
 1287         unsigned int nwoken = 0;
 1288         int error = 0;
 1289 
 1290         /* Reject negative number of wakeups.  */
 1291         if (val < 0) {
 1292                 error = EINVAL;
 1293                 goto out;
 1294         }
 1295 
 1296         /* Look up the futex, if any.  */
 1297         error = futex_lookup(uaddr, shared, &f);
 1298         if (error)
 1299                 goto out;
 1300 
 1301         /* If there's no futex, there are no waiters to wake.  */
 1302         if (f == NULL)
 1303                 goto out;
 1304 
 1305         /*
 1306          * Under f's queue lock, wake the waiters and remember the
 1307          * number woken.
 1308          */
 1309         futex_queue_lock(f);
 1310         nwoken = futex_wake(f, val, NULL, 0, val3);
 1311         futex_queue_unlock(f);
 1312 
 1313         /* Release the futex.  */
 1314         futex_rele(f);
 1315 
 1316 out:
 1317         /* Return the number of waiters woken.  */
 1318         *retval = nwoken;
 1319 
 1320         /* Success!  */
 1321         return error;
 1322 }
 1323 
 1324 /*
 1325  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
 1326  *
 1327  *      Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
 1328  */
 1329 static int
 1330 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
 1331     int val2, int val3, register_t *retval)
 1332 {
 1333         struct futex *f = NULL, *f2 = NULL;
 1334         unsigned nwoken = 0;    /* default to zero woken on early return */
 1335         int error;
 1336 
 1337         /* Reject negative number of wakeups or requeues. */
 1338         if (val < 0 || val2 < 0) {
 1339                 error = EINVAL;
 1340                 goto out;
 1341         }
 1342 
 1343         /* Look up the source futex, if any. */
 1344         error = futex_lookup(uaddr, shared, &f);
 1345         if (error)
 1346                 goto out;
 1347 
 1348         /* If there is none, nothing to do. */
 1349         if (f == NULL)
 1350                 goto out;
 1351 
 1352         /*
 1353          * We may need to create the destination futex because it's
 1354          * entirely possible it does not currently have any waiters.
 1355          */
 1356         error = futex_lookup_create(uaddr2, shared, &f2);
 1357         if (error)
 1358                 goto out;
 1359 
 1360         /*
 1361          * Under the futexes' queue locks, check the value; if
 1362          * unchanged from val3, wake the waiters.
 1363          */
 1364         futex_queue_lock2(f, f2);
 1365         if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
 1366                 error = EAGAIN;
 1367         } else {
 1368                 error = 0;
 1369                 nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY);
 1370         }
 1371         futex_queue_unlock2(f, f2);
 1372 
 1373 out:
 1374         /* Return the number of waiters woken.  */
 1375         *retval = nwoken;
 1376 
 1377         /* Release the futexes if we got them.  */
 1378         if (f2)
 1379                 futex_rele(f2);
 1380         if (f)
 1381                 futex_rele(f);
 1382         return error;
 1383 }
 1384 
 1385 /*
 1386  * futex_validate_op_cmp(val3)
 1387  *
 1388  *      Validate an op/cmp argument for FUTEX_WAKE_OP.
 1389  */
 1390 static int
 1391 futex_validate_op_cmp(int val3)
 1392 {
 1393         int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
 1394         int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
 1395 
 1396         if (op & FUTEX_OP_OPARG_SHIFT) {
 1397                 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
 1398                 if (oparg < 0)
 1399                         return EINVAL;
 1400                 if (oparg >= 32)
 1401                         return EINVAL;
 1402                 op &= ~FUTEX_OP_OPARG_SHIFT;
 1403         }
 1404 
 1405         switch (op) {
 1406         case FUTEX_OP_SET:
 1407         case FUTEX_OP_ADD:
 1408         case FUTEX_OP_OR:
 1409         case FUTEX_OP_ANDN:
 1410         case FUTEX_OP_XOR:
 1411                 break;
 1412         default:
 1413                 return EINVAL;
 1414         }
 1415 
 1416         switch (cmp) {
 1417         case FUTEX_OP_CMP_EQ:
 1418         case FUTEX_OP_CMP_NE:
 1419         case FUTEX_OP_CMP_LT:
 1420         case FUTEX_OP_CMP_LE:
 1421         case FUTEX_OP_CMP_GT:
 1422         case FUTEX_OP_CMP_GE:
 1423                 break;
 1424         default:
 1425                 return EINVAL;
 1426         }
 1427 
 1428         return 0;
 1429 }
 1430 
 1431 /*
 1432  * futex_compute_op(oldval, val3)
 1433  *
 1434  *      Apply a FUTEX_WAIT_OP operation to oldval.
 1435  */
 1436 static int
 1437 futex_compute_op(int oldval, int val3)
 1438 {
 1439         int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
 1440         int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
 1441 
 1442         if (op & FUTEX_OP_OPARG_SHIFT) {
 1443                 KASSERT(oparg >= 0);
 1444                 KASSERT(oparg < 32);
 1445                 oparg = 1u << oparg;
 1446                 op &= ~FUTEX_OP_OPARG_SHIFT;
 1447         }
 1448 
 1449         switch (op) {
 1450         case FUTEX_OP_SET:
 1451                 return oparg;
 1452 
 1453         case FUTEX_OP_ADD:
 1454                 /*
 1455                  * Avoid signed arithmetic overflow by doing
 1456                  * arithmetic unsigned and converting back to signed
 1457                  * at the end.
 1458                  */
 1459                 return (int)((unsigned)oldval + (unsigned)oparg);
 1460 
 1461         case FUTEX_OP_OR:
 1462                 return oldval | oparg;
 1463 
 1464         case FUTEX_OP_ANDN:
 1465                 return oldval & ~oparg;
 1466 
 1467         case FUTEX_OP_XOR:
 1468                 return oldval ^ oparg;
 1469 
 1470         default:
 1471                 panic("invalid futex op");
 1472         }
 1473 }
 1474 
 1475 /*
 1476  * futex_compute_cmp(oldval, val3)
 1477  *
 1478  *      Apply a FUTEX_WAIT_OP comparison to oldval.
 1479  */
 1480 static bool
 1481 futex_compute_cmp(int oldval, int val3)
 1482 {
 1483         int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
 1484         int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
 1485 
 1486         switch (cmp) {
 1487         case FUTEX_OP_CMP_EQ:
 1488                 return (oldval == cmparg);
 1489 
 1490         case FUTEX_OP_CMP_NE:
 1491                 return (oldval != cmparg);
 1492 
 1493         case FUTEX_OP_CMP_LT:
 1494                 return (oldval < cmparg);
 1495 
 1496         case FUTEX_OP_CMP_LE:
 1497                 return (oldval <= cmparg);
 1498 
 1499         case FUTEX_OP_CMP_GT:
 1500                 return (oldval > cmparg);
 1501 
 1502         case FUTEX_OP_CMP_GE:
 1503                 return (oldval >= cmparg);
 1504 
 1505         default:
 1506                 panic("invalid futex cmp operation");
 1507         }
 1508 }
 1509 
 1510 /*
 1511  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
 1512  *
 1513  *      Implement futex(FUTEX_WAKE_OP).
 1514  */
 1515 static int
 1516 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
 1517     int val3, register_t *retval)
 1518 {
 1519         struct futex *f = NULL, *f2 = NULL;
 1520         int oldval, newval, actual;
 1521         unsigned nwoken = 0;
 1522         int error;
 1523 
 1524         /* Reject negative number of wakeups.  */
 1525         if (val < 0 || val2 < 0) {
 1526                 error = EINVAL;
 1527                 goto out;
 1528         }
 1529 
 1530         /* Reject invalid operations before we start doing things.  */
 1531         if ((error = futex_validate_op_cmp(val3)) != 0)
 1532                 goto out;
 1533 
 1534         /* Look up the first futex, if any.  */
 1535         error = futex_lookup(uaddr, shared, &f);
 1536         if (error)
 1537                 goto out;
 1538 
 1539         /* Look up the second futex, if any.  */
 1540         error = futex_lookup(uaddr2, shared, &f2);
 1541         if (error)
 1542                 goto out;
 1543 
 1544         /*
 1545          * Under the queue locks:
 1546          *
 1547          * 1. Read/modify/write: *uaddr2 op= oparg.
 1548          * 2. Unconditionally wake uaddr.
 1549          * 3. Conditionally wake uaddr2, if it previously matched val2.
 1550          */
 1551         futex_queue_lock2(f, f2);
 1552         do {
 1553                 error = futex_load(uaddr2, &oldval);
 1554                 if (error)
 1555                         goto out_unlock;
 1556                 newval = futex_compute_op(oldval, val3);
 1557                 error = ucas_int(uaddr2, oldval, newval, &actual);
 1558                 if (error)
 1559                         goto out_unlock;
 1560         } while (actual != oldval);
 1561         nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0);
 1562         if (f2 && futex_compute_cmp(oldval, val3))
 1563                 nwoken += futex_wake(f2, val2, NULL, 0,
 1564                     FUTEX_BITSET_MATCH_ANY);
 1565 
 1566         /* Success! */
 1567         error = 0;
 1568 out_unlock:
 1569         futex_queue_unlock2(f, f2);
 1570 
 1571 out:
 1572         /* Return the number of waiters woken. */
 1573         *retval = nwoken;
 1574 
 1575         /* Release the futexes, if we got them. */
 1576         if (f2)
 1577                 futex_rele(f2);
 1578         if (f)
 1579                 futex_rele(f);
 1580         return error;
 1581 }
 1582 
 1583 /*
 1584  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
 1585  *
 1586  *      Implement the futex system call with all the parameters
 1587  *      parsed out.
 1588  */
 1589 int
 1590 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
 1591     int *uaddr2, int val2, int val3, register_t *retval)
 1592 {
 1593         const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
 1594         const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
 1595                                                             : CLOCK_MONOTONIC;
 1596 
 1597         op &= FUTEX_CMD_MASK;
 1598 
 1599         switch (op) {
 1600         case FUTEX_WAIT:
 1601                 return futex_func_wait(shared, uaddr, val,
 1602                     FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
 1603                     retval);
 1604 
 1605         case FUTEX_WAKE:
 1606                 val3 = FUTEX_BITSET_MATCH_ANY;
 1607                 /* FALLTHROUGH */
 1608         case FUTEX_WAKE_BITSET:
 1609                 return futex_func_wake(shared, uaddr, val, val3, retval);
 1610 
 1611         case FUTEX_REQUEUE:
 1612         case FUTEX_CMP_REQUEUE:
 1613                 return futex_func_requeue(shared, op, uaddr, val, uaddr2,
 1614                     val2, val3, retval);
 1615 
 1616         case FUTEX_WAIT_BITSET:
 1617                 return futex_func_wait(shared, uaddr, val, val3, timeout,
 1618                     clkid, TIMER_ABSTIME, retval);
 1619 
 1620         case FUTEX_WAKE_OP:
 1621                 return futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
 1622                     val3, retval);
 1623 
 1624         case FUTEX_FD:
 1625         default:
 1626                 return ENOSYS;
 1627         }
 1628 }
 1629 
 1630 /*
 1631  * sys___futex(l, uap, retval)
 1632  *
 1633  *      __futex(2) system call: generic futex operations.
 1634  */
 1635 int
 1636 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
 1637     register_t *retval)
 1638 {
 1639         /* {
 1640                 syscallarg(int *) uaddr;
 1641                 syscallarg(int) op;
 1642                 syscallarg(int) val;
 1643                 syscallarg(const struct timespec *) timeout;
 1644                 syscallarg(int *) uaddr2;
 1645                 syscallarg(int) val2;
 1646                 syscallarg(int) val3;
 1647         } */
 1648         struct timespec ts, *tsp;
 1649         int error;
 1650 
 1651         /*
 1652          * Copy in the timeout argument, if specified.
 1653          */
 1654         if (SCARG(uap, timeout)) {
 1655                 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
 1656                 if (error)
 1657                         return error;
 1658                 tsp = &ts;
 1659         } else {
 1660                 tsp = NULL;
 1661         }
 1662 
 1663         return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
 1664             tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
 1665             retval);
 1666 }
 1667 
 1668 /*
 1669  * sys___futex_set_robust_list(l, uap, retval)
 1670  *
 1671  *      __futex_set_robust_list(2) system call for robust futexes.
 1672  */
 1673 int
 1674 sys___futex_set_robust_list(struct lwp *l,
 1675     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
 1676 {
 1677         /* {
 1678                 syscallarg(void *) head;
 1679                 syscallarg(size_t) len;
 1680         } */
 1681         void *head = SCARG(uap, head);
 1682 
 1683         if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
 1684                 return EINVAL;
 1685         if ((uintptr_t)head % sizeof(u_long))
 1686                 return EINVAL;
 1687 
 1688         l->l_robust_head = (uintptr_t)head;
 1689 
 1690         return 0;
 1691 }
 1692 
 1693 /*
 1694  * sys___futex_get_robust_list(l, uap, retval)
 1695  *
 1696  *      __futex_get_robust_list(2) system call for robust futexes.
 1697  */
 1698 int
 1699 sys___futex_get_robust_list(struct lwp *l,
 1700     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
 1701 {
 1702         /* {
 1703                 syscallarg(lwpid_t) lwpid;
 1704                 syscallarg(void **) headp;
 1705                 syscallarg(size_t *) lenp;
 1706         } */
 1707         void *head;
 1708         const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
 1709         int error;
 1710 
 1711         error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
 1712         if (error)
 1713                 return error;
 1714 
 1715         /* Copy out the head pointer and the head structure length. */
 1716         error = copyout(&head, SCARG(uap, headp), sizeof(head));
 1717         if (__predict_true(error == 0)) {
 1718                 error = copyout(&len, SCARG(uap, lenp), sizeof(len));
 1719         }
 1720 
 1721         return error;
 1722 }
 1723 
 1724 /*
 1725  * release_futex(uva, tid)
 1726  *
 1727  *      Try to release the robust futex at uva in the current process
 1728  *      on lwp exit.  If anything goes wrong, silently fail.  It is the
 1729  *      userland program's obligation to arrange correct behaviour.
 1730  */
 1731 static void
 1732 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
 1733     bool const is_pending)
 1734 {
 1735         int *uaddr;
 1736         struct futex *f;
 1737         int oldval, newval, actual;
 1738         int error;
 1739 
 1740         /* If it's misaligned, tough.  */
 1741         if (__predict_false(uptr & 3))
 1742                 return;
 1743         uaddr = (int *)uptr;
 1744 
 1745         error = futex_load(uaddr, &oldval);
 1746         if (__predict_false(error))
 1747                 return;
 1748 
 1749         /*
 1750          * There are two race conditions we need to handle here:
 1751          *
 1752          * 1. User space cleared the futex word but died before
 1753          *    being able to issue the wakeup.  No wakeups will
 1754          *    ever be issued, oops!
 1755          *
 1756          * 2. Awakened waiter died before being able to acquire
 1757          *    the futex in user space.  Any other waiters are
 1758          *    now stuck, oops!
 1759          *
 1760          * In both of these cases, the futex word will be 0 (because
 1761          * it's updated before the wake is issued).  The best we can
 1762          * do is detect this situation if it's the pending futex and
 1763          * issue a wake without modifying the futex word.
 1764          *
 1765          * XXX eventual PI handling?
 1766          */
 1767         if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
 1768                 register_t retval;
 1769                 (void) futex_func_wake(/*shared*/true, uaddr, 1,
 1770                     FUTEX_BITSET_MATCH_ANY, &retval);
 1771                 return;
 1772         }
 1773 
 1774         /* Optimistically test whether we need to do anything at all.  */
 1775         if ((oldval & FUTEX_TID_MASK) != tid)
 1776                 return;
 1777 
 1778         /*
 1779          * We need to handle the case where this thread owned the futex,
 1780          * but it was uncontended.  In this case, there won't be any
 1781          * kernel state to look up.  All we can do is mark the futex
 1782          * as a zombie to be mopped up the next time another thread
 1783          * attempts to acquire it.
 1784          *
 1785          * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
 1786          * this loop, even if waiters appear while we're are doing
 1787          * so.  This is beause FUTEX_WAITERS is set by user space
 1788          * before calling __futex() to wait, and the futex needs
 1789          * to be marked as a zombie when the new waiter gets into
 1790          * the kernel.
 1791          */
 1792         if ((oldval & FUTEX_WAITERS) == 0) {
 1793                 do {
 1794                         error = futex_load(uaddr, &oldval);
 1795                         if (error)
 1796                                 return;
 1797                         if ((oldval & FUTEX_TID_MASK) != tid)
 1798                                 return;
 1799                         newval = oldval | FUTEX_OWNER_DIED;
 1800                         error = ucas_int(uaddr, oldval, newval, &actual);
 1801                         if (error)
 1802                                 return;
 1803                 } while (actual != oldval);
 1804 
 1805                 /*
 1806                  * If where is still no indication of waiters, then there is
 1807                  * no more work for us to do.
 1808                  */
 1809                 if ((oldval & FUTEX_WAITERS) == 0)
 1810                         return;
 1811         }
 1812 
 1813         /*
 1814          * Look for a shared futex since we have no positive indication
 1815          * it is private.  If we can't, tough.
 1816          */
 1817         error = futex_lookup(uaddr, /*shared*/true, &f);
 1818         if (error)
 1819                 return;
 1820 
 1821         /*
 1822          * If there's no kernel state for this futex, there's nothing to
 1823          * release.
 1824          */
 1825         if (f == NULL)
 1826                 return;
 1827 
 1828         /* Work under the futex queue lock.  */
 1829         futex_queue_lock(f);
 1830 
 1831         /*
 1832          * Fetch the word: if the tid doesn't match ours, skip;
 1833          * otherwise, set the owner-died bit, atomically.
 1834          */
 1835         do {
 1836                 error = futex_load(uaddr, &oldval);
 1837                 if (error)
 1838                         goto out;
 1839                 if ((oldval & FUTEX_TID_MASK) != tid)
 1840                         goto out;
 1841                 newval = oldval | FUTEX_OWNER_DIED;
 1842                 error = ucas_int(uaddr, oldval, newval, &actual);
 1843                 if (error)
 1844                         goto out;
 1845         } while (actual != oldval);
 1846 
 1847         /*
 1848          * If there may be waiters, try to wake one.  If anything goes
 1849          * wrong, tough.
 1850          *
 1851          * XXX eventual PI handling?
 1852          */
 1853         if (oldval & FUTEX_WAITERS)
 1854                 (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY);
 1855 
 1856         /* Unlock the queue and release the futex.  */
 1857 out:    futex_queue_unlock(f);
 1858         futex_rele(f);
 1859 }
 1860 
 1861 /*
 1862  * futex_robust_head_lookup(l, lwpid)
 1863  *
 1864  *      Helper function to look up a robust head by LWP ID.
 1865  */
 1866 int
 1867 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
 1868 {
 1869         struct proc *p = l->l_proc;
 1870 
 1871         /* Find the other lwp, if requested; otherwise use our robust head.  */
 1872         if (lwpid) {
 1873                 mutex_enter(p->p_lock);
 1874                 l = lwp_find(p, lwpid);
 1875                 if (l == NULL) {
 1876                         mutex_exit(p->p_lock);
 1877                         return ESRCH;
 1878                 }
 1879                 *headp = (void *)l->l_robust_head;
 1880                 mutex_exit(p->p_lock);
 1881         } else {
 1882                 *headp = (void *)l->l_robust_head;
 1883         }
 1884         return 0;
 1885 }
 1886 
 1887 /*
 1888  * futex_fetch_robust_head(uaddr)
 1889  *
 1890  *      Helper routine to fetch the futex robust list head that
 1891  *      handles 32-bit binaries running on 64-bit kernels.
 1892  */
 1893 static int
 1894 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
 1895 {
 1896 #ifdef _LP64
 1897         if (curproc->p_flag & PK_32) {
 1898                 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
 1899                 int error;
 1900 
 1901                 error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
 1902                 if (__predict_true(error == 0)) {
 1903                         for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
 1904                                 if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
 1905                                         /*
 1906                                          * Make sure the offset is sign-
 1907                                          * extended.
 1908                                          */
 1909                                         rhead[i] = (int32_t)rhead32[i];
 1910                                 } else {
 1911                                         rhead[i] = rhead32[i];
 1912                                 }
 1913                         }
 1914                 }
 1915                 return error;
 1916         }
 1917 #endif /* _L64 */
 1918 
 1919         return copyin((void *)uaddr, rhead,
 1920             sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
 1921 }
 1922 
 1923 /*
 1924  * futex_decode_robust_word(word)
 1925  *
 1926  *      Decode a robust futex list word into the entry and entry
 1927  *      properties.
 1928  */
 1929 static inline void
 1930 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
 1931     bool * const is_pi)
 1932 {
 1933         *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
 1934         *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
 1935 }
 1936 
 1937 /*
 1938  * futex_fetch_robust_entry(uaddr)
 1939  *
 1940  *      Helper routine to fetch and decode a robust futex entry
 1941  *      that handles 32-bit binaries running on 64-bit kernels.
 1942  */
 1943 static int
 1944 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
 1945     bool * const is_pi)
 1946 {
 1947         uintptr_t val = 0;
 1948         int error = 0;
 1949 
 1950 #ifdef _LP64
 1951         if (curproc->p_flag & PK_32) {
 1952                 uint32_t val32;
 1953 
 1954                 error = ufetch_32((uint32_t *)uaddr, &val32);
 1955                 if (__predict_true(error == 0))
 1956                         val = val32;
 1957         } else
 1958 #endif /* _LP64 */
 1959                 error = ufetch_long((u_long *)uaddr, (u_long *)&val);
 1960         if (__predict_false(error))
 1961                 return error;
 1962 
 1963         futex_decode_robust_word(val, valp, is_pi);
 1964         return 0;
 1965 }
 1966 
 1967 /*
 1968  * futex_release_all_lwp(l, tid)
 1969  *
 1970  *      Release all l's robust futexes.  If anything looks funny in
 1971  *      the process, give up -- it's userland's responsibility to dot
 1972  *      the i's and cross the t's.
 1973  */
 1974 void
 1975 futex_release_all_lwp(struct lwp * const l)
 1976 {
 1977         u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
 1978         int limit = 1000000;
 1979         int error;
 1980 
 1981         /* If there's no robust list there's nothing to do. */
 1982         if (l->l_robust_head == 0)
 1983                 return;
 1984 
 1985         KASSERT((l->l_lid & FUTEX_TID_MASK) == l->l_lid);
 1986 
 1987         /* Read the final snapshot of the robust list head. */
 1988         error = futex_fetch_robust_head(l->l_robust_head, rhead);
 1989         if (error) {
 1990                 printf("WARNING: pid %jd (%s) lwp %jd:"
 1991                     " unmapped robust futex list head\n",
 1992                     (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
 1993                     (uintmax_t)l->l_lid);
 1994                 return;
 1995         }
 1996 
 1997         const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
 1998 
 1999         uintptr_t next, pending;
 2000         bool is_pi, pending_is_pi;
 2001 
 2002         futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
 2003             &next, &is_pi);
 2004         futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
 2005             &pending, &pending_is_pi);
 2006 
 2007         /*
 2008          * Walk down the list of locked futexes and release them, up
 2009          * to one million of them before we give up.
 2010          */
 2011 
 2012         while (next != l->l_robust_head && limit-- > 0) {
 2013                 /* pending handled below. */
 2014                 if (next != pending)
 2015                         release_futex(next + offset, l->l_lid, is_pi, false);
 2016                 error = futex_fetch_robust_entry(next, &next, &is_pi);
 2017                 if (error)
 2018                         break;
 2019                 preempt_point();
 2020         }
 2021         if (limit <= 0) {
 2022                 printf("WARNING: pid %jd (%s) lwp %jd:"
 2023                     " exhausted robust futex limit\n",
 2024                     (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
 2025                     (uintmax_t)l->l_lid);
 2026         }
 2027 
 2028         /* If there's a pending futex, it may need to be released too. */
 2029         if (pending != 0) {
 2030                 release_futex(pending + offset, l->l_lid, pending_is_pi, true);
 2031         }
 2032 }

Cache object: 469ea97169322425d5b3677229f7c41c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.