The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/rrwlock.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  */
   25 /*
   26  * Copyright (c) 2012 by Delphix. All rights reserved.
   27  */
   28 
   29 #include <sys/rrwlock.h>
   30 #include <sys/trace_zfs.h>
   31 
   32 /*
   33  * This file contains the implementation of a re-entrant read
   34  * reader/writer lock (aka "rrwlock").
   35  *
   36  * This is a normal reader/writer lock with the additional feature
   37  * of allowing threads who have already obtained a read lock to
   38  * re-enter another read lock (re-entrant read) - even if there are
   39  * waiting writers.
   40  *
   41  * Callers who have not obtained a read lock give waiting writers priority.
   42  *
   43  * The rrwlock_t lock does not allow re-entrant writers, nor does it
   44  * allow a re-entrant mix of reads and writes (that is, it does not
   45  * allow a caller who has already obtained a read lock to be able to
   46  * then grab a write lock without first dropping all read locks, and
   47  * vice versa).
   48  *
   49  * The rrwlock_t uses tsd (thread specific data) to keep a list of
   50  * nodes (rrw_node_t), where each node keeps track of which specific
   51  * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
   52  * should be rare, a thread that grabs multiple reads on the same rrwlock_t
   53  * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
   54  * tsd list can represent a different rrwlock_t.  This allows a thread
   55  * to enter multiple and unique rrwlock_ts for read locks at the same time.
   56  *
   57  * Since using tsd exposes some overhead, the rrwlock_t only needs to
   58  * keep tsd data when writers are waiting.  If no writers are waiting, then
   59  * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
   60  * is needed.  Once a writer attempts to grab the lock, readers then
   61  * keep tsd data and bump the linked readers count (rr_linked_rcount).
   62  *
   63  * If there are waiting writers and there are anonymous readers, then a
   64  * reader doesn't know if it is a re-entrant lock. But since it may be one,
   65  * we allow the read to proceed (otherwise it could deadlock).  Since once
   66  * waiting writers are active, readers no longer bump the anonymous count,
   67  * the anonymous readers will eventually flush themselves out.  At this point,
   68  * readers will be able to tell if they are a re-entrant lock (have a
   69  * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
   70  * we must let the proceed.  If they are not, then the reader blocks for the
   71  * waiting writers.  Hence, we do not starve writers.
   72  */
   73 
   74 /* global key for TSD */
   75 uint_t rrw_tsd_key;
   76 
   77 typedef struct rrw_node {
   78         struct rrw_node *rn_next;
   79         rrwlock_t *rn_rrl;
   80         const void *rn_tag;
   81 } rrw_node_t;
   82 
   83 static rrw_node_t *
   84 rrn_find(rrwlock_t *rrl)
   85 {
   86         rrw_node_t *rn;
   87 
   88         if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
   89                 return (NULL);
   90 
   91         for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
   92                 if (rn->rn_rrl == rrl)
   93                         return (rn);
   94         }
   95         return (NULL);
   96 }
   97 
   98 /*
   99  * Add a node to the head of the singly linked list.
  100  */
  101 static void
  102 rrn_add(rrwlock_t *rrl, const void *tag)
  103 {
  104         rrw_node_t *rn;
  105 
  106         rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
  107         rn->rn_rrl = rrl;
  108         rn->rn_next = tsd_get(rrw_tsd_key);
  109         rn->rn_tag = tag;
  110         VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
  111 }
  112 
  113 /*
  114  * If a node is found for 'rrl', then remove the node from this
  115  * thread's list and return TRUE; otherwise return FALSE.
  116  */
  117 static boolean_t
  118 rrn_find_and_remove(rrwlock_t *rrl, const void *tag)
  119 {
  120         rrw_node_t *rn;
  121         rrw_node_t *prev = NULL;
  122 
  123         if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
  124                 return (B_FALSE);
  125 
  126         for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
  127                 if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
  128                         if (prev)
  129                                 prev->rn_next = rn->rn_next;
  130                         else
  131                                 VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
  132                         kmem_free(rn, sizeof (*rn));
  133                         return (B_TRUE);
  134                 }
  135                 prev = rn;
  136         }
  137         return (B_FALSE);
  138 }
  139 
  140 void
  141 rrw_init(rrwlock_t *rrl, boolean_t track_all)
  142 {
  143         mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
  144         cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
  145         rrl->rr_writer = NULL;
  146         zfs_refcount_create(&rrl->rr_anon_rcount);
  147         zfs_refcount_create(&rrl->rr_linked_rcount);
  148         rrl->rr_writer_wanted = B_FALSE;
  149         rrl->rr_track_all = track_all;
  150 }
  151 
  152 void
  153 rrw_destroy(rrwlock_t *rrl)
  154 {
  155         mutex_destroy(&rrl->rr_lock);
  156         cv_destroy(&rrl->rr_cv);
  157         ASSERT(rrl->rr_writer == NULL);
  158         zfs_refcount_destroy(&rrl->rr_anon_rcount);
  159         zfs_refcount_destroy(&rrl->rr_linked_rcount);
  160 }
  161 
  162 static void
  163 rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, const void *tag)
  164 {
  165         mutex_enter(&rrl->rr_lock);
  166 #if !defined(ZFS_DEBUG) && defined(_KERNEL)
  167         if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
  168             !rrl->rr_track_all) {
  169                 rrl->rr_anon_rcount.rc_count++;
  170                 mutex_exit(&rrl->rr_lock);
  171                 return;
  172         }
  173         DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
  174 #endif
  175         ASSERT(rrl->rr_writer != curthread);
  176         ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0);
  177 
  178         while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
  179             zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
  180             rrn_find(rrl) == NULL))
  181                 cv_wait(&rrl->rr_cv, &rrl->rr_lock);
  182 
  183         if (rrl->rr_writer_wanted || rrl->rr_track_all) {
  184                 /* may or may not be a re-entrant enter */
  185                 rrn_add(rrl, tag);
  186                 (void) zfs_refcount_add(&rrl->rr_linked_rcount, tag);
  187         } else {
  188                 (void) zfs_refcount_add(&rrl->rr_anon_rcount, tag);
  189         }
  190         ASSERT(rrl->rr_writer == NULL);
  191         mutex_exit(&rrl->rr_lock);
  192 }
  193 
  194 void
  195 rrw_enter_read(rrwlock_t *rrl, const void *tag)
  196 {
  197         rrw_enter_read_impl(rrl, B_FALSE, tag);
  198 }
  199 
  200 /*
  201  * take a read lock even if there are pending write lock requests. if we want
  202  * to take a lock reentrantly, but from different threads (that have a
  203  * relationship to each other), the normal detection mechanism to overrule
  204  * the pending writer does not work, so we have to give an explicit hint here.
  205  */
  206 void
  207 rrw_enter_read_prio(rrwlock_t *rrl, const void *tag)
  208 {
  209         rrw_enter_read_impl(rrl, B_TRUE, tag);
  210 }
  211 
  212 
  213 void
  214 rrw_enter_write(rrwlock_t *rrl)
  215 {
  216         mutex_enter(&rrl->rr_lock);
  217         ASSERT(rrl->rr_writer != curthread);
  218 
  219         while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 ||
  220             zfs_refcount_count(&rrl->rr_linked_rcount) > 0 ||
  221             rrl->rr_writer != NULL) {
  222                 rrl->rr_writer_wanted = B_TRUE;
  223                 cv_wait(&rrl->rr_cv, &rrl->rr_lock);
  224         }
  225         rrl->rr_writer_wanted = B_FALSE;
  226         rrl->rr_writer = curthread;
  227         mutex_exit(&rrl->rr_lock);
  228 }
  229 
  230 void
  231 rrw_enter(rrwlock_t *rrl, krw_t rw, const void *tag)
  232 {
  233         if (rw == RW_READER)
  234                 rrw_enter_read(rrl, tag);
  235         else
  236                 rrw_enter_write(rrl);
  237 }
  238 
  239 void
  240 rrw_exit(rrwlock_t *rrl, const void *tag)
  241 {
  242         mutex_enter(&rrl->rr_lock);
  243 #if !defined(ZFS_DEBUG) && defined(_KERNEL)
  244         if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
  245                 rrl->rr_anon_rcount.rc_count--;
  246                 if (rrl->rr_anon_rcount.rc_count == 0)
  247                         cv_broadcast(&rrl->rr_cv);
  248                 mutex_exit(&rrl->rr_lock);
  249                 return;
  250         }
  251         DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
  252 #endif
  253         ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
  254             !zfs_refcount_is_zero(&rrl->rr_linked_rcount) ||
  255             rrl->rr_writer != NULL);
  256 
  257         if (rrl->rr_writer == NULL) {
  258                 int64_t count;
  259                 if (rrn_find_and_remove(rrl, tag)) {
  260                         count = zfs_refcount_remove(
  261                             &rrl->rr_linked_rcount, tag);
  262                 } else {
  263                         ASSERT(!rrl->rr_track_all);
  264                         count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag);
  265                 }
  266                 if (count == 0)
  267                         cv_broadcast(&rrl->rr_cv);
  268         } else {
  269                 ASSERT(rrl->rr_writer == curthread);
  270                 ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) &&
  271                     zfs_refcount_is_zero(&rrl->rr_linked_rcount));
  272                 rrl->rr_writer = NULL;
  273                 cv_broadcast(&rrl->rr_cv);
  274         }
  275         mutex_exit(&rrl->rr_lock);
  276 }
  277 
  278 /*
  279  * If the lock was created with track_all, rrw_held(RW_READER) will return
  280  * B_TRUE iff the current thread has the lock for reader.  Otherwise it may
  281  * return B_TRUE if any thread has the lock for reader.
  282  */
  283 boolean_t
  284 rrw_held(rrwlock_t *rrl, krw_t rw)
  285 {
  286         boolean_t held;
  287 
  288         mutex_enter(&rrl->rr_lock);
  289         if (rw == RW_WRITER) {
  290                 held = (rrl->rr_writer == curthread);
  291         } else {
  292                 held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
  293                     rrn_find(rrl) != NULL);
  294         }
  295         mutex_exit(&rrl->rr_lock);
  296 
  297         return (held);
  298 }
  299 
  300 void
  301 rrw_tsd_destroy(void *arg)
  302 {
  303         rrw_node_t *rn = arg;
  304         if (rn != NULL) {
  305                 panic("thread %p terminating with rrw lock %p held",
  306                     (void *)curthread, (void *)rn->rn_rrl);
  307         }
  308 }
  309 
  310 /*
  311  * A reader-mostly lock implementation, tuning above reader-writer locks
  312  * for hightly parallel read acquisitions, while pessimizing writes.
  313  *
  314  * The idea is to split single busy lock into array of locks, so that
  315  * each reader can lock only one of them for read, depending on result
  316  * of simple hash function.  That proportionally reduces lock congestion.
  317  * Writer at the same time has to sequentially acquire write on all the locks.
  318  * That makes write acquisition proportionally slower, but in places where
  319  * it is used (filesystem unmount) performance is not critical.
  320  *
  321  * All the functions below are direct wrappers around functions above.
  322  */
  323 void
  324 rrm_init(rrmlock_t *rrl, boolean_t track_all)
  325 {
  326         int i;
  327 
  328         for (i = 0; i < RRM_NUM_LOCKS; i++)
  329                 rrw_init(&rrl->locks[i], track_all);
  330 }
  331 
  332 void
  333 rrm_destroy(rrmlock_t *rrl)
  334 {
  335         int i;
  336 
  337         for (i = 0; i < RRM_NUM_LOCKS; i++)
  338                 rrw_destroy(&rrl->locks[i]);
  339 }
  340 
  341 void
  342 rrm_enter(rrmlock_t *rrl, krw_t rw, const void *tag)
  343 {
  344         if (rw == RW_READER)
  345                 rrm_enter_read(rrl, tag);
  346         else
  347                 rrm_enter_write(rrl);
  348 }
  349 
  350 /*
  351  * This maps the current thread to a specific lock.  Note that the lock
  352  * must be released by the same thread that acquired it.  We do this
  353  * mapping by taking the thread pointer mod a prime number.  We examine
  354  * only the low 32 bits of the thread pointer, because 32-bit division
  355  * is faster than 64-bit division, and the high 32 bits have little
  356  * entropy anyway.
  357  */
  358 #define RRM_TD_LOCK()   (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
  359 
  360 void
  361 rrm_enter_read(rrmlock_t *rrl, const void *tag)
  362 {
  363         rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
  364 }
  365 
  366 void
  367 rrm_enter_write(rrmlock_t *rrl)
  368 {
  369         int i;
  370 
  371         for (i = 0; i < RRM_NUM_LOCKS; i++)
  372                 rrw_enter_write(&rrl->locks[i]);
  373 }
  374 
  375 void
  376 rrm_exit(rrmlock_t *rrl, const void *tag)
  377 {
  378         int i;
  379 
  380         if (rrl->locks[0].rr_writer == curthread) {
  381                 for (i = 0; i < RRM_NUM_LOCKS; i++)
  382                         rrw_exit(&rrl->locks[i], tag);
  383         } else {
  384                 rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
  385         }
  386 }
  387 
  388 boolean_t
  389 rrm_held(rrmlock_t *rrl, krw_t rw)
  390 {
  391         if (rw == RW_WRITER) {
  392                 return (rrw_held(&rrl->locks[0], rw));
  393         } else {
  394                 return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
  395         }
  396 }

Cache object: 7638e31bc5bd6bffc88717c1347e3c2b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.