The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/mmp.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
   23  */
   24 
   25 #include <sys/abd.h>
   26 #include <sys/mmp.h>
   27 #include <sys/spa.h>
   28 #include <sys/spa_impl.h>
   29 #include <sys/time.h>
   30 #include <sys/vdev.h>
   31 #include <sys/vdev_impl.h>
   32 #include <sys/zfs_context.h>
   33 #include <sys/callb.h>
   34 
   35 /*
   36  * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
   37  * or opening a pool on more than one host at a time.  In particular, it
   38  * prevents "zpool import -f" on a host from succeeding while the pool is
   39  * already imported on another host.  There are many other ways in which a
   40  * device could be used by two hosts for different purposes at the same time
   41  * resulting in pool damage.  This implementation does not attempt to detect
   42  * those cases.
   43  *
   44  * MMP operates by ensuring there are frequent visible changes on disk (a
   45  * "heartbeat") at all times.  And by altering the import process to check
   46  * for these changes and failing the import when they are detected.  This
   47  * functionality is enabled by setting the 'multihost' pool property to on.
   48  *
   49  * Uberblocks written by the txg_sync thread always go into the first
   50  * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
   51  * They are used to hold uberblocks which are exactly the same as the last
   52  * synced uberblock except that the ub_timestamp and mmp_config are frequently
   53  * updated.  Like all other uberblocks, the slot is written with an embedded
   54  * checksum, and slots with invalid checksums are ignored.  This provides the
   55  * "heartbeat", with no risk of overwriting good uberblocks that must be
   56  * preserved, e.g. previous txgs and associated block pointers.
   57  *
   58  * Three optional fields are added to uberblock structure; ub_mmp_magic,
   59  * ub_mmp_config, and ub_mmp_delay.  The ub_mmp_magic value allows zfs to tell
   60  * whether the other ub_mmp_* fields are valid.  The ub_mmp_config field tells
   61  * the importing host the settings of zfs_multihost_interval and
   62  * zfs_multihost_fail_intervals on the host which last had (or currently has)
   63  * the pool imported.  These determine how long a host must wait to detect
   64  * activity in the pool, before concluding the pool is not in use.  The
   65  * mmp_delay field is a decaying average of the amount of time between
   66  * completion of successive MMP writes, in nanoseconds.  It indicates whether
   67  * MMP is enabled.
   68  *
   69  * During import an activity test may now be performed to determine if
   70  * the pool is in use.  The activity test is typically required if the
   71  * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
   72  * POOL_STATE_ACTIVE, and the pool is not a root pool.
   73  *
   74  * The activity test finds the "best" uberblock (highest txg, timestamp, and, if
   75  * ub_mmp_magic is valid, sequence number from ub_mmp_config).  It then waits
   76  * some time, and finds the "best" uberblock again.  If any of the mentioned
   77  * fields have different values in the newly read uberblock, the pool is in use
   78  * by another host and the import fails.  In order to assure the accuracy of the
   79  * activity test, the default values result in an activity test duration of 20x
   80  * the mmp write interval.
   81  *
   82  * The duration of the "zpool import" activity test depends on the information
   83  * available in the "best" uberblock:
   84  *
   85  * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0:
   86  *    ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2
   87  *
   88  *    In this case, a weak guarantee is provided.  Since the host which last had
   89  *    the pool imported will suspend the pool if no mmp writes land within
   90  *    fail_intervals * multihost_interval ms, the absence of writes during that
   91  *    time means either the pool is not imported, or it is imported but the pool
   92  *    is suspended and no further writes will occur.
   93  *
   94  *    Note that resuming the suspended pool on the remote host would invalidate
   95  *    this guarantee, and so it is not allowed.
   96  *
   97  *    The factor of 2 provides a conservative safety factor and derives from
   98  *    MMP_IMPORT_SAFETY_FACTOR;
   99  *
  100  * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0:
  101  *    (ub_mmp_config.multihost_interval + ub_mmp_delay) *
  102  *        zfs_multihost_import_intervals
  103  *
  104  *    In this case no guarantee can provided.  However, as long as some devices
  105  *    are healthy and connected, it is likely that at least one write will land
  106  *    within (multihost_interval + mmp_delay) because multihost_interval is
  107  *    enough time for a write to be attempted to each leaf vdev, and mmp_delay
  108  *    is enough for one to land, based on past delays.  Multiplying by
  109  *    zfs_multihost_import_intervals provides a conservative safety factor.
  110  *
  111  * 3) If uberblock was written by zfs-0.7:
  112  *    (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
  113  *
  114  *    The same logic as case #2 applies, but we do not know remote tunables.
  115  *
  116  *    We use the local value for zfs_multihost_interval because the original MMP
  117  *    did not record this value in the uberblock.
  118  *
  119  *    ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host
  120  *    has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect
  121  *    that.  We will have waited enough time for zfs_multihost_import_intervals
  122  *    writes to be issued and all but one to land.
  123  *
  124  *    single device pool example delays
  125  *
  126  *    import_delay = (1 + 1) * 20   =  40s #defaults, no I/O delay
  127  *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
  128  *    import_delay = (10 + 10) * 20 = 400s #10s multihost_interval,
  129  *                                          no I/O delay
  130  *    100 device pool example delays
  131  *
  132  *    import_delay = (1 + .01) * 20 =  20s #defaults, no I/O delay
  133  *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
  134  *    import_delay = (10 + .1) * 20 = 202s #10s multihost_interval,
  135  *                                          no I/O delay
  136  *
  137  * 4) Otherwise, this uberblock was written by a pre-MMP zfs:
  138  *    zfs_multihost_import_intervals * zfs_multihost_interval
  139  *
  140  *    In this case local tunables are used.  By default this product = 10s, long
  141  *    enough for a pool with any activity at all to write at least one
  142  *    uberblock.  No guarantee can be provided.
  143  *
  144  * Additionally, the duration is then extended by a random 25% to attempt to to
  145  * detect simultaneous imports.  For example, if both partner hosts are rebooted
  146  * at the same time and automatically attempt to import the pool.
  147  */
  148 
  149 /*
  150  * Used to control the frequency of mmp writes which are performed when the
  151  * 'multihost' pool property is on.  This is one factor used to determine the
  152  * length of the activity check during import.
  153  *
  154  * On average an mmp write will be issued for each leaf vdev every
  155  * zfs_multihost_interval milliseconds.  In practice, the observed period can
  156  * vary with the I/O load and this observed value is the ub_mmp_delay which is
  157  * stored in the uberblock.  The minimum allowed value is 100 ms.
  158  */
  159 uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
  160 
  161 /*
  162  * Used to control the duration of the activity test on import.  Smaller values
  163  * of zfs_multihost_import_intervals will reduce the import time but increase
  164  * the risk of failing to detect an active pool.  The total activity check time
  165  * is never allowed to drop below one second.  A value of 0 is ignored and
  166  * treated as if it was set to 1.
  167  */
  168 uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
  169 
  170 /*
  171  * Controls the behavior of the pool when mmp write failures or delays are
  172  * detected.
  173  *
  174  * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are
  175  * ignored.  The failures will still be reported to the ZED which depending on
  176  * its configuration may take action such as suspending the pool or taking a
  177  * device offline.
  178  *
  179  * When zfs_multihost_fail_intervals > 0, the pool will be suspended if
  180  * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass
  181  * without a successful mmp write.  This guarantees the activity test will see
  182  * mmp writes if the pool is imported.  A value of 1 is ignored and treated as
  183  * if it was set to 2, because a single leaf vdev pool will issue a write once
  184  * per multihost_interval and thus any variation in latency would cause the
  185  * pool to be suspended.
  186  */
  187 uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
  188 
  189 static const void *const mmp_tag = "mmp_write_uberblock";
  190 static __attribute__((noreturn)) void mmp_thread(void *arg);
  191 
  192 void
  193 mmp_init(spa_t *spa)
  194 {
  195         mmp_thread_t *mmp = &spa->spa_mmp;
  196 
  197         mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
  198         cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
  199         mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
  200         mmp->mmp_kstat_id = 1;
  201 }
  202 
  203 void
  204 mmp_fini(spa_t *spa)
  205 {
  206         mmp_thread_t *mmp = &spa->spa_mmp;
  207 
  208         mutex_destroy(&mmp->mmp_thread_lock);
  209         cv_destroy(&mmp->mmp_thread_cv);
  210         mutex_destroy(&mmp->mmp_io_lock);
  211 }
  212 
  213 static void
  214 mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
  215 {
  216         CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
  217         mutex_enter(&mmp->mmp_thread_lock);
  218 }
  219 
  220 static void
  221 mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
  222 {
  223         ASSERT(*mpp != NULL);
  224         *mpp = NULL;
  225         cv_broadcast(&mmp->mmp_thread_cv);
  226         CALLB_CPR_EXIT(cpr);            /* drops &mmp->mmp_thread_lock */
  227 }
  228 
  229 void
  230 mmp_thread_start(spa_t *spa)
  231 {
  232         mmp_thread_t *mmp = &spa->spa_mmp;
  233 
  234         if (spa_writeable(spa)) {
  235                 mutex_enter(&mmp->mmp_thread_lock);
  236                 if (!mmp->mmp_thread) {
  237                         mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
  238                             spa, 0, &p0, TS_RUN, defclsyspri);
  239                         zfs_dbgmsg("MMP thread started pool '%s' "
  240                             "gethrtime %llu", spa_name(spa), gethrtime());
  241                 }
  242                 mutex_exit(&mmp->mmp_thread_lock);
  243         }
  244 }
  245 
  246 void
  247 mmp_thread_stop(spa_t *spa)
  248 {
  249         mmp_thread_t *mmp = &spa->spa_mmp;
  250 
  251         mutex_enter(&mmp->mmp_thread_lock);
  252         mmp->mmp_thread_exiting = 1;
  253         cv_broadcast(&mmp->mmp_thread_cv);
  254 
  255         while (mmp->mmp_thread) {
  256                 cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
  257         }
  258         mutex_exit(&mmp->mmp_thread_lock);
  259         zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
  260             spa_name(spa), gethrtime());
  261 
  262         ASSERT(mmp->mmp_thread == NULL);
  263         mmp->mmp_thread_exiting = 0;
  264 }
  265 
  266 typedef enum mmp_vdev_state_flag {
  267         MMP_FAIL_NOT_WRITABLE   = (1 << 0),
  268         MMP_FAIL_WRITE_PENDING  = (1 << 1),
  269 } mmp_vdev_state_flag_t;
  270 
  271 /*
  272  * Find a leaf vdev to write an MMP block to.  It must not have an outstanding
  273  * mmp write (if so a new write will also likely block).  If there is no usable
  274  * leaf, a nonzero error value is returned. The error value returned is a bit
  275  * field.
  276  *
  277  * MMP_FAIL_WRITE_PENDING   One or more leaf vdevs are writeable, but have an
  278  *                          outstanding MMP write.
  279  * MMP_FAIL_NOT_WRITABLE    One or more leaf vdevs are not writeable.
  280  */
  281 
  282 static int
  283 mmp_next_leaf(spa_t *spa)
  284 {
  285         vdev_t *leaf;
  286         vdev_t *starting_leaf;
  287         int fail_mask = 0;
  288 
  289         ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
  290         ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
  291         ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
  292         ASSERT(!list_is_empty(&spa->spa_leaf_list));
  293 
  294         if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
  295                 spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
  296                 spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
  297         }
  298 
  299         leaf = spa->spa_mmp.mmp_last_leaf;
  300         if (leaf == NULL)
  301                 leaf = list_head(&spa->spa_leaf_list);
  302         starting_leaf = leaf;
  303 
  304         do {
  305                 leaf = list_next(&spa->spa_leaf_list, leaf);
  306                 if (leaf == NULL) {
  307                         leaf = list_head(&spa->spa_leaf_list);
  308                         ASSERT3P(leaf, !=, NULL);
  309                 }
  310 
  311                 /*
  312                  * We skip unwritable, offline, detached, and dRAID spare
  313                  * devices as they are either not legal targets or the write
  314                  * may fail or not be seen by other hosts.  Skipped dRAID
  315                  * spares can never be written so the fail mask is not set.
  316                  */
  317                 if (!vdev_writeable(leaf) || leaf->vdev_offline ||
  318                     leaf->vdev_detached) {
  319                         fail_mask |= MMP_FAIL_NOT_WRITABLE;
  320                 } else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
  321                         continue;
  322                 } else if (leaf->vdev_mmp_pending != 0) {
  323                         fail_mask |= MMP_FAIL_WRITE_PENDING;
  324                 } else {
  325                         spa->spa_mmp.mmp_last_leaf = leaf;
  326                         return (0);
  327                 }
  328         } while (leaf != starting_leaf);
  329 
  330         ASSERT(fail_mask);
  331 
  332         return (fail_mask);
  333 }
  334 
  335 /*
  336  * MMP writes are issued on a fixed schedule, but may complete at variable,
  337  * much longer, intervals.  The mmp_delay captures long periods between
  338  * successful writes for any reason, including disk latency, scheduling delays,
  339  * etc.
  340  *
  341  * The mmp_delay is usually calculated as a decaying average, but if the latest
  342  * delay is higher we do not average it, so that we do not hide sudden spikes
  343  * which the importing host must wait for.
  344  *
  345  * If writes are occurring frequently, such as due to a high rate of txg syncs,
  346  * the mmp_delay could become very small.  Since those short delays depend on
  347  * activity we cannot count on, we never allow mmp_delay to get lower than rate
  348  * expected if only mmp_thread writes occur.
  349  *
  350  * If an mmp write was skipped or fails, and we have already waited longer than
  351  * mmp_delay, we need to update it so the next write reflects the longer delay.
  352  *
  353  * Do not set mmp_delay if the multihost property is not on, so as not to
  354  * trigger an activity check on import.
  355  */
  356 static void
  357 mmp_delay_update(spa_t *spa, boolean_t write_completed)
  358 {
  359         mmp_thread_t *mts = &spa->spa_mmp;
  360         hrtime_t delay = gethrtime() - mts->mmp_last_write;
  361 
  362         ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
  363 
  364         if (spa_multihost(spa) == B_FALSE) {
  365                 mts->mmp_delay = 0;
  366                 return;
  367         }
  368 
  369         if (delay > mts->mmp_delay)
  370                 mts->mmp_delay = delay;
  371 
  372         if (write_completed == B_FALSE)
  373                 return;
  374 
  375         mts->mmp_last_write = gethrtime();
  376 
  377         /*
  378          * strictly less than, in case delay was changed above.
  379          */
  380         if (delay < mts->mmp_delay) {
  381                 hrtime_t min_delay =
  382                     MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) /
  383                     MAX(1, vdev_count_leaves(spa));
  384                 mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
  385                     min_delay);
  386         }
  387 }
  388 
  389 static void
  390 mmp_write_done(zio_t *zio)
  391 {
  392         spa_t *spa = zio->io_spa;
  393         vdev_t *vd = zio->io_vd;
  394         mmp_thread_t *mts = zio->io_private;
  395 
  396         mutex_enter(&mts->mmp_io_lock);
  397         uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
  398         hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
  399 
  400         mmp_delay_update(spa, (zio->io_error == 0));
  401 
  402         vd->vdev_mmp_pending = 0;
  403         vd->vdev_mmp_kstat_id = 0;
  404 
  405         mutex_exit(&mts->mmp_io_lock);
  406         spa_config_exit(spa, SCL_STATE, mmp_tag);
  407 
  408         spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error,
  409             mmp_write_duration);
  410 
  411         abd_free(zio->io_abd);
  412 }
  413 
  414 /*
  415  * When the uberblock on-disk is updated by a spa_sync,
  416  * creating a new "best" uberblock, update the one stored
  417  * in the mmp thread state, used for mmp writes.
  418  */
  419 void
  420 mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
  421 {
  422         mmp_thread_t *mmp = &spa->spa_mmp;
  423 
  424         mutex_enter(&mmp->mmp_io_lock);
  425         mmp->mmp_ub = *ub;
  426         mmp->mmp_seq = 1;
  427         mmp->mmp_ub.ub_timestamp = gethrestime_sec();
  428         mmp_delay_update(spa, B_TRUE);
  429         mutex_exit(&mmp->mmp_io_lock);
  430 }
  431 
  432 /*
  433  * Choose a random vdev, label, and MMP block, and write over it
  434  * with a copy of the last-synced uberblock, whose timestamp
  435  * has been updated to reflect that the pool is in use.
  436  */
  437 static void
  438 mmp_write_uberblock(spa_t *spa)
  439 {
  440         int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
  441         mmp_thread_t *mmp = &spa->spa_mmp;
  442         uberblock_t *ub;
  443         vdev_t *vd = NULL;
  444         int label, error;
  445         uint64_t offset;
  446 
  447         hrtime_t lock_acquire_time = gethrtime();
  448         spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
  449         lock_acquire_time = gethrtime() - lock_acquire_time;
  450         if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
  451                 zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
  452                     "gethrtime %llu", spa_name(spa), lock_acquire_time,
  453                     gethrtime());
  454 
  455         mutex_enter(&mmp->mmp_io_lock);
  456 
  457         error = mmp_next_leaf(spa);
  458 
  459         /*
  460          * spa_mmp_history has two types of entries:
  461          * Issued MMP write: records time issued, error status, etc.
  462          * Skipped MMP write: an MMP write could not be issued because no
  463          * suitable leaf vdev was available.  See comment above struct
  464          * spa_mmp_history for details.
  465          */
  466 
  467         if (error) {
  468                 mmp_delay_update(spa, B_FALSE);
  469                 if (mmp->mmp_skip_error == error) {
  470                         spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
  471                 } else {
  472                         mmp->mmp_skip_error = error;
  473                         spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
  474                             gethrestime_sec(), mmp->mmp_delay, NULL, 0,
  475                             mmp->mmp_kstat_id++, error);
  476                         zfs_dbgmsg("MMP error choosing leaf pool '%s' "
  477                             "gethrtime %llu fail_mask %#x", spa_name(spa),
  478                             gethrtime(), error);
  479                 }
  480                 mutex_exit(&mmp->mmp_io_lock);
  481                 spa_config_exit(spa, SCL_STATE, mmp_tag);
  482                 return;
  483         }
  484 
  485         vd = spa->spa_mmp.mmp_last_leaf;
  486         if (mmp->mmp_skip_error != 0) {
  487                 mmp->mmp_skip_error = 0;
  488                 zfs_dbgmsg("MMP write after skipping due to unavailable "
  489                     "leaves, pool '%s' gethrtime %llu leaf %llu",
  490                     spa_name(spa), (u_longlong_t)gethrtime(),
  491                     (u_longlong_t)vd->vdev_guid);
  492         }
  493 
  494         if (mmp->mmp_zio_root == NULL)
  495                 mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
  496                     flags | ZIO_FLAG_GODFATHER);
  497 
  498         if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) {
  499                 /*
  500                  * Want to reset mmp_seq when timestamp advances because after
  501                  * an mmp_seq wrap new values will not be chosen by
  502                  * uberblock_compare() as the "best".
  503                  */
  504                 mmp->mmp_ub.ub_timestamp = gethrestime_sec();
  505                 mmp->mmp_seq = 1;
  506         }
  507 
  508         ub = &mmp->mmp_ub;
  509         ub->ub_mmp_magic = MMP_MAGIC;
  510         ub->ub_mmp_delay = mmp->mmp_delay;
  511         ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) |
  512             MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) |
  513             MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK(
  514             zfs_multihost_fail_intervals));
  515         vd->vdev_mmp_pending = gethrtime();
  516         vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
  517 
  518         zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
  519         abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
  520         abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
  521         abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
  522 
  523         mmp->mmp_seq++;
  524         mmp->mmp_kstat_id++;
  525         mutex_exit(&mmp->mmp_io_lock);
  526 
  527         offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
  528             MMP_BLOCKS_PER_LABEL + random_in_range(MMP_BLOCKS_PER_LABEL));
  529 
  530         label = random_in_range(VDEV_LABELS);
  531         vdev_label_write(zio, vd, label, ub_abd, offset,
  532             VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
  533             flags | ZIO_FLAG_DONT_PROPAGATE);
  534 
  535         (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
  536             ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
  537 
  538         zio_nowait(zio);
  539 }
  540 
  541 static __attribute__((noreturn)) void
  542 mmp_thread(void *arg)
  543 {
  544         spa_t *spa = (spa_t *)arg;
  545         mmp_thread_t *mmp = &spa->spa_mmp;
  546         boolean_t suspended = spa_suspended(spa);
  547         boolean_t multihost = spa_multihost(spa);
  548         uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
  549             zfs_multihost_interval));
  550         uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
  551             zfs_multihost_fail_intervals);
  552         hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
  553         boolean_t last_spa_suspended;
  554         boolean_t last_spa_multihost;
  555         uint64_t last_mmp_interval;
  556         uint32_t last_mmp_fail_intervals;
  557         hrtime_t last_mmp_fail_ns;
  558         callb_cpr_t cpr;
  559         int skip_wait = 0;
  560 
  561         mmp_thread_enter(mmp, &cpr);
  562 
  563         /*
  564          * There have been no MMP writes yet.  Setting mmp_last_write here gives
  565          * us one mmp_fail_ns period, which is consistent with the activity
  566          * check duration, to try to land an MMP write before MMP suspends the
  567          * pool (if so configured).
  568          */
  569 
  570         mutex_enter(&mmp->mmp_io_lock);
  571         mmp->mmp_last_write = gethrtime();
  572         mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
  573         mutex_exit(&mmp->mmp_io_lock);
  574 
  575         while (!mmp->mmp_thread_exiting) {
  576                 hrtime_t next_time = gethrtime() +
  577                     MSEC2NSEC(MMP_DEFAULT_INTERVAL);
  578                 int leaves = MAX(vdev_count_leaves(spa), 1);
  579 
  580                 /* Detect changes in tunables or state */
  581 
  582                 last_spa_suspended = suspended;
  583                 last_spa_multihost = multihost;
  584                 suspended = spa_suspended(spa);
  585                 multihost = spa_multihost(spa);
  586 
  587                 last_mmp_interval = mmp_interval;
  588                 last_mmp_fail_intervals = mmp_fail_intervals;
  589                 last_mmp_fail_ns = mmp_fail_ns;
  590                 mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
  591                     zfs_multihost_interval));
  592                 mmp_fail_intervals = MMP_FAIL_INTVS_OK(
  593                     zfs_multihost_fail_intervals);
  594 
  595                 /* Smooth so pool is not suspended when reducing tunables */
  596                 if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) {
  597                         mmp_fail_ns = (mmp_fail_ns * 31 +
  598                             mmp_fail_intervals * mmp_interval) / 32;
  599                 } else {
  600                         mmp_fail_ns = mmp_fail_intervals *
  601                             mmp_interval;
  602                 }
  603 
  604                 if (mmp_interval != last_mmp_interval ||
  605                     mmp_fail_intervals != last_mmp_fail_intervals) {
  606                         /*
  607                          * We want other hosts to see new tunables as quickly as
  608                          * possible.  Write out at higher frequency than usual.
  609                          */
  610                         skip_wait += leaves;
  611                 }
  612 
  613                 if (multihost)
  614                         next_time = gethrtime() + mmp_interval / leaves;
  615 
  616                 if (mmp_fail_ns != last_mmp_fail_ns) {
  617                         zfs_dbgmsg("MMP interval change pool '%s' "
  618                             "gethrtime %llu last_mmp_interval %llu "
  619                             "mmp_interval %llu last_mmp_fail_intervals %u "
  620                             "mmp_fail_intervals %u mmp_fail_ns %llu "
  621                             "skip_wait %d leaves %d next_time %llu",
  622                             spa_name(spa), (u_longlong_t)gethrtime(),
  623                             (u_longlong_t)last_mmp_interval,
  624                             (u_longlong_t)mmp_interval, last_mmp_fail_intervals,
  625                             mmp_fail_intervals, (u_longlong_t)mmp_fail_ns,
  626                             skip_wait, leaves, (u_longlong_t)next_time);
  627                 }
  628 
  629                 /*
  630                  * MMP off => on, or suspended => !suspended:
  631                  * No writes occurred recently.  Update mmp_last_write to give
  632                  * us some time to try.
  633                  */
  634                 if ((!last_spa_multihost && multihost) ||
  635                     (last_spa_suspended && !suspended)) {
  636                         zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
  637                             "last_spa_multihost %u multihost %u "
  638                             "last_spa_suspended %u suspended %u",
  639                             spa_name(spa), (u_longlong_t)gethrtime(),
  640                             last_spa_multihost, multihost, last_spa_suspended,
  641                             suspended);
  642                         mutex_enter(&mmp->mmp_io_lock);
  643                         mmp->mmp_last_write = gethrtime();
  644                         mmp->mmp_delay = mmp_interval;
  645                         mutex_exit(&mmp->mmp_io_lock);
  646                 }
  647 
  648                 /*
  649                  * MMP on => off:
  650                  * mmp_delay == 0 tells importing node to skip activity check.
  651                  */
  652                 if (last_spa_multihost && !multihost) {
  653                         mutex_enter(&mmp->mmp_io_lock);
  654                         mmp->mmp_delay = 0;
  655                         mutex_exit(&mmp->mmp_io_lock);
  656                 }
  657 
  658                 /*
  659                  * Suspend the pool if no MMP write has succeeded in over
  660                  * mmp_interval * mmp_fail_intervals nanoseconds.
  661                  */
  662                 if (multihost && !suspended && mmp_fail_intervals &&
  663                     (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
  664                         zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
  665                             "mmp_last_write %llu mmp_interval %llu "
  666                             "mmp_fail_intervals %llu mmp_fail_ns %llu",
  667                             spa_name(spa), (u_longlong_t)gethrtime(),
  668                             (u_longlong_t)mmp->mmp_last_write,
  669                             (u_longlong_t)mmp_interval,
  670                             (u_longlong_t)mmp_fail_intervals,
  671                             (u_longlong_t)mmp_fail_ns);
  672                         cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
  673                             "succeeded in over %llu ms; suspending pool. "
  674                             "Hrtime %llu",
  675                             spa_name(spa),
  676                             NSEC2MSEC(gethrtime() - mmp->mmp_last_write),
  677                             gethrtime());
  678                         zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
  679                 }
  680 
  681                 if (multihost && !suspended)
  682                         mmp_write_uberblock(spa);
  683 
  684                 if (skip_wait > 0) {
  685                         next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) /
  686                             leaves;
  687                         skip_wait--;
  688                 }
  689 
  690                 CALLB_CPR_SAFE_BEGIN(&cpr);
  691                 (void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv,
  692                     &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
  693                     CALLOUT_FLAG_ABSOLUTE);
  694                 CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
  695         }
  696 
  697         /* Outstanding writes are allowed to complete. */
  698         zio_wait(mmp->mmp_zio_root);
  699 
  700         mmp->mmp_zio_root = NULL;
  701         mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
  702 
  703         thread_exit();
  704 }
  705 
  706 /*
  707  * Signal the MMP thread to wake it, when it is sleeping on
  708  * its cv.  Used when some module parameter has changed and
  709  * we want the thread to know about it.
  710  * Only signal if the pool is active and mmp thread is
  711  * running, otherwise there is no thread to wake.
  712  */
  713 static void
  714 mmp_signal_thread(spa_t *spa)
  715 {
  716         mmp_thread_t *mmp = &spa->spa_mmp;
  717 
  718         mutex_enter(&mmp->mmp_thread_lock);
  719         if (mmp->mmp_thread)
  720                 cv_broadcast(&mmp->mmp_thread_cv);
  721         mutex_exit(&mmp->mmp_thread_lock);
  722 }
  723 
  724 void
  725 mmp_signal_all_threads(void)
  726 {
  727         spa_t *spa = NULL;
  728 
  729         mutex_enter(&spa_namespace_lock);
  730         while ((spa = spa_next(spa))) {
  731                 if (spa->spa_state == POOL_STATE_ACTIVE)
  732                         mmp_signal_thread(spa);
  733         }
  734         mutex_exit(&spa_namespace_lock);
  735 }
  736 
  737 /* BEGIN CSTYLED */
  738 ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval,
  739         param_set_multihost_interval, spl_param_get_u64, ZMOD_RW,
  740         "Milliseconds between mmp writes to each leaf");
  741 /* END CSTYLED */
  742 
  743 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW,
  744         "Max allowed period without a successful mmp write");
  745 
  746 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW,
  747         "Number of zfs_multihost_interval periods to wait for activity");

Cache object: 4a1ef47f4f5f600683f88da8ca5e7ba7


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.