The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/zio_inject.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
   24  * Copyright (c) 2017, Intel Corporation.
   25  */
   26 
   27 /*
   28  * ZFS fault injection
   29  *
   30  * To handle fault injection, we keep track of a series of zinject_record_t
   31  * structures which describe which logical block(s) should be injected with a
   32  * fault.  These are kept in a global list.  Each record corresponds to a given
   33  * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
   34  * or exported while the injection record exists.
   35  *
   36  * Device level injection is done using the 'zi_guid' field.  If this is set, it
   37  * means that the error is destined for a particular device, not a piece of
   38  * data.
   39  *
   40  * This is a rather poor data structure and algorithm, but we don't expect more
   41  * than a few faults at any one time, so it should be sufficient for our needs.
   42  */
   43 
   44 #include <sys/arc.h>
   45 #include <sys/zio.h>
   46 #include <sys/zfs_ioctl.h>
   47 #include <sys/vdev_impl.h>
   48 #include <sys/dmu_objset.h>
   49 #include <sys/dsl_dataset.h>
   50 #include <sys/fs/zfs.h>
   51 
   52 uint32_t zio_injection_enabled = 0;
   53 
   54 /*
   55  * Data describing each zinject handler registered on the system, and
   56  * contains the list node linking the handler in the global zinject
   57  * handler list.
   58  */
   59 typedef struct inject_handler {
   60         int                     zi_id;
   61         spa_t                   *zi_spa;
   62         zinject_record_t        zi_record;
   63         uint64_t                *zi_lanes;
   64         int                     zi_next_lane;
   65         list_node_t             zi_link;
   66 } inject_handler_t;
   67 
   68 /*
   69  * List of all zinject handlers registered on the system, protected by
   70  * the inject_lock defined below.
   71  */
   72 static list_t inject_handlers;
   73 
   74 /*
   75  * This protects insertion into, and traversal of, the inject handler
   76  * list defined above; as well as the inject_delay_count. Any time a
   77  * handler is inserted or removed from the list, this lock should be
   78  * taken as a RW_WRITER; and any time traversal is done over the list
   79  * (without modification to it) this lock should be taken as a RW_READER.
   80  */
   81 static krwlock_t inject_lock;
   82 
   83 /*
   84  * This holds the number of zinject delay handlers that have been
   85  * registered on the system. It is protected by the inject_lock defined
   86  * above. Thus modifications to this count must be a RW_WRITER of the
   87  * inject_lock, and reads of this count must be (at least) a RW_READER
   88  * of the lock.
   89  */
   90 static int inject_delay_count = 0;
   91 
   92 /*
   93  * This lock is used only in zio_handle_io_delay(), refer to the comment
   94  * in that function for more details.
   95  */
   96 static kmutex_t inject_delay_mtx;
   97 
   98 /*
   99  * Used to assign unique identifying numbers to each new zinject handler.
  100  */
  101 static int inject_next_id = 1;
  102 
  103 /*
  104  * Test if the requested frequency was triggered
  105  */
  106 static boolean_t
  107 freq_triggered(uint32_t frequency)
  108 {
  109         /*
  110          * zero implies always (100%)
  111          */
  112         if (frequency == 0)
  113                 return (B_TRUE);
  114 
  115         /*
  116          * Note: we still handle legacy (unscaled) frequency values
  117          */
  118         uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX;
  119 
  120         return (random_in_range(maximum) < frequency);
  121 }
  122 
  123 /*
  124  * Returns true if the given record matches the I/O in progress.
  125  */
  126 static boolean_t
  127 zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
  128     zinject_record_t *record, int error)
  129 {
  130         /*
  131          * Check for a match against the MOS, which is based on type
  132          */
  133         if (zb->zb_objset == DMU_META_OBJSET &&
  134             record->zi_objset == DMU_META_OBJSET &&
  135             record->zi_object == DMU_META_DNODE_OBJECT) {
  136                 if (record->zi_type == DMU_OT_NONE ||
  137                     type == record->zi_type)
  138                         return (freq_triggered(record->zi_freq));
  139                 else
  140                         return (B_FALSE);
  141         }
  142 
  143         /*
  144          * Check for an exact match.
  145          */
  146         if (zb->zb_objset == record->zi_objset &&
  147             zb->zb_object == record->zi_object &&
  148             zb->zb_level == record->zi_level &&
  149             zb->zb_blkid >= record->zi_start &&
  150             zb->zb_blkid <= record->zi_end &&
  151             (record->zi_dvas == 0 ||
  152             (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) &&
  153             error == record->zi_error) {
  154                 return (freq_triggered(record->zi_freq));
  155         }
  156 
  157         return (B_FALSE);
  158 }
  159 
  160 /*
  161  * Panic the system when a config change happens in the function
  162  * specified by tag.
  163  */
  164 void
  165 zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type)
  166 {
  167         inject_handler_t *handler;
  168 
  169         rw_enter(&inject_lock, RW_READER);
  170 
  171         for (handler = list_head(&inject_handlers); handler != NULL;
  172             handler = list_next(&inject_handlers, handler)) {
  173 
  174                 if (spa != handler->zi_spa)
  175                         continue;
  176 
  177                 if (handler->zi_record.zi_type == type &&
  178                     strcmp(tag, handler->zi_record.zi_func) == 0)
  179                         panic("Panic requested in function %s\n", tag);
  180         }
  181 
  182         rw_exit(&inject_lock);
  183 }
  184 
  185 /*
  186  * Inject a decryption failure. Decryption failures can occur in
  187  * both the ARC and the ZIO layers.
  188  */
  189 int
  190 zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
  191     uint64_t type, int error)
  192 {
  193         int ret = 0;
  194         inject_handler_t *handler;
  195 
  196         rw_enter(&inject_lock, RW_READER);
  197 
  198         for (handler = list_head(&inject_handlers); handler != NULL;
  199             handler = list_next(&inject_handlers, handler)) {
  200 
  201                 if (spa != handler->zi_spa ||
  202                     handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
  203                         continue;
  204 
  205                 if (zio_match_handler(zb, type, ZI_NO_DVA,
  206                     &handler->zi_record, error)) {
  207                         ret = error;
  208                         break;
  209                 }
  210         }
  211 
  212         rw_exit(&inject_lock);
  213         return (ret);
  214 }
  215 
  216 /*
  217  * If this is a physical I/O for a vdev child determine which DVA it is
  218  * for. We iterate backwards through the DVAs matching on the offset so
  219  * that we end up with ZI_NO_DVA (-1) if we don't find a match.
  220  */
  221 static int
  222 zio_match_dva(zio_t *zio)
  223 {
  224         int i = ZI_NO_DVA;
  225 
  226         if (zio->io_bp != NULL && zio->io_vd != NULL &&
  227             zio->io_child_type == ZIO_CHILD_VDEV) {
  228                 for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
  229                         dva_t *dva = &zio->io_bp->blk_dva[i];
  230                         uint64_t off = DVA_GET_OFFSET(dva);
  231                         vdev_t *vd = vdev_lookup_top(zio->io_spa,
  232                             DVA_GET_VDEV(dva));
  233 
  234                         /* Compensate for vdev label added to leaves */
  235                         if (zio->io_vd->vdev_ops->vdev_op_leaf)
  236                                 off += VDEV_LABEL_START_SIZE;
  237 
  238                         if (zio->io_vd == vd && zio->io_offset == off)
  239                                 break;
  240                 }
  241         }
  242 
  243         return (i);
  244 }
  245 
  246 
  247 /*
  248  * Determine if the I/O in question should return failure.  Returns the errno
  249  * to be returned to the caller.
  250  */
  251 int
  252 zio_handle_fault_injection(zio_t *zio, int error)
  253 {
  254         int ret = 0;
  255         inject_handler_t *handler;
  256 
  257         /*
  258          * Ignore I/O not associated with any logical data.
  259          */
  260         if (zio->io_logical == NULL)
  261                 return (0);
  262 
  263         /*
  264          * Currently, we only support fault injection on reads.
  265          */
  266         if (zio->io_type != ZIO_TYPE_READ)
  267                 return (0);
  268 
  269         /*
  270          * A rebuild I/O has no checksum to verify.
  271          */
  272         if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
  273                 return (0);
  274 
  275         rw_enter(&inject_lock, RW_READER);
  276 
  277         for (handler = list_head(&inject_handlers); handler != NULL;
  278             handler = list_next(&inject_handlers, handler)) {
  279                 if (zio->io_spa != handler->zi_spa ||
  280                     handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
  281                         continue;
  282 
  283                 /* If this handler matches, return the specified error */
  284                 if (zio_match_handler(&zio->io_logical->io_bookmark,
  285                     zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
  286                     zio_match_dva(zio), &handler->zi_record, error)) {
  287                         ret = error;
  288                         break;
  289                 }
  290         }
  291 
  292         rw_exit(&inject_lock);
  293 
  294         return (ret);
  295 }
  296 
  297 /*
  298  * Determine if the zio is part of a label update and has an injection
  299  * handler associated with that portion of the label. Currently, we
  300  * allow error injection in either the nvlist or the uberblock region of
  301  * of the vdev label.
  302  */
  303 int
  304 zio_handle_label_injection(zio_t *zio, int error)
  305 {
  306         inject_handler_t *handler;
  307         vdev_t *vd = zio->io_vd;
  308         uint64_t offset = zio->io_offset;
  309         int label;
  310         int ret = 0;
  311 
  312         if (offset >= VDEV_LABEL_START_SIZE &&
  313             offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
  314                 return (0);
  315 
  316         rw_enter(&inject_lock, RW_READER);
  317 
  318         for (handler = list_head(&inject_handlers); handler != NULL;
  319             handler = list_next(&inject_handlers, handler)) {
  320                 uint64_t start = handler->zi_record.zi_start;
  321                 uint64_t end = handler->zi_record.zi_end;
  322 
  323                 if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
  324                         continue;
  325 
  326                 /*
  327                  * The injection region is the relative offsets within a
  328                  * vdev label. We must determine the label which is being
  329                  * updated and adjust our region accordingly.
  330                  */
  331                 label = vdev_label_number(vd->vdev_psize, offset);
  332                 start = vdev_label_offset(vd->vdev_psize, label, start);
  333                 end = vdev_label_offset(vd->vdev_psize, label, end);
  334 
  335                 if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
  336                     (offset >= start && offset <= end)) {
  337                         ret = error;
  338                         break;
  339                 }
  340         }
  341         rw_exit(&inject_lock);
  342         return (ret);
  343 }
  344 
  345 static int
  346 zio_inject_bitflip_cb(void *data, size_t len, void *private)
  347 {
  348         zio_t *zio = private;
  349         uint8_t *buffer = data;
  350         uint_t byte = random_in_range(len);
  351 
  352         ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
  353 
  354         /* flip a single random bit in an abd data buffer */
  355         buffer[byte] ^= 1 << random_in_range(8);
  356 
  357         return (1);     /* stop after first flip */
  358 }
  359 
  360 static int
  361 zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
  362 {
  363         inject_handler_t *handler;
  364         int ret = 0;
  365 
  366         /*
  367          * We skip over faults in the labels unless it's during
  368          * device open (i.e. zio == NULL).
  369          */
  370         if (zio != NULL) {
  371                 uint64_t offset = zio->io_offset;
  372 
  373                 if (offset < VDEV_LABEL_START_SIZE ||
  374                     offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
  375                         return (0);
  376         }
  377 
  378         rw_enter(&inject_lock, RW_READER);
  379 
  380         for (handler = list_head(&inject_handlers); handler != NULL;
  381             handler = list_next(&inject_handlers, handler)) {
  382 
  383                 if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
  384                         continue;
  385 
  386                 if (vd->vdev_guid == handler->zi_record.zi_guid) {
  387                         if (handler->zi_record.zi_failfast &&
  388                             (zio == NULL || (zio->io_flags &
  389                             (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
  390                                 continue;
  391                         }
  392 
  393                         /* Handle type specific I/O failures */
  394                         if (zio != NULL &&
  395                             handler->zi_record.zi_iotype != ZIO_TYPES &&
  396                             handler->zi_record.zi_iotype != zio->io_type)
  397                                 continue;
  398 
  399                         if (handler->zi_record.zi_error == err1 ||
  400                             handler->zi_record.zi_error == err2) {
  401                                 /*
  402                                  * limit error injection if requested
  403                                  */
  404                                 if (!freq_triggered(handler->zi_record.zi_freq))
  405                                         continue;
  406 
  407                                 /*
  408                                  * For a failed open, pretend like the device
  409                                  * has gone away.
  410                                  */
  411                                 if (err1 == ENXIO)
  412                                         vd->vdev_stat.vs_aux =
  413                                             VDEV_AUX_OPEN_FAILED;
  414 
  415                                 /*
  416                                  * Treat these errors as if they had been
  417                                  * retried so that all the appropriate stats
  418                                  * and FMA events are generated.
  419                                  */
  420                                 if (!handler->zi_record.zi_failfast &&
  421                                     zio != NULL)
  422                                         zio->io_flags |= ZIO_FLAG_IO_RETRY;
  423 
  424                                 /*
  425                                  * EILSEQ means flip a bit after a read
  426                                  */
  427                                 if (handler->zi_record.zi_error == EILSEQ) {
  428                                         if (zio == NULL)
  429                                                 break;
  430 
  431                                         /* locate buffer data and flip a bit */
  432                                         (void) abd_iterate_func(zio->io_abd, 0,
  433                                             zio->io_size, zio_inject_bitflip_cb,
  434                                             zio);
  435                                         break;
  436                                 }
  437 
  438                                 ret = handler->zi_record.zi_error;
  439                                 break;
  440                         }
  441                         if (handler->zi_record.zi_error == ENXIO) {
  442                                 ret = SET_ERROR(EIO);
  443                                 break;
  444                         }
  445                 }
  446         }
  447 
  448         rw_exit(&inject_lock);
  449 
  450         return (ret);
  451 }
  452 
  453 int
  454 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
  455 {
  456         return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX));
  457 }
  458 
  459 int
  460 zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2)
  461 {
  462         return (zio_handle_device_injection_impl(vd, zio, err1, err2));
  463 }
  464 
  465 /*
  466  * Simulate hardware that ignores cache flushes.  For requested number
  467  * of seconds nix the actual writing to disk.
  468  */
  469 void
  470 zio_handle_ignored_writes(zio_t *zio)
  471 {
  472         inject_handler_t *handler;
  473 
  474         rw_enter(&inject_lock, RW_READER);
  475 
  476         for (handler = list_head(&inject_handlers); handler != NULL;
  477             handler = list_next(&inject_handlers, handler)) {
  478 
  479                 /* Ignore errors not destined for this pool */
  480                 if (zio->io_spa != handler->zi_spa ||
  481                     handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
  482                         continue;
  483 
  484                 /*
  485                  * Positive duration implies # of seconds, negative
  486                  * a number of txgs
  487                  */
  488                 if (handler->zi_record.zi_timer == 0) {
  489                         if (handler->zi_record.zi_duration > 0)
  490                                 handler->zi_record.zi_timer = ddi_get_lbolt64();
  491                         else
  492                                 handler->zi_record.zi_timer = zio->io_txg;
  493                 }
  494 
  495                 /* Have a "problem" writing 60% of the time */
  496                 if (random_in_range(100) < 60)
  497                         zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
  498                 break;
  499         }
  500 
  501         rw_exit(&inject_lock);
  502 }
  503 
  504 void
  505 spa_handle_ignored_writes(spa_t *spa)
  506 {
  507         inject_handler_t *handler;
  508 
  509         if (zio_injection_enabled == 0)
  510                 return;
  511 
  512         rw_enter(&inject_lock, RW_READER);
  513 
  514         for (handler = list_head(&inject_handlers); handler != NULL;
  515             handler = list_next(&inject_handlers, handler)) {
  516 
  517                 if (spa != handler->zi_spa ||
  518                     handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
  519                         continue;
  520 
  521                 if (handler->zi_record.zi_duration > 0) {
  522                         VERIFY(handler->zi_record.zi_timer == 0 ||
  523                             ddi_time_after64(
  524                             (int64_t)handler->zi_record.zi_timer +
  525                             handler->zi_record.zi_duration * hz,
  526                             ddi_get_lbolt64()));
  527                 } else {
  528                         /* duration is negative so the subtraction here adds */
  529                         VERIFY(handler->zi_record.zi_timer == 0 ||
  530                             handler->zi_record.zi_timer -
  531                             handler->zi_record.zi_duration >=
  532                             spa_syncing_txg(spa));
  533                 }
  534         }
  535 
  536         rw_exit(&inject_lock);
  537 }
  538 
  539 hrtime_t
  540 zio_handle_io_delay(zio_t *zio)
  541 {
  542         vdev_t *vd = zio->io_vd;
  543         inject_handler_t *min_handler = NULL;
  544         hrtime_t min_target = 0;
  545 
  546         rw_enter(&inject_lock, RW_READER);
  547 
  548         /*
  549          * inject_delay_count is a subset of zio_injection_enabled that
  550          * is only incremented for delay handlers. These checks are
  551          * mainly added to remind the reader why we're not explicitly
  552          * checking zio_injection_enabled like the other functions.
  553          */
  554         IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
  555         IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
  556 
  557         /*
  558          * If there aren't any inject delay handlers registered, then we
  559          * can short circuit and simply return 0 here. A value of zero
  560          * informs zio_delay_interrupt() that this request should not be
  561          * delayed. This short circuit keeps us from acquiring the
  562          * inject_delay_mutex unnecessarily.
  563          */
  564         if (inject_delay_count == 0) {
  565                 rw_exit(&inject_lock);
  566                 return (0);
  567         }
  568 
  569         /*
  570          * Each inject handler has a number of "lanes" associated with
  571          * it. Each lane is able to handle requests independently of one
  572          * another, and at a latency defined by the inject handler
  573          * record's zi_timer field. Thus if a handler in configured with
  574          * a single lane with a 10ms latency, it will delay requests
  575          * such that only a single request is completed every 10ms. So,
  576          * if more than one request is attempted per each 10ms interval,
  577          * the average latency of the requests will be greater than
  578          * 10ms; but if only a single request is submitted each 10ms
  579          * interval the average latency will be 10ms.
  580          *
  581          * We need to acquire this mutex to prevent multiple concurrent
  582          * threads being assigned to the same lane of a given inject
  583          * handler. The mutex allows us to perform the following two
  584          * operations atomically:
  585          *
  586          *      1. determine the minimum handler and minimum target
  587          *         value of all the possible handlers
  588          *      2. update that minimum handler's lane array
  589          *
  590          * Without atomicity, two (or more) threads could pick the same
  591          * lane in step (1), and then conflict with each other in step
  592          * (2). This could allow a single lane handler to process
  593          * multiple requests simultaneously, which shouldn't be possible.
  594          */
  595         mutex_enter(&inject_delay_mtx);
  596 
  597         for (inject_handler_t *handler = list_head(&inject_handlers);
  598             handler != NULL; handler = list_next(&inject_handlers, handler)) {
  599                 if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
  600                         continue;
  601 
  602                 if (!freq_triggered(handler->zi_record.zi_freq))
  603                         continue;
  604 
  605                 if (vd->vdev_guid != handler->zi_record.zi_guid)
  606                         continue;
  607 
  608                 /*
  609                  * Defensive; should never happen as the array allocation
  610                  * occurs prior to inserting this handler on the list.
  611                  */
  612                 ASSERT3P(handler->zi_lanes, !=, NULL);
  613 
  614                 /*
  615                  * This should never happen, the zinject command should
  616                  * prevent a user from setting an IO delay with zero lanes.
  617                  */
  618                 ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
  619 
  620                 ASSERT3U(handler->zi_record.zi_nlanes, >,
  621                     handler->zi_next_lane);
  622 
  623                 /*
  624                  * We want to issue this IO to the lane that will become
  625                  * idle the soonest, so we compare the soonest this
  626                  * specific handler can complete the IO with all other
  627                  * handlers, to find the lowest value of all possible
  628                  * lanes. We then use this lane to submit the request.
  629                  *
  630                  * Since each handler has a constant value for its
  631                  * delay, we can just use the "next" lane for that
  632                  * handler; as it will always be the lane with the
  633                  * lowest value for that particular handler (i.e. the
  634                  * lane that will become idle the soonest). This saves a
  635                  * scan of each handler's lanes array.
  636                  *
  637                  * There's two cases to consider when determining when
  638                  * this specific IO request should complete. If this
  639                  * lane is idle, we want to "submit" the request now so
  640                  * it will complete after zi_timer milliseconds. Thus,
  641                  * we set the target to now + zi_timer.
  642                  *
  643                  * If the lane is busy, we want this request to complete
  644                  * zi_timer milliseconds after the lane becomes idle.
  645                  * Since the 'zi_lanes' array holds the time at which
  646                  * each lane will become idle, we use that value to
  647                  * determine when this request should complete.
  648                  */
  649                 hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
  650                 hrtime_t busy = handler->zi_record.zi_timer +
  651                     handler->zi_lanes[handler->zi_next_lane];
  652                 hrtime_t target = MAX(idle, busy);
  653 
  654                 if (min_handler == NULL) {
  655                         min_handler = handler;
  656                         min_target = target;
  657                         continue;
  658                 }
  659 
  660                 ASSERT3P(min_handler, !=, NULL);
  661                 ASSERT3U(min_target, !=, 0);
  662 
  663                 /*
  664                  * We don't yet increment the "next lane" variable since
  665                  * we still might find a lower value lane in another
  666                  * handler during any remaining iterations. Once we're
  667                  * sure we've selected the absolute minimum, we'll claim
  668                  * the lane and increment the handler's "next lane"
  669                  * field below.
  670                  */
  671 
  672                 if (target < min_target) {
  673                         min_handler = handler;
  674                         min_target = target;
  675                 }
  676         }
  677 
  678         /*
  679          * 'min_handler' will be NULL if no IO delays are registered for
  680          * this vdev, otherwise it will point to the handler containing
  681          * the lane that will become idle the soonest.
  682          */
  683         if (min_handler != NULL) {
  684                 ASSERT3U(min_target, !=, 0);
  685                 min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
  686 
  687                 /*
  688                  * If we've used all possible lanes for this handler,
  689                  * loop back and start using the first lane again;
  690                  * otherwise, just increment the lane index.
  691                  */
  692                 min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
  693                     min_handler->zi_record.zi_nlanes;
  694         }
  695 
  696         mutex_exit(&inject_delay_mtx);
  697         rw_exit(&inject_lock);
  698 
  699         return (min_target);
  700 }
  701 
  702 static int
  703 zio_calculate_range(const char *pool, zinject_record_t *record)
  704 {
  705         dsl_pool_t *dp;
  706         dsl_dataset_t *ds;
  707         objset_t *os = NULL;
  708         dnode_t *dn = NULL;
  709         int error;
  710 
  711         /*
  712          * Obtain the dnode for object using pool, objset, and object
  713          */
  714         error = dsl_pool_hold(pool, FTAG, &dp);
  715         if (error)
  716                 return (error);
  717 
  718         error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds);
  719         dsl_pool_rele(dp, FTAG);
  720         if (error)
  721                 return (error);
  722 
  723         error = dmu_objset_from_ds(ds, &os);
  724         dsl_dataset_rele(ds, FTAG);
  725         if (error)
  726                 return (error);
  727 
  728         error = dnode_hold(os, record->zi_object, FTAG, &dn);
  729         if (error)
  730                 return (error);
  731 
  732         /*
  733          * Translate the range into block IDs
  734          */
  735         if (record->zi_start != 0 || record->zi_end != -1ULL) {
  736                 record->zi_start >>= dn->dn_datablkshift;
  737                 record->zi_end >>= dn->dn_datablkshift;
  738         }
  739         if (record->zi_level > 0) {
  740                 if (record->zi_level >= dn->dn_nlevels) {
  741                         dnode_rele(dn, FTAG);
  742                         return (SET_ERROR(EDOM));
  743                 }
  744 
  745                 if (record->zi_start != 0 || record->zi_end != 0) {
  746                         int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
  747 
  748                         for (int level = record->zi_level; level > 0; level--) {
  749                                 record->zi_start >>= shift;
  750                                 record->zi_end >>= shift;
  751                         }
  752                 }
  753         }
  754 
  755         dnode_rele(dn, FTAG);
  756         return (0);
  757 }
  758 
  759 /*
  760  * Create a new handler for the given record.  We add it to the list, adding
  761  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
  762  * which is the switch to trigger all fault injection.
  763  */
  764 int
  765 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
  766 {
  767         inject_handler_t *handler;
  768         int error;
  769         spa_t *spa;
  770 
  771         /*
  772          * If this is pool-wide metadata, make sure we unload the corresponding
  773          * spa_t, so that the next attempt to load it will trigger the fault.
  774          * We call spa_reset() to unload the pool appropriately.
  775          */
  776         if (flags & ZINJECT_UNLOAD_SPA)
  777                 if ((error = spa_reset(name)) != 0)
  778                         return (error);
  779 
  780         if (record->zi_cmd == ZINJECT_DELAY_IO) {
  781                 /*
  782                  * A value of zero for the number of lanes or for the
  783                  * delay time doesn't make sense.
  784                  */
  785                 if (record->zi_timer == 0 || record->zi_nlanes == 0)
  786                         return (SET_ERROR(EINVAL));
  787 
  788                 /*
  789                  * The number of lanes is directly mapped to the size of
  790                  * an array used by the handler. Thus, to ensure the
  791                  * user doesn't trigger an allocation that's "too large"
  792                  * we cap the number of lanes here.
  793                  */
  794                 if (record->zi_nlanes >= UINT16_MAX)
  795                         return (SET_ERROR(EINVAL));
  796         }
  797 
  798         /*
  799          * If the supplied range was in bytes -- calculate the actual blkid
  800          */
  801         if (flags & ZINJECT_CALC_RANGE) {
  802                 error = zio_calculate_range(name, record);
  803                 if (error != 0)
  804                         return (error);
  805         }
  806 
  807         if (!(flags & ZINJECT_NULL)) {
  808                 /*
  809                  * spa_inject_ref() will add an injection reference, which will
  810                  * prevent the pool from being removed from the namespace while
  811                  * still allowing it to be unloaded.
  812                  */
  813                 if ((spa = spa_inject_addref(name)) == NULL)
  814                         return (SET_ERROR(ENOENT));
  815 
  816                 handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
  817 
  818                 handler->zi_spa = spa;
  819                 handler->zi_record = *record;
  820 
  821                 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
  822                         handler->zi_lanes = kmem_zalloc(
  823                             sizeof (*handler->zi_lanes) *
  824                             handler->zi_record.zi_nlanes, KM_SLEEP);
  825                         handler->zi_next_lane = 0;
  826                 } else {
  827                         handler->zi_lanes = NULL;
  828                         handler->zi_next_lane = 0;
  829                 }
  830 
  831                 rw_enter(&inject_lock, RW_WRITER);
  832 
  833                 /*
  834                  * We can't move this increment into the conditional
  835                  * above because we need to hold the RW_WRITER lock of
  836                  * inject_lock, and we don't want to hold that while
  837                  * allocating the handler's zi_lanes array.
  838                  */
  839                 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
  840                         ASSERT3S(inject_delay_count, >=, 0);
  841                         inject_delay_count++;
  842                         ASSERT3S(inject_delay_count, >, 0);
  843                 }
  844 
  845                 *id = handler->zi_id = inject_next_id++;
  846                 list_insert_tail(&inject_handlers, handler);
  847                 atomic_inc_32(&zio_injection_enabled);
  848 
  849                 rw_exit(&inject_lock);
  850         }
  851 
  852         /*
  853          * Flush the ARC, so that any attempts to read this data will end up
  854          * going to the ZIO layer.  Note that this is a little overkill, but
  855          * we don't have the necessary ARC interfaces to do anything else, and
  856          * fault injection isn't a performance critical path.
  857          */
  858         if (flags & ZINJECT_FLUSH_ARC)
  859                 /*
  860                  * We must use FALSE to ensure arc_flush returns, since
  861                  * we're not preventing concurrent ARC insertions.
  862                  */
  863                 arc_flush(NULL, FALSE);
  864 
  865         return (0);
  866 }
  867 
  868 /*
  869  * Returns the next record with an ID greater than that supplied to the
  870  * function.  Used to iterate over all handlers in the system.
  871  */
  872 int
  873 zio_inject_list_next(int *id, char *name, size_t buflen,
  874     zinject_record_t *record)
  875 {
  876         inject_handler_t *handler;
  877         int ret;
  878 
  879         mutex_enter(&spa_namespace_lock);
  880         rw_enter(&inject_lock, RW_READER);
  881 
  882         for (handler = list_head(&inject_handlers); handler != NULL;
  883             handler = list_next(&inject_handlers, handler))
  884                 if (handler->zi_id > *id)
  885                         break;
  886 
  887         if (handler) {
  888                 *record = handler->zi_record;
  889                 *id = handler->zi_id;
  890                 (void) strlcpy(name, spa_name(handler->zi_spa), buflen);
  891                 ret = 0;
  892         } else {
  893                 ret = SET_ERROR(ENOENT);
  894         }
  895 
  896         rw_exit(&inject_lock);
  897         mutex_exit(&spa_namespace_lock);
  898 
  899         return (ret);
  900 }
  901 
  902 /*
  903  * Clear the fault handler with the given identifier, or return ENOENT if none
  904  * exists.
  905  */
  906 int
  907 zio_clear_fault(int id)
  908 {
  909         inject_handler_t *handler;
  910 
  911         rw_enter(&inject_lock, RW_WRITER);
  912 
  913         for (handler = list_head(&inject_handlers); handler != NULL;
  914             handler = list_next(&inject_handlers, handler))
  915                 if (handler->zi_id == id)
  916                         break;
  917 
  918         if (handler == NULL) {
  919                 rw_exit(&inject_lock);
  920                 return (SET_ERROR(ENOENT));
  921         }
  922 
  923         if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
  924                 ASSERT3S(inject_delay_count, >, 0);
  925                 inject_delay_count--;
  926                 ASSERT3S(inject_delay_count, >=, 0);
  927         }
  928 
  929         list_remove(&inject_handlers, handler);
  930         rw_exit(&inject_lock);
  931 
  932         if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
  933                 ASSERT3P(handler->zi_lanes, !=, NULL);
  934                 kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
  935                     handler->zi_record.zi_nlanes);
  936         } else {
  937                 ASSERT3P(handler->zi_lanes, ==, NULL);
  938         }
  939 
  940         spa_inject_delref(handler->zi_spa);
  941         kmem_free(handler, sizeof (inject_handler_t));
  942         atomic_dec_32(&zio_injection_enabled);
  943 
  944         return (0);
  945 }
  946 
  947 void
  948 zio_inject_init(void)
  949 {
  950         rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
  951         mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
  952         list_create(&inject_handlers, sizeof (inject_handler_t),
  953             offsetof(inject_handler_t, zi_link));
  954 }
  955 
  956 void
  957 zio_inject_fini(void)
  958 {
  959         list_destroy(&inject_handlers);
  960         mutex_destroy(&inject_delay_mtx);
  961         rw_destroy(&inject_lock);
  962 }
  963 
  964 #if defined(_KERNEL)
  965 EXPORT_SYMBOL(zio_injection_enabled);
  966 EXPORT_SYMBOL(zio_inject_fault);
  967 EXPORT_SYMBOL(zio_inject_list_next);
  968 EXPORT_SYMBOL(zio_clear_fault);
  969 EXPORT_SYMBOL(zio_handle_fault_injection);
  970 EXPORT_SYMBOL(zio_handle_device_injection);
  971 EXPORT_SYMBOL(zio_handle_label_injection);
  972 #endif

Cache object: 2aeef7b49778eefb14cef1d77f227dfe


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.