| 
     1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
   23  * Copyright (c) 2019 by Delphix. All rights reserved.
   24  */
   25 
   26 /*
   27  * ARC buffer data (ABD).
   28  *
   29  * ABDs are an abstract data structure for the ARC which can use two
   30  * different ways of storing the underlying data:
   31  *
   32  * (a) Linear buffer. In this case, all the data in the ABD is stored in one
   33  *     contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
   34  *
   35  *         +-------------------+
   36  *         | ABD (linear)      |
   37  *         |   abd_flags = ... |
   38  *         |   abd_size = ...  |     +--------------------------------+
   39  *         |   abd_buf ------------->| raw buffer of size abd_size    |
   40  *         +-------------------+     +--------------------------------+
   41  *              no abd_chunks
   42  *
   43  * (b) Scattered buffer. In this case, the data in the ABD is split into
   44  *     equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
   45  *     to the chunks recorded in an array at the end of the ABD structure.
   46  *
   47  *         +-------------------+
   48  *         | ABD (scattered)   |
   49  *         |   abd_flags = ... |
   50  *         |   abd_size = ...  |
   51  *         |   abd_offset = 0  |                           +-----------+
   52  *         |   abd_chunks[0] ----------------------------->| chunk 0   |
   53  *         |   abd_chunks[1] ---------------------+        +-----------+
   54  *         |   ...             |                  |        +-----------+
   55  *         |   abd_chunks[N-1] ---------+         +------->| chunk 1   |
   56  *         +-------------------+        |                  +-----------+
   57  *                                      |                      ...
   58  *                                      |                  +-----------+
   59  *                                      +----------------->| chunk N-1 |
   60  *                                                         +-----------+
   61  *
   62  * In addition to directly allocating a linear or scattered ABD, it is also
   63  * possible to create an ABD by requesting the "sub-ABD" starting at an offset
   64  * within an existing ABD. In linear buffers this is simple (set abd_buf of
   65  * the new ABD to the starting point within the original raw buffer), but
   66  * scattered ABDs are a little more complex. The new ABD makes a copy of the
   67  * relevant abd_chunks pointers (but not the underlying data). However, to
   68  * provide arbitrary rather than only chunk-aligned starting offsets, it also
   69  * tracks an abd_offset field which represents the starting point of the data
   70  * within the first chunk in abd_chunks. For both linear and scattered ABDs,
   71  * creating an offset ABD marks the original ABD as the offset's parent, and the
   72  * original ABD's abd_children refcount is incremented. This data allows us to
   73  * ensure the root ABD isn't deleted before its children.
   74  *
   75  * Most consumers should never need to know what type of ABD they're using --
   76  * the ABD public API ensures that it's possible to transparently switch from
   77  * using a linear ABD to a scattered one when doing so would be beneficial.
   78  *
   79  * If you need to use the data within an ABD directly, if you know it's linear
   80  * (because you allocated it) you can use abd_to_buf() to access the underlying
   81  * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
   82  * which will allocate a raw buffer if necessary. Use the abd_return_buf*
   83  * functions to return any raw buffers that are no longer necessary when you're
   84  * done using them.
   85  *
   86  * There are a variety of ABD APIs that implement basic buffer operations:
   87  * compare, copy, read, write, and fill with zeroes. If you need a custom
   88  * function which progressively accesses the whole ABD, use the abd_iterate_*
   89  * functions.
   90  *
   91  * As an additional feature, linear and scatter ABD's can be stitched together
   92  * by using the gang ABD type (abd_alloc_gang_abd()). This allows for
   93  * multiple ABDs to be viewed as a singular ABD.
   94  *
   95  * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
   96  * B_FALSE.
   97  */
   98 
   99 #include <sys/abd_impl.h>
  100 #include <sys/param.h>
  101 #include <sys/zio.h>
  102 #include <sys/zfs_context.h>
  103 #include <sys/zfs_znode.h>
  104 
  105 /* see block comment above for description */
  106 int zfs_abd_scatter_enabled = B_TRUE;
  107 
  108 void
  109 abd_verify(abd_t *abd)
  110 {
  111 #ifdef ZFS_DEBUG
  112         ASSERT3U(abd->abd_size, >, 0);
  113         ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
  114         ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
  115             ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
  116             ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
  117             ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
  118         IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
  119         IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
  120         if (abd_is_linear(abd)) {
  121                 ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
  122         } else if (abd_is_gang(abd)) {
  123                 uint_t child_sizes = 0;
  124                 for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
  125                     cabd != NULL;
  126                     cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
  127                         ASSERT(list_link_active(&cabd->abd_gang_link));
  128                         child_sizes += cabd->abd_size;
  129                         abd_verify(cabd);
  130                 }
  131                 ASSERT3U(abd->abd_size, ==, child_sizes);
  132         } else {
  133                 abd_verify_scatter(abd);
  134         }
  135 #endif
  136 }
  137 
  138 static void
  139 abd_init_struct(abd_t *abd)
  140 {
  141         list_link_init(&abd->abd_gang_link);
  142         mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
  143         abd->abd_flags = 0;
  144 #ifdef ZFS_DEBUG
  145         zfs_refcount_create(&abd->abd_children);
  146         abd->abd_parent = NULL;
  147 #endif
  148         abd->abd_size = 0;
  149 }
  150 
  151 static void
  152 abd_fini_struct(abd_t *abd)
  153 {
  154         mutex_destroy(&abd->abd_mtx);
  155         ASSERT(!list_link_active(&abd->abd_gang_link));
  156 #ifdef ZFS_DEBUG
  157         zfs_refcount_destroy(&abd->abd_children);
  158 #endif
  159 }
  160 
  161 abd_t *
  162 abd_alloc_struct(size_t size)
  163 {
  164         abd_t *abd = abd_alloc_struct_impl(size);
  165         abd_init_struct(abd);
  166         abd->abd_flags |= ABD_FLAG_ALLOCD;
  167         return (abd);
  168 }
  169 
  170 void
  171 abd_free_struct(abd_t *abd)
  172 {
  173         abd_fini_struct(abd);
  174         abd_free_struct_impl(abd);
  175 }
  176 
  177 /*
  178  * Allocate an ABD, along with its own underlying data buffers. Use this if you
  179  * don't care whether the ABD is linear or not.
  180  */
  181 abd_t *
  182 abd_alloc(size_t size, boolean_t is_metadata)
  183 {
  184         if (abd_size_alloc_linear(size))
  185                 return (abd_alloc_linear(size, is_metadata));
  186 
  187         VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
  188 
  189         abd_t *abd = abd_alloc_struct(size);
  190         abd->abd_flags |= ABD_FLAG_OWNER;
  191         abd->abd_u.abd_scatter.abd_offset = 0;
  192         abd_alloc_chunks(abd, size);
  193 
  194         if (is_metadata) {
  195                 abd->abd_flags |= ABD_FLAG_META;
  196         }
  197         abd->abd_size = size;
  198 
  199         abd_update_scatter_stats(abd, ABDSTAT_INCR);
  200 
  201         return (abd);
  202 }
  203 
  204 /*
  205  * Allocate an ABD that must be linear, along with its own underlying data
  206  * buffer. Only use this when it would be very annoying to write your ABD
  207  * consumer with a scattered ABD.
  208  */
  209 abd_t *
  210 abd_alloc_linear(size_t size, boolean_t is_metadata)
  211 {
  212         abd_t *abd = abd_alloc_struct(0);
  213 
  214         VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
  215 
  216         abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
  217         if (is_metadata) {
  218                 abd->abd_flags |= ABD_FLAG_META;
  219         }
  220         abd->abd_size = size;
  221 
  222         if (is_metadata) {
  223                 ABD_LINEAR_BUF(abd) = zio_buf_alloc(size);
  224         } else {
  225                 ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size);
  226         }
  227 
  228         abd_update_linear_stats(abd, ABDSTAT_INCR);
  229 
  230         return (abd);
  231 }
  232 
  233 static void
  234 abd_free_linear(abd_t *abd)
  235 {
  236         if (abd_is_linear_page(abd)) {
  237                 abd_free_linear_page(abd);
  238                 return;
  239         }
  240         if (abd->abd_flags & ABD_FLAG_META) {
  241                 zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
  242         } else {
  243                 zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
  244         }
  245 
  246         abd_update_linear_stats(abd, ABDSTAT_DECR);
  247 }
  248 
  249 static void
  250 abd_free_gang(abd_t *abd)
  251 {
  252         ASSERT(abd_is_gang(abd));
  253         abd_t *cabd;
  254 
  255         while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) {
  256                 /*
  257                  * We must acquire the child ABDs mutex to ensure that if it
  258                  * is being added to another gang ABD we will set the link
  259                  * as inactive when removing it from this gang ABD and before
  260                  * adding it to the other gang ABD.
  261                  */
  262                 mutex_enter(&cabd->abd_mtx);
  263                 ASSERT(list_link_active(&cabd->abd_gang_link));
  264                 list_remove(&ABD_GANG(abd).abd_gang_chain, cabd);
  265                 mutex_exit(&cabd->abd_mtx);
  266                 if (cabd->abd_flags & ABD_FLAG_GANG_FREE)
  267                         abd_free(cabd);
  268         }
  269         list_destroy(&ABD_GANG(abd).abd_gang_chain);
  270 }
  271 
  272 static void
  273 abd_free_scatter(abd_t *abd)
  274 {
  275         abd_free_chunks(abd);
  276         abd_update_scatter_stats(abd, ABDSTAT_DECR);
  277 }
  278 
  279 /*
  280  * Free an ABD.  Use with any kind of abd: those created with abd_alloc_*()
  281  * and abd_get_*(), including abd_get_offset_struct().
  282  *
  283  * If the ABD was created with abd_alloc_*(), the underlying data
  284  * (scatterlist or linear buffer) will also be freed.  (Subject to ownership
  285  * changes via abd_*_ownership_of_buf().)
  286  *
  287  * Unless the ABD was created with abd_get_offset_struct(), the abd_t will
  288  * also be freed.
  289  */
  290 void
  291 abd_free(abd_t *abd)
  292 {
  293         if (abd == NULL)
  294                 return;
  295 
  296         abd_verify(abd);
  297 #ifdef ZFS_DEBUG
  298         IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL);
  299 #endif
  300 
  301         if (abd_is_gang(abd)) {
  302                 abd_free_gang(abd);
  303         } else if (abd_is_linear(abd)) {
  304                 if (abd->abd_flags & ABD_FLAG_OWNER)
  305                         abd_free_linear(abd);
  306         } else {
  307                 if (abd->abd_flags & ABD_FLAG_OWNER)
  308                         abd_free_scatter(abd);
  309         }
  310 
  311 #ifdef ZFS_DEBUG
  312         if (abd->abd_parent != NULL) {
  313                 (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
  314                     abd->abd_size, abd);
  315         }
  316 #endif
  317 
  318         abd_fini_struct(abd);
  319         if (abd->abd_flags & ABD_FLAG_ALLOCD)
  320                 abd_free_struct_impl(abd);
  321 }
  322 
  323 /*
  324  * Allocate an ABD of the same format (same metadata flag, same scatterize
  325  * setting) as another ABD.
  326  */
  327 abd_t *
  328 abd_alloc_sametype(abd_t *sabd, size_t size)
  329 {
  330         boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
  331         if (abd_is_linear(sabd) &&
  332             !abd_is_linear_page(sabd)) {
  333                 return (abd_alloc_linear(size, is_metadata));
  334         } else {
  335                 return (abd_alloc(size, is_metadata));
  336         }
  337 }
  338 
  339 /*
  340  * Create gang ABD that will be the head of a list of ABD's. This is used
  341  * to "chain" scatter/gather lists together when constructing aggregated
  342  * IO's. To free this abd, abd_free() must be called.
  343  */
  344 abd_t *
  345 abd_alloc_gang(void)
  346 {
  347         abd_t *abd = abd_alloc_struct(0);
  348         abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER;
  349         list_create(&ABD_GANG(abd).abd_gang_chain,
  350             sizeof (abd_t), offsetof(abd_t, abd_gang_link));
  351         return (abd);
  352 }
  353 
  354 /*
  355  * Add a child gang ABD to a parent gang ABDs chained list.
  356  */
  357 static void
  358 abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
  359 {
  360         ASSERT(abd_is_gang(pabd));
  361         ASSERT(abd_is_gang(cabd));
  362 
  363         if (free_on_free) {
  364                 /*
  365                  * If the parent is responsible for freeing the child gang
  366                  * ABD we will just splice the child's children ABD list to
  367                  * the parent's list and immediately free the child gang ABD
  368                  * struct. The parent gang ABDs children from the child gang
  369                  * will retain all the free_on_free settings after being
  370                  * added to the parents list.
  371                  */
  372                 pabd->abd_size += cabd->abd_size;
  373                 list_move_tail(&ABD_GANG(pabd).abd_gang_chain,
  374                     &ABD_GANG(cabd).abd_gang_chain);
  375                 ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
  376                 abd_verify(pabd);
  377                 abd_free(cabd);
  378         } else {
  379                 for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain);
  380                     child != NULL;
  381                     child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) {
  382                         /*
  383                          * We always pass B_FALSE for free_on_free as it is the
  384                          * original child gang ABDs responsibility to determine
  385                          * if any of its child ABDs should be free'd on the call
  386                          * to abd_free().
  387                          */
  388                         abd_gang_add(pabd, child, B_FALSE);
  389                 }
  390                 abd_verify(pabd);
  391         }
  392 }
  393 
  394 /*
  395  * Add a child ABD to a gang ABD's chained list.
  396  */
  397 void
  398 abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
  399 {
  400         ASSERT(abd_is_gang(pabd));
  401         abd_t *child_abd = NULL;
  402 
  403         /*
  404          * If the child being added is a gang ABD, we will add the
  405          * child's ABDs to the parent gang ABD. This allows us to account
  406          * for the offset correctly in the parent gang ABD.
  407          */
  408         if (abd_is_gang(cabd)) {
  409                 ASSERT(!list_link_active(&cabd->abd_gang_link));
  410                 ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
  411                 return (abd_gang_add_gang(pabd, cabd, free_on_free));
  412         }
  413         ASSERT(!abd_is_gang(cabd));
  414 
  415         /*
  416          * In order to verify that an ABD is not already part of
  417          * another gang ABD, we must lock the child ABD's abd_mtx
  418          * to check its abd_gang_link status. We unlock the abd_mtx
  419          * only after it is has been added to a gang ABD, which
  420          * will update the abd_gang_link's status. See comment below
  421          * for how an ABD can be in multiple gang ABD's simultaneously.
  422          */
  423         mutex_enter(&cabd->abd_mtx);
  424         if (list_link_active(&cabd->abd_gang_link)) {
  425                 /*
  426                  * If the child ABD is already part of another
  427                  * gang ABD then we must allocate a new
  428                  * ABD to use a separate link. We mark the newly
  429                  * allocated ABD with ABD_FLAG_GANG_FREE, before
  430                  * adding it to the gang ABD's list, to make the
  431                  * gang ABD aware that it is responsible to call
  432                  * abd_free(). We use abd_get_offset() in order
  433                  * to just allocate a new ABD but avoid copying the
  434                  * data over into the newly allocated ABD.
  435                  *
  436                  * An ABD may become part of multiple gang ABD's. For
  437                  * example, when writing ditto bocks, the same ABD
  438                  * is used to write 2 or 3 locations with 2 or 3
  439                  * zio_t's. Each of the zio's may be aggregated with
  440                  * different adjacent zio's. zio aggregation uses gang
  441                  * zio's, so the single ABD can become part of multiple
  442                  * gang zio's.
  443                  *
  444                  * The ASSERT below is to make sure that if
  445                  * free_on_free is passed as B_TRUE, the ABD can
  446                  * not be in multiple gang ABD's. The gang ABD
  447                  * can not be responsible for cleaning up the child
  448                  * ABD memory allocation if the ABD can be in
  449                  * multiple gang ABD's at one time.
  450                  */
  451                 ASSERT3B(free_on_free, ==, B_FALSE);
  452                 child_abd = abd_get_offset(cabd, 0);
  453                 child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
  454         } else {
  455                 child_abd = cabd;
  456                 if (free_on_free)
  457                         child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
  458         }
  459         ASSERT3P(child_abd, !=, NULL);
  460 
  461         list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd);
  462         mutex_exit(&cabd->abd_mtx);
  463         pabd->abd_size += child_abd->abd_size;
  464 }
  465 
  466 /*
  467  * Locate the ABD for the supplied offset in the gang ABD.
  468  * Return a new offset relative to the returned ABD.
  469  */
  470 abd_t *
  471 abd_gang_get_offset(abd_t *abd, size_t *off)
  472 {
  473         abd_t *cabd;
  474 
  475         ASSERT(abd_is_gang(abd));
  476         ASSERT3U(*off, <, abd->abd_size);
  477         for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL;
  478             cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
  479                 if (*off >= cabd->abd_size)
  480                         *off -= cabd->abd_size;
  481                 else
  482                         return (cabd);
  483         }
  484         VERIFY3P(cabd, !=, NULL);
  485         return (cabd);
  486 }
  487 
  488 /*
  489  * Allocate a new ABD, using the provided struct (if non-NULL, and if
  490  * circumstances allow - otherwise allocate the struct).  The returned ABD will
  491  * point to offset off of sabd. It shares the underlying buffer data with sabd.
  492  * Use abd_free() to free.  sabd must not be freed while any derived ABDs exist.
  493  */
  494 static abd_t *
  495 abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
  496 {
  497         abd_verify(sabd);
  498         ASSERT3U(off + size, <=, sabd->abd_size);
  499 
  500         if (abd_is_linear(sabd)) {
  501                 if (abd == NULL)
  502                         abd = abd_alloc_struct(0);
  503                 /*
  504                  * Even if this buf is filesystem metadata, we only track that
  505                  * if we own the underlying data buffer, which is not true in
  506                  * this case. Therefore, we don't ever use ABD_FLAG_META here.
  507                  */
  508                 abd->abd_flags |= ABD_FLAG_LINEAR;
  509 
  510                 ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
  511         } else if (abd_is_gang(sabd)) {
  512                 size_t left = size;
  513                 if (abd == NULL) {
  514                         abd = abd_alloc_gang();
  515                 } else {
  516                         abd->abd_flags |= ABD_FLAG_GANG;
  517                         list_create(&ABD_GANG(abd).abd_gang_chain,
  518                             sizeof (abd_t), offsetof(abd_t, abd_gang_link));
  519                 }
  520 
  521                 abd->abd_flags &= ~ABD_FLAG_OWNER;
  522                 for (abd_t *cabd = abd_gang_get_offset(sabd, &off);
  523                     cabd != NULL && left > 0;
  524                     cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) {
  525                         int csize = MIN(left, cabd->abd_size - off);
  526 
  527                         abd_t *nabd = abd_get_offset_size(cabd, off, csize);
  528                         abd_gang_add(abd, nabd, B_TRUE);
  529                         left -= csize;
  530                         off = 0;
  531                 }
  532                 ASSERT3U(left, ==, 0);
  533         } else {
  534                 abd = abd_get_offset_scatter(abd, sabd, off, size);
  535         }
  536 
  537         ASSERT3P(abd, !=, NULL);
  538         abd->abd_size = size;
  539 #ifdef ZFS_DEBUG
  540         abd->abd_parent = sabd;
  541         (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
  542 #endif
  543         return (abd);
  544 }
  545 
  546 /*
  547  * Like abd_get_offset_size(), but memory for the abd_t is provided by the
  548  * caller.  Using this routine can improve performance by avoiding the cost
  549  * of allocating memory for the abd_t struct, and updating the abd stats.
  550  * Usually, the provided abd is returned, but in some circumstances (FreeBSD,
  551  * if sabd is scatter and size is more than 2 pages) a new abd_t may need to
  552  * be allocated.  Therefore callers should be careful to use the returned
  553  * abd_t*.
  554  */
  555 abd_t *
  556 abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size)
  557 {
  558         abd_t *result;
  559         abd_init_struct(abd);
  560         result = abd_get_offset_impl(abd, sabd, off, size);
  561         if (result != abd)
  562                 abd_fini_struct(abd);
  563         return (result);
  564 }
  565 
  566 abd_t *
  567 abd_get_offset(abd_t *sabd, size_t off)
  568 {
  569         size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
  570         VERIFY3U(size, >, 0);
  571         return (abd_get_offset_impl(NULL, sabd, off, size));
  572 }
  573 
  574 abd_t *
  575 abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
  576 {
  577         ASSERT3U(off + size, <=, sabd->abd_size);
  578         return (abd_get_offset_impl(NULL, sabd, off, size));
  579 }
  580 
  581 /*
  582  * Return a size scatter ABD containing only zeros.
  583  */
  584 abd_t *
  585 abd_get_zeros(size_t size)
  586 {
  587         ASSERT3P(abd_zero_scatter, !=, NULL);
  588         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
  589         return (abd_get_offset_size(abd_zero_scatter, 0, size));
  590 }
  591 
  592 /*
  593  * Allocate a linear ABD structure for buf.
  594  */
  595 abd_t *
  596 abd_get_from_buf(void *buf, size_t size)
  597 {
  598         abd_t *abd = abd_alloc_struct(0);
  599 
  600         VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
  601 
  602         /*
  603          * Even if this buf is filesystem metadata, we only track that if we
  604          * own the underlying data buffer, which is not true in this case.
  605          * Therefore, we don't ever use ABD_FLAG_META here.
  606          */
  607         abd->abd_flags |= ABD_FLAG_LINEAR;
  608         abd->abd_size = size;
  609 
  610         ABD_LINEAR_BUF(abd) = buf;
  611 
  612         return (abd);
  613 }
  614 
  615 /*
  616  * Get the raw buffer associated with a linear ABD.
  617  */
  618 void *
  619 abd_to_buf(abd_t *abd)
  620 {
  621         ASSERT(abd_is_linear(abd));
  622         abd_verify(abd);
  623         return (ABD_LINEAR_BUF(abd));
  624 }
  625 
  626 /*
  627  * Borrow a raw buffer from an ABD without copying the contents of the ABD
  628  * into the buffer. If the ABD is scattered, this will allocate a raw buffer
  629  * whose contents are undefined. To copy over the existing data in the ABD, use
  630  * abd_borrow_buf_copy() instead.
  631  */
  632 void *
  633 abd_borrow_buf(abd_t *abd, size_t n)
  634 {
  635         void *buf;
  636         abd_verify(abd);
  637         ASSERT3U(abd->abd_size, >=, n);
  638         if (abd_is_linear(abd)) {
  639                 buf = abd_to_buf(abd);
  640         } else {
  641                 buf = zio_buf_alloc(n);
  642         }
  643 #ifdef ZFS_DEBUG
  644         (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
  645 #endif
  646         return (buf);
  647 }
  648 
  649 void *
  650 abd_borrow_buf_copy(abd_t *abd, size_t n)
  651 {
  652         void *buf = abd_borrow_buf(abd, n);
  653         if (!abd_is_linear(abd)) {
  654                 abd_copy_to_buf(buf, abd, n);
  655         }
  656         return (buf);
  657 }
  658 
  659 /*
  660  * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
  661  * not change the contents of the ABD and will ASSERT that you didn't modify
  662  * the buffer since it was borrowed. If you want any changes you made to buf to
  663  * be copied back to abd, use abd_return_buf_copy() instead.
  664  */
  665 void
  666 abd_return_buf(abd_t *abd, void *buf, size_t n)
  667 {
  668         abd_verify(abd);
  669         ASSERT3U(abd->abd_size, >=, n);
  670 #ifdef ZFS_DEBUG
  671         (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
  672 #endif
  673         if (abd_is_linear(abd)) {
  674                 ASSERT3P(buf, ==, abd_to_buf(abd));
  675         } else {
  676                 ASSERT0(abd_cmp_buf(abd, buf, n));
  677                 zio_buf_free(buf, n);
  678         }
  679 }
  680 
  681 void
  682 abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
  683 {
  684         if (!abd_is_linear(abd)) {
  685                 abd_copy_from_buf(abd, buf, n);
  686         }
  687         abd_return_buf(abd, buf, n);
  688 }
  689 
  690 void
  691 abd_release_ownership_of_buf(abd_t *abd)
  692 {
  693         ASSERT(abd_is_linear(abd));
  694         ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
  695 
  696         /*
  697          * abd_free() needs to handle LINEAR_PAGE ABD's specially.
  698          * Since that flag does not survive the
  699          * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
  700          * abd_take_ownership_of_buf() sequence, we don't allow releasing
  701          * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
  702          */
  703         ASSERT(!abd_is_linear_page(abd));
  704 
  705         abd_verify(abd);
  706 
  707         abd->abd_flags &= ~ABD_FLAG_OWNER;
  708         /* Disable this flag since we no longer own the data buffer */
  709         abd->abd_flags &= ~ABD_FLAG_META;
  710 
  711         abd_update_linear_stats(abd, ABDSTAT_DECR);
  712 }
  713 
  714 
  715 /*
  716  * Give this ABD ownership of the buffer that it's storing. Can only be used on
  717  * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
  718  * with abd_alloc_linear() which subsequently released ownership of their buf
  719  * with abd_release_ownership_of_buf().
  720  */
  721 void
  722 abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
  723 {
  724         ASSERT(abd_is_linear(abd));
  725         ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
  726         abd_verify(abd);
  727 
  728         abd->abd_flags |= ABD_FLAG_OWNER;
  729         if (is_metadata) {
  730                 abd->abd_flags |= ABD_FLAG_META;
  731         }
  732 
  733         abd_update_linear_stats(abd, ABDSTAT_INCR);
  734 }
  735 
  736 /*
  737  * Initializes an abd_iter based on whether the abd is a gang ABD
  738  * or just a single ABD.
  739  */
  740 static inline abd_t *
  741 abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off)
  742 {
  743         abd_t *cabd = NULL;
  744 
  745         if (abd_is_gang(abd)) {
  746                 cabd = abd_gang_get_offset(abd, &off);
  747                 if (cabd) {
  748                         abd_iter_init(aiter, cabd);
  749                         abd_iter_advance(aiter, off);
  750                 }
  751         } else {
  752                 abd_iter_init(aiter, abd);
  753                 abd_iter_advance(aiter, off);
  754         }
  755         return (cabd);
  756 }
  757 
  758 /*
  759  * Advances an abd_iter. We have to be careful with gang ABD as
  760  * advancing could mean that we are at the end of a particular ABD and
  761  * must grab the ABD in the gang ABD's list.
  762  */
  763 static inline abd_t *
  764 abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter,
  765     size_t len)
  766 {
  767         abd_iter_advance(aiter, len);
  768         if (abd_is_gang(abd) && abd_iter_at_end(aiter)) {
  769                 ASSERT3P(cabd, !=, NULL);
  770                 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd);
  771                 if (cabd) {
  772                         abd_iter_init(aiter, cabd);
  773                         abd_iter_advance(aiter, 0);
  774                 }
  775         }
  776         return (cabd);
  777 }
  778 
  779 int
  780 abd_iterate_func(abd_t *abd, size_t off, size_t size,
  781     abd_iter_func_t *func, void *private)
  782 {
  783         struct abd_iter aiter;
  784         int ret = 0;
  785 
  786         if (size == 0)
  787                 return (0);
  788 
  789         abd_verify(abd);
  790         ASSERT3U(off + size, <=, abd->abd_size);
  791 
  792         boolean_t gang = abd_is_gang(abd);
  793         abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
  794 
  795         while (size > 0) {
  796                 /* If we are at the end of the gang ABD we are done */
  797                 if (gang && !c_abd)
  798                         break;
  799 
  800                 abd_iter_map(&aiter);
  801 
  802                 size_t len = MIN(aiter.iter_mapsize, size);
  803                 ASSERT3U(len, >, 0);
  804 
  805                 ret = func(aiter.iter_mapaddr, len, private);
  806 
  807                 abd_iter_unmap(&aiter);
  808 
  809                 if (ret != 0)
  810                         break;
  811 
  812                 size -= len;
  813                 c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
  814         }
  815 
  816         return (ret);
  817 }
  818 
  819 struct buf_arg {
  820         void *arg_buf;
  821 };
  822 
  823 static int
  824 abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
  825 {
  826         struct buf_arg *ba_ptr = private;
  827 
  828         (void) memcpy(ba_ptr->arg_buf, buf, size);
  829         ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
  830 
  831         return (0);
  832 }
  833 
  834 /*
  835  * Copy abd to buf. (off is the offset in abd.)
  836  */
  837 void
  838 abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
  839 {
  840         struct buf_arg ba_ptr = { buf };
  841 
  842         (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
  843             &ba_ptr);
  844 }
  845 
  846 static int
  847 abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
  848 {
  849         int ret;
  850         struct buf_arg *ba_ptr = private;
  851 
  852         ret = memcmp(buf, ba_ptr->arg_buf, size);
  853         ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
  854 
  855         return (ret);
  856 }
  857 
  858 /*
  859  * Compare the contents of abd to buf. (off is the offset in abd.)
  860  */
  861 int
  862 abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
  863 {
  864         struct buf_arg ba_ptr = { (void *) buf };
  865 
  866         return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
  867 }
  868 
  869 static int
  870 abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
  871 {
  872         struct buf_arg *ba_ptr = private;
  873 
  874         (void) memcpy(buf, ba_ptr->arg_buf, size);
  875         ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
  876 
  877         return (0);
  878 }
  879 
  880 /*
  881  * Copy from buf to abd. (off is the offset in abd.)
  882  */
  883 void
  884 abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
  885 {
  886         struct buf_arg ba_ptr = { (void *) buf };
  887 
  888         (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
  889             &ba_ptr);
  890 }
  891 
  892 static int
  893 abd_zero_off_cb(void *buf, size_t size, void *private)
  894 {
  895         (void) private;
  896         (void) memset(buf, 0, size);
  897         return (0);
  898 }
  899 
  900 /*
  901  * Zero out the abd from a particular offset to the end.
  902  */
  903 void
  904 abd_zero_off(abd_t *abd, size_t off, size_t size)
  905 {
  906         (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
  907 }
  908 
  909 /*
  910  * Iterate over two ABDs and call func incrementally on the two ABDs' data in
  911  * equal-sized chunks (passed to func as raw buffers). func could be called many
  912  * times during this iteration.
  913  */
  914 int
  915 abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
  916     size_t size, abd_iter_func2_t *func, void *private)
  917 {
  918         int ret = 0;
  919         struct abd_iter daiter, saiter;
  920         boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
  921         abd_t *c_dabd, *c_sabd;
  922 
  923         if (size == 0)
  924                 return (0);
  925 
  926         abd_verify(dabd);
  927         abd_verify(sabd);
  928 
  929         ASSERT3U(doff + size, <=, dabd->abd_size);
  930         ASSERT3U(soff + size, <=, sabd->abd_size);
  931 
  932         dabd_is_gang_abd = abd_is_gang(dabd);
  933         sabd_is_gang_abd = abd_is_gang(sabd);
  934         c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
  935         c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
  936 
  937         while (size > 0) {
  938                 /* if we are at the end of the gang ABD we are done */
  939                 if ((dabd_is_gang_abd && !c_dabd) ||
  940                     (sabd_is_gang_abd && !c_sabd))
  941                         break;
  942 
  943                 abd_iter_map(&daiter);
  944                 abd_iter_map(&saiter);
  945 
  946                 size_t dlen = MIN(daiter.iter_mapsize, size);
  947                 size_t slen = MIN(saiter.iter_mapsize, size);
  948                 size_t len = MIN(dlen, slen);
  949                 ASSERT(dlen > 0 || slen > 0);
  950 
  951                 ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
  952                     private);
  953 
  954                 abd_iter_unmap(&saiter);
  955                 abd_iter_unmap(&daiter);
  956 
  957                 if (ret != 0)
  958                         break;
  959 
  960                 size -= len;
  961                 c_dabd =
  962                     abd_advance_abd_iter(dabd, c_dabd, &daiter, len);
  963                 c_sabd =
  964                     abd_advance_abd_iter(sabd, c_sabd, &saiter, len);
  965         }
  966 
  967         return (ret);
  968 }
  969 
  970 static int
  971 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
  972 {
  973         (void) private;
  974         (void) memcpy(dbuf, sbuf, size);
  975         return (0);
  976 }
  977 
  978 /*
  979  * Copy from sabd to dabd starting from soff and doff.
  980  */
  981 void
  982 abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
  983 {
  984         (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
  985             abd_copy_off_cb, NULL);
  986 }
  987 
  988 static int
  989 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
  990 {
  991         (void) private;
  992         return (memcmp(bufa, bufb, size));
  993 }
  994 
  995 /*
  996  * Compares the contents of two ABDs.
  997  */
  998 int
  999 abd_cmp(abd_t *dabd, abd_t *sabd)
 1000 {
 1001         ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
 1002         return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
 1003             abd_cmp_cb, NULL));
 1004 }
 1005 
 1006 /*
 1007  * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
 1008  *
 1009  * @cabds          parity ABDs, must have equal size
 1010  * @dabd           data ABD. Can be NULL (in this case @dsize = 0)
 1011  * @func_raidz_gen should be implemented so that its behaviour
 1012  *                 is the same when taking linear and when taking scatter
 1013  */
 1014 void
 1015 abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
 1016     ssize_t csize, ssize_t dsize, const unsigned parity,
 1017     void (*func_raidz_gen)(void **, const void *, size_t, size_t))
 1018 {
 1019         int i;
 1020         ssize_t len, dlen;
 1021         struct abd_iter caiters[3];
 1022         struct abd_iter daiter = {0};
 1023         void *caddrs[3];
 1024         unsigned long flags __maybe_unused = 0;
 1025         abd_t *c_cabds[3];
 1026         abd_t *c_dabd = NULL;
 1027         boolean_t cabds_is_gang_abd[3];
 1028         boolean_t dabd_is_gang_abd = B_FALSE;
 1029 
 1030         ASSERT3U(parity, <=, 3);
 1031 
 1032         for (i = 0; i < parity; i++) {
 1033                 cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
 1034                 c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
 1035         }
 1036 
 1037         if (dabd) {
 1038                 dabd_is_gang_abd = abd_is_gang(dabd);
 1039                 c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
 1040         }
 1041 
 1042         ASSERT3S(dsize, >=, 0);
 1043 
 1044         abd_enter_critical(flags);
 1045         while (csize > 0) {
 1046                 /* if we are at the end of the gang ABD we are done */
 1047                 if (dabd_is_gang_abd && !c_dabd)
 1048                         break;
 1049 
 1050                 for (i = 0; i < parity; i++) {
 1051                         /*
 1052                          * If we are at the end of the gang ABD we are
 1053                          * done.
 1054                          */
 1055                         if (cabds_is_gang_abd[i] && !c_cabds[i])
 1056                                 break;
 1057                         abd_iter_map(&caiters[i]);
 1058                         caddrs[i] = caiters[i].iter_mapaddr;
 1059                 }
 1060 
 1061                 len = csize;
 1062 
 1063                 if (dabd && dsize > 0)
 1064                         abd_iter_map(&daiter);
 1065 
 1066                 switch (parity) {
 1067                         case 3:
 1068                                 len = MIN(caiters[2].iter_mapsize, len);
 1069                                 zfs_fallthrough;
 1070                         case 2:
 1071                                 len = MIN(caiters[1].iter_mapsize, len);
 1072                                 zfs_fallthrough;
 1073                         case 1:
 1074                                 len = MIN(caiters[0].iter_mapsize, len);
 1075                 }
 1076 
 1077                 /* must be progressive */
 1078                 ASSERT3S(len, >, 0);
 1079 
 1080                 if (dabd && dsize > 0) {
 1081                         /* this needs precise iter.length */
 1082                         len = MIN(daiter.iter_mapsize, len);
 1083                         dlen = len;
 1084                 } else
 1085                         dlen = 0;
 1086 
 1087                 /* must be progressive */
 1088                 ASSERT3S(len, >, 0);
 1089                 /*
 1090                  * The iterated function likely will not do well if each
 1091                  * segment except the last one is not multiple of 512 (raidz).
 1092                  */
 1093                 ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
 1094 
 1095                 func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
 1096 
 1097                 for (i = parity-1; i >= 0; i--) {
 1098                         abd_iter_unmap(&caiters[i]);
 1099                         c_cabds[i] =
 1100                             abd_advance_abd_iter(cabds[i], c_cabds[i],
 1101                             &caiters[i], len);
 1102                 }
 1103 
 1104                 if (dabd && dsize > 0) {
 1105                         abd_iter_unmap(&daiter);
 1106                         c_dabd =
 1107                             abd_advance_abd_iter(dabd, c_dabd, &daiter,
 1108                             dlen);
 1109                         dsize -= dlen;
 1110                 }
 1111 
 1112                 csize -= len;
 1113 
 1114                 ASSERT3S(dsize, >=, 0);
 1115                 ASSERT3S(csize, >=, 0);
 1116         }
 1117         abd_exit_critical(flags);
 1118 }
 1119 
 1120 /*
 1121  * Iterate over code ABDs and data reconstruction target ABDs and call
 1122  * @func_raidz_rec. Function maps at most 6 pages atomically.
 1123  *
 1124  * @cabds           parity ABDs, must have equal size
 1125  * @tabds           rec target ABDs, at most 3
 1126  * @tsize           size of data target columns
 1127  * @func_raidz_rec  expects syndrome data in target columns. Function
 1128  *                  reconstructs data and overwrites target columns.
 1129  */
 1130 void
 1131 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
 1132     ssize_t tsize, const unsigned parity,
 1133     void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
 1134     const unsigned *mul),
 1135     const unsigned *mul)
 1136 {
 1137         int i;
 1138         ssize_t len;
 1139         struct abd_iter citers[3];
 1140         struct abd_iter xiters[3];
 1141         void *caddrs[3], *xaddrs[3];
 1142         unsigned long flags __maybe_unused = 0;
 1143         boolean_t cabds_is_gang_abd[3];
 1144         boolean_t tabds_is_gang_abd[3];
 1145         abd_t *c_cabds[3];
 1146         abd_t *c_tabds[3];
 1147 
 1148         ASSERT3U(parity, <=, 3);
 1149 
 1150         for (i = 0; i < parity; i++) {
 1151                 cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
 1152                 tabds_is_gang_abd[i] = abd_is_gang(tabds[i]);
 1153                 c_cabds[i] =
 1154                     abd_init_abd_iter(cabds[i], &citers[i], 0);
 1155                 c_tabds[i] =
 1156                     abd_init_abd_iter(tabds[i], &xiters[i], 0);
 1157         }
 1158 
 1159         abd_enter_critical(flags);
 1160         while (tsize > 0) {
 1161 
 1162                 for (i = 0; i < parity; i++) {
 1163                         /*
 1164                          * If we are at the end of the gang ABD we
 1165                          * are done.
 1166                          */
 1167                         if (cabds_is_gang_abd[i] && !c_cabds[i])
 1168                                 break;
 1169                         if (tabds_is_gang_abd[i] && !c_tabds[i])
 1170                                 break;
 1171                         abd_iter_map(&citers[i]);
 1172                         abd_iter_map(&xiters[i]);
 1173                         caddrs[i] = citers[i].iter_mapaddr;
 1174                         xaddrs[i] = xiters[i].iter_mapaddr;
 1175                 }
 1176 
 1177                 len = tsize;
 1178                 switch (parity) {
 1179                         case 3:
 1180                                 len = MIN(xiters[2].iter_mapsize, len);
 1181                                 len = MIN(citers[2].iter_mapsize, len);
 1182                                 zfs_fallthrough;
 1183                         case 2:
 1184                                 len = MIN(xiters[1].iter_mapsize, len);
 1185                                 len = MIN(citers[1].iter_mapsize, len);
 1186                                 zfs_fallthrough;
 1187                         case 1:
 1188                                 len = MIN(xiters[0].iter_mapsize, len);
 1189                                 len = MIN(citers[0].iter_mapsize, len);
 1190                 }
 1191                 /* must be progressive */
 1192                 ASSERT3S(len, >, 0);
 1193                 /*
 1194                  * The iterated function likely will not do well if each
 1195                  * segment except the last one is not multiple of 512 (raidz).
 1196                  */
 1197                 ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
 1198 
 1199                 func_raidz_rec(xaddrs, len, caddrs, mul);
 1200 
 1201                 for (i = parity-1; i >= 0; i--) {
 1202                         abd_iter_unmap(&xiters[i]);
 1203                         abd_iter_unmap(&citers[i]);
 1204                         c_tabds[i] =
 1205                             abd_advance_abd_iter(tabds[i], c_tabds[i],
 1206                             &xiters[i], len);
 1207                         c_cabds[i] =
 1208                             abd_advance_abd_iter(cabds[i], c_cabds[i],
 1209                             &citers[i], len);
 1210                 }
 1211 
 1212                 tsize -= len;
 1213                 ASSERT3S(tsize, >=, 0);
 1214         }
 1215         abd_exit_critical(flags);
 1216 }
Cache object: d73a9a676daed98598cf97e17ac63c02 
 
 |