zfs_fletcher.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
   25  */
   26 /*
   27  * Copyright 2013 Saso Kiselkov. All rights reserved.
   28  */
   29 
   30 /*
   31  * Copyright (c) 2016 by Delphix. All rights reserved.
   32  */
   33 
   34 /*
   35  * Fletcher Checksums
   36  * ------------------
   37  *
   38  * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
   39  * recurrence relations:
   40  *
   41  *      a  = a    + f
   42  *       i    i-1    i-1
   43  *
   44  *      b  = b    + a
   45  *       i    i-1    i
   46  *
   47  *      c  = c    + b           (fletcher-4 only)
   48  *       i    i-1    i
   49  *
   50  *      d  = d    + c           (fletcher-4 only)
   51  *       i    i-1    i
   52  *
   53  * Where
   54  *      a_0 = b_0 = c_0 = d_0 = 0
   55  * and
   56  *      f_0 .. f_(n-1) are the input data.
   57  *
   58  * Using standard techniques, these translate into the following series:
   59  *
   60  *           __n_                            __n_
   61  *           \   |                           \   |
   62  *      a  =  >     f                   b  =  >     i * f
   63  *       n   /___|   n - i               n   /___|       n - i
   64  *           i = 1                           i = 1
   65  *
   66  *
   67  *           __n_                            __n_
   68  *           \   |  i*(i+1)                  \   |  i*(i+1)*(i+2)
   69  *      c  =  >     ------- f           d  =  >     ------------- f
   70  *       n   /___|     2     n - i       n   /___|        6        n - i
   71  *           i = 1                           i = 1
   72  *
   73  * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
   74  * Since the additions are done mod (2^64), errors in the high bits may not
   75  * be noticed.  For this reason, fletcher-2 is deprecated.
   76  *
   77  * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
   78  * A conservative estimate of how big the buffer can get before we overflow
   79  * can be estimated using f_i = 0xffffffff for all i:
   80  *
   81  * % bc
   82  *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
   83  * 2264
   84  *  quit
   85  * %
   86  *
   87  * So blocks of up to 2k will not overflow.  Our largest block size is
   88  * 128k, which has 32k 4-byte words, so we can compute the largest possible
   89  * accumulators, then divide by 2^64 to figure the max amount of overflow:
   90  *
   91  * % bc
   92  *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
   93  *  a/2^64;b/2^64;c/2^64;d/2^64
   94  * 0
   95  * 0
   96  * 1365
   97  * 11186858
   98  *  quit
   99  * %
  100  *
  101  * So a and b cannot overflow.  To make sure each bit of input has some
  102  * effect on the contents of c and d, we can look at what the factors of
  103  * the coefficients in the equations for c_n and d_n are.  The number of 2s
  104  * in the factors determines the lowest set bit in the multiplier.  Running
  105  * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
  106  * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
  107  * the 64-bit accumulators, every bit of every f_i effects every accumulator,
  108  * even for 128k blocks.
  109  *
  110  * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
  111  * we could do our calculations mod (2^32 - 1) by adding in the carries
  112  * periodically, and store the number of carries in the top 32-bits.
  113  *
  114  * --------------------
  115  * Checksum Performance
  116  * --------------------
  117  *
  118  * There are two interesting components to checksum performance: cached and
  119  * uncached performance.  With cached data, fletcher-2 is about four times
  120  * faster than fletcher-4.  With uncached data, the performance difference is
  121  * negligible, since the cost of a cache fill dominates the processing time.
  122  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
  123  * efficient pass over the data.
  124  *
  125  * In normal operation, the data which is being checksummed is in a buffer
  126  * which has been filled either by:
  127  *
  128  *      1. a compression step, which will be mostly cached, or
  129  *      2. a memcpy() or copyin(), which will be uncached
  130  *         (because the copy is cache-bypassing).
  131  *
  132  * For both cached and uncached data, both fletcher checksums are much faster
  133  * than sha-256, and slower than 'off', which doesn't touch the data at all.
  134  */
  135 
  136 #include <sys/types.h>
  137 #include <sys/sysmacros.h>
  138 #include <sys/byteorder.h>
  139 #include <sys/spa.h>
  140 #include <sys/simd.h>
  141 #include <sys/zio_checksum.h>
  142 #include <sys/zfs_context.h>
  143 #include <zfs_fletcher.h>
  144 
  145 #define FLETCHER_MIN_SIMD_SIZE  64
  146 
  147 static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
  148 static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
  149 static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
  150     const void *buf, uint64_t size);
  151 static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
  152     const void *buf, uint64_t size);
  153 static boolean_t fletcher_4_scalar_valid(void);
  154 
  155 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
  156         .init_native = fletcher_4_scalar_init,
  157         .fini_native = fletcher_4_scalar_fini,
  158         .compute_native = fletcher_4_scalar_native,
  159         .init_byteswap = fletcher_4_scalar_init,
  160         .fini_byteswap = fletcher_4_scalar_fini,
  161         .compute_byteswap = fletcher_4_scalar_byteswap,
  162         .valid = fletcher_4_scalar_valid,
  163         .name = "scalar"
  164 };
  165 
  166 static fletcher_4_ops_t fletcher_4_fastest_impl = {
  167         .name = "fastest",
  168         .valid = fletcher_4_scalar_valid
  169 };
  170 
  171 static const fletcher_4_ops_t *fletcher_4_impls[] = {
  172         &fletcher_4_scalar_ops,
  173         &fletcher_4_superscalar_ops,
  174         &fletcher_4_superscalar4_ops,
  175 #if defined(HAVE_SSE2)
  176         &fletcher_4_sse2_ops,
  177 #endif
  178 #if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
  179         &fletcher_4_ssse3_ops,
  180 #endif
  181 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
  182         &fletcher_4_avx2_ops,
  183 #endif
  184 #if defined(__x86_64) && defined(HAVE_AVX512F)
  185         &fletcher_4_avx512f_ops,
  186 #endif
  187 #if defined(__x86_64) && defined(HAVE_AVX512BW)
  188         &fletcher_4_avx512bw_ops,
  189 #endif
  190 #if defined(__aarch64__) && !defined(__FreeBSD__)
  191         &fletcher_4_aarch64_neon_ops,
  192 #endif
  193 };
  194 
  195 /* Hold all supported implementations */
  196 static uint32_t fletcher_4_supp_impls_cnt = 0;
  197 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
  198 
  199 /* Select fletcher4 implementation */
  200 #define IMPL_FASTEST    (UINT32_MAX)
  201 #define IMPL_CYCLE      (UINT32_MAX - 1)
  202 #define IMPL_SCALAR     (0)
  203 
  204 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
  205 
  206 #define IMPL_READ(i)    (*(volatile uint32_t *) &(i))
  207 
  208 static struct fletcher_4_impl_selector {
  209         const char      *fis_name;
  210         uint32_t        fis_sel;
  211 } fletcher_4_impl_selectors[] = {
  212         { "cycle",      IMPL_CYCLE },
  213         { "fastest",    IMPL_FASTEST },
  214         { "scalar",     IMPL_SCALAR }
  215 };
  216 
  217 #if defined(_KERNEL)
  218 static kstat_t *fletcher_4_kstat;
  219 
  220 static struct fletcher_4_kstat {
  221         uint64_t native;
  222         uint64_t byteswap;
  223 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
  224 #endif
  225 
  226 /* Indicate that benchmark has been completed */
  227 static boolean_t fletcher_4_initialized = B_FALSE;
  228 
  229 void
  230 fletcher_init(zio_cksum_t *zcp)
  231 {
  232         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  233 }
  234 
  235 int
  236 fletcher_2_incremental_native(void *buf, size_t size, void *data)
  237 {
  238         zio_cksum_t *zcp = data;
  239 
  240         const uint64_t *ip = buf;
  241         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
  242         uint64_t a0, b0, a1, b1;
  243 
  244         a0 = zcp->zc_word[0];
  245         a1 = zcp->zc_word[1];
  246         b0 = zcp->zc_word[2];
  247         b1 = zcp->zc_word[3];
  248 
  249         for (; ip < ipend; ip += 2) {
  250                 a0 += ip[0];
  251                 a1 += ip[1];
  252                 b0 += a0;
  253                 b1 += a1;
  254         }
  255 
  256         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
  257         return (0);
  258 }
  259 
  260 void
  261 fletcher_2_native(const void *buf, uint64_t size,
  262     const void *ctx_template, zio_cksum_t *zcp)
  263 {
  264         (void) ctx_template;
  265         fletcher_init(zcp);
  266         (void) fletcher_2_incremental_native((void *) buf, size, zcp);
  267 }
  268 
  269 int
  270 fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
  271 {
  272         zio_cksum_t *zcp = data;
  273 
  274         const uint64_t *ip = buf;
  275         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
  276         uint64_t a0, b0, a1, b1;
  277 
  278         a0 = zcp->zc_word[0];
  279         a1 = zcp->zc_word[1];
  280         b0 = zcp->zc_word[2];
  281         b1 = zcp->zc_word[3];
  282 
  283         for (; ip < ipend; ip += 2) {
  284                 a0 += BSWAP_64(ip[0]);
  285                 a1 += BSWAP_64(ip[1]);
  286                 b0 += a0;
  287                 b1 += a1;
  288         }
  289 
  290         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
  291         return (0);
  292 }
  293 
  294 void
  295 fletcher_2_byteswap(const void *buf, uint64_t size,
  296     const void *ctx_template, zio_cksum_t *zcp)
  297 {
  298         (void) ctx_template;
  299         fletcher_init(zcp);
  300         (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
  301 }
  302 
  303 ZFS_NO_SANITIZE_UNDEFINED
  304 static void
  305 fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
  306 {
  307         ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
  308 }
  309 
  310 ZFS_NO_SANITIZE_UNDEFINED
  311 static void
  312 fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
  313 {
  314         memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
  315 }
  316 
  317 ZFS_NO_SANITIZE_UNDEFINED
  318 static void
  319 fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
  320     uint64_t size)
  321 {
  322         const uint32_t *ip = buf;
  323         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
  324         uint64_t a, b, c, d;
  325 
  326         a = ctx->scalar.zc_word[0];
  327         b = ctx->scalar.zc_word[1];
  328         c = ctx->scalar.zc_word[2];
  329         d = ctx->scalar.zc_word[3];
  330 
  331         for (; ip < ipend; ip++) {
  332                 a += ip[0];
  333                 b += a;
  334                 c += b;
  335                 d += c;
  336         }
  337 
  338         ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
  339 }
  340 
  341 ZFS_NO_SANITIZE_UNDEFINED
  342 static void
  343 fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
  344     uint64_t size)
  345 {
  346         const uint32_t *ip = buf;
  347         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
  348         uint64_t a, b, c, d;
  349 
  350         a = ctx->scalar.zc_word[0];
  351         b = ctx->scalar.zc_word[1];
  352         c = ctx->scalar.zc_word[2];
  353         d = ctx->scalar.zc_word[3];
  354 
  355         for (; ip < ipend; ip++) {
  356                 a += BSWAP_32(ip[0]);
  357                 b += a;
  358                 c += b;
  359                 d += c;
  360         }
  361 
  362         ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
  363 }
  364 
  365 static boolean_t
  366 fletcher_4_scalar_valid(void)
  367 {
  368         return (B_TRUE);
  369 }
  370 
  371 int
  372 fletcher_4_impl_set(const char *val)
  373 {
  374         int err = -EINVAL;
  375         uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
  376         size_t i, val_len;
  377 
  378         val_len = strlen(val);
  379         while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
  380                 val_len--;
  381 
  382         /* check mandatory implementations */
  383         for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
  384                 const char *name = fletcher_4_impl_selectors[i].fis_name;
  385 
  386                 if (val_len == strlen(name) &&
  387                     strncmp(val, name, val_len) == 0) {
  388                         impl = fletcher_4_impl_selectors[i].fis_sel;
  389                         err = 0;
  390                         break;
  391                 }
  392         }
  393 
  394         if (err != 0 && fletcher_4_initialized) {
  395                 /* check all supported implementations */
  396                 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
  397                         const char *name = fletcher_4_supp_impls[i]->name;
  398 
  399                         if (val_len == strlen(name) &&
  400                             strncmp(val, name, val_len) == 0) {
  401                                 impl = i;
  402                                 err = 0;
  403                                 break;
  404                         }
  405                 }
  406         }
  407 
  408         if (err == 0) {
  409                 atomic_swap_32(&fletcher_4_impl_chosen, impl);
  410                 membar_producer();
  411         }
  412 
  413         return (err);
  414 }
  415 
  416 /*
  417  * Returns the Fletcher 4 operations for checksums.   When a SIMD
  418  * implementation is not allowed in the current context, then fallback
  419  * to the fastest generic implementation.
  420  */
  421 static inline const fletcher_4_ops_t *
  422 fletcher_4_impl_get(void)
  423 {
  424         if (!kfpu_allowed())
  425                 return (&fletcher_4_superscalar4_ops);
  426 
  427         const fletcher_4_ops_t *ops = NULL;
  428         uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
  429 
  430         switch (impl) {
  431         case IMPL_FASTEST:
  432                 ASSERT(fletcher_4_initialized);
  433                 ops = &fletcher_4_fastest_impl;
  434                 break;
  435         case IMPL_CYCLE:
  436                 /* Cycle through supported implementations */
  437                 ASSERT(fletcher_4_initialized);
  438                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
  439                 static uint32_t cycle_count = 0;
  440                 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
  441                 ops = fletcher_4_supp_impls[idx];
  442                 break;
  443         default:
  444                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
  445                 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
  446                 ops = fletcher_4_supp_impls[impl];
  447                 break;
  448         }
  449 
  450         ASSERT3P(ops, !=, NULL);
  451 
  452         return (ops);
  453 }
  454 
  455 static inline void
  456 fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
  457 {
  458         fletcher_4_ctx_t ctx;
  459         const fletcher_4_ops_t *ops = fletcher_4_impl_get();
  460 
  461         ops->init_native(&ctx);
  462         ops->compute_native(&ctx, buf, size);
  463         ops->fini_native(&ctx, zcp);
  464 }
  465 
  466 void
  467 fletcher_4_native(const void *buf, uint64_t size,
  468     const void *ctx_template, zio_cksum_t *zcp)
  469 {
  470         (void) ctx_template;
  471         const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
  472 
  473         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
  474 
  475         if (size == 0 || p2size == 0) {
  476                 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  477 
  478                 if (size > 0)
  479                         fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
  480                             buf, size);
  481         } else {
  482                 fletcher_4_native_impl(buf, p2size, zcp);
  483 
  484                 if (p2size < size)
  485                         fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
  486                             (char *)buf + p2size, size - p2size);
  487         }
  488 }
  489 
  490 void
  491 fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
  492 {
  493         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  494         fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
  495 }
  496 
  497 static inline void
  498 fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
  499 {
  500         fletcher_4_ctx_t ctx;
  501         const fletcher_4_ops_t *ops = fletcher_4_impl_get();
  502 
  503         ops->init_byteswap(&ctx);
  504         ops->compute_byteswap(&ctx, buf, size);
  505         ops->fini_byteswap(&ctx, zcp);
  506 }
  507 
  508 void
  509 fletcher_4_byteswap(const void *buf, uint64_t size,
  510     const void *ctx_template, zio_cksum_t *zcp)
  511 {
  512         (void) ctx_template;
  513         const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
  514 
  515         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
  516 
  517         if (size == 0 || p2size == 0) {
  518                 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  519 
  520                 if (size > 0)
  521                         fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
  522                             buf, size);
  523         } else {
  524                 fletcher_4_byteswap_impl(buf, p2size, zcp);
  525 
  526                 if (p2size < size)
  527                         fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
  528                             (char *)buf + p2size, size - p2size);
  529         }
  530 }
  531 
  532 /* Incremental Fletcher 4 */
  533 
  534 #define ZFS_FLETCHER_4_INC_MAX_SIZE     (8ULL << 20)
  535 
  536 static inline void
  537 fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
  538     const zio_cksum_t *nzcp)
  539 {
  540         const uint64_t c1 = size / sizeof (uint32_t);
  541         const uint64_t c2 = c1 * (c1 + 1) / 2;
  542         const uint64_t c3 = c2 * (c1 + 2) / 3;
  543 
  544         /*
  545          * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
  546          * reason we split incremental fletcher4 computation of large buffers
  547          * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
  548          */
  549         ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
  550 
  551         zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
  552             c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
  553         zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
  554             c2 * zcp->zc_word[0];
  555         zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
  556         zcp->zc_word[0] += nzcp->zc_word[0];
  557 }
  558 
  559 static inline void
  560 fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
  561     zio_cksum_t *zcp)
  562 {
  563         while (size > 0) {
  564                 zio_cksum_t nzc;
  565                 uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
  566 
  567                 if (native)
  568                         fletcher_4_native(buf, len, NULL, &nzc);
  569                 else
  570                         fletcher_4_byteswap(buf, len, NULL, &nzc);
  571 
  572                 fletcher_4_incremental_combine(zcp, len, &nzc);
  573 
  574                 size -= len;
  575                 buf += len;
  576         }
  577 }
  578 
  579 int
  580 fletcher_4_incremental_native(void *buf, size_t size, void *data)
  581 {
  582         zio_cksum_t *zcp = data;
  583         /* Use scalar impl to directly update cksum of small blocks */
  584         if (size < SPA_MINBLOCKSIZE)
  585                 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
  586         else
  587                 fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
  588         return (0);
  589 }
  590 
  591 int
  592 fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
  593 {
  594         zio_cksum_t *zcp = data;
  595         /* Use scalar impl to directly update cksum of small blocks */
  596         if (size < SPA_MINBLOCKSIZE)
  597                 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
  598         else
  599                 fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
  600         return (0);
  601 }
  602 
  603 #if defined(_KERNEL)
  604 /*
  605  * Fletcher 4 kstats
  606  */
  607 static int
  608 fletcher_4_kstat_headers(char *buf, size_t size)
  609 {
  610         ssize_t off = 0;
  611 
  612         off += snprintf(buf + off, size, "%-17s", "implementation");
  613         off += snprintf(buf + off, size - off, "%-15s", "native");
  614         (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
  615 
  616         return (0);
  617 }
  618 
  619 static int
  620 fletcher_4_kstat_data(char *buf, size_t size, void *data)
  621 {
  622         struct fletcher_4_kstat *fastest_stat =
  623             &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
  624         struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data;
  625         ssize_t off = 0;
  626 
  627         if (curr_stat == fastest_stat) {
  628                 off += snprintf(buf + off, size - off, "%-17s", "fastest");
  629                 off += snprintf(buf + off, size - off, "%-15s",
  630                     fletcher_4_supp_impls[fastest_stat->native]->name);
  631                 (void) snprintf(buf + off, size - off, "%-15s\n",
  632                     fletcher_4_supp_impls[fastest_stat->byteswap]->name);
  633         } else {
  634                 ptrdiff_t id = curr_stat - fletcher_4_stat_data;
  635 
  636                 off += snprintf(buf + off, size - off, "%-17s",
  637                     fletcher_4_supp_impls[id]->name);
  638                 off += snprintf(buf + off, size - off, "%-15llu",
  639                     (u_longlong_t)curr_stat->native);
  640                 (void) snprintf(buf + off, size - off, "%-15llu\n",
  641                     (u_longlong_t)curr_stat->byteswap);
  642         }
  643 
  644         return (0);
  645 }
  646 
  647 static void *
  648 fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
  649 {
  650         if (n <= fletcher_4_supp_impls_cnt)
  651                 ksp->ks_private = (void *) (fletcher_4_stat_data + n);
  652         else
  653                 ksp->ks_private = NULL;
  654 
  655         return (ksp->ks_private);
  656 }
  657 #endif
  658 
  659 #define FLETCHER_4_FASTEST_FN_COPY(type, src)                             \
  660 {                                                                         \
  661         fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;       \
  662         fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;       \
  663         fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
  664 }
  665 
  666 #define FLETCHER_4_BENCH_NS     (MSEC2NSEC(1))          /* 1ms */
  667 
  668 typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
  669                                         zio_cksum_t *);
  670 
  671 #if defined(_KERNEL)
  672 static void
  673 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
  674 {
  675 
  676         struct fletcher_4_kstat *fastest_stat =
  677             &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
  678         hrtime_t start;
  679         uint64_t run_bw, run_time_ns, best_run = 0;
  680         zio_cksum_t zc;
  681         uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
  682 
  683         fletcher_checksum_func_t *fletcher_4_test = native ?
  684             fletcher_4_native : fletcher_4_byteswap;
  685 
  686         for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
  687                 struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
  688                 uint64_t run_count = 0;
  689 
  690                 /* temporary set an implementation */
  691                 fletcher_4_impl_chosen = i;
  692 
  693                 kpreempt_disable();
  694                 start = gethrtime();
  695                 do {
  696                         for (l = 0; l < 32; l++, run_count++)
  697                                 fletcher_4_test(data, data_size, NULL, &zc);
  698 
  699                         run_time_ns = gethrtime() - start;
  700                 } while (run_time_ns < FLETCHER_4_BENCH_NS);
  701                 kpreempt_enable();
  702 
  703                 run_bw = data_size * run_count * NANOSEC;
  704                 run_bw /= run_time_ns;  /* B/s */
  705 
  706                 if (native)
  707                         stat->native = run_bw;
  708                 else
  709                         stat->byteswap = run_bw;
  710 
  711                 if (run_bw > best_run) {
  712                         best_run = run_bw;
  713 
  714                         if (native) {
  715                                 fastest_stat->native = i;
  716                                 FLETCHER_4_FASTEST_FN_COPY(native,
  717                                     fletcher_4_supp_impls[i]);
  718                         } else {
  719                                 fastest_stat->byteswap = i;
  720                                 FLETCHER_4_FASTEST_FN_COPY(byteswap,
  721                                     fletcher_4_supp_impls[i]);
  722                         }
  723                 }
  724         }
  725 
  726         /* restore original selection */
  727         atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
  728 }
  729 #endif /* _KERNEL */
  730 
  731 /*
  732  * Initialize and benchmark all supported implementations.
  733  */
  734 static void
  735 fletcher_4_benchmark(void)
  736 {
  737         fletcher_4_ops_t *curr_impl;
  738         int i, c;
  739 
  740         /* Move supported implementations into fletcher_4_supp_impls */
  741         for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
  742                 curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
  743 
  744                 if (curr_impl->valid && curr_impl->valid())
  745                         fletcher_4_supp_impls[c++] = curr_impl;
  746         }
  747         membar_producer();      /* complete fletcher_4_supp_impls[] init */
  748         fletcher_4_supp_impls_cnt = c;  /* number of supported impl */
  749 
  750 #if defined(_KERNEL)
  751         static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
  752         char *databuf = vmem_alloc(data_size, KM_SLEEP);
  753 
  754         for (i = 0; i < data_size / sizeof (uint64_t); i++)
  755                 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
  756 
  757         fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
  758         fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
  759 
  760         vmem_free(databuf, data_size);
  761 #else
  762         /*
  763          * Skip the benchmark in user space to avoid impacting libzpool
  764          * consumers (zdb, zhack, zinject, ztest).  The last implementation
  765          * is assumed to be the fastest and used by default.
  766          */
  767         memcpy(&fletcher_4_fastest_impl,
  768             fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
  769             sizeof (fletcher_4_fastest_impl));
  770         fletcher_4_fastest_impl.name = "fastest";
  771         membar_producer();
  772 #endif /* _KERNEL */
  773 }
  774 
  775 void
  776 fletcher_4_init(void)
  777 {
  778         /* Determine the fastest available implementation. */
  779         fletcher_4_benchmark();
  780 
  781 #if defined(_KERNEL)
  782         /* Install kstats for all implementations */
  783         fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
  784             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
  785         if (fletcher_4_kstat != NULL) {
  786                 fletcher_4_kstat->ks_data = NULL;
  787                 fletcher_4_kstat->ks_ndata = UINT32_MAX;
  788                 kstat_set_raw_ops(fletcher_4_kstat,
  789                     fletcher_4_kstat_headers,
  790                     fletcher_4_kstat_data,
  791                     fletcher_4_kstat_addr);
  792                 kstat_install(fletcher_4_kstat);
  793         }
  794 #endif
  795 
  796         /* Finish initialization */
  797         fletcher_4_initialized = B_TRUE;
  798 }
  799 
  800 void
  801 fletcher_4_fini(void)
  802 {
  803 #if defined(_KERNEL)
  804         if (fletcher_4_kstat != NULL) {
  805                 kstat_delete(fletcher_4_kstat);
  806                 fletcher_4_kstat = NULL;
  807         }
  808 #endif
  809 }
  810 
  811 /* ABD adapters */
  812 
  813 static void
  814 abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
  815 {
  816         const fletcher_4_ops_t *ops = fletcher_4_impl_get();
  817         cdp->acd_private = (void *) ops;
  818 
  819         if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
  820                 ops->init_native(cdp->acd_ctx);
  821         else
  822                 ops->init_byteswap(cdp->acd_ctx);
  823 }
  824 
  825 static void
  826 abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
  827 {
  828         fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
  829 
  830         ASSERT(ops);
  831 
  832         if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
  833                 ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
  834         else
  835                 ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
  836 }
  837 
  838 static void
  839 abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
  840     zio_abd_checksum_data_t *cdp)
  841 {
  842         zio_cksum_t *zcp = cdp->acd_zcp;
  843 
  844         ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
  845 
  846         abd_fletcher_4_fini(cdp);
  847         cdp->acd_private = (void *)&fletcher_4_scalar_ops;
  848 
  849         if (native)
  850                 fletcher_4_incremental_native(data, size, zcp);
  851         else
  852                 fletcher_4_incremental_byteswap(data, size, zcp);
  853 }
  854 
  855 static int
  856 abd_fletcher_4_iter(void *data, size_t size, void *private)
  857 {
  858         zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
  859         fletcher_4_ctx_t *ctx = cdp->acd_ctx;
  860         fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
  861         boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
  862         uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
  863 
  864         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
  865 
  866         if (asize > 0) {
  867                 if (native)
  868                         ops->compute_native(ctx, data, asize);
  869                 else
  870                         ops->compute_byteswap(ctx, data, asize);
  871 
  872                 size -= asize;
  873                 data = (char *)data + asize;
  874         }
  875 
  876         if (size > 0) {
  877                 ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
  878                 /* At this point we have to switch to scalar impl */
  879                 abd_fletcher_4_simd2scalar(native, data, size, cdp);
  880         }
  881 
  882         return (0);
  883 }
  884 
  885 zio_abd_checksum_func_t fletcher_4_abd_ops = {
  886         .acf_init = abd_fletcher_4_init,
  887         .acf_fini = abd_fletcher_4_fini,
  888         .acf_iter = abd_fletcher_4_iter
  889 };
  890 
  891 #if defined(_KERNEL)
  892 
  893 #define IMPL_FMT(impl, i)       (((impl) == (i)) ? "[%s] " : "%s ")
  894 
  895 #if defined(__linux__)
  896 
  897 static int
  898 fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
  899 {
  900         const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
  901         char *fmt;
  902         int cnt = 0;
  903 
  904         /* list fastest */
  905         fmt = IMPL_FMT(impl, IMPL_FASTEST);
  906         cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest");
  907 
  908         /* list all supported implementations */
  909         for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
  910                 fmt = IMPL_FMT(impl, i);
  911                 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
  912                     fletcher_4_supp_impls[i]->name);
  913         }
  914 
  915         return (cnt);
  916 }
  917 
  918 static int
  919 fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
  920 {
  921         return (fletcher_4_impl_set(val));
  922 }
  923 
  924 #else
  925 
  926 #include <sys/sbuf.h>
  927 
  928 static int
  929 fletcher_4_param(ZFS_MODULE_PARAM_ARGS)
  930 {
  931         int err;
  932 
  933         if (req->newptr == NULL) {
  934                 const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
  935                 const int init_buflen = 64;
  936                 const char *fmt;
  937                 struct sbuf *s;
  938 
  939                 s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
  940 
  941                 /* list fastest */
  942                 fmt = IMPL_FMT(impl, IMPL_FASTEST);
  943                 (void) sbuf_printf(s, fmt, "fastest");
  944 
  945                 /* list all supported implementations */
  946                 for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
  947                         fmt = IMPL_FMT(impl, i);
  948                         (void) sbuf_printf(s, fmt,
  949                             fletcher_4_supp_impls[i]->name);
  950                 }
  951 
  952                 err = sbuf_finish(s);
  953                 sbuf_delete(s);
  954 
  955                 return (err);
  956         }
  957 
  958         char buf[16];
  959 
  960         err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
  961         if (err)
  962                 return (err);
  963         return (-fletcher_4_impl_set(buf));
  964 }
  965 
  966 #endif
  967 
  968 #undef IMPL_FMT
  969 
  970 /*
  971  * Choose a fletcher 4 implementation in ZFS.
  972  * Users can choose "cycle" to exercise all implementations, but this is
  973  * for testing purpose therefore it can only be set in user space.
  974  */
  975 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl,
  976     fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW,
  977         "Select fletcher 4 implementation.");
  978 
  979 EXPORT_SYMBOL(fletcher_init);
  980 EXPORT_SYMBOL(fletcher_2_incremental_native);
  981 EXPORT_SYMBOL(fletcher_2_incremental_byteswap);
  982 EXPORT_SYMBOL(fletcher_4_init);
  983 EXPORT_SYMBOL(fletcher_4_fini);
  984 EXPORT_SYMBOL(fletcher_2_native);
  985 EXPORT_SYMBOL(fletcher_2_byteswap);
  986 EXPORT_SYMBOL(fletcher_4_native);
  987 EXPORT_SYMBOL(fletcher_4_native_varsize);
  988 EXPORT_SYMBOL(fletcher_4_byteswap);
  989 EXPORT_SYMBOL(fletcher_4_incremental_native);
  990 EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
  991 EXPORT_SYMBOL(fletcher_4_abd_ops);
  992 #endif
Cache object: 90667a91c192cbf922623acf212a4553
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/contrib/openzfs/module/zcommon/zfs_fletcher.c

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zcommon/zfs_fletcher.c