The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zstd/zfs_zstd.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
    3  *
    4  * Redistribution and use in source and binary forms, with or without
    5  * modification, are permitted provided that the following conditions are met:
    6  *
    7  * 1. Redistributions of source code must retain the above copyright notice,
    8  * this list of conditions and the following disclaimer.
    9  *
   10  * 2. Redistributions in binary form must reproduce the above copyright notice,
   11  * this list of conditions and the following disclaimer in the documentation
   12  * and/or other materials provided with the distribution.
   13  *
   14  * 3. Neither the name of the copyright holder nor the names of its
   15  * contributors may be used to endorse or promote products derived from this
   16  * software without specific prior written permission.
   17  *
   18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
   22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   28  * POSSIBILITY OF SUCH DAMAGE.
   29  */
   30 
   31 /*
   32  * Copyright (c) 2016-2018, Klara Inc.
   33  * Copyright (c) 2016-2018, Allan Jude
   34  * Copyright (c) 2018-2020, Sebastian Gottschall
   35  * Copyright (c) 2019-2020, Michael Niewöhner
   36  * Copyright (c) 2020, The FreeBSD Foundation [1]
   37  *
   38  * [1] Portions of this software were developed by Allan Jude
   39  *     under sponsorship from the FreeBSD Foundation.
   40  */
   41 
   42 #include <sys/param.h>
   43 #include <sys/sysmacros.h>
   44 #include <sys/zfs_context.h>
   45 #include <sys/zio_compress.h>
   46 #include <sys/spa.h>
   47 #include <sys/zstd/zstd.h>
   48 
   49 #define ZSTD_STATIC_LINKING_ONLY
   50 #include "lib/zstd.h"
   51 #include "lib/common/zstd_errors.h"
   52 
   53 #ifndef IN_LIBSA
   54 static uint_t zstd_earlyabort_pass = 1;
   55 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
   56 static unsigned int zstd_abort_size = (128 * 1024);
   57 #endif
   58 
   59 static kstat_t *zstd_ksp = NULL;
   60 
   61 typedef struct zstd_stats {
   62         kstat_named_t   zstd_stat_alloc_fail;
   63         kstat_named_t   zstd_stat_alloc_fallback;
   64         kstat_named_t   zstd_stat_com_alloc_fail;
   65         kstat_named_t   zstd_stat_dec_alloc_fail;
   66         kstat_named_t   zstd_stat_com_inval;
   67         kstat_named_t   zstd_stat_dec_inval;
   68         kstat_named_t   zstd_stat_dec_header_inval;
   69         kstat_named_t   zstd_stat_com_fail;
   70         kstat_named_t   zstd_stat_dec_fail;
   71         /*
   72          * LZ4 first-pass early abort verdict
   73          */
   74         kstat_named_t   zstd_stat_lz4pass_allowed;
   75         kstat_named_t   zstd_stat_lz4pass_rejected;
   76         /*
   77          * zstd-1 second-pass early abort verdict
   78          */
   79         kstat_named_t   zstd_stat_zstdpass_allowed;
   80         kstat_named_t   zstd_stat_zstdpass_rejected;
   81         /*
   82          * We excluded this from early abort for some reason
   83          */
   84         kstat_named_t   zstd_stat_passignored;
   85         kstat_named_t   zstd_stat_passignored_size;
   86         kstat_named_t   zstd_stat_buffers;
   87         kstat_named_t   zstd_stat_size;
   88 } zstd_stats_t;
   89 
   90 static zstd_stats_t zstd_stats = {
   91         { "alloc_fail",                 KSTAT_DATA_UINT64 },
   92         { "alloc_fallback",             KSTAT_DATA_UINT64 },
   93         { "compress_alloc_fail",        KSTAT_DATA_UINT64 },
   94         { "decompress_alloc_fail",      KSTAT_DATA_UINT64 },
   95         { "compress_level_invalid",     KSTAT_DATA_UINT64 },
   96         { "decompress_level_invalid",   KSTAT_DATA_UINT64 },
   97         { "decompress_header_invalid",  KSTAT_DATA_UINT64 },
   98         { "compress_failed",            KSTAT_DATA_UINT64 },
   99         { "decompress_failed",          KSTAT_DATA_UINT64 },
  100         { "lz4pass_allowed",            KSTAT_DATA_UINT64 },
  101         { "lz4pass_rejected",           KSTAT_DATA_UINT64 },
  102         { "zstdpass_allowed",           KSTAT_DATA_UINT64 },
  103         { "zstdpass_rejected",          KSTAT_DATA_UINT64 },
  104         { "passignored",                KSTAT_DATA_UINT64 },
  105         { "passignored_size",           KSTAT_DATA_UINT64 },
  106         { "buffers",                    KSTAT_DATA_UINT64 },
  107         { "size",                       KSTAT_DATA_UINT64 },
  108 };
  109 
  110 #ifdef _KERNEL
  111 static int
  112 kstat_zstd_update(kstat_t *ksp, int rw)
  113 {
  114         ASSERT(ksp != NULL);
  115 
  116         if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
  117                 ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
  118                 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
  119                 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
  120                 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
  121                 ZSTDSTAT_ZERO(zstd_stat_com_inval);
  122                 ZSTDSTAT_ZERO(zstd_stat_dec_inval);
  123                 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
  124                 ZSTDSTAT_ZERO(zstd_stat_com_fail);
  125                 ZSTDSTAT_ZERO(zstd_stat_dec_fail);
  126                 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
  127                 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
  128                 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
  129                 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
  130                 ZSTDSTAT_ZERO(zstd_stat_passignored);
  131                 ZSTDSTAT_ZERO(zstd_stat_passignored_size);
  132         }
  133 
  134         return (0);
  135 }
  136 #endif
  137 
  138 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
  139 enum zstd_kmem_type {
  140         ZSTD_KMEM_UNKNOWN = 0,
  141         /* Allocation type using kmem_vmalloc */
  142         ZSTD_KMEM_DEFAULT,
  143         /* Pool based allocation using mempool_alloc */
  144         ZSTD_KMEM_POOL,
  145         /* Reserved fallback memory for decompression only */
  146         ZSTD_KMEM_DCTX,
  147         ZSTD_KMEM_COUNT,
  148 };
  149 
  150 /* Structure for pooled memory objects */
  151 struct zstd_pool {
  152         void *mem;
  153         size_t size;
  154         kmutex_t barrier;
  155         hrtime_t timeout;
  156 };
  157 
  158 /* Global structure for handling memory allocations */
  159 struct zstd_kmem {
  160         enum zstd_kmem_type kmem_type;
  161         size_t kmem_size;
  162         struct zstd_pool *pool;
  163 };
  164 
  165 /* Fallback memory structure used for decompression only if memory runs out */
  166 struct zstd_fallback_mem {
  167         size_t mem_size;
  168         void *mem;
  169         kmutex_t barrier;
  170 };
  171 
  172 struct zstd_levelmap {
  173         int16_t zstd_level;
  174         enum zio_zstd_levels level;
  175 };
  176 
  177 /*
  178  * ZSTD memory handlers
  179  *
  180  * For decompression we use a different handler which also provides fallback
  181  * memory allocation in case memory runs out.
  182  *
  183  * The ZSTD handlers were split up for the most simplified implementation.
  184  */
  185 #ifndef IN_LIBSA
  186 static void *zstd_alloc(void *opaque, size_t size);
  187 #endif
  188 static void *zstd_dctx_alloc(void *opaque, size_t size);
  189 static void zstd_free(void *opaque, void *ptr);
  190 
  191 #ifndef IN_LIBSA
  192 /* Compression memory handler */
  193 static const ZSTD_customMem zstd_malloc = {
  194         zstd_alloc,
  195         zstd_free,
  196         NULL,
  197 };
  198 #endif
  199 
  200 /* Decompression memory handler */
  201 static const ZSTD_customMem zstd_dctx_malloc = {
  202         zstd_dctx_alloc,
  203         zstd_free,
  204         NULL,
  205 };
  206 
  207 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
  208 static struct zstd_levelmap zstd_levels[] = {
  209         {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
  210         {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
  211         {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
  212         {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
  213         {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
  214         {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
  215         {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
  216         {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
  217         {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
  218         {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
  219         {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
  220         {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
  221         {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
  222         {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
  223         {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
  224         {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
  225         {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
  226         {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
  227         {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
  228         {-1, ZIO_ZSTD_LEVEL_FAST_1},
  229         {-2, ZIO_ZSTD_LEVEL_FAST_2},
  230         {-3, ZIO_ZSTD_LEVEL_FAST_3},
  231         {-4, ZIO_ZSTD_LEVEL_FAST_4},
  232         {-5, ZIO_ZSTD_LEVEL_FAST_5},
  233         {-6, ZIO_ZSTD_LEVEL_FAST_6},
  234         {-7, ZIO_ZSTD_LEVEL_FAST_7},
  235         {-8, ZIO_ZSTD_LEVEL_FAST_8},
  236         {-9, ZIO_ZSTD_LEVEL_FAST_9},
  237         {-10, ZIO_ZSTD_LEVEL_FAST_10},
  238         {-20, ZIO_ZSTD_LEVEL_FAST_20},
  239         {-30, ZIO_ZSTD_LEVEL_FAST_30},
  240         {-40, ZIO_ZSTD_LEVEL_FAST_40},
  241         {-50, ZIO_ZSTD_LEVEL_FAST_50},
  242         {-60, ZIO_ZSTD_LEVEL_FAST_60},
  243         {-70, ZIO_ZSTD_LEVEL_FAST_70},
  244         {-80, ZIO_ZSTD_LEVEL_FAST_80},
  245         {-90, ZIO_ZSTD_LEVEL_FAST_90},
  246         {-100, ZIO_ZSTD_LEVEL_FAST_100},
  247         {-500, ZIO_ZSTD_LEVEL_FAST_500},
  248         {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
  249 };
  250 
  251 /*
  252  * This variable represents the maximum count of the pool based on the number
  253  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
  254  */
  255 static int pool_count = 16;
  256 
  257 #define ZSTD_POOL_MAX           pool_count
  258 #define ZSTD_POOL_TIMEOUT       60 * 2
  259 
  260 static struct zstd_fallback_mem zstd_dctx_fallback;
  261 static struct zstd_pool *zstd_mempool_cctx;
  262 static struct zstd_pool *zstd_mempool_dctx;
  263 
  264 /*
  265  * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
  266  * and while ASAN does this, KASAN defines that and does not. So to avoid
  267  * changing the external code, we do this.
  268  */
  269 #if defined(ZFS_ASAN_ENABLED)
  270 #define ADDRESS_SANITIZER 1
  271 #endif
  272 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
  273 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
  274 void __asan_poison_memory_region(void const volatile *addr, size_t size);
  275 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
  276 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
  277 #endif
  278 
  279 
  280 static void
  281 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
  282 {
  283         struct zstd_pool *pool;
  284 
  285         if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
  286                 return;
  287         }
  288 
  289         /* free obsolete slots */
  290         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
  291                 pool = &zstd_mempool[i];
  292                 if (pool->mem && mutex_tryenter(&pool->barrier)) {
  293                         /* Free memory if unused object older than 2 minutes */
  294                         if (pool->mem && gethrestime_sec() > pool->timeout) {
  295                                 vmem_free(pool->mem, pool->size);
  296                                 ZSTDSTAT_SUB(zstd_stat_buffers, 1);
  297                                 ZSTDSTAT_SUB(zstd_stat_size, pool->size);
  298                                 pool->mem = NULL;
  299                                 pool->size = 0;
  300                                 pool->timeout = 0;
  301                         }
  302                         mutex_exit(&pool->barrier);
  303                 }
  304         }
  305 }
  306 
  307 /*
  308  * Try to get a cached allocated buffer from memory pool or allocate a new one
  309  * if necessary. If a object is older than 2 minutes and does not fit the
  310  * requested size, it will be released and a new cached entry will be allocated.
  311  * If other pooled objects are detected without being used for 2 minutes, they
  312  * will be released, too.
  313  *
  314  * The concept is that high frequency memory allocations of bigger objects are
  315  * expensive. So if a lot of work is going on, allocations will be kept for a
  316  * while and can be reused in that time frame.
  317  *
  318  * The scheduled release will be updated every time a object is reused.
  319  */
  320 
  321 static void *
  322 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
  323 {
  324         struct zstd_pool *pool;
  325         struct zstd_kmem *mem = NULL;
  326 
  327         if (!zstd_mempool) {
  328                 return (NULL);
  329         }
  330 
  331         /* Seek for preallocated memory slot and free obsolete slots */
  332         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
  333                 pool = &zstd_mempool[i];
  334                 /*
  335                  * This lock is simply a marker for a pool object being in use.
  336                  * If it's already hold, it will be skipped.
  337                  *
  338                  * We need to create it before checking it to avoid race
  339                  * conditions caused by running in a threaded context.
  340                  *
  341                  * The lock is later released by zstd_mempool_free.
  342                  */
  343                 if (mutex_tryenter(&pool->barrier)) {
  344                         /*
  345                          * Check if objects fits the size, if so we take it and
  346                          * update the timestamp.
  347                          */
  348                         if (pool->mem && size <= pool->size) {
  349                                 pool->timeout = gethrestime_sec() +
  350                                     ZSTD_POOL_TIMEOUT;
  351                                 mem = pool->mem;
  352                                 return (mem);
  353                         }
  354                         mutex_exit(&pool->barrier);
  355                 }
  356         }
  357 
  358         /*
  359          * If no preallocated slot was found, try to fill in a new one.
  360          *
  361          * We run a similar algorithm twice here to avoid pool fragmentation.
  362          * The first one may generate holes in the list if objects get released.
  363          * We always make sure that these holes get filled instead of adding new
  364          * allocations constantly at the end.
  365          */
  366         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
  367                 pool = &zstd_mempool[i];
  368                 if (mutex_tryenter(&pool->barrier)) {
  369                         /* Object is free, try to allocate new one */
  370                         if (!pool->mem) {
  371                                 mem = vmem_alloc(size, KM_SLEEP);
  372                                 if (mem) {
  373                                         ZSTDSTAT_ADD(zstd_stat_buffers, 1);
  374                                         ZSTDSTAT_ADD(zstd_stat_size, size);
  375                                         pool->mem = mem;
  376                                         pool->size = size;
  377                                         /* Keep track for later release */
  378                                         mem->pool = pool;
  379                                         mem->kmem_type = ZSTD_KMEM_POOL;
  380                                         mem->kmem_size = size;
  381                                 }
  382                         }
  383 
  384                         if (size <= pool->size) {
  385                                 /* Update timestamp */
  386                                 pool->timeout = gethrestime_sec() +
  387                                     ZSTD_POOL_TIMEOUT;
  388 
  389                                 return (pool->mem);
  390                         }
  391 
  392                         mutex_exit(&pool->barrier);
  393                 }
  394         }
  395 
  396         /*
  397          * If the pool is full or the allocation failed, try lazy allocation
  398          * instead.
  399          */
  400         if (!mem) {
  401                 mem = vmem_alloc(size, KM_NOSLEEP);
  402                 if (mem) {
  403                         mem->pool = NULL;
  404                         mem->kmem_type = ZSTD_KMEM_DEFAULT;
  405                         mem->kmem_size = size;
  406                 }
  407         }
  408 
  409         return (mem);
  410 }
  411 
  412 /* Mark object as released by releasing the barrier mutex */
  413 static void
  414 zstd_mempool_free(struct zstd_kmem *z)
  415 {
  416         mutex_exit(&z->pool->barrier);
  417 }
  418 
  419 /* Convert ZFS internal enum to ZSTD level */
  420 static int
  421 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
  422 {
  423         if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
  424                 *zstd_level = zstd_levels[level - 1].zstd_level;
  425                 return (0);
  426         }
  427         if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
  428             level <= ZIO_ZSTD_LEVEL_FAST_1000) {
  429                 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
  430                     + ZIO_ZSTD_LEVEL_19].zstd_level;
  431                 return (0);
  432         }
  433 
  434         /* Invalid/unknown zfs compression enum - this should never happen. */
  435         return (1);
  436 }
  437 
  438 #ifndef IN_LIBSA
  439 size_t
  440 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
  441     int level)
  442 {
  443         int16_t zstd_level;
  444         if (zstd_enum_to_level(level, &zstd_level)) {
  445                 ZSTDSTAT_BUMP(zstd_stat_com_inval);
  446                 return (s_len);
  447         }
  448         /*
  449          * A zstd early abort heuristic.
  450          *
  451          * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
  452          *   128k), don't try any of this, just go.
  453          *   (because experimentally that was a reasonable cutoff for a perf win
  454          *   with tiny ratio change)
  455          * - First, we try LZ4 compression, and if it doesn't early abort, we
  456          *   jump directly to whatever compression level we intended to try.
  457          * - Second, we try zstd-1 - if that errors out (usually, but not
  458          *   exclusively, if it would overflow), we give up early.
  459          *
  460          *   If it works, instead we go on and compress anyway.
  461          *
  462          * Why two passes? LZ4 alone gets you a lot of the way, but on highly
  463          * compressible data, it was losing up to 8.5% of the compressed
  464          * savings versus no early abort, and all the zstd-fast levels are
  465          * worse indications on their own than LZ4, and don't improve the LZ4
  466          * pass noticably if stacked like this.
  467          */
  468         size_t actual_abort_size = zstd_abort_size;
  469         if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
  470             s_len >= actual_abort_size) {
  471                 int pass_len = 1;
  472                 pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
  473                 if (pass_len < d_len) {
  474                         ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
  475                         goto keep_trying;
  476                 }
  477                 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
  478 
  479                 pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
  480                     ZIO_ZSTD_LEVEL_1);
  481                 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
  482                         ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
  483                         return (s_len);
  484                 }
  485                 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
  486         } else {
  487                 ZSTDSTAT_BUMP(zstd_stat_passignored);
  488                 if (s_len < actual_abort_size) {
  489                         ZSTDSTAT_BUMP(zstd_stat_passignored_size);
  490                 }
  491         }
  492 keep_trying:
  493         return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
  494 
  495 }
  496 
  497 /* Compress block using zstd */
  498 size_t
  499 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
  500     int level)
  501 {
  502         size_t c_len;
  503         int16_t zstd_level;
  504         zfs_zstdhdr_t *hdr;
  505         ZSTD_CCtx *cctx;
  506 
  507         hdr = (zfs_zstdhdr_t *)d_start;
  508 
  509         /* Skip compression if the specified level is invalid */
  510         if (zstd_enum_to_level(level, &zstd_level)) {
  511                 ZSTDSTAT_BUMP(zstd_stat_com_inval);
  512                 return (s_len);
  513         }
  514 
  515         ASSERT3U(d_len, >=, sizeof (*hdr));
  516         ASSERT3U(d_len, <=, s_len);
  517         ASSERT3U(zstd_level, !=, 0);
  518 
  519         cctx = ZSTD_createCCtx_advanced(zstd_malloc);
  520 
  521         /*
  522          * Out of kernel memory, gently fall through - this will disable
  523          * compression in zio_compress_data
  524          */
  525         if (!cctx) {
  526                 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
  527                 return (s_len);
  528         }
  529 
  530         /* Set the compression level */
  531         ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
  532 
  533         /* Use the "magicless" zstd header which saves us 4 header bytes */
  534         ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
  535 
  536         /*
  537          * Disable redundant checksum calculation and content size storage since
  538          * this is already done by ZFS itself.
  539          */
  540         ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
  541         ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
  542 
  543         c_len = ZSTD_compress2(cctx,
  544             hdr->data,
  545             d_len - sizeof (*hdr),
  546             s_start, s_len);
  547 
  548         ZSTD_freeCCtx(cctx);
  549 
  550         /* Error in the compression routine, disable compression. */
  551         if (ZSTD_isError(c_len)) {
  552                 /*
  553                  * If we are aborting the compression because the saves are
  554                  * too small, that is not a failure. Everything else is a
  555                  * failure, so increment the compression failure counter.
  556                  */
  557                 int err = ZSTD_getErrorCode(c_len);
  558                 if (err != ZSTD_error_dstSize_tooSmall) {
  559                         ZSTDSTAT_BUMP(zstd_stat_com_fail);
  560                         dprintf("Error: %s", ZSTD_getErrorString(err));
  561                 }
  562                 return (s_len);
  563         }
  564 
  565         /*
  566          * Encode the compressed buffer size at the start. We'll need this in
  567          * decompression to counter the effects of padding which might be added
  568          * to the compressed buffer and which, if unhandled, would confuse the
  569          * hell out of our decompression function.
  570          */
  571         hdr->c_len = BE_32(c_len);
  572 
  573         /*
  574          * Check version for overflow.
  575          * The limit of 24 bits must not be exceeded. This allows a maximum
  576          * version 1677.72.15 which we don't expect to be ever reached.
  577          */
  578         ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
  579 
  580         /*
  581          * Encode the compression level as well. We may need to know the
  582          * original compression level if compressed_arc is disabled, to match
  583          * the compression settings to write this block to the L2ARC.
  584          *
  585          * Encode the actual level, so if the enum changes in the future, we
  586          * will be compatible.
  587          *
  588          * The upper 24 bits store the ZSTD version to be able to provide
  589          * future compatibility, since new versions might enhance the
  590          * compression algorithm in a way, where the compressed data will
  591          * change.
  592          *
  593          * As soon as such incompatibility occurs, handling code needs to be
  594          * added, differentiating between the versions.
  595          */
  596         zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
  597         zfs_set_hdrlevel(hdr, level);
  598         hdr->raw_version_level = BE_32(hdr->raw_version_level);
  599 
  600         return (c_len + sizeof (*hdr));
  601 }
  602 #endif
  603 
  604 /* Decompress block using zstd and return its stored level */
  605 int
  606 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
  607     size_t d_len, uint8_t *level)
  608 {
  609         ZSTD_DCtx *dctx;
  610         size_t result;
  611         int16_t zstd_level;
  612         uint32_t c_len;
  613         const zfs_zstdhdr_t *hdr;
  614         zfs_zstdhdr_t hdr_copy;
  615 
  616         hdr = (const zfs_zstdhdr_t *)s_start;
  617         c_len = BE_32(hdr->c_len);
  618 
  619         /*
  620          * Make a copy instead of directly converting the header, since we must
  621          * not modify the original data that may be used again later.
  622          */
  623         hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
  624         uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
  625 
  626         /*
  627          * NOTE: We ignore the ZSTD version for now. As soon as any
  628          * incompatibility occurs, it has to be handled accordingly.
  629          * The version can be accessed via `hdr_copy.version`.
  630          */
  631 
  632         /*
  633          * Convert and check the level
  634          * An invalid level is a strong indicator for data corruption! In such
  635          * case return an error so the upper layers can try to fix it.
  636          */
  637         if (zstd_enum_to_level(curlevel, &zstd_level)) {
  638                 ZSTDSTAT_BUMP(zstd_stat_dec_inval);
  639                 return (1);
  640         }
  641 
  642         ASSERT3U(d_len, >=, s_len);
  643         ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
  644 
  645         /* Invalid compressed buffer size encoded at start */
  646         if (c_len + sizeof (*hdr) > s_len) {
  647                 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
  648                 return (1);
  649         }
  650 
  651         dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
  652         if (!dctx) {
  653                 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
  654                 return (1);
  655         }
  656 
  657         /* Set header type to "magicless" */
  658         ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
  659 
  660         /* Decompress the data and release the context */
  661         result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
  662         ZSTD_freeDCtx(dctx);
  663 
  664         /*
  665          * Returns 0 on success (decompression function returned non-negative)
  666          * and non-zero on failure (decompression function returned negative.
  667          */
  668         if (ZSTD_isError(result)) {
  669                 ZSTDSTAT_BUMP(zstd_stat_dec_fail);
  670                 return (1);
  671         }
  672 
  673         if (level) {
  674                 *level = curlevel;
  675         }
  676 
  677         return (0);
  678 }
  679 
  680 /* Decompress datablock using zstd */
  681 int
  682 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
  683     int level __maybe_unused)
  684 {
  685 
  686         return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
  687             NULL));
  688 }
  689 
  690 #ifndef IN_LIBSA
  691 /* Allocator for zstd compression context using mempool_allocator */
  692 static void *
  693 zstd_alloc(void *opaque __maybe_unused, size_t size)
  694 {
  695         size_t nbytes = sizeof (struct zstd_kmem) + size;
  696         struct zstd_kmem *z = NULL;
  697 
  698         z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
  699 
  700         if (!z) {
  701                 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
  702                 return (NULL);
  703         }
  704 
  705         return ((void*)z + (sizeof (struct zstd_kmem)));
  706 }
  707 #endif
  708 
  709 /*
  710  * Allocator for zstd decompression context using mempool_allocator with
  711  * fallback to reserved memory if allocation fails
  712  */
  713 static void *
  714 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
  715 {
  716         size_t nbytes = sizeof (struct zstd_kmem) + size;
  717         struct zstd_kmem *z = NULL;
  718         enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
  719 
  720         z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
  721         if (!z) {
  722                 /* Try harder, decompression shall not fail */
  723                 z = vmem_alloc(nbytes, KM_SLEEP);
  724                 if (z) {
  725                         z->pool = NULL;
  726                 }
  727                 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
  728         } else {
  729                 return ((void*)z + (sizeof (struct zstd_kmem)));
  730         }
  731 
  732         /* Fallback if everything fails */
  733         if (!z) {
  734                 /*
  735                  * Barrier since we only can handle it in a single thread. All
  736                  * other following threads need to wait here until decompression
  737                  * is completed. zstd_free will release this barrier later.
  738                  */
  739                 mutex_enter(&zstd_dctx_fallback.barrier);
  740 
  741                 z = zstd_dctx_fallback.mem;
  742                 type = ZSTD_KMEM_DCTX;
  743                 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
  744         }
  745 
  746         /* Allocation should always be successful */
  747         if (!z) {
  748                 return (NULL);
  749         }
  750 
  751         z->kmem_type = type;
  752         z->kmem_size = nbytes;
  753 
  754         return ((void*)z + (sizeof (struct zstd_kmem)));
  755 }
  756 
  757 /* Free allocated memory by its specific type */
  758 static void
  759 zstd_free(void *opaque __maybe_unused, void *ptr)
  760 {
  761         struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
  762         enum zstd_kmem_type type;
  763 
  764         ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
  765         ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
  766 
  767         type = z->kmem_type;
  768         switch (type) {
  769         case ZSTD_KMEM_DEFAULT:
  770                 vmem_free(z, z->kmem_size);
  771                 break;
  772         case ZSTD_KMEM_POOL:
  773                 zstd_mempool_free(z);
  774                 break;
  775         case ZSTD_KMEM_DCTX:
  776                 mutex_exit(&zstd_dctx_fallback.barrier);
  777                 break;
  778         default:
  779                 break;
  780         }
  781 }
  782 
  783 /* Allocate fallback memory to ensure safe decompression */
  784 static void __init
  785 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
  786 {
  787         mem->mem_size = size;
  788         mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
  789         mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
  790 }
  791 
  792 /* Initialize memory pool barrier mutexes */
  793 static void __init
  794 zstd_mempool_init(void)
  795 {
  796         zstd_mempool_cctx =
  797             kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
  798         zstd_mempool_dctx =
  799             kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
  800 
  801         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
  802                 mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
  803                     MUTEX_DEFAULT, NULL);
  804                 mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
  805                     MUTEX_DEFAULT, NULL);
  806         }
  807 }
  808 
  809 /* Initialize zstd-related memory handling */
  810 static int __init
  811 zstd_meminit(void)
  812 {
  813         zstd_mempool_init();
  814 
  815         /*
  816          * Estimate the size of the fallback decompression context.
  817          * The expected size on x64 with current ZSTD should be about 160 KB.
  818          */
  819         create_fallback_mem(&zstd_dctx_fallback,
  820             P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
  821             PAGESIZE));
  822 
  823         return (0);
  824 }
  825 
  826 /* Release object from pool and free memory */
  827 static void
  828 release_pool(struct zstd_pool *pool)
  829 {
  830         mutex_destroy(&pool->barrier);
  831         vmem_free(pool->mem, pool->size);
  832         pool->mem = NULL;
  833         pool->size = 0;
  834 }
  835 
  836 /* Release memory pool objects */
  837 static void
  838 zstd_mempool_deinit(void)
  839 {
  840         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
  841                 release_pool(&zstd_mempool_cctx[i]);
  842                 release_pool(&zstd_mempool_dctx[i]);
  843         }
  844 
  845         kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
  846         kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
  847         zstd_mempool_dctx = NULL;
  848         zstd_mempool_cctx = NULL;
  849 }
  850 
  851 /* release unused memory from pool */
  852 
  853 void
  854 zfs_zstd_cache_reap_now(void)
  855 {
  856 
  857         /*
  858          * Short-circuit if there are no buffers to begin with.
  859          */
  860         if (ZSTDSTAT(zstd_stat_buffers) == 0)
  861                 return;
  862 
  863         /*
  864          * calling alloc with zero size seeks
  865          * and releases old unused objects
  866          */
  867         zstd_mempool_reap(zstd_mempool_cctx);
  868         zstd_mempool_reap(zstd_mempool_dctx);
  869 }
  870 
  871 extern int __init
  872 zstd_init(void)
  873 {
  874         /* Set pool size by using maximum sane thread count * 4 */
  875         pool_count = (boot_ncpus * 4);
  876         zstd_meminit();
  877 
  878         /* Initialize kstat */
  879         zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
  880             KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
  881             KSTAT_FLAG_VIRTUAL);
  882         if (zstd_ksp != NULL) {
  883                 zstd_ksp->ks_data = &zstd_stats;
  884                 kstat_install(zstd_ksp);
  885 #ifdef _KERNEL
  886                 zstd_ksp->ks_update = kstat_zstd_update;
  887 #endif
  888         }
  889 
  890         return (0);
  891 }
  892 
  893 extern void
  894 zstd_fini(void)
  895 {
  896         /* Deinitialize kstat */
  897         if (zstd_ksp != NULL) {
  898                 kstat_delete(zstd_ksp);
  899                 zstd_ksp = NULL;
  900         }
  901 
  902         /* Release fallback memory */
  903         vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
  904         mutex_destroy(&zstd_dctx_fallback.barrier);
  905 
  906         /* Deinit memory pool */
  907         zstd_mempool_deinit();
  908 }
  909 
  910 #if defined(_KERNEL)
  911 #ifdef __FreeBSD__
  912 module_init(zstd_init);
  913 module_exit(zstd_fini);
  914 #endif
  915 
  916 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
  917         "Enable early abort attempts when using zstd");
  918 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
  919         "Minimal size of block to attempt early abort");
  920 #endif

Cache object: afa1985e0ac5c308b079e23c727d8412


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.