The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/os/linux/zfs/arc_os.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2018, Joyent, Inc.
   24  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
   25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
   26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
   27  */
   28 
   29 #include <sys/spa.h>
   30 #include <sys/zio.h>
   31 #include <sys/spa_impl.h>
   32 #include <sys/zio_compress.h>
   33 #include <sys/zio_checksum.h>
   34 #include <sys/zfs_context.h>
   35 #include <sys/arc.h>
   36 #include <sys/zfs_refcount.h>
   37 #include <sys/vdev.h>
   38 #include <sys/vdev_trim.h>
   39 #include <sys/vdev_impl.h>
   40 #include <sys/dsl_pool.h>
   41 #include <sys/multilist.h>
   42 #include <sys/abd.h>
   43 #include <sys/zil.h>
   44 #include <sys/fm/fs/zfs.h>
   45 #ifdef _KERNEL
   46 #include <sys/shrinker.h>
   47 #include <sys/vmsystm.h>
   48 #include <sys/zpl.h>
   49 #include <linux/page_compat.h>
   50 #include <linux/notifier.h>
   51 #include <linux/memory.h>
   52 #endif
   53 #include <sys/callb.h>
   54 #include <sys/kstat.h>
   55 #include <sys/zthr.h>
   56 #include <zfs_fletcher.h>
   57 #include <sys/arc_impl.h>
   58 #include <sys/trace_zfs.h>
   59 #include <sys/aggsum.h>
   60 
   61 /*
   62  * This is a limit on how many pages the ARC shrinker makes available for
   63  * eviction in response to one page allocation attempt.  Note that in
   64  * practice, the kernel's shrinker can ask us to evict up to about 4x this
   65  * for one allocation attempt.
   66  *
   67  * The default limit of 10,000 (in practice, 160MB per allocation attempt
   68  * with 4K pages) limits the amount of time spent attempting to reclaim ARC
   69  * memory to less than 100ms per allocation attempt, even with a small
   70  * average compressed block size of ~8KB.
   71  *
   72  * See also the comment in arc_shrinker_count().
   73  * Set to 0 to disable limit.
   74  */
   75 int zfs_arc_shrinker_limit = 10000;
   76 
   77 #ifdef CONFIG_MEMORY_HOTPLUG
   78 static struct notifier_block arc_hotplug_callback_mem_nb;
   79 #endif
   80 
   81 /*
   82  * Return a default max arc size based on the amount of physical memory.
   83  */
   84 uint64_t
   85 arc_default_max(uint64_t min, uint64_t allmem)
   86 {
   87         /* Default to 1/2 of all memory. */
   88         return (MAX(allmem / 2, min));
   89 }
   90 
   91 #ifdef _KERNEL
   92 /*
   93  * Return maximum amount of memory that we could possibly use.  Reduced
   94  * to half of all memory in user space which is primarily used for testing.
   95  */
   96 uint64_t
   97 arc_all_memory(void)
   98 {
   99 #ifdef CONFIG_HIGHMEM
  100         return (ptob(zfs_totalram_pages - zfs_totalhigh_pages));
  101 #else
  102         return (ptob(zfs_totalram_pages));
  103 #endif /* CONFIG_HIGHMEM */
  104 }
  105 
  106 /*
  107  * Return the amount of memory that is considered free.  In user space
  108  * which is primarily used for testing we pretend that free memory ranges
  109  * from 0-20% of all memory.
  110  */
  111 uint64_t
  112 arc_free_memory(void)
  113 {
  114 #ifdef CONFIG_HIGHMEM
  115         struct sysinfo si;
  116         si_meminfo(&si);
  117         return (ptob(si.freeram - si.freehigh));
  118 #else
  119         return (ptob(nr_free_pages() +
  120             nr_inactive_file_pages()));
  121 #endif /* CONFIG_HIGHMEM */
  122 }
  123 
  124 /*
  125  * Return the amount of memory that can be consumed before reclaim will be
  126  * needed.  Positive if there is sufficient free memory, negative indicates
  127  * the amount of memory that needs to be freed up.
  128  */
  129 int64_t
  130 arc_available_memory(void)
  131 {
  132         return (arc_free_memory() - arc_sys_free);
  133 }
  134 
  135 static uint64_t
  136 arc_evictable_memory(void)
  137 {
  138         int64_t asize = aggsum_value(&arc_sums.arcstat_size);
  139         uint64_t arc_clean =
  140             zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
  141             zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
  142             zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
  143             zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
  144         uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
  145 
  146         /*
  147          * Scale reported evictable memory in proportion to page cache, cap
  148          * at specified min/max.
  149          */
  150         uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent;
  151         min = MAX(arc_c_min, MIN(arc_c_max, min));
  152 
  153         if (arc_dirty >= min)
  154                 return (arc_clean);
  155 
  156         return (MAX((int64_t)asize - (int64_t)min, 0));
  157 }
  158 
  159 /*
  160  * The _count() function returns the number of free-able objects.
  161  * The _scan() function returns the number of objects that were freed.
  162  */
  163 static unsigned long
  164 arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
  165 {
  166         /*
  167          * __GFP_FS won't be set if we are called from ZFS code (see
  168          * kmem_flags_convert(), which removes it).  To avoid a deadlock, we
  169          * don't allow evicting in this case.  We return 0 rather than
  170          * SHRINK_STOP so that the shrinker logic doesn't accumulate a
  171          * deficit against us.
  172          */
  173         if (!(sc->gfp_mask & __GFP_FS)) {
  174                 return (0);
  175         }
  176 
  177         /*
  178          * This code is reached in the "direct reclaim" case, where the
  179          * kernel (outside ZFS) is trying to allocate a page, and the system
  180          * is low on memory.
  181          *
  182          * The kernel's shrinker code doesn't understand how many pages the
  183          * ARC's callback actually frees, so it may ask the ARC to shrink a
  184          * lot for one page allocation. This is problematic because it may
  185          * take a long time, thus delaying the page allocation, and because
  186          * it may force the ARC to unnecessarily shrink very small.
  187          *
  188          * Therefore, we limit the amount of data that we say is evictable,
  189          * which limits the amount that the shrinker will ask us to evict for
  190          * one page allocation attempt.
  191          *
  192          * In practice, we may be asked to shrink 4x the limit to satisfy one
  193          * page allocation, before the kernel's shrinker code gives up on us.
  194          * When that happens, we rely on the kernel code to find the pages
  195          * that we freed before invoking the OOM killer.  This happens in
  196          * __alloc_pages_slowpath(), which retries and finds the pages we
  197          * freed when it calls get_page_from_freelist().
  198          *
  199          * See also the comment above zfs_arc_shrinker_limit.
  200          */
  201         int64_t limit = zfs_arc_shrinker_limit != 0 ?
  202             zfs_arc_shrinker_limit : INT64_MAX;
  203         return (MIN(limit, btop((int64_t)arc_evictable_memory())));
  204 }
  205 
  206 static unsigned long
  207 arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
  208 {
  209         ASSERT((sc->gfp_mask & __GFP_FS) != 0);
  210 
  211         /* The arc is considered warm once reclaim has occurred */
  212         if (unlikely(arc_warm == B_FALSE))
  213                 arc_warm = B_TRUE;
  214 
  215         /*
  216          * Evict the requested number of pages by reducing arc_c and waiting
  217          * for the requested amount of data to be evicted.
  218          */
  219         arc_reduce_target_size(ptob(sc->nr_to_scan));
  220         arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE);
  221         if (current->reclaim_state != NULL)
  222                 current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
  223 
  224         /*
  225          * We are experiencing memory pressure which the arc_evict_zthr was
  226          * unable to keep up with. Set arc_no_grow to briefly pause arc
  227          * growth to avoid compounding the memory pressure.
  228          */
  229         arc_no_grow = B_TRUE;
  230 
  231         /*
  232          * When direct reclaim is observed it usually indicates a rapid
  233          * increase in memory pressure.  This occurs because the kswapd
  234          * threads were unable to asynchronously keep enough free memory
  235          * available.
  236          */
  237         if (current_is_kswapd()) {
  238                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
  239         } else {
  240                 ARCSTAT_BUMP(arcstat_memory_direct_count);
  241         }
  242 
  243         return (sc->nr_to_scan);
  244 }
  245 
  246 SPL_SHRINKER_DECLARE(arc_shrinker,
  247     arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
  248 
  249 int
  250 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
  251 {
  252         uint64_t free_memory = arc_free_memory();
  253 
  254         if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100)
  255                 return (0);
  256 
  257         if (txg > spa->spa_lowmem_last_txg) {
  258                 spa->spa_lowmem_last_txg = txg;
  259                 spa->spa_lowmem_page_load = 0;
  260         }
  261         /*
  262          * If we are in pageout, we know that memory is already tight,
  263          * the arc is already going to be evicting, so we just want to
  264          * continue to let page writes occur as quickly as possible.
  265          */
  266         if (current_is_kswapd()) {
  267                 if (spa->spa_lowmem_page_load >
  268                     MAX(arc_sys_free / 4, free_memory) / 4) {
  269                         DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
  270                         return (SET_ERROR(ERESTART));
  271                 }
  272                 /* Note: reserve is inflated, so we deflate */
  273                 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
  274                 return (0);
  275         } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
  276                 /* memory is low, delay before restarting */
  277                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
  278                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
  279                 return (SET_ERROR(EAGAIN));
  280         }
  281         spa->spa_lowmem_page_load = 0;
  282         return (0);
  283 }
  284 
  285 static void
  286 arc_set_sys_free(uint64_t allmem)
  287 {
  288         /*
  289          * The ARC tries to keep at least this much memory available for the
  290          * system.  This gives the ARC time to shrink in response to memory
  291          * pressure, before running completely out of memory and invoking the
  292          * direct-reclaim ARC shrinker.
  293          *
  294          * This should be more than twice high_wmark_pages(), so that
  295          * arc_wait_for_eviction() will wait until at least the
  296          * high_wmark_pages() are free (see arc_evict_state_impl()).
  297          *
  298          * Note: Even when the system is very low on memory, the kernel's
  299          * shrinker code may only ask for one "batch" of pages (512KB) to be
  300          * evicted.  If concurrent allocations consume these pages, there may
  301          * still be insufficient free pages, and the OOM killer takes action.
  302          *
  303          * By setting arc_sys_free large enough, and having
  304          * arc_wait_for_eviction() wait until there is at least arc_sys_free/2
  305          * free memory, it is much less likely that concurrent allocations can
  306          * consume all the memory that was evicted before checking for
  307          * OOM.
  308          *
  309          * It's hard to iterate the zones from a linux kernel module, which
  310          * makes it difficult to determine the watermark dynamically. Instead
  311          * we compute the maximum high watermark for this system, based
  312          * on the amount of memory, assuming default parameters on Linux kernel
  313          * 5.3.
  314          */
  315 
  316         /*
  317          * Base wmark_low is 4 * the square root of Kbytes of RAM.
  318          */
  319         long wmark = 4 * int_sqrt(allmem/1024) * 1024;
  320 
  321         /*
  322          * Clamp to between 128K and 64MB.
  323          */
  324         wmark = MAX(wmark, 128 * 1024);
  325         wmark = MIN(wmark, 64 * 1024 * 1024);
  326 
  327         /*
  328          * watermark_boost can increase the wmark by up to 150%.
  329          */
  330         wmark += wmark * 150 / 100;
  331 
  332         /*
  333          * arc_sys_free needs to be more than 2x the watermark, because
  334          * arc_wait_for_eviction() waits for half of arc_sys_free.  Bump this up
  335          * to 3x to ensure we're above it.
  336          */
  337         arc_sys_free = wmark * 3 + allmem / 32;
  338 }
  339 
  340 void
  341 arc_lowmem_init(void)
  342 {
  343         uint64_t allmem = arc_all_memory();
  344 
  345         /*
  346          * Register a shrinker to support synchronous (direct) memory
  347          * reclaim from the arc.  This is done to prevent kswapd from
  348          * swapping out pages when it is preferable to shrink the arc.
  349          */
  350         spl_register_shrinker(&arc_shrinker);
  351         arc_set_sys_free(allmem);
  352 }
  353 
  354 void
  355 arc_lowmem_fini(void)
  356 {
  357         spl_unregister_shrinker(&arc_shrinker);
  358 }
  359 
  360 int
  361 param_set_arc_u64(const char *buf, zfs_kernel_param_t *kp)
  362 {
  363         int error;
  364 
  365         error = spl_param_set_u64(buf, kp);
  366         if (error < 0)
  367                 return (SET_ERROR(error));
  368 
  369         arc_tuning_update(B_TRUE);
  370 
  371         return (0);
  372 }
  373 
  374 int
  375 param_set_arc_min(const char *buf, zfs_kernel_param_t *kp)
  376 {
  377         return (param_set_arc_u64(buf, kp));
  378 }
  379 
  380 int
  381 param_set_arc_max(const char *buf, zfs_kernel_param_t *kp)
  382 {
  383         return (param_set_arc_u64(buf, kp));
  384 }
  385 
  386 int
  387 param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
  388 {
  389         int error;
  390 
  391         error = param_set_int(buf, kp);
  392         if (error < 0)
  393                 return (SET_ERROR(error));
  394 
  395         arc_tuning_update(B_TRUE);
  396 
  397         return (0);
  398 }
  399 
  400 #ifdef CONFIG_MEMORY_HOTPLUG
  401 static int
  402 arc_hotplug_callback(struct notifier_block *self, unsigned long action,
  403     void *arg)
  404 {
  405         (void) self, (void) arg;
  406         uint64_t allmem = arc_all_memory();
  407         if (action != MEM_ONLINE)
  408                 return (NOTIFY_OK);
  409 
  410         arc_set_limits(allmem);
  411 
  412 #ifdef __LP64__
  413         if (zfs_dirty_data_max_max == 0)
  414                 zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
  415                     allmem * zfs_dirty_data_max_max_percent / 100);
  416 #else
  417         if (zfs_dirty_data_max_max == 0)
  418                 zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
  419                     allmem * zfs_dirty_data_max_max_percent / 100);
  420 #endif
  421 
  422         arc_set_sys_free(allmem);
  423         return (NOTIFY_OK);
  424 }
  425 #endif
  426 
  427 void
  428 arc_register_hotplug(void)
  429 {
  430 #ifdef CONFIG_MEMORY_HOTPLUG
  431         arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback;
  432         /* There is no significance to the value 100 */
  433         arc_hotplug_callback_mem_nb.priority = 100;
  434         register_memory_notifier(&arc_hotplug_callback_mem_nb);
  435 #endif
  436 }
  437 
  438 void
  439 arc_unregister_hotplug(void)
  440 {
  441 #ifdef CONFIG_MEMORY_HOTPLUG
  442         unregister_memory_notifier(&arc_hotplug_callback_mem_nb);
  443 #endif
  444 }
  445 #else /* _KERNEL */
  446 int64_t
  447 arc_available_memory(void)
  448 {
  449         int64_t lowest = INT64_MAX;
  450 
  451         /* Every 100 calls, free a small amount */
  452         if (random_in_range(100) == 0)
  453                 lowest = -1024;
  454 
  455         return (lowest);
  456 }
  457 
  458 int
  459 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
  460 {
  461         (void) spa, (void) reserve, (void) txg;
  462         return (0);
  463 }
  464 
  465 uint64_t
  466 arc_all_memory(void)
  467 {
  468         return (ptob(physmem) / 2);
  469 }
  470 
  471 uint64_t
  472 arc_free_memory(void)
  473 {
  474         return (random_in_range(arc_all_memory() * 20 / 100));
  475 }
  476 
  477 void
  478 arc_register_hotplug(void)
  479 {
  480 }
  481 
  482 void
  483 arc_unregister_hotplug(void)
  484 {
  485 }
  486 #endif /* _KERNEL */
  487 
  488 /*
  489  * Helper function for arc_prune_async() it is responsible for safely
  490  * handling the execution of a registered arc_prune_func_t.
  491  */
  492 static void
  493 arc_prune_task(void *ptr)
  494 {
  495         arc_prune_t *ap = (arc_prune_t *)ptr;
  496         arc_prune_func_t *func = ap->p_pfunc;
  497 
  498         if (func != NULL)
  499                 func(ap->p_adjust, ap->p_private);
  500 
  501         zfs_refcount_remove(&ap->p_refcnt, func);
  502 }
  503 
  504 /*
  505  * Notify registered consumers they must drop holds on a portion of the ARC
  506  * buffered they reference.  This provides a mechanism to ensure the ARC can
  507  * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
  508  * is analogous to dnlc_reduce_cache() but more generic.
  509  *
  510  * This operation is performed asynchronously so it may be safely called
  511  * in the context of the arc_reclaim_thread().  A reference is taken here
  512  * for each registered arc_prune_t and the arc_prune_task() is responsible
  513  * for releasing it once the registered arc_prune_func_t has completed.
  514  */
  515 void
  516 arc_prune_async(uint64_t adjust)
  517 {
  518         arc_prune_t *ap;
  519 
  520         mutex_enter(&arc_prune_mtx);
  521         for (ap = list_head(&arc_prune_list); ap != NULL;
  522             ap = list_next(&arc_prune_list, ap)) {
  523 
  524                 if (zfs_refcount_count(&ap->p_refcnt) >= 2)
  525                         continue;
  526 
  527                 zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
  528                 ap->p_adjust = adjust;
  529                 if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
  530                     ap, TQ_SLEEP) == TASKQID_INVALID) {
  531                         zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
  532                         continue;
  533                 }
  534                 ARCSTAT_BUMP(arcstat_prune);
  535         }
  536         mutex_exit(&arc_prune_mtx);
  537 }
  538 
  539 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
  540         "Limit on number of pages that ARC shrinker can reclaim at once");

Cache object: 5f53b61e5979860981b001369f573c59


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.