The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/uma_core.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org>
    5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
    6  * Copyright (c) 2004-2006 Robert N. M. Watson
    7  * All rights reserved.
    8  *
    9  * Redistribution and use in source and binary forms, with or without
   10  * modification, are permitted provided that the following conditions
   11  * are met:
   12  * 1. Redistributions of source code must retain the above copyright
   13  *    notice unmodified, this list of conditions, and the following
   14  *    disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   29  */
   30 
   31 /*
   32  * uma_core.c  Implementation of the Universal Memory allocator
   33  *
   34  * This allocator is intended to replace the multitude of similar object caches
   35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
   36  * efficient.  A primary design goal is to return unused memory to the rest of
   37  * the system.  This will make the system as a whole more flexible due to the
   38  * ability to move memory to subsystems which most need it instead of leaving
   39  * pools of reserved memory unused.
   40  *
   41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
   42  * are well known.
   43  *
   44  */
   45 
   46 /*
   47  * TODO:
   48  *      - Improve memory usage for large allocations
   49  *      - Investigate cache size adjustments
   50  */
   51 
   52 #include <sys/cdefs.h>
   53 __FBSDID("$FreeBSD$");
   54 
   55 #include "opt_ddb.h"
   56 #include "opt_param.h"
   57 #include "opt_vm.h"
   58 
   59 #include <sys/param.h>
   60 #include <sys/systm.h>
   61 #include <sys/bitset.h>
   62 #include <sys/domainset.h>
   63 #include <sys/eventhandler.h>
   64 #include <sys/kernel.h>
   65 #include <sys/types.h>
   66 #include <sys/limits.h>
   67 #include <sys/queue.h>
   68 #include <sys/malloc.h>
   69 #include <sys/ktr.h>
   70 #include <sys/lock.h>
   71 #include <sys/sysctl.h>
   72 #include <sys/mutex.h>
   73 #include <sys/proc.h>
   74 #include <sys/random.h>
   75 #include <sys/rwlock.h>
   76 #include <sys/sbuf.h>
   77 #include <sys/sched.h>
   78 #include <sys/sleepqueue.h>
   79 #include <sys/smp.h>
   80 #include <sys/smr.h>
   81 #include <sys/taskqueue.h>
   82 #include <sys/vmmeter.h>
   83 
   84 #include <vm/vm.h>
   85 #include <vm/vm_param.h>
   86 #include <vm/vm_domainset.h>
   87 #include <vm/vm_object.h>
   88 #include <vm/vm_page.h>
   89 #include <vm/vm_pageout.h>
   90 #include <vm/vm_phys.h>
   91 #include <vm/vm_pagequeue.h>
   92 #include <vm/vm_map.h>
   93 #include <vm/vm_kern.h>
   94 #include <vm/vm_extern.h>
   95 #include <vm/vm_dumpset.h>
   96 #include <vm/uma.h>
   97 #include <vm/uma_int.h>
   98 #include <vm/uma_dbg.h>
   99 
  100 #include <ddb/ddb.h>
  101 
  102 #ifdef DEBUG_MEMGUARD
  103 #include <vm/memguard.h>
  104 #endif
  105 
  106 #include <machine/md_var.h>
  107 
  108 #ifdef INVARIANTS
  109 #define UMA_ALWAYS_CTORDTOR     1
  110 #else
  111 #define UMA_ALWAYS_CTORDTOR     0
  112 #endif
  113 
  114 /*
  115  * This is the zone and keg from which all zones are spawned.
  116  */
  117 static uma_zone_t kegs;
  118 static uma_zone_t zones;
  119 
  120 /*
  121  * On INVARIANTS builds, the slab contains a second bitset of the same size,
  122  * "dbg_bits", which is laid out immediately after us_free.
  123  */
  124 #ifdef INVARIANTS
  125 #define SLAB_BITSETS    2
  126 #else
  127 #define SLAB_BITSETS    1
  128 #endif
  129 
  130 /*
  131  * These are the two zones from which all offpage uma_slab_ts are allocated.
  132  *
  133  * One zone is for slab headers that can represent a larger number of items,
  134  * making the slabs themselves more efficient, and the other zone is for
  135  * headers that are smaller and represent fewer items, making the headers more
  136  * efficient.
  137  */
  138 #define SLABZONE_SIZE(setsize)                                  \
  139     (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS)
  140 #define SLABZONE0_SETSIZE       (PAGE_SIZE / 16)
  141 #define SLABZONE1_SETSIZE       SLAB_MAX_SETSIZE
  142 #define SLABZONE0_SIZE  SLABZONE_SIZE(SLABZONE0_SETSIZE)
  143 #define SLABZONE1_SIZE  SLABZONE_SIZE(SLABZONE1_SETSIZE)
  144 static uma_zone_t slabzones[2];
  145 
  146 /*
  147  * The initial hash tables come out of this zone so they can be allocated
  148  * prior to malloc coming up.
  149  */
  150 static uma_zone_t hashzone;
  151 
  152 /* The boot-time adjusted value for cache line alignment. */
  153 int uma_align_cache = 64 - 1;
  154 
  155 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
  156 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc");
  157 
  158 /*
  159  * Are we allowed to allocate buckets?
  160  */
  161 static int bucketdisable = 1;
  162 
  163 /* Linked list of all kegs in the system */
  164 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
  165 
  166 /* Linked list of all cache-only zones in the system */
  167 static LIST_HEAD(,uma_zone) uma_cachezones =
  168     LIST_HEAD_INITIALIZER(uma_cachezones);
  169 
  170 /* This RW lock protects the keg list */
  171 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
  172 
  173 /*
  174  * First available virual address for boot time allocations.
  175  */
  176 static vm_offset_t bootstart;
  177 static vm_offset_t bootmem;
  178 
  179 static struct sx uma_reclaim_lock;
  180 
  181 /*
  182  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
  183  * allocations don't trigger a wakeup of the reclaim thread.
  184  */
  185 unsigned long uma_kmem_limit = LONG_MAX;
  186 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
  187     "UMA kernel memory soft limit");
  188 unsigned long uma_kmem_total;
  189 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
  190     "UMA kernel memory usage");
  191 
  192 /* Is the VM done starting up? */
  193 static enum {
  194         BOOT_COLD,
  195         BOOT_KVA,
  196         BOOT_PCPU,
  197         BOOT_RUNNING,
  198         BOOT_SHUTDOWN,
  199 } booted = BOOT_COLD;
  200 
  201 /*
  202  * This is the handle used to schedule events that need to happen
  203  * outside of the allocation fast path.
  204  */
  205 static struct callout uma_callout;
  206 #define UMA_TIMEOUT     20              /* Seconds for callout interval. */
  207 
  208 /*
  209  * This structure is passed as the zone ctor arg so that I don't have to create
  210  * a special allocation function just for zones.
  211  */
  212 struct uma_zctor_args {
  213         const char *name;
  214         size_t size;
  215         uma_ctor ctor;
  216         uma_dtor dtor;
  217         uma_init uminit;
  218         uma_fini fini;
  219         uma_import import;
  220         uma_release release;
  221         void *arg;
  222         uma_keg_t keg;
  223         int align;
  224         uint32_t flags;
  225 };
  226 
  227 struct uma_kctor_args {
  228         uma_zone_t zone;
  229         size_t size;
  230         uma_init uminit;
  231         uma_fini fini;
  232         int align;
  233         uint32_t flags;
  234 };
  235 
  236 struct uma_bucket_zone {
  237         uma_zone_t      ubz_zone;
  238         const char      *ubz_name;
  239         int             ubz_entries;    /* Number of items it can hold. */
  240         int             ubz_maxsize;    /* Maximum allocation size per-item. */
  241 };
  242 
  243 /*
  244  * Compute the actual number of bucket entries to pack them in power
  245  * of two sizes for more efficient space utilization.
  246  */
  247 #define BUCKET_SIZE(n)                                          \
  248     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
  249 
  250 #define BUCKET_MAX      BUCKET_SIZE(256)
  251 
  252 struct uma_bucket_zone bucket_zones[] = {
  253         /* Literal bucket sizes. */
  254         { NULL, "2 Bucket", 2, 4096 },
  255         { NULL, "4 Bucket", 4, 3072 },
  256         { NULL, "8 Bucket", 8, 2048 },
  257         { NULL, "16 Bucket", 16, 1024 },
  258         /* Rounded down power of 2 sizes for efficiency. */
  259         { NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
  260         { NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
  261         { NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
  262         { NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
  263         { NULL, NULL, 0}
  264 };
  265 
  266 /*
  267  * Flags and enumerations to be passed to internal functions.
  268  */
  269 enum zfreeskip {
  270         SKIP_NONE =     0,
  271         SKIP_CNT =      0x00000001,
  272         SKIP_DTOR =     0x00010000,
  273         SKIP_FINI =     0x00020000,
  274 };
  275 
  276 /* Prototypes.. */
  277 
  278 void    uma_startup1(vm_offset_t);
  279 void    uma_startup2(void);
  280 
  281 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
  282 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
  283 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
  284 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
  285 static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
  286 static void page_free(void *, vm_size_t, uint8_t);
  287 static void pcpu_page_free(void *, vm_size_t, uint8_t);
  288 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
  289 static void cache_drain(uma_zone_t);
  290 static void bucket_drain(uma_zone_t, uma_bucket_t);
  291 static void bucket_cache_reclaim(uma_zone_t zone, bool);
  292 static int keg_ctor(void *, int, void *, int);
  293 static void keg_dtor(void *, int, void *);
  294 static int zone_ctor(void *, int, void *, int);
  295 static void zone_dtor(void *, int, void *);
  296 static inline void item_dtor(uma_zone_t zone, void *item, int size,
  297     void *udata, enum zfreeskip skip);
  298 static int zero_init(void *, int, int);
  299 static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
  300     int itemdomain, bool ws);
  301 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *);
  302 static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *);
  303 static void zone_timeout(uma_zone_t zone, void *);
  304 static int hash_alloc(struct uma_hash *, u_int);
  305 static int hash_expand(struct uma_hash *, struct uma_hash *);
  306 static void hash_free(struct uma_hash *hash);
  307 static void uma_timeout(void *);
  308 static void uma_shutdown(void);
  309 static void *zone_alloc_item(uma_zone_t, void *, int, int);
  310 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
  311 static int zone_alloc_limit(uma_zone_t zone, int count, int flags);
  312 static void zone_free_limit(uma_zone_t zone, int count);
  313 static void bucket_enable(void);
  314 static void bucket_init(void);
  315 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
  316 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
  317 static void bucket_zone_drain(void);
  318 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
  319 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
  320 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
  321 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
  322     uma_fini fini, int align, uint32_t flags);
  323 static int zone_import(void *, void **, int, int, int);
  324 static void zone_release(void *, void **, int);
  325 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int);
  326 static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int);
  327 
  328 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
  329 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
  330 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS);
  331 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS);
  332 static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS);
  333 static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS);
  334 static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS);
  335 
  336 static uint64_t uma_zone_get_allocs(uma_zone_t zone);
  337 
  338 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  339     "Memory allocation debugging");
  340 
  341 #ifdef INVARIANTS
  342 static uint64_t uma_keg_get_allocs(uma_keg_t zone);
  343 static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg);
  344 
  345 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
  346 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
  347 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
  348 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
  349 
  350 static u_int dbg_divisor = 1;
  351 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
  352     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
  353     "Debug & thrash every this item in memory allocator");
  354 
  355 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
  356 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
  357 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
  358     &uma_dbg_cnt, "memory items debugged");
  359 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
  360     &uma_skip_cnt, "memory items skipped, not debugged");
  361 #endif
  362 
  363 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  364     "Universal Memory Allocator");
  365 
  366 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT,
  367     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
  368 
  369 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT,
  370     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
  371 
  372 static int zone_warnings = 1;
  373 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
  374     "Warn when UMA zones becomes full");
  375 
  376 static int multipage_slabs = 1;
  377 TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs);
  378 SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs,
  379     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0,
  380     "UMA may choose larger slab sizes for better efficiency");
  381 
  382 /*
  383  * Select the slab zone for an offpage slab with the given maximum item count.
  384  */
  385 static inline uma_zone_t
  386 slabzone(int ipers)
  387 {
  388 
  389         return (slabzones[ipers > SLABZONE0_SETSIZE]);
  390 }
  391 
  392 /*
  393  * This routine checks to see whether or not it's safe to enable buckets.
  394  */
  395 static void
  396 bucket_enable(void)
  397 {
  398 
  399         KASSERT(booted >= BOOT_KVA, ("Bucket enable before init"));
  400         bucketdisable = vm_page_count_min();
  401 }
  402 
  403 /*
  404  * Initialize bucket_zones, the array of zones of buckets of various sizes.
  405  *
  406  * For each zone, calculate the memory required for each bucket, consisting
  407  * of the header and an array of pointers.
  408  */
  409 static void
  410 bucket_init(void)
  411 {
  412         struct uma_bucket_zone *ubz;
  413         int size;
  414 
  415         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
  416                 size = roundup(sizeof(struct uma_bucket), sizeof(void *));
  417                 size += sizeof(void *) * ubz->ubz_entries;
  418                 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
  419                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
  420                     UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET |
  421                     UMA_ZONE_FIRSTTOUCH);
  422         }
  423 }
  424 
  425 /*
  426  * Given a desired number of entries for a bucket, return the zone from which
  427  * to allocate the bucket.
  428  */
  429 static struct uma_bucket_zone *
  430 bucket_zone_lookup(int entries)
  431 {
  432         struct uma_bucket_zone *ubz;
  433 
  434         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
  435                 if (ubz->ubz_entries >= entries)
  436                         return (ubz);
  437         ubz--;
  438         return (ubz);
  439 }
  440 
  441 static int
  442 bucket_select(int size)
  443 {
  444         struct uma_bucket_zone *ubz;
  445 
  446         ubz = &bucket_zones[0];
  447         if (size > ubz->ubz_maxsize)
  448                 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
  449 
  450         for (; ubz->ubz_entries != 0; ubz++)
  451                 if (ubz->ubz_maxsize < size)
  452                         break;
  453         ubz--;
  454         return (ubz->ubz_entries);
  455 }
  456 
  457 static uma_bucket_t
  458 bucket_alloc(uma_zone_t zone, void *udata, int flags)
  459 {
  460         struct uma_bucket_zone *ubz;
  461         uma_bucket_t bucket;
  462 
  463         /*
  464          * Don't allocate buckets early in boot.
  465          */
  466         if (__predict_false(booted < BOOT_KVA))
  467                 return (NULL);
  468 
  469         /*
  470          * To limit bucket recursion we store the original zone flags
  471          * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
  472          * NOVM flag to persist even through deep recursions.  We also
  473          * store ZFLAG_BUCKET once we have recursed attempting to allocate
  474          * a bucket for a bucket zone so we do not allow infinite bucket
  475          * recursion.  This cookie will even persist to frees of unused
  476          * buckets via the allocation path or bucket allocations in the
  477          * free path.
  478          */
  479         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
  480                 udata = (void *)(uintptr_t)zone->uz_flags;
  481         else {
  482                 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
  483                         return (NULL);
  484                 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
  485         }
  486         if (((uintptr_t)udata & UMA_ZONE_VM) != 0)
  487                 flags |= M_NOVM;
  488         ubz = bucket_zone_lookup(atomic_load_16(&zone->uz_bucket_size));
  489         if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
  490                 ubz++;
  491         bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
  492         if (bucket) {
  493 #ifdef INVARIANTS
  494                 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
  495 #endif
  496                 bucket->ub_cnt = 0;
  497                 bucket->ub_entries = min(ubz->ubz_entries,
  498                     zone->uz_bucket_size_max);
  499                 bucket->ub_seq = SMR_SEQ_INVALID;
  500                 CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p",
  501                     zone->uz_name, zone, bucket);
  502         }
  503 
  504         return (bucket);
  505 }
  506 
  507 static void
  508 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
  509 {
  510         struct uma_bucket_zone *ubz;
  511 
  512         if (bucket->ub_cnt != 0)
  513                 bucket_drain(zone, bucket);
  514 
  515         KASSERT(bucket->ub_cnt == 0,
  516             ("bucket_free: Freeing a non free bucket."));
  517         KASSERT(bucket->ub_seq == SMR_SEQ_INVALID,
  518             ("bucket_free: Freeing an SMR bucket."));
  519         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
  520                 udata = (void *)(uintptr_t)zone->uz_flags;
  521         ubz = bucket_zone_lookup(bucket->ub_entries);
  522         uma_zfree_arg(ubz->ubz_zone, bucket, udata);
  523 }
  524 
  525 static void
  526 bucket_zone_drain(void)
  527 {
  528         struct uma_bucket_zone *ubz;
  529 
  530         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
  531                 uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN);
  532 }
  533 
  534 /*
  535  * Acquire the domain lock and record contention.
  536  */
  537 static uma_zone_domain_t
  538 zone_domain_lock(uma_zone_t zone, int domain)
  539 {
  540         uma_zone_domain_t zdom;
  541         bool lockfail;
  542 
  543         zdom = ZDOM_GET(zone, domain);
  544         lockfail = false;
  545         if (ZDOM_OWNED(zdom))
  546                 lockfail = true;
  547         ZDOM_LOCK(zdom);
  548         /* This is unsynchronized.  The counter does not need to be precise. */
  549         if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max)
  550                 zone->uz_bucket_size++;
  551         return (zdom);
  552 }
  553 
  554 /*
  555  * Search for the domain with the least cached items and return it if it
  556  * is out of balance with the preferred domain.
  557  */
  558 static __noinline int
  559 zone_domain_lowest(uma_zone_t zone, int pref)
  560 {
  561         long least, nitems, prefitems;
  562         int domain;
  563         int i;
  564 
  565         prefitems = least = LONG_MAX;
  566         domain = 0;
  567         for (i = 0; i < vm_ndomains; i++) {
  568                 nitems = ZDOM_GET(zone, i)->uzd_nitems;
  569                 if (nitems < least) {
  570                         domain = i;
  571                         least = nitems;
  572                 }
  573                 if (domain == pref)
  574                         prefitems = nitems;
  575         }
  576         if (prefitems < least * 2)
  577                 return (pref);
  578 
  579         return (domain);
  580 }
  581 
  582 /*
  583  * Search for the domain with the most cached items and return it or the
  584  * preferred domain if it has enough to proceed.
  585  */
  586 static __noinline int
  587 zone_domain_highest(uma_zone_t zone, int pref)
  588 {
  589         long most, nitems;
  590         int domain;
  591         int i;
  592 
  593         if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX)
  594                 return (pref);
  595 
  596         most = 0;
  597         domain = 0;
  598         for (i = 0; i < vm_ndomains; i++) {
  599                 nitems = ZDOM_GET(zone, i)->uzd_nitems;
  600                 if (nitems > most) {
  601                         domain = i;
  602                         most = nitems;
  603                 }
  604         }
  605 
  606         return (domain);
  607 }
  608 
  609 /*
  610  * Safely subtract cnt from imax.
  611  */
  612 static void
  613 zone_domain_imax_sub(uma_zone_domain_t zdom, int cnt)
  614 {
  615         long new;
  616         long old;
  617 
  618         old = zdom->uzd_imax;
  619         do {
  620                 if (old <= cnt)
  621                         new = 0;
  622                 else
  623                         new = old - cnt;
  624         } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, new) == 0);
  625 }
  626 
  627 /*
  628  * Set the maximum imax value.
  629  */
  630 static void
  631 zone_domain_imax_set(uma_zone_domain_t zdom, int nitems)
  632 {
  633         long old;
  634 
  635         old = zdom->uzd_imax;
  636         do {
  637                 if (old >= nitems)
  638                         break;
  639         } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0);
  640 }
  641 
  642 /*
  643  * Attempt to satisfy an allocation by retrieving a full bucket from one of the
  644  * zone's caches.  If a bucket is found the zone is not locked on return.
  645  */
  646 static uma_bucket_t
  647 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim)
  648 {
  649         uma_bucket_t bucket;
  650         int i;
  651         bool dtor = false;
  652 
  653         ZDOM_LOCK_ASSERT(zdom);
  654 
  655         if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL)
  656                 return (NULL);
  657 
  658         /* SMR Buckets can not be re-used until readers expire. */
  659         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
  660             bucket->ub_seq != SMR_SEQ_INVALID) {
  661                 if (!smr_poll(zone->uz_smr, bucket->ub_seq, false))
  662                         return (NULL);
  663                 bucket->ub_seq = SMR_SEQ_INVALID;
  664                 dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR;
  665                 if (STAILQ_NEXT(bucket, ub_link) != NULL)
  666                         zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq;
  667         }
  668         STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link);
  669 
  670         KASSERT(zdom->uzd_nitems >= bucket->ub_cnt,
  671             ("%s: item count underflow (%ld, %d)",
  672             __func__, zdom->uzd_nitems, bucket->ub_cnt));
  673         KASSERT(bucket->ub_cnt > 0,
  674             ("%s: empty bucket in bucket cache", __func__));
  675         zdom->uzd_nitems -= bucket->ub_cnt;
  676 
  677         /*
  678          * Shift the bounds of the current WSS interval to avoid
  679          * perturbing the estimate.
  680          */
  681         if (reclaim) {
  682                 zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt);
  683                 zone_domain_imax_sub(zdom, bucket->ub_cnt);
  684         } else if (zdom->uzd_imin > zdom->uzd_nitems)
  685                 zdom->uzd_imin = zdom->uzd_nitems;
  686 
  687         ZDOM_UNLOCK(zdom);
  688         if (dtor)
  689                 for (i = 0; i < bucket->ub_cnt; i++)
  690                         item_dtor(zone, bucket->ub_bucket[i], zone->uz_size,
  691                             NULL, SKIP_NONE);
  692 
  693         return (bucket);
  694 }
  695 
  696 /*
  697  * Insert a full bucket into the specified cache.  The "ws" parameter indicates
  698  * whether the bucket's contents should be counted as part of the zone's working
  699  * set.  The bucket may be freed if it exceeds the bucket limit.
  700  */
  701 static void
  702 zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata,
  703     const bool ws)
  704 {
  705         uma_zone_domain_t zdom;
  706 
  707         /* We don't cache empty buckets.  This can happen after a reclaim. */
  708         if (bucket->ub_cnt == 0)
  709                 goto out;
  710         zdom = zone_domain_lock(zone, domain);
  711 
  712         /*
  713          * Conditionally set the maximum number of items.
  714          */
  715         zdom->uzd_nitems += bucket->ub_cnt;
  716         if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) {
  717                 if (ws)
  718                         zone_domain_imax_set(zdom, zdom->uzd_nitems);
  719                 if (STAILQ_EMPTY(&zdom->uzd_buckets))
  720                         zdom->uzd_seq = bucket->ub_seq;
  721 
  722                 /*
  723                  * Try to promote reuse of recently used items.  For items
  724                  * protected by SMR, try to defer reuse to minimize polling.
  725                  */
  726                 if (bucket->ub_seq == SMR_SEQ_INVALID)
  727                         STAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
  728                 else
  729                         STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
  730                 ZDOM_UNLOCK(zdom);
  731                 return;
  732         }
  733         zdom->uzd_nitems -= bucket->ub_cnt;
  734         ZDOM_UNLOCK(zdom);
  735 out:
  736         bucket_free(zone, bucket, udata);
  737 }
  738 
  739 /* Pops an item out of a per-cpu cache bucket. */
  740 static inline void *
  741 cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket)
  742 {
  743         void *item;
  744 
  745         CRITICAL_ASSERT(curthread);
  746 
  747         bucket->ucb_cnt--;
  748         item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt];
  749 #ifdef INVARIANTS
  750         bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL;
  751         KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
  752 #endif
  753         cache->uc_allocs++;
  754 
  755         return (item);
  756 }
  757 
  758 /* Pushes an item into a per-cpu cache bucket. */
  759 static inline void
  760 cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item)
  761 {
  762 
  763         CRITICAL_ASSERT(curthread);
  764         KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL,
  765             ("uma_zfree: Freeing to non free bucket index."));
  766 
  767         bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item;
  768         bucket->ucb_cnt++;
  769         cache->uc_frees++;
  770 }
  771 
  772 /*
  773  * Unload a UMA bucket from a per-cpu cache.
  774  */
  775 static inline uma_bucket_t
  776 cache_bucket_unload(uma_cache_bucket_t bucket)
  777 {
  778         uma_bucket_t b;
  779 
  780         b = bucket->ucb_bucket;
  781         if (b != NULL) {
  782                 MPASS(b->ub_entries == bucket->ucb_entries);
  783                 b->ub_cnt = bucket->ucb_cnt;
  784                 bucket->ucb_bucket = NULL;
  785                 bucket->ucb_entries = bucket->ucb_cnt = 0;
  786         }
  787 
  788         return (b);
  789 }
  790 
  791 static inline uma_bucket_t
  792 cache_bucket_unload_alloc(uma_cache_t cache)
  793 {
  794 
  795         return (cache_bucket_unload(&cache->uc_allocbucket));
  796 }
  797 
  798 static inline uma_bucket_t
  799 cache_bucket_unload_free(uma_cache_t cache)
  800 {
  801 
  802         return (cache_bucket_unload(&cache->uc_freebucket));
  803 }
  804 
  805 static inline uma_bucket_t
  806 cache_bucket_unload_cross(uma_cache_t cache)
  807 {
  808 
  809         return (cache_bucket_unload(&cache->uc_crossbucket));
  810 }
  811 
  812 /*
  813  * Load a bucket into a per-cpu cache bucket.
  814  */
  815 static inline void
  816 cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b)
  817 {
  818 
  819         CRITICAL_ASSERT(curthread);
  820         MPASS(bucket->ucb_bucket == NULL);
  821         MPASS(b->ub_seq == SMR_SEQ_INVALID);
  822 
  823         bucket->ucb_bucket = b;
  824         bucket->ucb_cnt = b->ub_cnt;
  825         bucket->ucb_entries = b->ub_entries;
  826 }
  827 
  828 static inline void
  829 cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b)
  830 {
  831 
  832         cache_bucket_load(&cache->uc_allocbucket, b);
  833 }
  834 
  835 static inline void
  836 cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b)
  837 {
  838 
  839         cache_bucket_load(&cache->uc_freebucket, b);
  840 }
  841 
  842 #ifdef NUMA
  843 static inline void 
  844 cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b)
  845 {
  846 
  847         cache_bucket_load(&cache->uc_crossbucket, b);
  848 }
  849 #endif
  850 
  851 /*
  852  * Copy and preserve ucb_spare.
  853  */
  854 static inline void
  855 cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
  856 {
  857 
  858         b1->ucb_bucket = b2->ucb_bucket;
  859         b1->ucb_entries = b2->ucb_entries;
  860         b1->ucb_cnt = b2->ucb_cnt;
  861 }
  862 
  863 /*
  864  * Swap two cache buckets.
  865  */
  866 static inline void
  867 cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
  868 {
  869         struct uma_cache_bucket b3;
  870 
  871         CRITICAL_ASSERT(curthread);
  872 
  873         cache_bucket_copy(&b3, b1);
  874         cache_bucket_copy(b1, b2);
  875         cache_bucket_copy(b2, &b3);
  876 }
  877 
  878 /*
  879  * Attempt to fetch a bucket from a zone on behalf of the current cpu cache.
  880  */
  881 static uma_bucket_t
  882 cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain)
  883 {
  884         uma_zone_domain_t zdom;
  885         uma_bucket_t bucket;
  886 
  887         /*
  888          * Avoid the lock if possible.
  889          */
  890         zdom = ZDOM_GET(zone, domain);
  891         if (zdom->uzd_nitems == 0)
  892                 return (NULL);
  893 
  894         if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 &&
  895             !smr_poll(zone->uz_smr, zdom->uzd_seq, false))
  896                 return (NULL);
  897 
  898         /*
  899          * Check the zone's cache of buckets.
  900          */
  901         zdom = zone_domain_lock(zone, domain);
  902         if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL)
  903                 return (bucket);
  904         ZDOM_UNLOCK(zdom);
  905 
  906         return (NULL);
  907 }
  908 
  909 static void
  910 zone_log_warning(uma_zone_t zone)
  911 {
  912         static const struct timeval warninterval = { 300, 0 };
  913 
  914         if (!zone_warnings || zone->uz_warning == NULL)
  915                 return;
  916 
  917         if (ratecheck(&zone->uz_ratecheck, &warninterval))
  918                 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
  919 }
  920 
  921 static inline void
  922 zone_maxaction(uma_zone_t zone)
  923 {
  924 
  925         if (zone->uz_maxaction.ta_func != NULL)
  926                 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
  927 }
  928 
  929 /*
  930  * Routine called by timeout which is used to fire off some time interval
  931  * based calculations.  (stats, hash size, etc.)
  932  *
  933  * Arguments:
  934  *      arg   Unused
  935  *
  936  * Returns:
  937  *      Nothing
  938  */
  939 static void
  940 uma_timeout(void *unused)
  941 {
  942         bucket_enable();
  943         zone_foreach(zone_timeout, NULL);
  944 
  945         /* Reschedule this event */
  946         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
  947 }
  948 
  949 /*
  950  * Update the working set size estimate for the zone's bucket cache.
  951  * The constants chosen here are somewhat arbitrary.  With an update period of
  952  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
  953  * last 100s.
  954  */
  955 static void
  956 zone_domain_update_wss(uma_zone_domain_t zdom)
  957 {
  958         long wss;
  959 
  960         ZDOM_LOCK(zdom);
  961         MPASS(zdom->uzd_imax >= zdom->uzd_imin);
  962         wss = zdom->uzd_imax - zdom->uzd_imin;
  963         zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
  964         zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
  965         ZDOM_UNLOCK(zdom);
  966 }
  967 
  968 /*
  969  * Routine to perform timeout driven calculations.  This expands the
  970  * hashes and does per cpu statistics aggregation.
  971  *
  972  *  Returns nothing.
  973  */
  974 static void
  975 zone_timeout(uma_zone_t zone, void *unused)
  976 {
  977         uma_keg_t keg;
  978         u_int slabs, pages;
  979 
  980         if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
  981                 goto update_wss;
  982 
  983         keg = zone->uz_keg;
  984 
  985         /*
  986          * Hash zones are non-numa by definition so the first domain
  987          * is the only one present.
  988          */
  989         KEG_LOCK(keg, 0);
  990         pages = keg->uk_domain[0].ud_pages;
  991 
  992         /*
  993          * Expand the keg hash table.
  994          *
  995          * This is done if the number of slabs is larger than the hash size.
  996          * What I'm trying to do here is completely reduce collisions.  This
  997          * may be a little aggressive.  Should I allow for two collisions max?
  998          */
  999         if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) {
 1000                 struct uma_hash newhash;
 1001                 struct uma_hash oldhash;
 1002                 int ret;
 1003 
 1004                 /*
 1005                  * This is so involved because allocating and freeing
 1006                  * while the keg lock is held will lead to deadlock.
 1007                  * I have to do everything in stages and check for
 1008                  * races.
 1009                  */
 1010                 KEG_UNLOCK(keg, 0);
 1011                 ret = hash_alloc(&newhash, 1 << fls(slabs));
 1012                 KEG_LOCK(keg, 0);
 1013                 if (ret) {
 1014                         if (hash_expand(&keg->uk_hash, &newhash)) {
 1015                                 oldhash = keg->uk_hash;
 1016                                 keg->uk_hash = newhash;
 1017                         } else
 1018                                 oldhash = newhash;
 1019 
 1020                         KEG_UNLOCK(keg, 0);
 1021                         hash_free(&oldhash);
 1022                         goto update_wss;
 1023                 }
 1024         }
 1025         KEG_UNLOCK(keg, 0);
 1026 
 1027 update_wss:
 1028         for (int i = 0; i < vm_ndomains; i++)
 1029                 zone_domain_update_wss(ZDOM_GET(zone, i));
 1030 }
 1031 
 1032 /*
 1033  * Allocate and zero fill the next sized hash table from the appropriate
 1034  * backing store.
 1035  *
 1036  * Arguments:
 1037  *      hash  A new hash structure with the old hash size in uh_hashsize
 1038  *
 1039  * Returns:
 1040  *      1 on success and 0 on failure.
 1041  */
 1042 static int
 1043 hash_alloc(struct uma_hash *hash, u_int size)
 1044 {
 1045         size_t alloc;
 1046 
 1047         KASSERT(powerof2(size), ("hash size must be power of 2"));
 1048         if (size > UMA_HASH_SIZE_INIT)  {
 1049                 hash->uh_hashsize = size;
 1050                 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 1051                 hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT);
 1052         } else {
 1053                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 1054                 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 1055                     UMA_ANYDOMAIN, M_WAITOK);
 1056                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 1057         }
 1058         if (hash->uh_slab_hash) {
 1059                 bzero(hash->uh_slab_hash, alloc);
 1060                 hash->uh_hashmask = hash->uh_hashsize - 1;
 1061                 return (1);
 1062         }
 1063 
 1064         return (0);
 1065 }
 1066 
 1067 /*
 1068  * Expands the hash table for HASH zones.  This is done from zone_timeout
 1069  * to reduce collisions.  This must not be done in the regular allocation
 1070  * path, otherwise, we can recurse on the vm while allocating pages.
 1071  *
 1072  * Arguments:
 1073  *      oldhash  The hash you want to expand
 1074  *      newhash  The hash structure for the new table
 1075  *
 1076  * Returns:
 1077  *      Nothing
 1078  *
 1079  * Discussion:
 1080  */
 1081 static int
 1082 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 1083 {
 1084         uma_hash_slab_t slab;
 1085         u_int hval;
 1086         u_int idx;
 1087 
 1088         if (!newhash->uh_slab_hash)
 1089                 return (0);
 1090 
 1091         if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 1092                 return (0);
 1093 
 1094         /*
 1095          * I need to investigate hash algorithms for resizing without a
 1096          * full rehash.
 1097          */
 1098 
 1099         for (idx = 0; idx < oldhash->uh_hashsize; idx++)
 1100                 while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
 1101                         slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]);
 1102                         LIST_REMOVE(slab, uhs_hlink);
 1103                         hval = UMA_HASH(newhash, slab->uhs_data);
 1104                         LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 1105                             slab, uhs_hlink);
 1106                 }
 1107 
 1108         return (1);
 1109 }
 1110 
 1111 /*
 1112  * Free the hash bucket to the appropriate backing store.
 1113  *
 1114  * Arguments:
 1115  *      slab_hash  The hash bucket we're freeing
 1116  *      hashsize   The number of entries in that hash bucket
 1117  *
 1118  * Returns:
 1119  *      Nothing
 1120  */
 1121 static void
 1122 hash_free(struct uma_hash *hash)
 1123 {
 1124         if (hash->uh_slab_hash == NULL)
 1125                 return;
 1126         if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 1127                 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
 1128         else
 1129                 free(hash->uh_slab_hash, M_UMAHASH);
 1130 }
 1131 
 1132 /*
 1133  * Frees all outstanding items in a bucket
 1134  *
 1135  * Arguments:
 1136  *      zone   The zone to free to, must be unlocked.
 1137  *      bucket The free/alloc bucket with items.
 1138  *
 1139  * Returns:
 1140  *      Nothing
 1141  */
 1142 static void
 1143 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 1144 {
 1145         int i;
 1146 
 1147         if (bucket->ub_cnt == 0)
 1148                 return;
 1149 
 1150         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
 1151             bucket->ub_seq != SMR_SEQ_INVALID) {
 1152                 smr_wait(zone->uz_smr, bucket->ub_seq);
 1153                 bucket->ub_seq = SMR_SEQ_INVALID;
 1154                 for (i = 0; i < bucket->ub_cnt; i++)
 1155                         item_dtor(zone, bucket->ub_bucket[i],
 1156                             zone->uz_size, NULL, SKIP_NONE);
 1157         }
 1158         if (zone->uz_fini)
 1159                 for (i = 0; i < bucket->ub_cnt; i++) 
 1160                         zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
 1161         zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
 1162         if (zone->uz_max_items > 0)
 1163                 zone_free_limit(zone, bucket->ub_cnt);
 1164 #ifdef INVARIANTS
 1165         bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt);
 1166 #endif
 1167         bucket->ub_cnt = 0;
 1168 }
 1169 
 1170 /*
 1171  * Drains the per cpu caches for a zone.
 1172  *
 1173  * NOTE: This may only be called while the zone is being torn down, and not
 1174  * during normal operation.  This is necessary in order that we do not have
 1175  * to migrate CPUs to drain the per-CPU caches.
 1176  *
 1177  * Arguments:
 1178  *      zone     The zone to drain, must be unlocked.
 1179  *
 1180  * Returns:
 1181  *      Nothing
 1182  */
 1183 static void
 1184 cache_drain(uma_zone_t zone)
 1185 {
 1186         uma_cache_t cache;
 1187         uma_bucket_t bucket;
 1188         smr_seq_t seq;
 1189         int cpu;
 1190 
 1191         /*
 1192          * XXX: It is safe to not lock the per-CPU caches, because we're
 1193          * tearing down the zone anyway.  I.e., there will be no further use
 1194          * of the caches at this point.
 1195          *
 1196          * XXX: It would good to be able to assert that the zone is being
 1197          * torn down to prevent improper use of cache_drain().
 1198          */
 1199         seq = SMR_SEQ_INVALID;
 1200         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
 1201                 seq = smr_advance(zone->uz_smr);
 1202         CPU_FOREACH(cpu) {
 1203                 cache = &zone->uz_cpu[cpu];
 1204                 bucket = cache_bucket_unload_alloc(cache);
 1205                 if (bucket != NULL)
 1206                         bucket_free(zone, bucket, NULL);
 1207                 bucket = cache_bucket_unload_free(cache);
 1208                 if (bucket != NULL) {
 1209                         bucket->ub_seq = seq;
 1210                         bucket_free(zone, bucket, NULL);
 1211                 }
 1212                 bucket = cache_bucket_unload_cross(cache);
 1213                 if (bucket != NULL) {
 1214                         bucket->ub_seq = seq;
 1215                         bucket_free(zone, bucket, NULL);
 1216                 }
 1217         }
 1218         bucket_cache_reclaim(zone, true);
 1219 }
 1220 
 1221 static void
 1222 cache_shrink(uma_zone_t zone, void *unused)
 1223 {
 1224 
 1225         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 1226                 return;
 1227 
 1228         zone->uz_bucket_size =
 1229             (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2;
 1230 }
 1231 
 1232 static void
 1233 cache_drain_safe_cpu(uma_zone_t zone, void *unused)
 1234 {
 1235         uma_cache_t cache;
 1236         uma_bucket_t b1, b2, b3;
 1237         int domain;
 1238 
 1239         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 1240                 return;
 1241 
 1242         b1 = b2 = b3 = NULL;
 1243         critical_enter();
 1244         cache = &zone->uz_cpu[curcpu];
 1245         domain = PCPU_GET(domain);
 1246         b1 = cache_bucket_unload_alloc(cache);
 1247 
 1248         /*
 1249          * Don't flush SMR zone buckets.  This leaves the zone without a
 1250          * bucket and forces every free to synchronize().
 1251          */
 1252         if ((zone->uz_flags & UMA_ZONE_SMR) == 0) {
 1253                 b2 = cache_bucket_unload_free(cache);
 1254                 b3 = cache_bucket_unload_cross(cache);
 1255         }
 1256         critical_exit();
 1257 
 1258         if (b1 != NULL)
 1259                 zone_free_bucket(zone, b1, NULL, domain, false);
 1260         if (b2 != NULL)
 1261                 zone_free_bucket(zone, b2, NULL, domain, false);
 1262         if (b3 != NULL) {
 1263                 /* Adjust the domain so it goes to zone_free_cross. */
 1264                 domain = (domain + 1) % vm_ndomains;
 1265                 zone_free_bucket(zone, b3, NULL, domain, false);
 1266         }
 1267 }
 1268 
 1269 /*
 1270  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
 1271  * This is an expensive call because it needs to bind to all CPUs
 1272  * one by one and enter a critical section on each of them in order
 1273  * to safely access their cache buckets.
 1274  * Zone lock must not be held on call this function.
 1275  */
 1276 static void
 1277 pcpu_cache_drain_safe(uma_zone_t zone)
 1278 {
 1279         int cpu;
 1280 
 1281         /*
 1282          * Polite bucket sizes shrinking was not enough, shrink aggressively.
 1283          */
 1284         if (zone)
 1285                 cache_shrink(zone, NULL);
 1286         else
 1287                 zone_foreach(cache_shrink, NULL);
 1288 
 1289         CPU_FOREACH(cpu) {
 1290                 thread_lock(curthread);
 1291                 sched_bind(curthread, cpu);
 1292                 thread_unlock(curthread);
 1293 
 1294                 if (zone)
 1295                         cache_drain_safe_cpu(zone, NULL);
 1296                 else
 1297                         zone_foreach(cache_drain_safe_cpu, NULL);
 1298         }
 1299         thread_lock(curthread);
 1300         sched_unbind(curthread);
 1301         thread_unlock(curthread);
 1302 }
 1303 
 1304 /*
 1305  * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
 1306  * requested a drain, otherwise the per-domain caches are trimmed to either
 1307  * estimated working set size.
 1308  */
 1309 static void
 1310 bucket_cache_reclaim(uma_zone_t zone, bool drain)
 1311 {
 1312         uma_zone_domain_t zdom;
 1313         uma_bucket_t bucket;
 1314         long target;
 1315         int i;
 1316 
 1317         /*
 1318          * Shrink the zone bucket size to ensure that the per-CPU caches
 1319          * don't grow too large.
 1320          */
 1321         if (zone->uz_bucket_size > zone->uz_bucket_size_min)
 1322                 zone->uz_bucket_size--;
 1323 
 1324         for (i = 0; i < vm_ndomains; i++) {
 1325                 /*
 1326                  * The cross bucket is partially filled and not part of
 1327                  * the item count.  Reclaim it individually here.
 1328                  */
 1329                 zdom = ZDOM_GET(zone, i);
 1330                 if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) {
 1331                         ZONE_CROSS_LOCK(zone);
 1332                         bucket = zdom->uzd_cross;
 1333                         zdom->uzd_cross = NULL;
 1334                         ZONE_CROSS_UNLOCK(zone);
 1335                         if (bucket != NULL)
 1336                                 bucket_free(zone, bucket, NULL);
 1337                 }
 1338 
 1339                 /*
 1340                  * If we were asked to drain the zone, we are done only once
 1341                  * this bucket cache is empty.  Otherwise, we reclaim items in
 1342                  * excess of the zone's estimated working set size.  If the
 1343                  * difference nitems - imin is larger than the WSS estimate,
 1344                  * then the estimate will grow at the end of this interval and
 1345                  * we ignore the historical average.
 1346                  */
 1347                 ZDOM_LOCK(zdom);
 1348                 target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
 1349                     zdom->uzd_imin);
 1350                 while (zdom->uzd_nitems > target) {
 1351                         bucket = zone_fetch_bucket(zone, zdom, true);
 1352                         if (bucket == NULL)
 1353                                 break;
 1354                         bucket_free(zone, bucket, NULL);
 1355                         ZDOM_LOCK(zdom);
 1356                 }
 1357                 ZDOM_UNLOCK(zdom);
 1358         }
 1359 }
 1360 
 1361 static void
 1362 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
 1363 {
 1364         uint8_t *mem;
 1365         int i;
 1366         uint8_t flags;
 1367 
 1368         CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
 1369             keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
 1370 
 1371         mem = slab_data(slab, keg);
 1372         flags = slab->us_flags;
 1373         i = start;
 1374         if (keg->uk_fini != NULL) {
 1375                 for (i--; i > -1; i--)
 1376 #ifdef INVARIANTS
 1377                 /*
 1378                  * trash_fini implies that dtor was trash_dtor. trash_fini
 1379                  * would check that memory hasn't been modified since free,
 1380                  * which executed trash_dtor.
 1381                  * That's why we need to run uma_dbg_kskip() check here,
 1382                  * albeit we don't make skip check for other init/fini
 1383                  * invocations.
 1384                  */
 1385                 if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) ||
 1386                     keg->uk_fini != trash_fini)
 1387 #endif
 1388                         keg->uk_fini(slab_item(slab, keg, i), keg->uk_size);
 1389         }
 1390         if (keg->uk_flags & UMA_ZFLAG_OFFPAGE)
 1391                 zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab),
 1392                     NULL, SKIP_NONE);
 1393         keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
 1394         uma_total_dec(PAGE_SIZE * keg->uk_ppera);
 1395 }
 1396 
 1397 static void
 1398 keg_drain_domain(uma_keg_t keg, int domain)
 1399 {
 1400         struct slabhead freeslabs;
 1401         uma_domain_t dom;
 1402         uma_slab_t slab, tmp;
 1403         uint32_t i, stofree, stokeep, partial;
 1404 
 1405         dom = &keg->uk_domain[domain];
 1406         LIST_INIT(&freeslabs);
 1407 
 1408         CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u",
 1409             keg->uk_name, keg, domain, dom->ud_free_items);
 1410 
 1411         KEG_LOCK(keg, domain);
 1412 
 1413         /*
 1414          * Are the free items in partially allocated slabs sufficient to meet
 1415          * the reserve? If not, compute the number of fully free slabs that must
 1416          * be kept.
 1417          */
 1418         partial = dom->ud_free_items - dom->ud_free_slabs * keg->uk_ipers;
 1419         if (partial < keg->uk_reserve) {
 1420                 stokeep = min(dom->ud_free_slabs,
 1421                     howmany(keg->uk_reserve - partial, keg->uk_ipers));
 1422         } else {
 1423                 stokeep = 0;
 1424         }
 1425         stofree = dom->ud_free_slabs - stokeep;
 1426 
 1427         /*
 1428          * Partition the free slabs into two sets: those that must be kept in
 1429          * order to maintain the reserve, and those that may be released back to
 1430          * the system.  Since one set may be much larger than the other,
 1431          * populate the smaller of the two sets and swap them if necessary.
 1432          */
 1433         for (i = min(stofree, stokeep); i > 0; i--) {
 1434                 slab = LIST_FIRST(&dom->ud_free_slab);
 1435                 LIST_REMOVE(slab, us_link);
 1436                 LIST_INSERT_HEAD(&freeslabs, slab, us_link);
 1437         }
 1438         if (stofree > stokeep)
 1439                 LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link);
 1440 
 1441         if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) {
 1442                 LIST_FOREACH(slab, &freeslabs, us_link)
 1443                         UMA_HASH_REMOVE(&keg->uk_hash, slab);
 1444         }
 1445         dom->ud_free_items -= stofree * keg->uk_ipers;
 1446         dom->ud_free_slabs -= stofree;
 1447         dom->ud_pages -= stofree * keg->uk_ppera;
 1448         KEG_UNLOCK(keg, domain);
 1449 
 1450         LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp)
 1451                 keg_free_slab(keg, slab, keg->uk_ipers);
 1452 }
 1453 
 1454 /*
 1455  * Frees pages from a keg back to the system.  This is done on demand from
 1456  * the pageout daemon.
 1457  *
 1458  * Returns nothing.
 1459  */
 1460 static void
 1461 keg_drain(uma_keg_t keg)
 1462 {
 1463         int i;
 1464 
 1465         if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0)
 1466                 return;
 1467         for (i = 0; i < vm_ndomains; i++)
 1468                 keg_drain_domain(keg, i);
 1469 }
 1470 
 1471 static void
 1472 zone_reclaim(uma_zone_t zone, int waitok, bool drain)
 1473 {
 1474 
 1475         /*
 1476          * Set draining to interlock with zone_dtor() so we can release our
 1477          * locks as we go.  Only dtor() should do a WAITOK call since it
 1478          * is the only call that knows the structure will still be available
 1479          * when it wakes up.
 1480          */
 1481         ZONE_LOCK(zone);
 1482         while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) {
 1483                 if (waitok == M_NOWAIT)
 1484                         goto out;
 1485                 msleep(zone, &ZDOM_GET(zone, 0)->uzd_lock, PVM, "zonedrain",
 1486                     1);
 1487         }
 1488         zone->uz_flags |= UMA_ZFLAG_RECLAIMING;
 1489         ZONE_UNLOCK(zone);
 1490         bucket_cache_reclaim(zone, drain);
 1491 
 1492         /*
 1493          * The DRAINING flag protects us from being freed while
 1494          * we're running.  Normally the uma_rwlock would protect us but we
 1495          * must be able to release and acquire the right lock for each keg.
 1496          */
 1497         if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
 1498                 keg_drain(zone->uz_keg);
 1499         ZONE_LOCK(zone);
 1500         zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING;
 1501         wakeup(zone);
 1502 out:
 1503         ZONE_UNLOCK(zone);
 1504 }
 1505 
 1506 static void
 1507 zone_drain(uma_zone_t zone, void *unused)
 1508 {
 1509 
 1510         zone_reclaim(zone, M_NOWAIT, true);
 1511 }
 1512 
 1513 static void
 1514 zone_trim(uma_zone_t zone, void *unused)
 1515 {
 1516 
 1517         zone_reclaim(zone, M_NOWAIT, false);
 1518 }
 1519 
 1520 /*
 1521  * Allocate a new slab for a keg and inserts it into the partial slab list.
 1522  * The keg should be unlocked on entry.  If the allocation succeeds it will
 1523  * be locked on return.
 1524  *
 1525  * Arguments:
 1526  *      flags   Wait flags for the item initialization routine
 1527  *      aflags  Wait flags for the slab allocation
 1528  *
 1529  * Returns:
 1530  *      The slab that was allocated or NULL if there is no memory and the
 1531  *      caller specified M_NOWAIT.
 1532  */
 1533 static uma_slab_t
 1534 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
 1535     int aflags)
 1536 {
 1537         uma_domain_t dom;
 1538         uma_alloc allocf;
 1539         uma_slab_t slab;
 1540         unsigned long size;
 1541         uint8_t *mem;
 1542         uint8_t sflags;
 1543         int i;
 1544 
 1545         KASSERT(domain >= 0 && domain < vm_ndomains,
 1546             ("keg_alloc_slab: domain %d out of range", domain));
 1547 
 1548         allocf = keg->uk_allocf;
 1549         slab = NULL;
 1550         mem = NULL;
 1551         if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) {
 1552                 uma_hash_slab_t hslab;
 1553                 hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL,
 1554                     domain, aflags);
 1555                 if (hslab == NULL)
 1556                         goto fail;
 1557                 slab = &hslab->uhs_slab;
 1558         }
 1559 
 1560         /*
 1561          * This reproduces the old vm_zone behavior of zero filling pages the
 1562          * first time they are added to a zone.
 1563          *
 1564          * Malloced items are zeroed in uma_zalloc.
 1565          */
 1566 
 1567         if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 1568                 aflags |= M_ZERO;
 1569         else
 1570                 aflags &= ~M_ZERO;
 1571 
 1572         if (keg->uk_flags & UMA_ZONE_NODUMP)
 1573                 aflags |= M_NODUMP;
 1574 
 1575         /* zone is passed for legacy reasons. */
 1576         size = keg->uk_ppera * PAGE_SIZE;
 1577         mem = allocf(zone, size, domain, &sflags, aflags);
 1578         if (mem == NULL) {
 1579                 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE)
 1580                         zone_free_item(slabzone(keg->uk_ipers),
 1581                             slab_tohashslab(slab), NULL, SKIP_NONE);
 1582                 goto fail;
 1583         }
 1584         uma_total_inc(size);
 1585 
 1586         /* For HASH zones all pages go to the same uma_domain. */
 1587         if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
 1588                 domain = 0;
 1589 
 1590         /* Point the slab into the allocated memory */
 1591         if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE))
 1592                 slab = (uma_slab_t )(mem + keg->uk_pgoff);
 1593         else
 1594                 slab_tohashslab(slab)->uhs_data = mem;
 1595 
 1596         if (keg->uk_flags & UMA_ZFLAG_VTOSLAB)
 1597                 for (i = 0; i < keg->uk_ppera; i++)
 1598                         vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE),
 1599                             zone, slab);
 1600 
 1601         slab->us_freecount = keg->uk_ipers;
 1602         slab->us_flags = sflags;
 1603         slab->us_domain = domain;
 1604 
 1605         BIT_FILL(keg->uk_ipers, &slab->us_free);
 1606 #ifdef INVARIANTS
 1607         BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg));
 1608 #endif
 1609 
 1610         if (keg->uk_init != NULL) {
 1611                 for (i = 0; i < keg->uk_ipers; i++)
 1612                         if (keg->uk_init(slab_item(slab, keg, i),
 1613                             keg->uk_size, flags) != 0)
 1614                                 break;
 1615                 if (i != keg->uk_ipers) {
 1616                         keg_free_slab(keg, slab, i);
 1617                         goto fail;
 1618                 }
 1619         }
 1620         KEG_LOCK(keg, domain);
 1621 
 1622         CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
 1623             slab, keg->uk_name, keg);
 1624 
 1625         if (keg->uk_flags & UMA_ZFLAG_HASH)
 1626                 UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 1627 
 1628         /*
 1629          * If we got a slab here it's safe to mark it partially used
 1630          * and return.  We assume that the caller is going to remove
 1631          * at least one item.
 1632          */
 1633         dom = &keg->uk_domain[domain];
 1634         LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 1635         dom->ud_pages += keg->uk_ppera;
 1636         dom->ud_free_items += keg->uk_ipers;
 1637 
 1638         return (slab);
 1639 
 1640 fail:
 1641         return (NULL);
 1642 }
 1643 
 1644 /*
 1645  * This function is intended to be used early on in place of page_alloc() so
 1646  * that we may use the boot time page cache to satisfy allocations before
 1647  * the VM is ready.
 1648  */
 1649 static void *
 1650 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
 1651     int wait)
 1652 {
 1653         vm_paddr_t pa;
 1654         vm_page_t m;
 1655         void *mem;
 1656         int pages;
 1657         int i;
 1658 
 1659         pages = howmany(bytes, PAGE_SIZE);
 1660         KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
 1661 
 1662         *pflag = UMA_SLAB_BOOT;
 1663         m = vm_page_alloc_contig_domain(NULL, 0, domain,
 1664             malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, pages, 
 1665             (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT);
 1666         if (m == NULL)
 1667                 return (NULL);
 1668 
 1669         pa = VM_PAGE_TO_PHYS(m);
 1670         for (i = 0; i < pages; i++, pa += PAGE_SIZE) {
 1671 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
 1672     defined(__riscv) || defined(__powerpc64__)
 1673                 if ((wait & M_NODUMP) == 0)
 1674                         dump_add_page(pa);
 1675 #endif
 1676         }
 1677         /* Allocate KVA and indirectly advance bootmem. */
 1678         mem = (void *)pmap_map(&bootmem, m->phys_addr,
 1679             m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE);
 1680         if ((wait & M_ZERO) != 0)
 1681                 bzero(mem, pages * PAGE_SIZE);
 1682 
 1683         return (mem);
 1684 }
 1685 
 1686 static void
 1687 startup_free(void *mem, vm_size_t bytes)
 1688 {
 1689         vm_offset_t va;
 1690         vm_page_t m;
 1691 
 1692         va = (vm_offset_t)mem;
 1693         m = PHYS_TO_VM_PAGE(pmap_kextract(va));
 1694 
 1695         /*
 1696          * startup_alloc() returns direct-mapped slabs on some platforms.  Avoid
 1697          * unmapping ranges of the direct map.
 1698          */
 1699         if (va >= bootstart && va + bytes <= bootmem)
 1700                 pmap_remove(kernel_pmap, va, va + bytes);
 1701         for (; bytes != 0; bytes -= PAGE_SIZE, m++) {
 1702 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
 1703     defined(__riscv) || defined(__powerpc64__)
 1704                 dump_drop_page(VM_PAGE_TO_PHYS(m));
 1705 #endif
 1706                 vm_page_unwire_noq(m);
 1707                 vm_page_free(m);
 1708         }
 1709 }
 1710 
 1711 /*
 1712  * Allocates a number of pages from the system
 1713  *
 1714  * Arguments:
 1715  *      bytes  The number of bytes requested
 1716  *      wait  Shall we wait?
 1717  *
 1718  * Returns:
 1719  *      A pointer to the alloced memory or possibly
 1720  *      NULL if M_NOWAIT is set.
 1721  */
 1722 static void *
 1723 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
 1724     int wait)
 1725 {
 1726         void *p;        /* Returned page */
 1727 
 1728         *pflag = UMA_SLAB_KERNEL;
 1729         p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
 1730 
 1731         return (p);
 1732 }
 1733 
 1734 static void *
 1735 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
 1736     int wait)
 1737 {
 1738         struct pglist alloctail;
 1739         vm_offset_t addr, zkva;
 1740         int cpu, flags;
 1741         vm_page_t p, p_next;
 1742 #ifdef NUMA
 1743         struct pcpu *pc;
 1744 #endif
 1745 
 1746         MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
 1747 
 1748         TAILQ_INIT(&alloctail);
 1749         flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
 1750             malloc2vm_flags(wait);
 1751         *pflag = UMA_SLAB_KERNEL;
 1752         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 1753                 if (CPU_ABSENT(cpu)) {
 1754                         p = vm_page_alloc(NULL, 0, flags);
 1755                 } else {
 1756 #ifndef NUMA
 1757                         p = vm_page_alloc(NULL, 0, flags);
 1758 #else
 1759                         pc = pcpu_find(cpu);
 1760                         if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain)))
 1761                                 p = NULL;
 1762                         else
 1763                                 p = vm_page_alloc_domain(NULL, 0,
 1764                                     pc->pc_domain, flags);
 1765                         if (__predict_false(p == NULL))
 1766                                 p = vm_page_alloc(NULL, 0, flags);
 1767 #endif
 1768                 }
 1769                 if (__predict_false(p == NULL))
 1770                         goto fail;
 1771                 TAILQ_INSERT_TAIL(&alloctail, p, listq);
 1772         }
 1773         if ((addr = kva_alloc(bytes)) == 0)
 1774                 goto fail;
 1775         zkva = addr;
 1776         TAILQ_FOREACH(p, &alloctail, listq) {
 1777                 pmap_qenter(zkva, &p, 1);
 1778                 zkva += PAGE_SIZE;
 1779         }
 1780         return ((void*)addr);
 1781 fail:
 1782         TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
 1783                 vm_page_unwire_noq(p);
 1784                 vm_page_free(p);
 1785         }
 1786         return (NULL);
 1787 }
 1788 
 1789 /*
 1790  * Allocates a number of pages from within an object
 1791  *
 1792  * Arguments:
 1793  *      bytes  The number of bytes requested
 1794  *      wait   Shall we wait?
 1795  *
 1796  * Returns:
 1797  *      A pointer to the alloced memory or possibly
 1798  *      NULL if M_NOWAIT is set.
 1799  */
 1800 static void *
 1801 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
 1802     int wait)
 1803 {
 1804         TAILQ_HEAD(, vm_page) alloctail;
 1805         u_long npages;
 1806         vm_offset_t retkva, zkva;
 1807         vm_page_t p, p_next;
 1808         uma_keg_t keg;
 1809 
 1810         TAILQ_INIT(&alloctail);
 1811         keg = zone->uz_keg;
 1812 
 1813         npages = howmany(bytes, PAGE_SIZE);
 1814         while (npages > 0) {
 1815                 p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
 1816                     VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
 1817                     ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
 1818                     VM_ALLOC_NOWAIT));
 1819                 if (p != NULL) {
 1820                         /*
 1821                          * Since the page does not belong to an object, its
 1822                          * listq is unused.
 1823                          */
 1824                         TAILQ_INSERT_TAIL(&alloctail, p, listq);
 1825                         npages--;
 1826                         continue;
 1827                 }
 1828                 /*
 1829                  * Page allocation failed, free intermediate pages and
 1830                  * exit.
 1831                  */
 1832                 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
 1833                         vm_page_unwire_noq(p);
 1834                         vm_page_free(p); 
 1835                 }
 1836                 return (NULL);
 1837         }
 1838         *flags = UMA_SLAB_PRIV;
 1839         zkva = keg->uk_kva +
 1840             atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
 1841         retkva = zkva;
 1842         TAILQ_FOREACH(p, &alloctail, listq) {
 1843                 pmap_qenter(zkva, &p, 1);
 1844                 zkva += PAGE_SIZE;
 1845         }
 1846 
 1847         return ((void *)retkva);
 1848 }
 1849 
 1850 /*
 1851  * Allocate physically contiguous pages.
 1852  */
 1853 static void *
 1854 contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
 1855     int wait)
 1856 {
 1857 
 1858         *pflag = UMA_SLAB_KERNEL;
 1859         return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
 1860             bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
 1861 }
 1862 
 1863 /*
 1864  * Frees a number of pages to the system
 1865  *
 1866  * Arguments:
 1867  *      mem   A pointer to the memory to be freed
 1868  *      size  The size of the memory being freed
 1869  *      flags The original p->us_flags field
 1870  *
 1871  * Returns:
 1872  *      Nothing
 1873  */
 1874 static void
 1875 page_free(void *mem, vm_size_t size, uint8_t flags)
 1876 {
 1877 
 1878         if ((flags & UMA_SLAB_BOOT) != 0) {
 1879                 startup_free(mem, size);
 1880                 return;
 1881         }
 1882 
 1883         KASSERT((flags & UMA_SLAB_KERNEL) != 0,
 1884             ("UMA: page_free used with invalid flags %x", flags));
 1885 
 1886         kmem_free((vm_offset_t)mem, size);
 1887 }
 1888 
 1889 /*
 1890  * Frees pcpu zone allocations
 1891  *
 1892  * Arguments:
 1893  *      mem   A pointer to the memory to be freed
 1894  *      size  The size of the memory being freed
 1895  *      flags The original p->us_flags field
 1896  *
 1897  * Returns:
 1898  *      Nothing
 1899  */
 1900 static void
 1901 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
 1902 {
 1903         vm_offset_t sva, curva;
 1904         vm_paddr_t paddr;
 1905         vm_page_t m;
 1906 
 1907         MPASS(size == (mp_maxid+1)*PAGE_SIZE);
 1908 
 1909         if ((flags & UMA_SLAB_BOOT) != 0) {
 1910                 startup_free(mem, size);
 1911                 return;
 1912         }
 1913 
 1914         sva = (vm_offset_t)mem;
 1915         for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
 1916                 paddr = pmap_kextract(curva);
 1917                 m = PHYS_TO_VM_PAGE(paddr);
 1918                 vm_page_unwire_noq(m);
 1919                 vm_page_free(m);
 1920         }
 1921         pmap_qremove(sva, size >> PAGE_SHIFT);
 1922         kva_free(sva, size);
 1923 }
 1924 
 1925 /*
 1926  * Zero fill initializer
 1927  *
 1928  * Arguments/Returns follow uma_init specifications
 1929  */
 1930 static int
 1931 zero_init(void *mem, int size, int flags)
 1932 {
 1933         bzero(mem, size);
 1934         return (0);
 1935 }
 1936 
 1937 #ifdef INVARIANTS
 1938 static struct noslabbits *
 1939 slab_dbg_bits(uma_slab_t slab, uma_keg_t keg)
 1940 {
 1941 
 1942         return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers)));
 1943 }
 1944 #endif
 1945 
 1946 /*
 1947  * Actual size of embedded struct slab (!OFFPAGE).
 1948  */
 1949 static size_t
 1950 slab_sizeof(int nitems)
 1951 {
 1952         size_t s;
 1953 
 1954         s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS;
 1955         return (roundup(s, UMA_ALIGN_PTR + 1));
 1956 }
 1957 
 1958 #define UMA_FIXPT_SHIFT 31
 1959 #define UMA_FRAC_FIXPT(n, d)                                            \
 1960         ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d)))
 1961 #define UMA_FIXPT_PCT(f)                                                \
 1962         ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT))
 1963 #define UMA_PCT_FIXPT(pct)      UMA_FRAC_FIXPT((pct), 100)
 1964 #define UMA_MIN_EFF     UMA_PCT_FIXPT(100 - UMA_MAX_WASTE)
 1965 
 1966 /*
 1967  * Compute the number of items that will fit in a slab.  If hdr is true, the
 1968  * item count may be limited to provide space in the slab for an inline slab
 1969  * header.  Otherwise, all slab space will be provided for item storage.
 1970  */
 1971 static u_int
 1972 slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr)
 1973 {
 1974         u_int ipers;
 1975         u_int padpi;
 1976 
 1977         /* The padding between items is not needed after the last item. */
 1978         padpi = rsize - size;
 1979 
 1980         if (hdr) {
 1981                 /*
 1982                  * Start with the maximum item count and remove items until
 1983                  * the slab header first alongside the allocatable memory.
 1984                  */
 1985                 for (ipers = MIN(SLAB_MAX_SETSIZE,
 1986                     (slabsize + padpi - slab_sizeof(1)) / rsize);
 1987                     ipers > 0 &&
 1988                     ipers * rsize - padpi + slab_sizeof(ipers) > slabsize;
 1989                     ipers--)
 1990                         continue;
 1991         } else {
 1992                 ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE);
 1993         }
 1994 
 1995         return (ipers);
 1996 }
 1997 
 1998 struct keg_layout_result {
 1999         u_int format;
 2000         u_int slabsize;
 2001         u_int ipers;
 2002         u_int eff;
 2003 };
 2004 
 2005 static void
 2006 keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt,
 2007     struct keg_layout_result *kl)
 2008 {
 2009         u_int total;
 2010 
 2011         kl->format = fmt;
 2012         kl->slabsize = slabsize;
 2013 
 2014         /* Handle INTERNAL as inline with an extra page. */
 2015         if ((fmt & UMA_ZFLAG_INTERNAL) != 0) {
 2016                 kl->format &= ~UMA_ZFLAG_INTERNAL;
 2017                 kl->slabsize += PAGE_SIZE;
 2018         }
 2019 
 2020         kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize,
 2021             (fmt & UMA_ZFLAG_OFFPAGE) == 0);
 2022 
 2023         /* Account for memory used by an offpage slab header. */
 2024         total = kl->slabsize;
 2025         if ((fmt & UMA_ZFLAG_OFFPAGE) != 0)
 2026                 total += slabzone(kl->ipers)->uz_keg->uk_rsize;
 2027 
 2028         kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total);
 2029 }
 2030 
 2031 /*
 2032  * Determine the format of a uma keg.  This determines where the slab header
 2033  * will be placed (inline or offpage) and calculates ipers, rsize, and ppera.
 2034  *
 2035  * Arguments
 2036  *      keg  The zone we should initialize
 2037  *
 2038  * Returns
 2039  *      Nothing
 2040  */
 2041 static void
 2042 keg_layout(uma_keg_t keg)
 2043 {
 2044         struct keg_layout_result kl = {}, kl_tmp;
 2045         u_int fmts[2];
 2046         u_int alignsize;
 2047         u_int nfmt;
 2048         u_int pages;
 2049         u_int rsize;
 2050         u_int slabsize;
 2051         u_int i, j;
 2052 
 2053         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
 2054             (keg->uk_size <= UMA_PCPU_ALLOC_SIZE &&
 2055              (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0),
 2056             ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b",
 2057              __func__, keg->uk_name, keg->uk_size, keg->uk_flags,
 2058              PRINT_UMA_ZFLAGS));
 2059         KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 ||
 2060             (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0,
 2061             ("%s: incompatible flags 0x%b", __func__, keg->uk_flags,
 2062              PRINT_UMA_ZFLAGS));
 2063 
 2064         alignsize = keg->uk_align + 1;
 2065 
 2066         /*
 2067          * Calculate the size of each allocation (rsize) according to
 2068          * alignment.  If the requested size is smaller than we have
 2069          * allocation bits for we round it up.
 2070          */
 2071         rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT);
 2072         rsize = roundup2(rsize, alignsize);
 2073 
 2074         if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) {
 2075                 /*
 2076                  * We want one item to start on every align boundary in a page.
 2077                  * To do this we will span pages.  We will also extend the item
 2078                  * by the size of align if it is an even multiple of align.
 2079                  * Otherwise, it would fall on the same boundary every time.
 2080                  */
 2081                 if ((rsize & alignsize) == 0)
 2082                         rsize += alignsize;
 2083                 slabsize = rsize * (PAGE_SIZE / alignsize);
 2084                 slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE);
 2085                 slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE);
 2086                 slabsize = round_page(slabsize);
 2087         } else {
 2088                 /*
 2089                  * Start with a slab size of as many pages as it takes to
 2090                  * represent a single item.  We will try to fit as many
 2091                  * additional items into the slab as possible.
 2092                  */
 2093                 slabsize = round_page(keg->uk_size);
 2094         }
 2095 
 2096         /* Build a list of all of the available formats for this keg. */
 2097         nfmt = 0;
 2098 
 2099         /* Evaluate an inline slab layout. */
 2100         if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0)
 2101                 fmts[nfmt++] = 0;
 2102 
 2103         /* TODO: vm_page-embedded slab. */
 2104 
 2105         /*
 2106          * We can't do OFFPAGE if we're internal or if we've been
 2107          * asked to not go to the VM for buckets.  If we do this we
 2108          * may end up going to the VM for slabs which we do not want
 2109          * to do if we're UMA_ZONE_VM, which clearly forbids it.
 2110          * In those cases, evaluate a pseudo-format called INTERNAL
 2111          * which has an inline slab header and one extra page to
 2112          * guarantee that it fits.
 2113          *
 2114          * Otherwise, see if using an OFFPAGE slab will improve our
 2115          * efficiency.
 2116          */
 2117         if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0)
 2118                 fmts[nfmt++] = UMA_ZFLAG_INTERNAL;
 2119         else
 2120                 fmts[nfmt++] = UMA_ZFLAG_OFFPAGE;
 2121 
 2122         /*
 2123          * Choose a slab size and format which satisfy the minimum efficiency.
 2124          * Prefer the smallest slab size that meets the constraints.
 2125          *
 2126          * Start with a minimum slab size, to accommodate CACHESPREAD.  Then,
 2127          * for small items (up to PAGE_SIZE), the iteration increment is one
 2128          * page; and for large items, the increment is one item.
 2129          */
 2130         i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize);
 2131         KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u",
 2132             keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize,
 2133             rsize, i));
 2134         for ( ; ; i++) {
 2135                 slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) :
 2136                     round_page(rsize * (i - 1) + keg->uk_size);
 2137 
 2138                 for (j = 0; j < nfmt; j++) {
 2139                         /* Only if we have no viable format yet. */
 2140                         if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 &&
 2141                             kl.ipers > 0)
 2142                                 continue;
 2143 
 2144                         keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp);
 2145                         if (kl_tmp.eff <= kl.eff)
 2146                                 continue;
 2147 
 2148                         kl = kl_tmp;
 2149 
 2150                         CTR6(KTR_UMA, "keg %s layout: format %#x "
 2151                             "(ipers %u * rsize %u) / slabsize %#x = %u%% eff",
 2152                             keg->uk_name, kl.format, kl.ipers, rsize,
 2153                             kl.slabsize, UMA_FIXPT_PCT(kl.eff));
 2154 
 2155                         /* Stop when we reach the minimum efficiency. */
 2156                         if (kl.eff >= UMA_MIN_EFF)
 2157                                 break;
 2158                 }
 2159 
 2160                 if (kl.eff >= UMA_MIN_EFF || !multipage_slabs ||
 2161                     slabsize >= SLAB_MAX_SETSIZE * rsize ||
 2162                     (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0)
 2163                         break;
 2164         }
 2165 
 2166         pages = atop(kl.slabsize);
 2167         if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
 2168                 pages *= mp_maxid + 1;
 2169 
 2170         keg->uk_rsize = rsize;
 2171         keg->uk_ipers = kl.ipers;
 2172         keg->uk_ppera = pages;
 2173         keg->uk_flags |= kl.format;
 2174 
 2175         /*
 2176          * How do we find the slab header if it is offpage or if not all item
 2177          * start addresses are in the same page?  We could solve the latter
 2178          * case with vaddr alignment, but we don't.
 2179          */
 2180         if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 ||
 2181             (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) {
 2182                 if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0)
 2183                         keg->uk_flags |= UMA_ZFLAG_HASH;
 2184                 else
 2185                         keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
 2186         }
 2187 
 2188         CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u",
 2189             __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers,
 2190             pages);
 2191         KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE,
 2192             ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__,
 2193              keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize,
 2194              keg->uk_ipers, pages));
 2195 }
 2196 
 2197 /*
 2198  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
 2199  * the keg onto the global keg list.
 2200  *
 2201  * Arguments/Returns follow uma_ctor specifications
 2202  *      udata  Actually uma_kctor_args
 2203  */
 2204 static int
 2205 keg_ctor(void *mem, int size, void *udata, int flags)
 2206 {
 2207         struct uma_kctor_args *arg = udata;
 2208         uma_keg_t keg = mem;
 2209         uma_zone_t zone;
 2210         int i;
 2211 
 2212         bzero(keg, size);
 2213         keg->uk_size = arg->size;
 2214         keg->uk_init = arg->uminit;
 2215         keg->uk_fini = arg->fini;
 2216         keg->uk_align = arg->align;
 2217         keg->uk_reserve = 0;
 2218         keg->uk_flags = arg->flags;
 2219 
 2220         /*
 2221          * We use a global round-robin policy by default.  Zones with
 2222          * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which
 2223          * case the iterator is never run.
 2224          */
 2225         keg->uk_dr.dr_policy = DOMAINSET_RR();
 2226         keg->uk_dr.dr_iter = 0;
 2227 
 2228         /*
 2229          * The primary zone is passed to us at keg-creation time.
 2230          */
 2231         zone = arg->zone;
 2232         keg->uk_name = zone->uz_name;
 2233 
 2234         if (arg->flags & UMA_ZONE_ZINIT)
 2235                 keg->uk_init = zero_init;
 2236 
 2237         if (arg->flags & UMA_ZONE_MALLOC)
 2238                 keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
 2239 
 2240 #ifndef SMP
 2241         keg->uk_flags &= ~UMA_ZONE_PCPU;
 2242 #endif
 2243 
 2244         keg_layout(keg);
 2245 
 2246         /*
 2247          * Use a first-touch NUMA policy for kegs that pmap_extract() will
 2248          * work on.  Use round-robin for everything else.
 2249          *
 2250          * Zones may override the default by specifying either.
 2251          */
 2252 #ifdef NUMA
 2253         if ((keg->uk_flags &
 2254             (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0)
 2255                 keg->uk_flags |= UMA_ZONE_FIRSTTOUCH;
 2256         else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0)
 2257                 keg->uk_flags |= UMA_ZONE_ROUNDROBIN;
 2258 #endif
 2259 
 2260         /*
 2261          * If we haven't booted yet we need allocations to go through the
 2262          * startup cache until the vm is ready.
 2263          */
 2264 #ifdef UMA_MD_SMALL_ALLOC
 2265         if (keg->uk_ppera == 1)
 2266                 keg->uk_allocf = uma_small_alloc;
 2267         else
 2268 #endif
 2269         if (booted < BOOT_KVA)
 2270                 keg->uk_allocf = startup_alloc;
 2271         else if (keg->uk_flags & UMA_ZONE_PCPU)
 2272                 keg->uk_allocf = pcpu_page_alloc;
 2273         else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1)
 2274                 keg->uk_allocf = contig_alloc;
 2275         else
 2276                 keg->uk_allocf = page_alloc;
 2277 #ifdef UMA_MD_SMALL_ALLOC
 2278         if (keg->uk_ppera == 1)
 2279                 keg->uk_freef = uma_small_free;
 2280         else
 2281 #endif
 2282         if (keg->uk_flags & UMA_ZONE_PCPU)
 2283                 keg->uk_freef = pcpu_page_free;
 2284         else
 2285                 keg->uk_freef = page_free;
 2286 
 2287         /*
 2288          * Initialize keg's locks.
 2289          */
 2290         for (i = 0; i < vm_ndomains; i++)
 2291                 KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS));
 2292 
 2293         /*
 2294          * If we're putting the slab header in the actual page we need to
 2295          * figure out where in each page it goes.  See slab_sizeof
 2296          * definition.
 2297          */
 2298         if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) {
 2299                 size_t shsize;
 2300 
 2301                 shsize = slab_sizeof(keg->uk_ipers);
 2302                 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize;
 2303                 /*
 2304                  * The only way the following is possible is if with our
 2305                  * UMA_ALIGN_PTR adjustments we are now bigger than
 2306                  * UMA_SLAB_SIZE.  I haven't checked whether this is
 2307                  * mathematically possible for all cases, so we make
 2308                  * sure here anyway.
 2309                  */
 2310                 KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera,
 2311                     ("zone %s ipers %d rsize %d size %d slab won't fit",
 2312                     zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
 2313         }
 2314 
 2315         if (keg->uk_flags & UMA_ZFLAG_HASH)
 2316                 hash_alloc(&keg->uk_hash, 0);
 2317 
 2318         CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone);
 2319 
 2320         LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 2321 
 2322         rw_wlock(&uma_rwlock);
 2323         LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 2324         rw_wunlock(&uma_rwlock);
 2325         return (0);
 2326 }
 2327 
 2328 static void
 2329 zone_kva_available(uma_zone_t zone, void *unused)
 2330 {
 2331         uma_keg_t keg;
 2332 
 2333         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
 2334                 return;
 2335         KEG_GET(zone, keg);
 2336 
 2337         if (keg->uk_allocf == startup_alloc) {
 2338                 /* Switch to the real allocator. */
 2339                 if (keg->uk_flags & UMA_ZONE_PCPU)
 2340                         keg->uk_allocf = pcpu_page_alloc;
 2341                 else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 &&
 2342                     keg->uk_ppera > 1)
 2343                         keg->uk_allocf = contig_alloc;
 2344                 else
 2345                         keg->uk_allocf = page_alloc;
 2346         }
 2347 }
 2348 
 2349 static void
 2350 zone_alloc_counters(uma_zone_t zone, void *unused)
 2351 {
 2352 
 2353         zone->uz_allocs = counter_u64_alloc(M_WAITOK);
 2354         zone->uz_frees = counter_u64_alloc(M_WAITOK);
 2355         zone->uz_fails = counter_u64_alloc(M_WAITOK);
 2356         zone->uz_xdomain = counter_u64_alloc(M_WAITOK);
 2357 }
 2358 
 2359 static void
 2360 zone_alloc_sysctl(uma_zone_t zone, void *unused)
 2361 {
 2362         uma_zone_domain_t zdom;
 2363         uma_domain_t dom;
 2364         uma_keg_t keg;
 2365         struct sysctl_oid *oid, *domainoid;
 2366         int domains, i, cnt;
 2367         static const char *nokeg = "cache zone";
 2368         char *c;
 2369 
 2370         /*
 2371          * Make a sysctl safe copy of the zone name by removing
 2372          * any special characters and handling dups by appending
 2373          * an index.
 2374          */
 2375         if (zone->uz_namecnt != 0) {
 2376                 /* Count the number of decimal digits and '_' separator. */
 2377                 for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++)
 2378                         cnt /= 10;
 2379                 zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1,
 2380                     M_UMA, M_WAITOK);
 2381                 sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name,
 2382                     zone->uz_namecnt);
 2383         } else
 2384                 zone->uz_ctlname = strdup(zone->uz_name, M_UMA);
 2385         for (c = zone->uz_ctlname; *c != '\0'; c++)
 2386                 if (strchr("./\\ -", *c) != NULL)
 2387                         *c = '_';
 2388 
 2389         /*
 2390          * Basic parameters at the root.
 2391          */
 2392         zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma),
 2393             OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 2394         oid = zone->uz_oid;
 2395         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2396             "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size");
 2397         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2398             "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE,
 2399             zone, 0, sysctl_handle_uma_zone_flags, "A",
 2400             "Allocator configuration flags");
 2401         SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2402             "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0,
 2403             "Desired per-cpu cache size");
 2404         SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2405             "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0,
 2406             "Maximum allowed per-cpu cache size");
 2407 
 2408         /*
 2409          * keg if present.
 2410          */
 2411         if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
 2412                 domains = vm_ndomains;
 2413         else
 2414                 domains = 1;
 2415         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
 2416             "keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 2417         keg = zone->uz_keg;
 2418         if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) {
 2419                 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2420                     "name", CTLFLAG_RD, keg->uk_name, "Keg name");
 2421                 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2422                     "rsize", CTLFLAG_RD, &keg->uk_rsize, 0,
 2423                     "Real object size with alignment");
 2424                 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2425                     "ppera", CTLFLAG_RD, &keg->uk_ppera, 0,
 2426                     "pages per-slab allocation");
 2427                 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2428                     "ipers", CTLFLAG_RD, &keg->uk_ipers, 0,
 2429                     "items available per-slab");
 2430                 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2431                     "align", CTLFLAG_RD, &keg->uk_align, 0,
 2432                     "item alignment mask");
 2433                 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2434                     "reserve", CTLFLAG_RD, &keg->uk_reserve, 0,
 2435                     "number of reserved items");
 2436                 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2437                     "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
 2438                     keg, 0, sysctl_handle_uma_slab_efficiency, "I",
 2439                     "Slab utilization (100 - internal fragmentation %)");
 2440                 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid),
 2441                     OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 2442                 for (i = 0; i < domains; i++) {
 2443                         dom = &keg->uk_domain[i];
 2444                         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
 2445                             OID_AUTO, VM_DOMAIN(i)->vmd_name,
 2446                             CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 2447                         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2448                             "pages", CTLFLAG_RD, &dom->ud_pages, 0,
 2449                             "Total pages currently allocated from VM");
 2450                         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2451                             "free_items", CTLFLAG_RD, &dom->ud_free_items, 0,
 2452                             "items free in the slab layer");
 2453                 }
 2454         } else
 2455                 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2456                     "name", CTLFLAG_RD, nokeg, "Keg name");
 2457 
 2458         /*
 2459          * Information about zone limits.
 2460          */
 2461         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
 2462             "limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 2463         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2464             "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
 2465             zone, 0, sysctl_handle_uma_zone_items, "QU",
 2466             "Current number of allocated items if limit is set");
 2467         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2468             "max_items", CTLFLAG_RD, &zone->uz_max_items, 0,
 2469             "Maximum number of allocated and cached items");
 2470         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2471             "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0,
 2472             "Number of threads sleeping at limit");
 2473         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2474             "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0,
 2475             "Total zone limit sleeps");
 2476         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2477             "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0,
 2478             "Maximum number of items in each domain's bucket cache");
 2479 
 2480         /*
 2481          * Per-domain zone information.
 2482          */
 2483         domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid),
 2484             OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 2485         for (i = 0; i < domains; i++) {
 2486                 zdom = ZDOM_GET(zone, i);
 2487                 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
 2488                     OID_AUTO, VM_DOMAIN(i)->vmd_name,
 2489                     CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 2490                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2491                     "nitems", CTLFLAG_RD, &zdom->uzd_nitems,
 2492                     "number of items in this domain");
 2493                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2494                     "imax", CTLFLAG_RD, &zdom->uzd_imax,
 2495                     "maximum item count in this period");
 2496                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2497                     "imin", CTLFLAG_RD, &zdom->uzd_imin,
 2498                     "minimum item count in this period");
 2499                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2500                     "wss", CTLFLAG_RD, &zdom->uzd_wss,
 2501                     "Working set size");
 2502         }
 2503 
 2504         /*
 2505          * General statistics.
 2506          */
 2507         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
 2508             "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 2509         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2510             "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
 2511             zone, 1, sysctl_handle_uma_zone_cur, "I",
 2512             "Current number of allocated items");
 2513         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2514             "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
 2515             zone, 0, sysctl_handle_uma_zone_allocs, "QU",
 2516             "Total allocation calls");
 2517         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2518             "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
 2519             zone, 0, sysctl_handle_uma_zone_frees, "QU",
 2520             "Total free calls");
 2521         SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2522             "fails", CTLFLAG_RD, &zone->uz_fails,
 2523             "Number of allocation failures");
 2524         SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 2525             "xdomain", CTLFLAG_RD, &zone->uz_xdomain,
 2526             "Free calls from the wrong domain");
 2527 }
 2528 
 2529 struct uma_zone_count {
 2530         const char      *name;
 2531         int             count;
 2532 };
 2533 
 2534 static void
 2535 zone_count(uma_zone_t zone, void *arg)
 2536 {
 2537         struct uma_zone_count *cnt;
 2538 
 2539         cnt = arg;
 2540         /*
 2541          * Some zones are rapidly created with identical names and
 2542          * destroyed out of order.  This can lead to gaps in the count.
 2543          * Use one greater than the maximum observed for this name.
 2544          */
 2545         if (strcmp(zone->uz_name, cnt->name) == 0)
 2546                 cnt->count = MAX(cnt->count,
 2547                     zone->uz_namecnt + 1);
 2548 }
 2549 
 2550 static void
 2551 zone_update_caches(uma_zone_t zone)
 2552 {
 2553         int i;
 2554 
 2555         for (i = 0; i <= mp_maxid; i++) {
 2556                 cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size);
 2557                 cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags);
 2558         }
 2559 }
 2560 
 2561 /*
 2562  * Zone header ctor.  This initializes all fields, locks, etc.
 2563  *
 2564  * Arguments/Returns follow uma_ctor specifications
 2565  *      udata  Actually uma_zctor_args
 2566  */
 2567 static int
 2568 zone_ctor(void *mem, int size, void *udata, int flags)
 2569 {
 2570         struct uma_zone_count cnt;
 2571         struct uma_zctor_args *arg = udata;
 2572         uma_zone_domain_t zdom;
 2573         uma_zone_t zone = mem;
 2574         uma_zone_t z;
 2575         uma_keg_t keg;
 2576         int i;
 2577 
 2578         bzero(zone, size);
 2579         zone->uz_name = arg->name;
 2580         zone->uz_ctor = arg->ctor;
 2581         zone->uz_dtor = arg->dtor;
 2582         zone->uz_init = NULL;
 2583         zone->uz_fini = NULL;
 2584         zone->uz_sleeps = 0;
 2585         zone->uz_bucket_size = 0;
 2586         zone->uz_bucket_size_min = 0;
 2587         zone->uz_bucket_size_max = BUCKET_MAX;
 2588         zone->uz_flags = (arg->flags & UMA_ZONE_SMR);
 2589         zone->uz_warning = NULL;
 2590         /* The domain structures follow the cpu structures. */
 2591         zone->uz_bucket_max = ULONG_MAX;
 2592         timevalclear(&zone->uz_ratecheck);
 2593 
 2594         /* Count the number of duplicate names. */
 2595         cnt.name = arg->name;
 2596         cnt.count = 0;
 2597         zone_foreach(zone_count, &cnt);
 2598         zone->uz_namecnt = cnt.count;
 2599         ZONE_CROSS_LOCK_INIT(zone);
 2600 
 2601         for (i = 0; i < vm_ndomains; i++) {
 2602                 zdom = ZDOM_GET(zone, i);
 2603                 ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS));
 2604                 STAILQ_INIT(&zdom->uzd_buckets);
 2605         }
 2606 
 2607 #ifdef INVARIANTS
 2608         if (arg->uminit == trash_init && arg->fini == trash_fini)
 2609                 zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR;
 2610 #endif
 2611 
 2612         /*
 2613          * This is a pure cache zone, no kegs.
 2614          */
 2615         if (arg->import) {
 2616                 KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0,
 2617                     ("zone_ctor: Import specified for non-cache zone."));
 2618                 zone->uz_flags = arg->flags;
 2619                 zone->uz_size = arg->size;
 2620                 zone->uz_import = arg->import;
 2621                 zone->uz_release = arg->release;
 2622                 zone->uz_arg = arg->arg;
 2623 #ifdef NUMA
 2624                 /*
 2625                  * Cache zones are round-robin unless a policy is
 2626                  * specified because they may have incompatible
 2627                  * constraints.
 2628                  */
 2629                 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0)
 2630                         zone->uz_flags |= UMA_ZONE_ROUNDROBIN;
 2631 #endif
 2632                 rw_wlock(&uma_rwlock);
 2633                 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
 2634                 rw_wunlock(&uma_rwlock);
 2635                 goto out;
 2636         }
 2637 
 2638         /*
 2639          * Use the regular zone/keg/slab allocator.
 2640          */
 2641         zone->uz_import = zone_import;
 2642         zone->uz_release = zone_release;
 2643         zone->uz_arg = zone; 
 2644         keg = arg->keg;
 2645 
 2646         if (arg->flags & UMA_ZONE_SECONDARY) {
 2647                 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
 2648                     ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 2649                 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
 2650                 zone->uz_init = arg->uminit;
 2651                 zone->uz_fini = arg->fini;
 2652                 zone->uz_flags |= UMA_ZONE_SECONDARY;
 2653                 rw_wlock(&uma_rwlock);
 2654                 ZONE_LOCK(zone);
 2655                 LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 2656                         if (LIST_NEXT(z, uz_link) == NULL) {
 2657                                 LIST_INSERT_AFTER(z, zone, uz_link);
 2658                                 break;
 2659                         }
 2660                 }
 2661                 ZONE_UNLOCK(zone);
 2662                 rw_wunlock(&uma_rwlock);
 2663         } else if (keg == NULL) {
 2664                 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
 2665                     arg->align, arg->flags)) == NULL)
 2666                         return (ENOMEM);
 2667         } else {
 2668                 struct uma_kctor_args karg;
 2669                 int error;
 2670 
 2671                 /* We should only be here from uma_startup() */
 2672                 karg.size = arg->size;
 2673                 karg.uminit = arg->uminit;
 2674                 karg.fini = arg->fini;
 2675                 karg.align = arg->align;
 2676                 karg.flags = (arg->flags & ~UMA_ZONE_SMR);
 2677                 karg.zone = zone;
 2678                 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
 2679                     flags);
 2680                 if (error)
 2681                         return (error);
 2682         }
 2683 
 2684         /* Inherit properties from the keg. */
 2685         zone->uz_keg = keg;
 2686         zone->uz_size = keg->uk_size;
 2687         zone->uz_flags |= (keg->uk_flags &
 2688             (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
 2689 
 2690 out:
 2691         if (booted >= BOOT_PCPU) {
 2692                 zone_alloc_counters(zone, NULL);
 2693                 if (booted >= BOOT_RUNNING)
 2694                         zone_alloc_sysctl(zone, NULL);
 2695         } else {
 2696                 zone->uz_allocs = EARLY_COUNTER;
 2697                 zone->uz_frees = EARLY_COUNTER;
 2698                 zone->uz_fails = EARLY_COUNTER;
 2699         }
 2700 
 2701         /* Caller requests a private SMR context. */
 2702         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
 2703                 zone->uz_smr = smr_create(zone->uz_name, 0, 0);
 2704 
 2705         KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
 2706             (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
 2707             ("Invalid zone flag combination"));
 2708         if (arg->flags & UMA_ZFLAG_INTERNAL)
 2709                 zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
 2710         if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
 2711                 zone->uz_bucket_size = BUCKET_MAX;
 2712         else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
 2713                 zone->uz_bucket_size = 0;
 2714         else
 2715                 zone->uz_bucket_size = bucket_select(zone->uz_size);
 2716         zone->uz_bucket_size_min = zone->uz_bucket_size;
 2717         if (zone->uz_dtor != NULL || zone->uz_ctor != NULL)
 2718                 zone->uz_flags |= UMA_ZFLAG_CTORDTOR;
 2719         zone_update_caches(zone);
 2720 
 2721         return (0);
 2722 }
 2723 
 2724 /*
 2725  * Keg header dtor.  This frees all data, destroys locks, frees the hash
 2726  * table and removes the keg from the global list.
 2727  *
 2728  * Arguments/Returns follow uma_dtor specifications
 2729  *      udata  unused
 2730  */
 2731 static void
 2732 keg_dtor(void *arg, int size, void *udata)
 2733 {
 2734         uma_keg_t keg;
 2735         uint32_t free, pages;
 2736         int i;
 2737 
 2738         keg = (uma_keg_t)arg;
 2739         free = pages = 0;
 2740         for (i = 0; i < vm_ndomains; i++) {
 2741                 free += keg->uk_domain[i].ud_free_items;
 2742                 pages += keg->uk_domain[i].ud_pages;
 2743                 KEG_LOCK_FINI(keg, i);
 2744         }
 2745         if (pages != 0)
 2746                 printf("Freed UMA keg (%s) was not empty (%u items). "
 2747                     " Lost %u pages of memory.\n",
 2748                     keg->uk_name ? keg->uk_name : "",
 2749                     pages / keg->uk_ppera * keg->uk_ipers - free, pages);
 2750 
 2751         hash_free(&keg->uk_hash);
 2752 }
 2753 
 2754 /*
 2755  * Zone header dtor.
 2756  *
 2757  * Arguments/Returns follow uma_dtor specifications
 2758  *      udata  unused
 2759  */
 2760 static void
 2761 zone_dtor(void *arg, int size, void *udata)
 2762 {
 2763         uma_zone_t zone;
 2764         uma_keg_t keg;
 2765         int i;
 2766 
 2767         zone = (uma_zone_t)arg;
 2768 
 2769         sysctl_remove_oid(zone->uz_oid, 1, 1);
 2770 
 2771         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 2772                 cache_drain(zone);
 2773 
 2774         rw_wlock(&uma_rwlock);
 2775         LIST_REMOVE(zone, uz_link);
 2776         rw_wunlock(&uma_rwlock);
 2777         if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
 2778                 keg = zone->uz_keg;
 2779                 keg->uk_reserve = 0;
 2780         }
 2781         zone_reclaim(zone, M_WAITOK, true);
 2782 
 2783         /*
 2784          * We only destroy kegs from non secondary/non cache zones.
 2785          */
 2786         if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
 2787                 keg = zone->uz_keg;
 2788                 rw_wlock(&uma_rwlock);
 2789                 LIST_REMOVE(keg, uk_link);
 2790                 rw_wunlock(&uma_rwlock);
 2791                 zone_free_item(kegs, keg, NULL, SKIP_NONE);
 2792         }
 2793         counter_u64_free(zone->uz_allocs);
 2794         counter_u64_free(zone->uz_frees);
 2795         counter_u64_free(zone->uz_fails);
 2796         counter_u64_free(zone->uz_xdomain);
 2797         free(zone->uz_ctlname, M_UMA);
 2798         for (i = 0; i < vm_ndomains; i++)
 2799                 ZDOM_LOCK_FINI(ZDOM_GET(zone, i));
 2800         ZONE_CROSS_LOCK_FINI(zone);
 2801 }
 2802 
 2803 static void
 2804 zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg)
 2805 {
 2806         uma_keg_t keg;
 2807         uma_zone_t zone;
 2808 
 2809         LIST_FOREACH(keg, &uma_kegs, uk_link) {
 2810                 LIST_FOREACH(zone, &keg->uk_zones, uz_link)
 2811                         zfunc(zone, arg);
 2812         }
 2813         LIST_FOREACH(zone, &uma_cachezones, uz_link)
 2814                 zfunc(zone, arg);
 2815 }
 2816 
 2817 /*
 2818  * Traverses every zone in the system and calls a callback
 2819  *
 2820  * Arguments:
 2821  *      zfunc  A pointer to a function which accepts a zone
 2822  *              as an argument.
 2823  *
 2824  * Returns:
 2825  *      Nothing
 2826  */
 2827 static void
 2828 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg)
 2829 {
 2830 
 2831         rw_rlock(&uma_rwlock);
 2832         zone_foreach_unlocked(zfunc, arg);
 2833         rw_runlock(&uma_rwlock);
 2834 }
 2835 
 2836 /*
 2837  * Initialize the kernel memory allocator.  This is done after pages can be
 2838  * allocated but before general KVA is available.
 2839  */
 2840 void
 2841 uma_startup1(vm_offset_t virtual_avail)
 2842 {
 2843         struct uma_zctor_args args;
 2844         size_t ksize, zsize, size;
 2845         uma_keg_t primarykeg;
 2846         uintptr_t m;
 2847         int domain;
 2848         uint8_t pflag;
 2849 
 2850         bootstart = bootmem = virtual_avail;
 2851 
 2852         rw_init(&uma_rwlock, "UMA lock");
 2853         sx_init(&uma_reclaim_lock, "umareclaim");
 2854 
 2855         ksize = sizeof(struct uma_keg) +
 2856             (sizeof(struct uma_domain) * vm_ndomains);
 2857         ksize = roundup(ksize, UMA_SUPER_ALIGN);
 2858         zsize = sizeof(struct uma_zone) +
 2859             (sizeof(struct uma_cache) * (mp_maxid + 1)) +
 2860             (sizeof(struct uma_zone_domain) * vm_ndomains);
 2861         zsize = roundup(zsize, UMA_SUPER_ALIGN);
 2862 
 2863         /* Allocate the zone of zones, zone of kegs, and zone of zones keg. */
 2864         size = (zsize * 2) + ksize;
 2865         for (domain = 0; domain < vm_ndomains; domain++) {
 2866                 m = (uintptr_t)startup_alloc(NULL, size, domain, &pflag,
 2867                     M_NOWAIT | M_ZERO);
 2868                 if (m != 0)
 2869                         break;
 2870         }
 2871         zones = (uma_zone_t)m;
 2872         m += zsize;
 2873         kegs = (uma_zone_t)m;
 2874         m += zsize;
 2875         primarykeg = (uma_keg_t)m;
 2876 
 2877         /* "manually" create the initial zone */
 2878         memset(&args, 0, sizeof(args));
 2879         args.name = "UMA Kegs";
 2880         args.size = ksize;
 2881         args.ctor = keg_ctor;
 2882         args.dtor = keg_dtor;
 2883         args.uminit = zero_init;
 2884         args.fini = NULL;
 2885         args.keg = primarykeg;
 2886         args.align = UMA_SUPER_ALIGN - 1;
 2887         args.flags = UMA_ZFLAG_INTERNAL;
 2888         zone_ctor(kegs, zsize, &args, M_WAITOK);
 2889 
 2890         args.name = "UMA Zones";
 2891         args.size = zsize;
 2892         args.ctor = zone_ctor;
 2893         args.dtor = zone_dtor;
 2894         args.uminit = zero_init;
 2895         args.fini = NULL;
 2896         args.keg = NULL;
 2897         args.align = UMA_SUPER_ALIGN - 1;
 2898         args.flags = UMA_ZFLAG_INTERNAL;
 2899         zone_ctor(zones, zsize, &args, M_WAITOK);
 2900 
 2901         /* Now make zones for slab headers */
 2902         slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE,
 2903             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 2904         slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE,
 2905             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 2906 
 2907         hashzone = uma_zcreate("UMA Hash",
 2908             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 2909             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 2910 
 2911         bucket_init();
 2912         smr_init();
 2913 }
 2914 
 2915 #ifndef UMA_MD_SMALL_ALLOC
 2916 extern void vm_radix_reserve_kva(void);
 2917 #endif
 2918 
 2919 /*
 2920  * Advertise the availability of normal kva allocations and switch to
 2921  * the default back-end allocator.  Marks the KVA we consumed on startup
 2922  * as used in the map.
 2923  */
 2924 void
 2925 uma_startup2(void)
 2926 {
 2927 
 2928         if (bootstart != bootmem) {
 2929                 vm_map_lock(kernel_map);
 2930                 (void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem,
 2931                     VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
 2932                 vm_map_unlock(kernel_map);
 2933         }
 2934 
 2935 #ifndef UMA_MD_SMALL_ALLOC
 2936         /* Set up radix zone to use noobj_alloc. */
 2937         vm_radix_reserve_kva();
 2938 #endif
 2939 
 2940         booted = BOOT_KVA;
 2941         zone_foreach_unlocked(zone_kva_available, NULL);
 2942         bucket_enable();
 2943 }
 2944 
 2945 /*
 2946  * Allocate counters as early as possible so that boot-time allocations are
 2947  * accounted more precisely.
 2948  */
 2949 static void
 2950 uma_startup_pcpu(void *arg __unused)
 2951 {
 2952 
 2953         zone_foreach_unlocked(zone_alloc_counters, NULL);
 2954         booted = BOOT_PCPU;
 2955 }
 2956 SYSINIT(uma_startup_pcpu, SI_SUB_COUNTER, SI_ORDER_ANY, uma_startup_pcpu, NULL);
 2957 
 2958 /*
 2959  * Finish our initialization steps.
 2960  */
 2961 static void
 2962 uma_startup3(void *arg __unused)
 2963 {
 2964 
 2965 #ifdef INVARIANTS
 2966         TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
 2967         uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
 2968         uma_skip_cnt = counter_u64_alloc(M_WAITOK);
 2969 #endif
 2970         zone_foreach_unlocked(zone_alloc_sysctl, NULL);
 2971         callout_init(&uma_callout, 1);
 2972         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 2973         booted = BOOT_RUNNING;
 2974 
 2975         EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
 2976             EVENTHANDLER_PRI_FIRST);
 2977 }
 2978 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 2979 
 2980 static void
 2981 uma_shutdown(void)
 2982 {
 2983 
 2984         booted = BOOT_SHUTDOWN;
 2985 }
 2986 
 2987 static uma_keg_t
 2988 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 2989                 int align, uint32_t flags)
 2990 {
 2991         struct uma_kctor_args args;
 2992 
 2993         args.size = size;
 2994         args.uminit = uminit;
 2995         args.fini = fini;
 2996         args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 2997         args.flags = flags;
 2998         args.zone = zone;
 2999         return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
 3000 }
 3001 
 3002 /* Public functions */
 3003 /* See uma.h */
 3004 void
 3005 uma_set_align(int align)
 3006 {
 3007 
 3008         if (align != UMA_ALIGN_CACHE)
 3009                 uma_align_cache = align;
 3010 }
 3011 
 3012 /* See uma.h */
 3013 uma_zone_t
 3014 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 3015                 uma_init uminit, uma_fini fini, int align, uint32_t flags)
 3016 
 3017 {
 3018         struct uma_zctor_args args;
 3019         uma_zone_t res;
 3020 
 3021         KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
 3022             align, name));
 3023 
 3024         /* This stuff is essential for the zone ctor */
 3025         memset(&args, 0, sizeof(args));
 3026         args.name = name;
 3027         args.size = size;
 3028         args.ctor = ctor;
 3029         args.dtor = dtor;
 3030         args.uminit = uminit;
 3031         args.fini = fini;
 3032 #ifdef  INVARIANTS
 3033         /*
 3034          * Inject procedures which check for memory use after free if we are
 3035          * allowed to scramble the memory while it is not allocated.  This
 3036          * requires that: UMA is actually able to access the memory, no init
 3037          * or fini procedures, no dependency on the initial value of the
 3038          * memory, and no (legitimate) use of the memory after free.  Note,
 3039          * the ctor and dtor do not need to be empty.
 3040          */
 3041         if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH |
 3042             UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) {
 3043                 args.uminit = trash_init;
 3044                 args.fini = trash_fini;
 3045         }
 3046 #endif
 3047         args.align = align;
 3048         args.flags = flags;
 3049         args.keg = NULL;
 3050 
 3051         sx_slock(&uma_reclaim_lock);
 3052         res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 3053         sx_sunlock(&uma_reclaim_lock);
 3054 
 3055         return (res);
 3056 }
 3057 
 3058 /* See uma.h */
 3059 uma_zone_t
 3060 uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor,
 3061     uma_init zinit, uma_fini zfini, uma_zone_t primary)
 3062 {
 3063         struct uma_zctor_args args;
 3064         uma_keg_t keg;
 3065         uma_zone_t res;
 3066 
 3067         keg = primary->uz_keg;
 3068         memset(&args, 0, sizeof(args));
 3069         args.name = name;
 3070         args.size = keg->uk_size;
 3071         args.ctor = ctor;
 3072         args.dtor = dtor;
 3073         args.uminit = zinit;
 3074         args.fini = zfini;
 3075         args.align = keg->uk_align;
 3076         args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
 3077         args.keg = keg;
 3078 
 3079         sx_slock(&uma_reclaim_lock);
 3080         res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
 3081         sx_sunlock(&uma_reclaim_lock);
 3082 
 3083         return (res);
 3084 }
 3085 
 3086 /* See uma.h */
 3087 uma_zone_t
 3088 uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor,
 3089     uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease,
 3090     void *arg, int flags)
 3091 {
 3092         struct uma_zctor_args args;
 3093 
 3094         memset(&args, 0, sizeof(args));
 3095         args.name = name;
 3096         args.size = size;
 3097         args.ctor = ctor;
 3098         args.dtor = dtor;
 3099         args.uminit = zinit;
 3100         args.fini = zfini;
 3101         args.import = zimport;
 3102         args.release = zrelease;
 3103         args.arg = arg;
 3104         args.align = 0;
 3105         args.flags = flags | UMA_ZFLAG_CACHE;
 3106 
 3107         return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
 3108 }
 3109 
 3110 /* See uma.h */
 3111 void
 3112 uma_zdestroy(uma_zone_t zone)
 3113 {
 3114 
 3115         /*
 3116          * Large slabs are expensive to reclaim, so don't bother doing
 3117          * unnecessary work if we're shutting down.
 3118          */
 3119         if (booted == BOOT_SHUTDOWN &&
 3120             zone->uz_fini == NULL && zone->uz_release == zone_release)
 3121                 return;
 3122         sx_slock(&uma_reclaim_lock);
 3123         zone_free_item(zones, zone, NULL, SKIP_NONE);
 3124         sx_sunlock(&uma_reclaim_lock);
 3125 }
 3126 
 3127 void
 3128 uma_zwait(uma_zone_t zone)
 3129 {
 3130 
 3131         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
 3132                 uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK));
 3133         else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0)
 3134                 uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK));
 3135         else
 3136                 uma_zfree(zone, uma_zalloc(zone, M_WAITOK));
 3137 }
 3138 
 3139 void *
 3140 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
 3141 {
 3142         void *item, *pcpu_item;
 3143 #ifdef SMP
 3144         int i;
 3145 
 3146         MPASS(zone->uz_flags & UMA_ZONE_PCPU);
 3147 #endif
 3148         item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
 3149         if (item == NULL)
 3150                 return (NULL);
 3151         pcpu_item = zpcpu_base_to_offset(item);
 3152         if (flags & M_ZERO) {
 3153 #ifdef SMP
 3154                 for (i = 0; i <= mp_maxid; i++)
 3155                         bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size);
 3156 #else
 3157                 bzero(item, zone->uz_size);
 3158 #endif
 3159         }
 3160         return (pcpu_item);
 3161 }
 3162 
 3163 /*
 3164  * A stub while both regular and pcpu cases are identical.
 3165  */
 3166 void
 3167 uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata)
 3168 {
 3169         void *item;
 3170 
 3171 #ifdef SMP
 3172         MPASS(zone->uz_flags & UMA_ZONE_PCPU);
 3173 #endif
 3174         item = zpcpu_offset_to_base(pcpu_item);
 3175         uma_zfree_arg(zone, item, udata);
 3176 }
 3177 
 3178 static inline void *
 3179 item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags,
 3180     void *item)
 3181 {
 3182 #ifdef INVARIANTS
 3183         bool skipdbg;
 3184 
 3185         skipdbg = uma_dbg_zskip(zone, item);
 3186         if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
 3187             zone->uz_ctor != trash_ctor)
 3188                 trash_ctor(item, size, udata, flags);
 3189 #endif
 3190         /* Check flags before loading ctor pointer. */
 3191         if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) &&
 3192             __predict_false(zone->uz_ctor != NULL) &&
 3193             zone->uz_ctor(item, size, udata, flags) != 0) {
 3194                 counter_u64_add(zone->uz_fails, 1);
 3195                 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
 3196                 return (NULL);
 3197         }
 3198 #ifdef INVARIANTS
 3199         if (!skipdbg)
 3200                 uma_dbg_alloc(zone, NULL, item);
 3201 #endif
 3202         if (__predict_false(flags & M_ZERO))
 3203                 return (memset(item, 0, size));
 3204 
 3205         return (item);
 3206 }
 3207 
 3208 static inline void
 3209 item_dtor(uma_zone_t zone, void *item, int size, void *udata,
 3210     enum zfreeskip skip)
 3211 {
 3212 #ifdef INVARIANTS
 3213         bool skipdbg;
 3214 
 3215         skipdbg = uma_dbg_zskip(zone, item);
 3216         if (skip == SKIP_NONE && !skipdbg) {
 3217                 if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0)
 3218                         uma_dbg_free(zone, udata, item);
 3219                 else
 3220                         uma_dbg_free(zone, NULL, item);
 3221         }
 3222 #endif
 3223         if (__predict_true(skip < SKIP_DTOR)) {
 3224                 if (zone->uz_dtor != NULL)
 3225                         zone->uz_dtor(item, size, udata);
 3226 #ifdef INVARIANTS
 3227                 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
 3228                     zone->uz_dtor != trash_dtor)
 3229                         trash_dtor(item, size, udata);
 3230 #endif
 3231         }
 3232 }
 3233 
 3234 #ifdef NUMA
 3235 static int
 3236 item_domain(void *item)
 3237 {
 3238         int domain;
 3239 
 3240         domain = vm_phys_domain(vtophys(item));
 3241         KASSERT(domain >= 0 && domain < vm_ndomains,
 3242             ("%s: unknown domain for item %p", __func__, item));
 3243         return (domain);
 3244 }
 3245 #endif
 3246 
 3247 #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS)
 3248 #define UMA_ZALLOC_DEBUG
 3249 static int
 3250 uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags)
 3251 {
 3252         int error;
 3253 
 3254         error = 0;
 3255 #ifdef WITNESS
 3256         if (flags & M_WAITOK) {
 3257                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 3258                     "uma_zalloc_debug: zone \"%s\"", zone->uz_name);
 3259         }
 3260 #endif
 3261 
 3262 #ifdef INVARIANTS
 3263         KASSERT((flags & M_EXEC) == 0,
 3264             ("uma_zalloc_debug: called with M_EXEC"));
 3265         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 3266             ("uma_zalloc_debug: called within spinlock or critical section"));
 3267         KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0,
 3268             ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO"));
 3269 #endif
 3270 
 3271 #ifdef DEBUG_MEMGUARD
 3272         if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && memguard_cmp_zone(zone)) {
 3273                 void *item;
 3274                 item = memguard_alloc(zone->uz_size, flags);
 3275                 if (item != NULL) {
 3276                         error = EJUSTRETURN;
 3277                         if (zone->uz_init != NULL &&
 3278                             zone->uz_init(item, zone->uz_size, flags) != 0) {
 3279                                 *itemp = NULL;
 3280                                 return (error);
 3281                         }
 3282                         if (zone->uz_ctor != NULL &&
 3283                             zone->uz_ctor(item, zone->uz_size, udata,
 3284                             flags) != 0) {
 3285                                 counter_u64_add(zone->uz_fails, 1);
 3286                                 zone->uz_fini(item, zone->uz_size);
 3287                                 *itemp = NULL;
 3288                                 return (error);
 3289                         }
 3290                         *itemp = item;
 3291                         return (error);
 3292                 }
 3293                 /* This is unfortunate but should not be fatal. */
 3294         }
 3295 #endif
 3296         return (error);
 3297 }
 3298 
 3299 static int
 3300 uma_zfree_debug(uma_zone_t zone, void *item, void *udata)
 3301 {
 3302         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 3303             ("uma_zfree_debug: called with spinlock or critical section held"));
 3304 
 3305 #ifdef DEBUG_MEMGUARD
 3306         if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && is_memguard_addr(item)) {
 3307                 if (zone->uz_dtor != NULL)
 3308                         zone->uz_dtor(item, zone->uz_size, udata);
 3309                 if (zone->uz_fini != NULL)
 3310                         zone->uz_fini(item, zone->uz_size);
 3311                 memguard_free(item);
 3312                 return (EJUSTRETURN);
 3313         }
 3314 #endif
 3315         return (0);
 3316 }
 3317 #endif
 3318 
 3319 static inline void *
 3320 cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket,
 3321     void *udata, int flags)
 3322 {
 3323         void *item;
 3324         int size, uz_flags;
 3325 
 3326         item = cache_bucket_pop(cache, bucket);
 3327         size = cache_uz_size(cache);
 3328         uz_flags = cache_uz_flags(cache);
 3329         critical_exit();
 3330         return (item_ctor(zone, uz_flags, size, udata, flags, item));
 3331 }
 3332 
 3333 static __noinline void *
 3334 cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
 3335 {
 3336         uma_cache_bucket_t bucket;
 3337         int domain;
 3338 
 3339         while (cache_alloc(zone, cache, udata, flags)) {
 3340                 cache = &zone->uz_cpu[curcpu];
 3341                 bucket = &cache->uc_allocbucket;
 3342                 if (__predict_false(bucket->ucb_cnt == 0))
 3343                         continue;
 3344                 return (cache_alloc_item(zone, cache, bucket, udata, flags));
 3345         }
 3346         critical_exit();
 3347 
 3348         /*
 3349          * We can not get a bucket so try to return a single item.
 3350          */
 3351         if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH)
 3352                 domain = PCPU_GET(domain);
 3353         else
 3354                 domain = UMA_ANYDOMAIN;
 3355         return (zone_alloc_item(zone, udata, domain, flags));
 3356 }
 3357 
 3358 /* See uma.h */
 3359 void *
 3360 uma_zalloc_smr(uma_zone_t zone, int flags)
 3361 {
 3362         uma_cache_bucket_t bucket;
 3363         uma_cache_t cache;
 3364 
 3365 #ifdef UMA_ZALLOC_DEBUG
 3366         void *item;
 3367 
 3368         KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
 3369             ("uma_zalloc_arg: called with non-SMR zone."));
 3370         if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN)
 3371                 return (item);
 3372 #endif
 3373 
 3374         critical_enter();
 3375         cache = &zone->uz_cpu[curcpu];
 3376         bucket = &cache->uc_allocbucket;
 3377         if (__predict_false(bucket->ucb_cnt == 0))
 3378                 return (cache_alloc_retry(zone, cache, NULL, flags));
 3379         return (cache_alloc_item(zone, cache, bucket, NULL, flags));
 3380 }
 3381 
 3382 /* See uma.h */
 3383 void *
 3384 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 3385 {
 3386         uma_cache_bucket_t bucket;
 3387         uma_cache_t cache;
 3388 
 3389         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 3390         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 3391 
 3392         /* This is the fast path allocation */
 3393         CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name,
 3394             zone, flags);
 3395 
 3396 #ifdef UMA_ZALLOC_DEBUG
 3397         void *item;
 3398 
 3399         KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
 3400             ("uma_zalloc_arg: called with SMR zone."));
 3401         if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN)
 3402                 return (item);
 3403 #endif
 3404 
 3405         /*
 3406          * If possible, allocate from the per-CPU cache.  There are two
 3407          * requirements for safe access to the per-CPU cache: (1) the thread
 3408          * accessing the cache must not be preempted or yield during access,
 3409          * and (2) the thread must not migrate CPUs without switching which
 3410          * cache it accesses.  We rely on a critical section to prevent
 3411          * preemption and migration.  We release the critical section in
 3412          * order to acquire the zone mutex if we are unable to allocate from
 3413          * the current cache; when we re-acquire the critical section, we
 3414          * must detect and handle migration if it has occurred.
 3415          */
 3416         critical_enter();
 3417         cache = &zone->uz_cpu[curcpu];
 3418         bucket = &cache->uc_allocbucket;
 3419         if (__predict_false(bucket->ucb_cnt == 0))
 3420                 return (cache_alloc_retry(zone, cache, udata, flags));
 3421         return (cache_alloc_item(zone, cache, bucket, udata, flags));
 3422 }
 3423 
 3424 /*
 3425  * Replenish an alloc bucket and possibly restore an old one.  Called in
 3426  * a critical section.  Returns in a critical section.
 3427  *
 3428  * A false return value indicates an allocation failure.
 3429  * A true return value indicates success and the caller should retry.
 3430  */
 3431 static __noinline bool
 3432 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
 3433 {
 3434         uma_bucket_t bucket;
 3435         int curdomain, domain;
 3436         bool new;
 3437 
 3438         CRITICAL_ASSERT(curthread);
 3439 
 3440         /*
 3441          * If we have run out of items in our alloc bucket see
 3442          * if we can switch with the free bucket.
 3443          *
 3444          * SMR Zones can't re-use the free bucket until the sequence has
 3445          * expired.
 3446          */
 3447         if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 &&
 3448             cache->uc_freebucket.ucb_cnt != 0) {
 3449                 cache_bucket_swap(&cache->uc_freebucket,
 3450                     &cache->uc_allocbucket);
 3451                 return (true);
 3452         }
 3453 
 3454         /*
 3455          * Discard any empty allocation bucket while we hold no locks.
 3456          */
 3457         bucket = cache_bucket_unload_alloc(cache);
 3458         critical_exit();
 3459 
 3460         if (bucket != NULL) {
 3461                 KASSERT(bucket->ub_cnt == 0,
 3462                     ("cache_alloc: Entered with non-empty alloc bucket."));
 3463                 bucket_free(zone, bucket, udata);
 3464         }
 3465 
 3466         /*
 3467          * Attempt to retrieve the item from the per-CPU cache has failed, so
 3468          * we must go back to the zone.  This requires the zdom lock, so we
 3469          * must drop the critical section, then re-acquire it when we go back
 3470          * to the cache.  Since the critical section is released, we may be
 3471          * preempted or migrate.  As such, make sure not to maintain any
 3472          * thread-local state specific to the cache from prior to releasing
 3473          * the critical section.
 3474          */
 3475         domain = PCPU_GET(domain);
 3476         if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0 ||
 3477             VM_DOMAIN_EMPTY(domain))
 3478                 domain = zone_domain_highest(zone, domain);
 3479         bucket = cache_fetch_bucket(zone, cache, domain);
 3480         if (bucket == NULL && zone->uz_bucket_size != 0 && !bucketdisable) {
 3481                 bucket = zone_alloc_bucket(zone, udata, domain, flags);
 3482                 new = true;
 3483         } else {
 3484                 new = false;
 3485         }
 3486 
 3487         CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
 3488             zone->uz_name, zone, bucket);
 3489         if (bucket == NULL) {
 3490                 critical_enter();
 3491                 return (false);
 3492         }
 3493 
 3494         /*
 3495          * See if we lost the race or were migrated.  Cache the
 3496          * initialized bucket to make this less likely or claim
 3497          * the memory directly.
 3498          */
 3499         critical_enter();
 3500         cache = &zone->uz_cpu[curcpu];
 3501         if (cache->uc_allocbucket.ucb_bucket == NULL &&
 3502             ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 ||
 3503             (curdomain = PCPU_GET(domain)) == domain ||
 3504             VM_DOMAIN_EMPTY(curdomain))) {
 3505                 if (new)
 3506                         atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax,
 3507                             bucket->ub_cnt);
 3508                 cache_bucket_load_alloc(cache, bucket);
 3509                 return (true);
 3510         }
 3511 
 3512         /*
 3513          * We lost the race, release this bucket and start over.
 3514          */
 3515         critical_exit();
 3516         zone_put_bucket(zone, domain, bucket, udata, false);
 3517         critical_enter();
 3518 
 3519         return (true);
 3520 }
 3521 
 3522 void *
 3523 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
 3524 {
 3525 #ifdef NUMA
 3526         uma_bucket_t bucket;
 3527         uma_zone_domain_t zdom;
 3528         void *item;
 3529 #endif
 3530 
 3531         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 3532         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 3533 
 3534         /* This is the fast path allocation */
 3535         CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d",
 3536             zone->uz_name, zone, domain, flags);
 3537 
 3538         if (flags & M_WAITOK) {
 3539                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 3540                     "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
 3541         }
 3542         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
 3543             ("uma_zalloc_domain: called with spinlock or critical section held"));
 3544         KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
 3545             ("uma_zalloc_domain: called with SMR zone."));
 3546 #ifdef NUMA
 3547         KASSERT((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0,
 3548             ("uma_zalloc_domain: called with non-FIRSTTOUCH zone."));
 3549 
 3550         if (vm_ndomains == 1)
 3551                 return (uma_zalloc_arg(zone, udata, flags));
 3552 
 3553         /*
 3554          * Try to allocate from the bucket cache before falling back to the keg.
 3555          * We could try harder and attempt to allocate from per-CPU caches or
 3556          * the per-domain cross-domain buckets, but the complexity is probably
 3557          * not worth it.  It is more important that frees of previous
 3558          * cross-domain allocations do not blow up the cache.
 3559          */
 3560         zdom = zone_domain_lock(zone, domain);
 3561         if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) {
 3562                 item = bucket->ub_bucket[bucket->ub_cnt - 1];
 3563 #ifdef INVARIANTS
 3564                 bucket->ub_bucket[bucket->ub_cnt - 1] = NULL;
 3565 #endif
 3566                 bucket->ub_cnt--;
 3567                 zone_put_bucket(zone, domain, bucket, udata, true);
 3568                 item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata,
 3569                     flags, item);
 3570                 if (item != NULL) {
 3571                         KASSERT(item_domain(item) == domain,
 3572                             ("%s: bucket cache item %p from wrong domain",
 3573                             __func__, item));
 3574                         counter_u64_add(zone->uz_allocs, 1);
 3575                 }
 3576                 return (item);
 3577         }
 3578         ZDOM_UNLOCK(zdom);
 3579         return (zone_alloc_item(zone, udata, domain, flags));
 3580 #else
 3581         return (uma_zalloc_arg(zone, udata, flags));
 3582 #endif
 3583 }
 3584 
 3585 /*
 3586  * Find a slab with some space.  Prefer slabs that are partially used over those
 3587  * that are totally full.  This helps to reduce fragmentation.
 3588  *
 3589  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
 3590  * only 'domain'.
 3591  */
 3592 static uma_slab_t
 3593 keg_first_slab(uma_keg_t keg, int domain, bool rr)
 3594 {
 3595         uma_domain_t dom;
 3596         uma_slab_t slab;
 3597         int start;
 3598 
 3599         KASSERT(domain >= 0 && domain < vm_ndomains,
 3600             ("keg_first_slab: domain %d out of range", domain));
 3601         KEG_LOCK_ASSERT(keg, domain);
 3602 
 3603         slab = NULL;
 3604         start = domain;
 3605         do {
 3606                 dom = &keg->uk_domain[domain];
 3607                 if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL)
 3608                         return (slab);
 3609                 if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) {
 3610                         LIST_REMOVE(slab, us_link);
 3611                         dom->ud_free_slabs--;
 3612                         LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 3613                         return (slab);
 3614                 }
 3615                 if (rr)
 3616                         domain = (domain + 1) % vm_ndomains;
 3617         } while (domain != start);
 3618 
 3619         return (NULL);
 3620 }
 3621 
 3622 /*
 3623  * Fetch an existing slab from a free or partial list.  Returns with the
 3624  * keg domain lock held if a slab was found or unlocked if not.
 3625  */
 3626 static uma_slab_t
 3627 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
 3628 {
 3629         uma_slab_t slab;
 3630         uint32_t reserve;
 3631 
 3632         /* HASH has a single free list. */
 3633         if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
 3634                 domain = 0;
 3635 
 3636         KEG_LOCK(keg, domain);
 3637         reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
 3638         if (keg->uk_domain[domain].ud_free_items <= reserve ||
 3639             (slab = keg_first_slab(keg, domain, rr)) == NULL) {
 3640                 KEG_UNLOCK(keg, domain);
 3641                 return (NULL);
 3642         }
 3643         return (slab);
 3644 }
 3645 
 3646 static uma_slab_t
 3647 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
 3648 {
 3649         struct vm_domainset_iter di;
 3650         uma_slab_t slab;
 3651         int aflags, domain;
 3652         bool rr;
 3653 
 3654 restart:
 3655         /*
 3656          * Use the keg's policy if upper layers haven't already specified a
 3657          * domain (as happens with first-touch zones).
 3658          *
 3659          * To avoid races we run the iterator with the keg lock held, but that
 3660          * means that we cannot allow the vm_domainset layer to sleep.  Thus,
 3661          * clear M_WAITOK and handle low memory conditions locally.
 3662          */
 3663         rr = rdomain == UMA_ANYDOMAIN;
 3664         if (rr) {
 3665                 aflags = (flags & ~M_WAITOK) | M_NOWAIT;
 3666                 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
 3667                     &aflags);
 3668         } else {
 3669                 aflags = flags;
 3670                 domain = rdomain;
 3671         }
 3672 
 3673         for (;;) {
 3674                 slab = keg_fetch_free_slab(keg, domain, rr, flags);
 3675                 if (slab != NULL)
 3676                         return (slab);
 3677 
 3678                 /*
 3679                  * M_NOVM means don't ask at all!
 3680                  */
 3681                 if (flags & M_NOVM)
 3682                         break;
 3683 
 3684                 slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
 3685                 if (slab != NULL)
 3686                         return (slab);
 3687                 if (!rr && (flags & M_WAITOK) == 0)
 3688                         break;
 3689                 if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
 3690                         if ((flags & M_WAITOK) != 0) {
 3691                                 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
 3692                                 goto restart;
 3693                         }
 3694                         break;
 3695                 }
 3696         }
 3697 
 3698         /*
 3699          * We might not have been able to get a slab but another cpu
 3700          * could have while we were unlocked.  Check again before we
 3701          * fail.
 3702          */
 3703         if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL)
 3704                 return (slab);
 3705 
 3706         return (NULL);
 3707 }
 3708 
 3709 static void *
 3710 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
 3711 {
 3712         uma_domain_t dom;
 3713         void *item;
 3714         int freei;
 3715 
 3716         KEG_LOCK_ASSERT(keg, slab->us_domain);
 3717 
 3718         dom = &keg->uk_domain[slab->us_domain];
 3719         freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1;
 3720         BIT_CLR(keg->uk_ipers, freei, &slab->us_free);
 3721         item = slab_item(slab, keg, freei);
 3722         slab->us_freecount--;
 3723         dom->ud_free_items--;
 3724 
 3725         /*
 3726          * Move this slab to the full list.  It must be on the partial list, so
 3727          * we do not need to update the free slab count.  In particular,
 3728          * keg_fetch_slab() always returns slabs on the partial list.
 3729          */
 3730         if (slab->us_freecount == 0) {
 3731                 LIST_REMOVE(slab, us_link);
 3732                 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
 3733         }
 3734 
 3735         return (item);
 3736 }
 3737 
 3738 static int
 3739 zone_import(void *arg, void **bucket, int max, int domain, int flags)
 3740 {
 3741         uma_domain_t dom;
 3742         uma_zone_t zone;
 3743         uma_slab_t slab;
 3744         uma_keg_t keg;
 3745 #ifdef NUMA
 3746         int stripe;
 3747 #endif
 3748         int i;
 3749 
 3750         zone = arg;
 3751         slab = NULL;
 3752         keg = zone->uz_keg;
 3753         /* Try to keep the buckets totally full */
 3754         for (i = 0; i < max; ) {
 3755                 if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL)
 3756                         break;
 3757 #ifdef NUMA
 3758                 stripe = howmany(max, vm_ndomains);
 3759 #endif
 3760                 dom = &keg->uk_domain[slab->us_domain];
 3761                 do {
 3762                         bucket[i++] = slab_alloc_item(keg, slab);
 3763                         if (dom->ud_free_items <= keg->uk_reserve) {
 3764                                 /*
 3765                                  * Avoid depleting the reserve after a
 3766                                  * successful item allocation, even if
 3767                                  * M_USE_RESERVE is specified.
 3768                                  */
 3769                                 KEG_UNLOCK(keg, slab->us_domain);
 3770                                 goto out;
 3771                         }
 3772 #ifdef NUMA
 3773                         /*
 3774                          * If the zone is striped we pick a new slab for every
 3775                          * N allocations.  Eliminating this conditional will
 3776                          * instead pick a new domain for each bucket rather
 3777                          * than stripe within each bucket.  The current option
 3778                          * produces more fragmentation and requires more cpu
 3779                          * time but yields better distribution.
 3780                          */
 3781                         if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 &&
 3782                             vm_ndomains > 1 && --stripe == 0)
 3783                                 break;
 3784 #endif
 3785                 } while (slab->us_freecount != 0 && i < max);
 3786                 KEG_UNLOCK(keg, slab->us_domain);
 3787 
 3788                 /* Don't block if we allocated any successfully. */
 3789                 flags &= ~M_WAITOK;
 3790                 flags |= M_NOWAIT;
 3791         }
 3792 out:
 3793         return i;
 3794 }
 3795 
 3796 static int
 3797 zone_alloc_limit_hard(uma_zone_t zone, int count, int flags)
 3798 {
 3799         uint64_t old, new, total, max;
 3800 
 3801         /*
 3802          * The hard case.  We're going to sleep because there were existing
 3803          * sleepers or because we ran out of items.  This routine enforces
 3804          * fairness by keeping fifo order.
 3805          *
 3806          * First release our ill gotten gains and make some noise.
 3807          */
 3808         for (;;) {
 3809                 zone_free_limit(zone, count);
 3810                 zone_log_warning(zone);
 3811                 zone_maxaction(zone);
 3812                 if (flags & M_NOWAIT)
 3813                         return (0);
 3814 
 3815                 /*
 3816                  * We need to allocate an item or set ourself as a sleeper
 3817                  * while the sleepq lock is held to avoid wakeup races.  This
 3818                  * is essentially a home rolled semaphore.
 3819                  */
 3820                 sleepq_lock(&zone->uz_max_items);
 3821                 old = zone->uz_items;
 3822                 do {
 3823                         MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX);
 3824                         /* Cache the max since we will evaluate twice. */
 3825                         max = zone->uz_max_items;
 3826                         if (UZ_ITEMS_SLEEPERS(old) != 0 ||
 3827                             UZ_ITEMS_COUNT(old) >= max)
 3828                                 new = old + UZ_ITEMS_SLEEPER;
 3829                         else
 3830                                 new = old + MIN(count, max - old);
 3831                 } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0);
 3832 
 3833                 /* We may have successfully allocated under the sleepq lock. */
 3834                 if (UZ_ITEMS_SLEEPERS(new) == 0) {
 3835                         sleepq_release(&zone->uz_max_items);
 3836                         return (new - old);
 3837                 }
 3838 
 3839                 /*
 3840                  * This is in a different cacheline from uz_items so that we
 3841                  * don't constantly invalidate the fastpath cacheline when we
 3842                  * adjust item counts.  This could be limited to toggling on
 3843                  * transitions.
 3844                  */
 3845                 atomic_add_32(&zone->uz_sleepers, 1);
 3846                 atomic_add_64(&zone->uz_sleeps, 1);
 3847 
 3848                 /*
 3849                  * We have added ourselves as a sleeper.  The sleepq lock
 3850                  * protects us from wakeup races.  Sleep now and then retry.
 3851                  */
 3852                 sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0);
 3853                 sleepq_wait(&zone->uz_max_items, PVM);
 3854 
 3855                 /*
 3856                  * After wakeup, remove ourselves as a sleeper and try
 3857                  * again.  We no longer have the sleepq lock for protection.
 3858                  *
 3859                  * Subract ourselves as a sleeper while attempting to add
 3860                  * our count.
 3861                  */
 3862                 atomic_subtract_32(&zone->uz_sleepers, 1);
 3863                 old = atomic_fetchadd_64(&zone->uz_items,
 3864                     -(UZ_ITEMS_SLEEPER - count));
 3865                 /* We're no longer a sleeper. */
 3866                 old -= UZ_ITEMS_SLEEPER;
 3867 
 3868                 /*
 3869                  * If we're still at the limit, restart.  Notably do not
 3870                  * block on other sleepers.  Cache the max value to protect
 3871                  * against changes via sysctl.
 3872                  */
 3873                 total = UZ_ITEMS_COUNT(old);
 3874                 max = zone->uz_max_items;
 3875                 if (total >= max)
 3876                         continue;
 3877                 /* Truncate if necessary, otherwise wake other sleepers. */
 3878                 if (total + count > max) {
 3879                         zone_free_limit(zone, total + count - max);
 3880                         count = max - total;
 3881                 } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0)
 3882                         wakeup_one(&zone->uz_max_items);
 3883 
 3884                 return (count);
 3885         }
 3886 }
 3887 
 3888 /*
 3889  * Allocate 'count' items from our max_items limit.  Returns the number
 3890  * available.  If M_NOWAIT is not specified it will sleep until at least
 3891  * one item can be allocated.
 3892  */
 3893 static int
 3894 zone_alloc_limit(uma_zone_t zone, int count, int flags)
 3895 {
 3896         uint64_t old;
 3897         uint64_t max;
 3898 
 3899         max = zone->uz_max_items;
 3900         MPASS(max > 0);
 3901 
 3902         /*
 3903          * We expect normal allocations to succeed with a simple
 3904          * fetchadd.
 3905          */
 3906         old = atomic_fetchadd_64(&zone->uz_items, count);
 3907         if (__predict_true(old + count <= max))
 3908                 return (count);
 3909 
 3910         /*
 3911          * If we had some items and no sleepers just return the
 3912          * truncated value.  We have to release the excess space
 3913          * though because that may wake sleepers who weren't woken
 3914          * because we were temporarily over the limit.
 3915          */
 3916         if (old < max) {
 3917                 zone_free_limit(zone, (old + count) - max);
 3918                 return (max - old);
 3919         }
 3920         return (zone_alloc_limit_hard(zone, count, flags));
 3921 }
 3922 
 3923 /*
 3924  * Free a number of items back to the limit.
 3925  */
 3926 static void
 3927 zone_free_limit(uma_zone_t zone, int count)
 3928 {
 3929         uint64_t old;
 3930 
 3931         MPASS(count > 0);
 3932 
 3933         /*
 3934          * In the common case we either have no sleepers or
 3935          * are still over the limit and can just return.
 3936          */
 3937         old = atomic_fetchadd_64(&zone->uz_items, -count);
 3938         if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 ||
 3939            UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items))
 3940                 return;
 3941 
 3942         /*
 3943          * Moderate the rate of wakeups.  Sleepers will continue
 3944          * to generate wakeups if necessary.
 3945          */
 3946         wakeup_one(&zone->uz_max_items);
 3947 }
 3948 
 3949 static uma_bucket_t
 3950 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
 3951 {
 3952         uma_bucket_t bucket;
 3953         int maxbucket, cnt;
 3954 
 3955         CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name,
 3956             zone, domain);
 3957 
 3958         /* Avoid allocs targeting empty domains. */
 3959         if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
 3960                 domain = UMA_ANYDOMAIN;
 3961         else if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
 3962                 domain = UMA_ANYDOMAIN;
 3963 
 3964         if (zone->uz_max_items > 0)
 3965                 maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size,
 3966                     M_NOWAIT);
 3967         else
 3968                 maxbucket = zone->uz_bucket_size;
 3969         if (maxbucket == 0)
 3970                 return (false);
 3971 
 3972         /* Don't wait for buckets, preserve caller's NOVM setting. */
 3973         bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
 3974         if (bucket == NULL) {
 3975                 cnt = 0;
 3976                 goto out;
 3977         }
 3978 
 3979         bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
 3980             MIN(maxbucket, bucket->ub_entries), domain, flags);
 3981 
 3982         /*
 3983          * Initialize the memory if necessary.
 3984          */
 3985         if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
 3986                 int i;
 3987 
 3988                 for (i = 0; i < bucket->ub_cnt; i++)
 3989                         if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
 3990                             flags) != 0)
 3991                                 break;
 3992                 /*
 3993                  * If we couldn't initialize the whole bucket, put the
 3994                  * rest back onto the freelist.
 3995                  */
 3996                 if (i != bucket->ub_cnt) {
 3997                         zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
 3998                             bucket->ub_cnt - i);
 3999 #ifdef INVARIANTS
 4000                         bzero(&bucket->ub_bucket[i],
 4001                             sizeof(void *) * (bucket->ub_cnt - i));
 4002 #endif
 4003                         bucket->ub_cnt = i;
 4004                 }
 4005         }
 4006 
 4007         cnt = bucket->ub_cnt;
 4008         if (bucket->ub_cnt == 0) {
 4009                 bucket_free(zone, bucket, udata);
 4010                 counter_u64_add(zone->uz_fails, 1);
 4011                 bucket = NULL;
 4012         }
 4013 out:
 4014         if (zone->uz_max_items > 0 && cnt < maxbucket)
 4015                 zone_free_limit(zone, maxbucket - cnt);
 4016 
 4017         return (bucket);
 4018 }
 4019 
 4020 /*
 4021  * Allocates a single item from a zone.
 4022  *
 4023  * Arguments
 4024  *      zone   The zone to alloc for.
 4025  *      udata  The data to be passed to the constructor.
 4026  *      domain The domain to allocate from or UMA_ANYDOMAIN.
 4027  *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
 4028  *
 4029  * Returns
 4030  *      NULL if there is no memory and M_NOWAIT is set
 4031  *      An item if successful
 4032  */
 4033 
 4034 static void *
 4035 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
 4036 {
 4037         void *item;
 4038 
 4039         if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) {
 4040                 counter_u64_add(zone->uz_fails, 1);
 4041                 return (NULL);
 4042         }
 4043 
 4044         /* Avoid allocs targeting empty domains. */
 4045         if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
 4046                 domain = UMA_ANYDOMAIN;
 4047 
 4048         if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
 4049                 goto fail_cnt;
 4050 
 4051         /*
 4052          * We have to call both the zone's init (not the keg's init)
 4053          * and the zone's ctor.  This is because the item is going from
 4054          * a keg slab directly to the user, and the user is expecting it
 4055          * to be both zone-init'd as well as zone-ctor'd.
 4056          */
 4057         if (zone->uz_init != NULL) {
 4058                 if (zone->uz_init(item, zone->uz_size, flags) != 0) {
 4059                         zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
 4060                         goto fail_cnt;
 4061                 }
 4062         }
 4063         item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags,
 4064             item);
 4065         if (item == NULL)
 4066                 goto fail;
 4067 
 4068         counter_u64_add(zone->uz_allocs, 1);
 4069         CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
 4070             zone->uz_name, zone);
 4071 
 4072         return (item);
 4073 
 4074 fail_cnt:
 4075         counter_u64_add(zone->uz_fails, 1);
 4076 fail:
 4077         if (zone->uz_max_items > 0)
 4078                 zone_free_limit(zone, 1);
 4079         CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
 4080             zone->uz_name, zone);
 4081 
 4082         return (NULL);
 4083 }
 4084 
 4085 /* See uma.h */
 4086 void
 4087 uma_zfree_smr(uma_zone_t zone, void *item)
 4088 {
 4089         uma_cache_t cache;
 4090         uma_cache_bucket_t bucket;
 4091         int itemdomain, uz_flags;
 4092 
 4093 #ifdef UMA_ZALLOC_DEBUG
 4094         KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
 4095             ("uma_zfree_smr: called with non-SMR zone."));
 4096         KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer."));
 4097         SMR_ASSERT_NOT_ENTERED(zone->uz_smr);
 4098         if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN)
 4099                 return;
 4100 #endif
 4101         cache = &zone->uz_cpu[curcpu];
 4102         uz_flags = cache_uz_flags(cache);
 4103         itemdomain = 0;
 4104 #ifdef NUMA
 4105         if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
 4106                 itemdomain = item_domain(item);
 4107 #endif
 4108         critical_enter();
 4109         do {
 4110                 cache = &zone->uz_cpu[curcpu];
 4111                 /* SMR Zones must free to the free bucket. */
 4112                 bucket = &cache->uc_freebucket;
 4113 #ifdef NUMA
 4114                 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
 4115                     PCPU_GET(domain) != itemdomain) {
 4116                         bucket = &cache->uc_crossbucket;
 4117                 }
 4118 #endif
 4119                 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
 4120                         cache_bucket_push(cache, bucket, item);
 4121                         critical_exit();
 4122                         return;
 4123                 }
 4124         } while (cache_free(zone, cache, NULL, item, itemdomain));
 4125         critical_exit();
 4126 
 4127         /*
 4128          * If nothing else caught this, we'll just do an internal free.
 4129          */
 4130         zone_free_item(zone, item, NULL, SKIP_NONE);
 4131 }
 4132 
 4133 /* See uma.h */
 4134 void
 4135 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 4136 {
 4137         uma_cache_t cache;
 4138         uma_cache_bucket_t bucket;
 4139         int itemdomain, uz_flags;
 4140 
 4141         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
 4142         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
 4143 
 4144         CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone);
 4145 
 4146 #ifdef UMA_ZALLOC_DEBUG
 4147         KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
 4148             ("uma_zfree_arg: called with SMR zone."));
 4149         if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN)
 4150                 return;
 4151 #endif
 4152         /* uma_zfree(..., NULL) does nothing, to match free(9). */
 4153         if (item == NULL)
 4154                 return;
 4155 
 4156         /*
 4157          * We are accessing the per-cpu cache without a critical section to
 4158          * fetch size and flags.  This is acceptable, if we are preempted we
 4159          * will simply read another cpu's line.
 4160          */
 4161         cache = &zone->uz_cpu[curcpu];
 4162         uz_flags = cache_uz_flags(cache);
 4163         if (UMA_ALWAYS_CTORDTOR ||
 4164             __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0))
 4165                 item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE);
 4166 
 4167         /*
 4168          * The race here is acceptable.  If we miss it we'll just have to wait
 4169          * a little longer for the limits to be reset.
 4170          */
 4171         if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) {
 4172                 if (atomic_load_32(&zone->uz_sleepers) > 0)
 4173                         goto zfree_item;
 4174         }
 4175 
 4176         /*
 4177          * If possible, free to the per-CPU cache.  There are two
 4178          * requirements for safe access to the per-CPU cache: (1) the thread
 4179          * accessing the cache must not be preempted or yield during access,
 4180          * and (2) the thread must not migrate CPUs without switching which
 4181          * cache it accesses.  We rely on a critical section to prevent
 4182          * preemption and migration.  We release the critical section in
 4183          * order to acquire the zone mutex if we are unable to free to the
 4184          * current cache; when we re-acquire the critical section, we must
 4185          * detect and handle migration if it has occurred.
 4186          */
 4187         itemdomain = 0;
 4188 #ifdef NUMA
 4189         if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
 4190                 itemdomain = item_domain(item);
 4191 #endif
 4192         critical_enter();
 4193         do {
 4194                 cache = &zone->uz_cpu[curcpu];
 4195                 /*
 4196                  * Try to free into the allocbucket first to give LIFO
 4197                  * ordering for cache-hot datastructures.  Spill over
 4198                  * into the freebucket if necessary.  Alloc will swap
 4199                  * them if one runs dry.
 4200                  */
 4201                 bucket = &cache->uc_allocbucket;
 4202 #ifdef NUMA
 4203                 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
 4204                     PCPU_GET(domain) != itemdomain) {
 4205                         bucket = &cache->uc_crossbucket;
 4206                 } else
 4207 #endif
 4208                 if (bucket->ucb_cnt == bucket->ucb_entries &&
 4209                    cache->uc_freebucket.ucb_cnt <
 4210                    cache->uc_freebucket.ucb_entries)
 4211                         cache_bucket_swap(&cache->uc_freebucket,
 4212                             &cache->uc_allocbucket);
 4213                 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
 4214                         cache_bucket_push(cache, bucket, item);
 4215                         critical_exit();
 4216                         return;
 4217                 }
 4218         } while (cache_free(zone, cache, udata, item, itemdomain));
 4219         critical_exit();
 4220 
 4221         /*
 4222          * If nothing else caught this, we'll just do an internal free.
 4223          */
 4224 zfree_item:
 4225         zone_free_item(zone, item, udata, SKIP_DTOR);
 4226 }
 4227 
 4228 #ifdef NUMA
 4229 /*
 4230  * sort crossdomain free buckets to domain correct buckets and cache
 4231  * them.
 4232  */
 4233 static void
 4234 zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 4235 {
 4236         struct uma_bucketlist emptybuckets, fullbuckets;
 4237         uma_zone_domain_t zdom;
 4238         uma_bucket_t b;
 4239         smr_seq_t seq;
 4240         void *item;
 4241         int domain;
 4242 
 4243         CTR3(KTR_UMA,
 4244             "uma_zfree: zone %s(%p) draining cross bucket %p",
 4245             zone->uz_name, zone, bucket);
 4246 
 4247         /*
 4248          * It is possible for buckets to arrive here out of order so we fetch
 4249          * the current smr seq rather than accepting the bucket's.
 4250          */
 4251         seq = SMR_SEQ_INVALID;
 4252         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
 4253                 seq = smr_advance(zone->uz_smr);
 4254 
 4255         /*
 4256          * To avoid having ndomain * ndomain buckets for sorting we have a
 4257          * lock on the current crossfree bucket.  A full matrix with
 4258          * per-domain locking could be used if necessary.
 4259          */
 4260         STAILQ_INIT(&emptybuckets);
 4261         STAILQ_INIT(&fullbuckets);
 4262         ZONE_CROSS_LOCK(zone);
 4263         for (; bucket->ub_cnt > 0; bucket->ub_cnt--) {
 4264                 item = bucket->ub_bucket[bucket->ub_cnt - 1];
 4265                 domain = item_domain(item);
 4266                 zdom = ZDOM_GET(zone, domain);
 4267                 if (zdom->uzd_cross == NULL) {
 4268                         if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) {
 4269                                 STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
 4270                                 zdom->uzd_cross = b;
 4271                         } else {
 4272                                 /*
 4273                                  * Avoid allocating a bucket with the cross lock
 4274                                  * held, since allocation can trigger a
 4275                                  * cross-domain free and bucket zones may
 4276                                  * allocate from each other.
 4277                                  */
 4278                                 ZONE_CROSS_UNLOCK(zone);
 4279                                 b = bucket_alloc(zone, udata, M_NOWAIT);
 4280                                 if (b == NULL)
 4281                                         goto out;
 4282                                 ZONE_CROSS_LOCK(zone);
 4283                                 if (zdom->uzd_cross != NULL) {
 4284                                         STAILQ_INSERT_HEAD(&emptybuckets, b,
 4285                                             ub_link);
 4286                                 } else {
 4287                                         zdom->uzd_cross = b;
 4288                                 }
 4289                         }
 4290                 }
 4291                 b = zdom->uzd_cross;
 4292                 b->ub_bucket[b->ub_cnt++] = item;
 4293                 b->ub_seq = seq;
 4294                 if (b->ub_cnt == b->ub_entries) {
 4295                         STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link);
 4296                         if ((b = STAILQ_FIRST(&emptybuckets)) != NULL)
 4297                                 STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
 4298                         zdom->uzd_cross = b;
 4299                 }
 4300         }
 4301         ZONE_CROSS_UNLOCK(zone);
 4302 out:
 4303         if (bucket->ub_cnt == 0)
 4304                 bucket->ub_seq = SMR_SEQ_INVALID;
 4305         bucket_free(zone, bucket, udata);
 4306 
 4307         while ((b = STAILQ_FIRST(&emptybuckets)) != NULL) {
 4308                 STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
 4309                 bucket_free(zone, b, udata);
 4310         }
 4311         while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) {
 4312                 STAILQ_REMOVE_HEAD(&fullbuckets, ub_link);
 4313                 domain = item_domain(b->ub_bucket[0]);
 4314                 zone_put_bucket(zone, domain, b, udata, true);
 4315         }
 4316 }
 4317 #endif
 4318 
 4319 static void
 4320 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
 4321     int itemdomain, bool ws)
 4322 {
 4323 
 4324 #ifdef NUMA
 4325         /*
 4326          * Buckets coming from the wrong domain will be entirely for the
 4327          * only other domain on two domain systems.  In this case we can
 4328          * simply cache them.  Otherwise we need to sort them back to
 4329          * correct domains.
 4330          */
 4331         if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
 4332             vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) {
 4333                 zone_free_cross(zone, bucket, udata);
 4334                 return;
 4335         }
 4336 #endif
 4337 
 4338         /*
 4339          * Attempt to save the bucket in the zone's domain bucket cache.
 4340          */
 4341         CTR3(KTR_UMA,
 4342             "uma_zfree: zone %s(%p) putting bucket %p on free list",
 4343             zone->uz_name, zone, bucket);
 4344         /* ub_cnt is pointing to the last free item */
 4345         if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
 4346                 itemdomain = zone_domain_lowest(zone, itemdomain);
 4347         zone_put_bucket(zone, itemdomain, bucket, udata, ws);
 4348 }
 4349 
 4350 /*
 4351  * Populate a free or cross bucket for the current cpu cache.  Free any
 4352  * existing full bucket either to the zone cache or back to the slab layer.
 4353  *
 4354  * Enters and returns in a critical section.  false return indicates that
 4355  * we can not satisfy this free in the cache layer.  true indicates that
 4356  * the caller should retry.
 4357  */
 4358 static __noinline bool
 4359 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
 4360     int itemdomain)
 4361 {
 4362         uma_cache_bucket_t cbucket;
 4363         uma_bucket_t newbucket, bucket;
 4364 
 4365         CRITICAL_ASSERT(curthread);
 4366 
 4367         if (zone->uz_bucket_size == 0)
 4368                 return false;
 4369 
 4370         cache = &zone->uz_cpu[curcpu];
 4371         newbucket = NULL;
 4372 
 4373         /*
 4374          * FIRSTTOUCH domains need to free to the correct zdom.  When
 4375          * enabled this is the zdom of the item.   The bucket is the
 4376          * cross bucket if the current domain and itemdomain do not match.
 4377          */
 4378         cbucket = &cache->uc_freebucket;
 4379 #ifdef NUMA
 4380         if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
 4381                 if (PCPU_GET(domain) != itemdomain) {
 4382                         cbucket = &cache->uc_crossbucket;
 4383                         if (cbucket->ucb_cnt != 0)
 4384                                 counter_u64_add(zone->uz_xdomain,
 4385                                     cbucket->ucb_cnt);
 4386                 }
 4387         }
 4388 #endif
 4389         bucket = cache_bucket_unload(cbucket);
 4390         KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries,
 4391             ("cache_free: Entered with non-full free bucket."));
 4392 
 4393         /* We are no longer associated with this CPU. */
 4394         critical_exit();
 4395 
 4396         /*
 4397          * Don't let SMR zones operate without a free bucket.  Force
 4398          * a synchronize and re-use this one.  We will only degrade
 4399          * to a synchronize every bucket_size items rather than every
 4400          * item if we fail to allocate a bucket.
 4401          */
 4402         if ((zone->uz_flags & UMA_ZONE_SMR) != 0) {
 4403                 if (bucket != NULL)
 4404                         bucket->ub_seq = smr_advance(zone->uz_smr);
 4405                 newbucket = bucket_alloc(zone, udata, M_NOWAIT);
 4406                 if (newbucket == NULL && bucket != NULL) {
 4407                         bucket_drain(zone, bucket);
 4408                         newbucket = bucket;
 4409                         bucket = NULL;
 4410                 }
 4411         } else if (!bucketdisable)
 4412                 newbucket = bucket_alloc(zone, udata, M_NOWAIT);
 4413 
 4414         if (bucket != NULL)
 4415                 zone_free_bucket(zone, bucket, udata, itemdomain, true);
 4416 
 4417         critical_enter();
 4418         if ((bucket = newbucket) == NULL)
 4419                 return (false);
 4420         cache = &zone->uz_cpu[curcpu];
 4421 #ifdef NUMA
 4422         /*
 4423          * Check to see if we should be populating the cross bucket.  If it
 4424          * is already populated we will fall through and attempt to populate
 4425          * the free bucket.
 4426          */
 4427         if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
 4428                 if (PCPU_GET(domain) != itemdomain &&
 4429                     cache->uc_crossbucket.ucb_bucket == NULL) {
 4430                         cache_bucket_load_cross(cache, bucket);
 4431                         return (true);
 4432                 }
 4433         }
 4434 #endif
 4435         /*
 4436          * We may have lost the race to fill the bucket or switched CPUs.
 4437          */
 4438         if (cache->uc_freebucket.ucb_bucket != NULL) {
 4439                 critical_exit();
 4440                 bucket_free(zone, bucket, udata);
 4441                 critical_enter();
 4442         } else
 4443                 cache_bucket_load_free(cache, bucket);
 4444 
 4445         return (true);
 4446 }
 4447 
 4448 static void
 4449 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
 4450 {
 4451         uma_keg_t keg;
 4452         uma_domain_t dom;
 4453         int freei;
 4454 
 4455         keg = zone->uz_keg;
 4456         KEG_LOCK_ASSERT(keg, slab->us_domain);
 4457 
 4458         /* Do we need to remove from any lists? */
 4459         dom = &keg->uk_domain[slab->us_domain];
 4460         if (slab->us_freecount + 1 == keg->uk_ipers) {
 4461                 LIST_REMOVE(slab, us_link);
 4462                 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
 4463                 dom->ud_free_slabs++;
 4464         } else if (slab->us_freecount == 0) {
 4465                 LIST_REMOVE(slab, us_link);
 4466                 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 4467         }
 4468 
 4469         /* Slab management. */
 4470         freei = slab_item_index(slab, keg, item);
 4471         BIT_SET(keg->uk_ipers, freei, &slab->us_free);
 4472         slab->us_freecount++;
 4473 
 4474         /* Keg statistics. */
 4475         dom->ud_free_items++;
 4476 }
 4477 
 4478 static void
 4479 zone_release(void *arg, void **bucket, int cnt)
 4480 {
 4481         struct mtx *lock;
 4482         uma_zone_t zone;
 4483         uma_slab_t slab;
 4484         uma_keg_t keg;
 4485         uint8_t *mem;
 4486         void *item;
 4487         int i;
 4488 
 4489         zone = arg;
 4490         keg = zone->uz_keg;
 4491         lock = NULL;
 4492         if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0))
 4493                 lock = KEG_LOCK(keg, 0);
 4494         for (i = 0; i < cnt; i++) {
 4495                 item = bucket[i];
 4496                 if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) {
 4497                         slab = vtoslab((vm_offset_t)item);
 4498                 } else {
 4499                         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 4500                         if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0)
 4501                                 slab = hash_sfind(&keg->uk_hash, mem);
 4502                         else
 4503                                 slab = (uma_slab_t)(mem + keg->uk_pgoff);
 4504                 }
 4505                 if (lock != KEG_LOCKPTR(keg, slab->us_domain)) {
 4506                         if (lock != NULL)
 4507                                 mtx_unlock(lock);
 4508                         lock = KEG_LOCK(keg, slab->us_domain);
 4509                 }
 4510                 slab_free_item(zone, slab, item);
 4511         }
 4512         if (lock != NULL)
 4513                 mtx_unlock(lock);
 4514 }
 4515 
 4516 /*
 4517  * Frees a single item to any zone.
 4518  *
 4519  * Arguments:
 4520  *      zone   The zone to free to
 4521  *      item   The item we're freeing
 4522  *      udata  User supplied data for the dtor
 4523  *      skip   Skip dtors and finis
 4524  */
 4525 static __noinline void
 4526 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
 4527 {
 4528 
 4529         /*
 4530          * If a free is sent directly to an SMR zone we have to
 4531          * synchronize immediately because the item can instantly
 4532          * be reallocated. This should only happen in degenerate
 4533          * cases when no memory is available for per-cpu caches.
 4534          */
 4535         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE)
 4536                 smr_synchronize(zone->uz_smr);
 4537 
 4538         item_dtor(zone, item, zone->uz_size, udata, skip);
 4539 
 4540         if (skip < SKIP_FINI && zone->uz_fini)
 4541                 zone->uz_fini(item, zone->uz_size);
 4542 
 4543         zone->uz_release(zone->uz_arg, &item, 1);
 4544 
 4545         if (skip & SKIP_CNT)
 4546                 return;
 4547 
 4548         counter_u64_add(zone->uz_frees, 1);
 4549 
 4550         if (zone->uz_max_items > 0)
 4551                 zone_free_limit(zone, 1);
 4552 }
 4553 
 4554 /* See uma.h */
 4555 int
 4556 uma_zone_set_max(uma_zone_t zone, int nitems)
 4557 {
 4558 
 4559         /*
 4560          * If the limit is small, we may need to constrain the maximum per-CPU
 4561          * cache size, or disable caching entirely.
 4562          */
 4563         uma_zone_set_maxcache(zone, nitems);
 4564 
 4565         /*
 4566          * XXX This can misbehave if the zone has any allocations with
 4567          * no limit and a limit is imposed.  There is currently no
 4568          * way to clear a limit.
 4569          */
 4570         ZONE_LOCK(zone);
 4571         zone->uz_max_items = nitems;
 4572         zone->uz_flags |= UMA_ZFLAG_LIMIT;
 4573         zone_update_caches(zone);
 4574         /* We may need to wake waiters. */
 4575         wakeup(&zone->uz_max_items);
 4576         ZONE_UNLOCK(zone);
 4577 
 4578         return (nitems);
 4579 }
 4580 
 4581 /* See uma.h */
 4582 void
 4583 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
 4584 {
 4585         int bpcpu, bpdom, bsize, nb;
 4586 
 4587         ZONE_LOCK(zone);
 4588 
 4589         /*
 4590          * Compute a lower bound on the number of items that may be cached in
 4591          * the zone.  Each CPU gets at least two buckets, and for cross-domain
 4592          * frees we use an additional bucket per CPU and per domain.  Select the
 4593          * largest bucket size that does not exceed half of the requested limit,
 4594          * with the left over space given to the full bucket cache.
 4595          */
 4596         bpdom = 0;
 4597         bpcpu = 2;
 4598 #ifdef NUMA
 4599         if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 1) {
 4600                 bpcpu++;
 4601                 bpdom++;
 4602         }
 4603 #endif
 4604         nb = bpcpu * mp_ncpus + bpdom * vm_ndomains;
 4605         bsize = nitems / nb / 2;
 4606         if (bsize > BUCKET_MAX)
 4607                 bsize = BUCKET_MAX;
 4608         else if (bsize == 0 && nitems / nb > 0)
 4609                 bsize = 1;
 4610         zone->uz_bucket_size_max = zone->uz_bucket_size = bsize;
 4611         if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
 4612                 zone->uz_bucket_size_min = zone->uz_bucket_size_max;
 4613         zone->uz_bucket_max = nitems - nb * bsize;
 4614         ZONE_UNLOCK(zone);
 4615 }
 4616 
 4617 /* See uma.h */
 4618 int
 4619 uma_zone_get_max(uma_zone_t zone)
 4620 {
 4621         int nitems;
 4622 
 4623         nitems = atomic_load_64(&zone->uz_max_items);
 4624 
 4625         return (nitems);
 4626 }
 4627 
 4628 /* See uma.h */
 4629 void
 4630 uma_zone_set_warning(uma_zone_t zone, const char *warning)
 4631 {
 4632 
 4633         ZONE_ASSERT_COLD(zone);
 4634         zone->uz_warning = warning;
 4635 }
 4636 
 4637 /* See uma.h */
 4638 void
 4639 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
 4640 {
 4641 
 4642         ZONE_ASSERT_COLD(zone);
 4643         TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
 4644 }
 4645 
 4646 /* See uma.h */
 4647 int
 4648 uma_zone_get_cur(uma_zone_t zone)
 4649 {
 4650         int64_t nitems;
 4651         u_int i;
 4652 
 4653         nitems = 0;
 4654         if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER)
 4655                 nitems = counter_u64_fetch(zone->uz_allocs) -
 4656                     counter_u64_fetch(zone->uz_frees);
 4657         CPU_FOREACH(i)
 4658                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) -
 4659                     atomic_load_64(&zone->uz_cpu[i].uc_frees);
 4660 
 4661         return (nitems < 0 ? 0 : nitems);
 4662 }
 4663 
 4664 static uint64_t
 4665 uma_zone_get_allocs(uma_zone_t zone)
 4666 {
 4667         uint64_t nitems;
 4668         u_int i;
 4669 
 4670         nitems = 0;
 4671         if (zone->uz_allocs != EARLY_COUNTER)
 4672                 nitems = counter_u64_fetch(zone->uz_allocs);
 4673         CPU_FOREACH(i)
 4674                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs);
 4675 
 4676         return (nitems);
 4677 }
 4678 
 4679 static uint64_t
 4680 uma_zone_get_frees(uma_zone_t zone)
 4681 {
 4682         uint64_t nitems;
 4683         u_int i;
 4684 
 4685         nitems = 0;
 4686         if (zone->uz_frees != EARLY_COUNTER)
 4687                 nitems = counter_u64_fetch(zone->uz_frees);
 4688         CPU_FOREACH(i)
 4689                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees);
 4690 
 4691         return (nitems);
 4692 }
 4693 
 4694 #ifdef INVARIANTS
 4695 /* Used only for KEG_ASSERT_COLD(). */
 4696 static uint64_t
 4697 uma_keg_get_allocs(uma_keg_t keg)
 4698 {
 4699         uma_zone_t z;
 4700         uint64_t nitems;
 4701 
 4702         nitems = 0;
 4703         LIST_FOREACH(z, &keg->uk_zones, uz_link)
 4704                 nitems += uma_zone_get_allocs(z);
 4705 
 4706         return (nitems);
 4707 }
 4708 #endif
 4709 
 4710 /* See uma.h */
 4711 void
 4712 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
 4713 {
 4714         uma_keg_t keg;
 4715 
 4716         KEG_GET(zone, keg);
 4717         KEG_ASSERT_COLD(keg);
 4718         keg->uk_init = uminit;
 4719 }
 4720 
 4721 /* See uma.h */
 4722 void
 4723 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
 4724 {
 4725         uma_keg_t keg;
 4726 
 4727         KEG_GET(zone, keg);
 4728         KEG_ASSERT_COLD(keg);
 4729         keg->uk_fini = fini;
 4730 }
 4731 
 4732 /* See uma.h */
 4733 void
 4734 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
 4735 {
 4736 
 4737         ZONE_ASSERT_COLD(zone);
 4738         zone->uz_init = zinit;
 4739 }
 4740 
 4741 /* See uma.h */
 4742 void
 4743 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
 4744 {
 4745 
 4746         ZONE_ASSERT_COLD(zone);
 4747         zone->uz_fini = zfini;
 4748 }
 4749 
 4750 /* See uma.h */
 4751 void
 4752 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 4753 {
 4754         uma_keg_t keg;
 4755 
 4756         KEG_GET(zone, keg);
 4757         KEG_ASSERT_COLD(keg);
 4758         keg->uk_freef = freef;
 4759 }
 4760 
 4761 /* See uma.h */
 4762 void
 4763 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 4764 {
 4765         uma_keg_t keg;
 4766 
 4767         KEG_GET(zone, keg);
 4768         KEG_ASSERT_COLD(keg);
 4769         keg->uk_allocf = allocf;
 4770 }
 4771 
 4772 /* See uma.h */
 4773 void
 4774 uma_zone_set_smr(uma_zone_t zone, smr_t smr)
 4775 {
 4776 
 4777         ZONE_ASSERT_COLD(zone);
 4778 
 4779         KASSERT(smr != NULL, ("Got NULL smr"));
 4780         KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
 4781             ("zone %p (%s) already uses SMR", zone, zone->uz_name));
 4782         zone->uz_flags |= UMA_ZONE_SMR;
 4783         zone->uz_smr = smr;
 4784         zone_update_caches(zone);
 4785 }
 4786 
 4787 smr_t
 4788 uma_zone_get_smr(uma_zone_t zone)
 4789 {
 4790 
 4791         return (zone->uz_smr);
 4792 }
 4793 
 4794 /* See uma.h */
 4795 void
 4796 uma_zone_reserve(uma_zone_t zone, int items)
 4797 {
 4798         uma_keg_t keg;
 4799 
 4800         KEG_GET(zone, keg);
 4801         KEG_ASSERT_COLD(keg);
 4802         keg->uk_reserve = items;
 4803 }
 4804 
 4805 /* See uma.h */
 4806 int
 4807 uma_zone_reserve_kva(uma_zone_t zone, int count)
 4808 {
 4809         uma_keg_t keg;
 4810         vm_offset_t kva;
 4811         u_int pages;
 4812 
 4813         KEG_GET(zone, keg);
 4814         KEG_ASSERT_COLD(keg);
 4815         ZONE_ASSERT_COLD(zone);
 4816 
 4817         pages = howmany(count, keg->uk_ipers) * keg->uk_ppera;
 4818 
 4819 #ifdef UMA_MD_SMALL_ALLOC
 4820         if (keg->uk_ppera > 1) {
 4821 #else
 4822         if (1) {
 4823 #endif
 4824                 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
 4825                 if (kva == 0)
 4826                         return (0);
 4827         } else
 4828                 kva = 0;
 4829 
 4830         MPASS(keg->uk_kva == 0);
 4831         keg->uk_kva = kva;
 4832         keg->uk_offset = 0;
 4833         zone->uz_max_items = pages * keg->uk_ipers;
 4834 #ifdef UMA_MD_SMALL_ALLOC
 4835         keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
 4836 #else
 4837         keg->uk_allocf = noobj_alloc;
 4838 #endif
 4839         keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
 4840         zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
 4841         zone_update_caches(zone);
 4842 
 4843         return (1);
 4844 }
 4845 
 4846 /* See uma.h */
 4847 void
 4848 uma_prealloc(uma_zone_t zone, int items)
 4849 {
 4850         struct vm_domainset_iter di;
 4851         uma_domain_t dom;
 4852         uma_slab_t slab;
 4853         uma_keg_t keg;
 4854         int aflags, domain, slabs;
 4855 
 4856         KEG_GET(zone, keg);
 4857         slabs = howmany(items, keg->uk_ipers);
 4858         while (slabs-- > 0) {
 4859                 aflags = M_NOWAIT;
 4860                 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
 4861                     &aflags);
 4862                 for (;;) {
 4863                         slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
 4864                             aflags);
 4865                         if (slab != NULL) {
 4866                                 dom = &keg->uk_domain[slab->us_domain];
 4867                                 /*
 4868                                  * keg_alloc_slab() always returns a slab on the
 4869                                  * partial list.
 4870                                  */
 4871                                 LIST_REMOVE(slab, us_link);
 4872                                 LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
 4873                                     us_link);
 4874                                 dom->ud_free_slabs++;
 4875                                 KEG_UNLOCK(keg, slab->us_domain);
 4876                                 break;
 4877                         }
 4878                         if (vm_domainset_iter_policy(&di, &domain) != 0)
 4879                                 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
 4880                 }
 4881         }
 4882 }
 4883 
 4884 /*
 4885  * Returns a snapshot of memory consumption in bytes.
 4886  */
 4887 size_t
 4888 uma_zone_memory(uma_zone_t zone)
 4889 {
 4890         size_t sz;
 4891         int i;
 4892 
 4893         sz = 0;
 4894         if (zone->uz_flags & UMA_ZFLAG_CACHE) {
 4895                 for (i = 0; i < vm_ndomains; i++)
 4896                         sz += ZDOM_GET(zone, i)->uzd_nitems;
 4897                 return (sz * zone->uz_size);
 4898         }
 4899         for (i = 0; i < vm_ndomains; i++)
 4900                 sz += zone->uz_keg->uk_domain[i].ud_pages;
 4901 
 4902         return (sz * PAGE_SIZE);
 4903 }
 4904 
 4905 /* See uma.h */
 4906 void
 4907 uma_reclaim(int req)
 4908 {
 4909 
 4910         CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
 4911         sx_xlock(&uma_reclaim_lock);
 4912         bucket_enable();
 4913 
 4914         switch (req) {
 4915         case UMA_RECLAIM_TRIM:
 4916                 zone_foreach(zone_trim, NULL);
 4917                 break;
 4918         case UMA_RECLAIM_DRAIN:
 4919         case UMA_RECLAIM_DRAIN_CPU:
 4920                 zone_foreach(zone_drain, NULL);
 4921                 if (req == UMA_RECLAIM_DRAIN_CPU) {
 4922                         pcpu_cache_drain_safe(NULL);
 4923                         zone_foreach(zone_drain, NULL);
 4924                 }
 4925                 break;
 4926         default:
 4927                 panic("unhandled reclamation request %d", req);
 4928         }
 4929 
 4930         /*
 4931          * Some slabs may have been freed but this zone will be visited early
 4932          * we visit again so that we can free pages that are empty once other
 4933          * zones are drained.  We have to do the same for buckets.
 4934          */
 4935         zone_drain(slabzones[0], NULL);
 4936         zone_drain(slabzones[1], NULL);
 4937         bucket_zone_drain();
 4938         sx_xunlock(&uma_reclaim_lock);
 4939 }
 4940 
 4941 static volatile int uma_reclaim_needed;
 4942 
 4943 void
 4944 uma_reclaim_wakeup(void)
 4945 {
 4946 
 4947         if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
 4948                 wakeup(uma_reclaim);
 4949 }
 4950 
 4951 void
 4952 uma_reclaim_worker(void *arg __unused)
 4953 {
 4954 
 4955         for (;;) {
 4956                 sx_xlock(&uma_reclaim_lock);
 4957                 while (atomic_load_int(&uma_reclaim_needed) == 0)
 4958                         sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
 4959                             hz);
 4960                 sx_xunlock(&uma_reclaim_lock);
 4961                 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
 4962                 uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
 4963                 atomic_store_int(&uma_reclaim_needed, 0);
 4964                 /* Don't fire more than once per-second. */
 4965                 pause("umarclslp", hz);
 4966         }
 4967 }
 4968 
 4969 /* See uma.h */
 4970 void
 4971 uma_zone_reclaim(uma_zone_t zone, int req)
 4972 {
 4973 
 4974         switch (req) {
 4975         case UMA_RECLAIM_TRIM:
 4976                 zone_trim(zone, NULL);
 4977                 break;
 4978         case UMA_RECLAIM_DRAIN:
 4979                 zone_drain(zone, NULL);
 4980                 break;
 4981         case UMA_RECLAIM_DRAIN_CPU:
 4982                 pcpu_cache_drain_safe(zone);
 4983                 zone_drain(zone, NULL);
 4984                 break;
 4985         default:
 4986                 panic("unhandled reclamation request %d", req);
 4987         }
 4988 }
 4989 
 4990 /* See uma.h */
 4991 int
 4992 uma_zone_exhausted(uma_zone_t zone)
 4993 {
 4994 
 4995         return (atomic_load_32(&zone->uz_sleepers) > 0);
 4996 }
 4997 
 4998 unsigned long
 4999 uma_limit(void)
 5000 {
 5001 
 5002         return (uma_kmem_limit);
 5003 }
 5004 
 5005 void
 5006 uma_set_limit(unsigned long limit)
 5007 {
 5008 
 5009         uma_kmem_limit = limit;
 5010 }
 5011 
 5012 unsigned long
 5013 uma_size(void)
 5014 {
 5015 
 5016         return (atomic_load_long(&uma_kmem_total));
 5017 }
 5018 
 5019 long
 5020 uma_avail(void)
 5021 {
 5022 
 5023         return (uma_kmem_limit - uma_size());
 5024 }
 5025 
 5026 #ifdef DDB
 5027 /*
 5028  * Generate statistics across both the zone and its per-cpu cache's.  Return
 5029  * desired statistics if the pointer is non-NULL for that statistic.
 5030  *
 5031  * Note: does not update the zone statistics, as it can't safely clear the
 5032  * per-CPU cache statistic.
 5033  *
 5034  */
 5035 static void
 5036 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
 5037     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
 5038 {
 5039         uma_cache_t cache;
 5040         uint64_t allocs, frees, sleeps, xdomain;
 5041         int cachefree, cpu;
 5042 
 5043         allocs = frees = sleeps = xdomain = 0;
 5044         cachefree = 0;
 5045         CPU_FOREACH(cpu) {
 5046                 cache = &z->uz_cpu[cpu];
 5047                 cachefree += cache->uc_allocbucket.ucb_cnt;
 5048                 cachefree += cache->uc_freebucket.ucb_cnt;
 5049                 xdomain += cache->uc_crossbucket.ucb_cnt;
 5050                 cachefree += cache->uc_crossbucket.ucb_cnt;
 5051                 allocs += cache->uc_allocs;
 5052                 frees += cache->uc_frees;
 5053         }
 5054         allocs += counter_u64_fetch(z->uz_allocs);
 5055         frees += counter_u64_fetch(z->uz_frees);
 5056         xdomain += counter_u64_fetch(z->uz_xdomain);
 5057         sleeps += z->uz_sleeps;
 5058         if (cachefreep != NULL)
 5059                 *cachefreep = cachefree;
 5060         if (allocsp != NULL)
 5061                 *allocsp = allocs;
 5062         if (freesp != NULL)
 5063                 *freesp = frees;
 5064         if (sleepsp != NULL)
 5065                 *sleepsp = sleeps;
 5066         if (xdomainp != NULL)
 5067                 *xdomainp = xdomain;
 5068 }
 5069 #endif /* DDB */
 5070 
 5071 static int
 5072 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
 5073 {
 5074         uma_keg_t kz;
 5075         uma_zone_t z;
 5076         int count;
 5077 
 5078         count = 0;
 5079         rw_rlock(&uma_rwlock);
 5080         LIST_FOREACH(kz, &uma_kegs, uk_link) {
 5081                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
 5082                         count++;
 5083         }
 5084         LIST_FOREACH(z, &uma_cachezones, uz_link)
 5085                 count++;
 5086 
 5087         rw_runlock(&uma_rwlock);
 5088         return (sysctl_handle_int(oidp, &count, 0, req));
 5089 }
 5090 
 5091 static void
 5092 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
 5093     struct uma_percpu_stat *ups, bool internal)
 5094 {
 5095         uma_zone_domain_t zdom;
 5096         uma_cache_t cache;
 5097         int i;
 5098 
 5099         for (i = 0; i < vm_ndomains; i++) {
 5100                 zdom = ZDOM_GET(z, i);
 5101                 uth->uth_zone_free += zdom->uzd_nitems;
 5102         }
 5103         uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
 5104         uth->uth_frees = counter_u64_fetch(z->uz_frees);
 5105         uth->uth_fails = counter_u64_fetch(z->uz_fails);
 5106         uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain);
 5107         uth->uth_sleeps = z->uz_sleeps;
 5108 
 5109         for (i = 0; i < mp_maxid + 1; i++) {
 5110                 bzero(&ups[i], sizeof(*ups));
 5111                 if (internal || CPU_ABSENT(i))
 5112                         continue;
 5113                 cache = &z->uz_cpu[i];
 5114                 ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt;
 5115                 ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt;
 5116                 ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt;
 5117                 ups[i].ups_allocs = cache->uc_allocs;
 5118                 ups[i].ups_frees = cache->uc_frees;
 5119         }
 5120 }
 5121 
 5122 static int
 5123 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
 5124 {
 5125         struct uma_stream_header ush;
 5126         struct uma_type_header uth;
 5127         struct uma_percpu_stat *ups;
 5128         struct sbuf sbuf;
 5129         uma_keg_t kz;
 5130         uma_zone_t z;
 5131         uint64_t items;
 5132         uint32_t kfree, pages;
 5133         int count, error, i;
 5134 
 5135         error = sysctl_wire_old_buffer(req, 0);
 5136         if (error != 0)
 5137                 return (error);
 5138         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 5139         sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
 5140         ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
 5141 
 5142         count = 0;
 5143         rw_rlock(&uma_rwlock);
 5144         LIST_FOREACH(kz, &uma_kegs, uk_link) {
 5145                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
 5146                         count++;
 5147         }
 5148 
 5149         LIST_FOREACH(z, &uma_cachezones, uz_link)
 5150                 count++;
 5151 
 5152         /*
 5153          * Insert stream header.
 5154          */
 5155         bzero(&ush, sizeof(ush));
 5156         ush.ush_version = UMA_STREAM_VERSION;
 5157         ush.ush_maxcpus = (mp_maxid + 1);
 5158         ush.ush_count = count;
 5159         (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
 5160 
 5161         LIST_FOREACH(kz, &uma_kegs, uk_link) {
 5162                 kfree = pages = 0;
 5163                 for (i = 0; i < vm_ndomains; i++) {
 5164                         kfree += kz->uk_domain[i].ud_free_items;
 5165                         pages += kz->uk_domain[i].ud_pages;
 5166                 }
 5167                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 5168                         bzero(&uth, sizeof(uth));
 5169                         strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 5170                         uth.uth_align = kz->uk_align;
 5171                         uth.uth_size = kz->uk_size;
 5172                         uth.uth_rsize = kz->uk_rsize;
 5173                         if (z->uz_max_items > 0) {
 5174                                 items = UZ_ITEMS_COUNT(z->uz_items);
 5175                                 uth.uth_pages = (items / kz->uk_ipers) *
 5176                                         kz->uk_ppera;
 5177                         } else
 5178                                 uth.uth_pages = pages;
 5179                         uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
 5180                             kz->uk_ppera;
 5181                         uth.uth_limit = z->uz_max_items;
 5182                         uth.uth_keg_free = kfree;
 5183 
 5184                         /*
 5185                          * A zone is secondary is it is not the first entry
 5186                          * on the keg's zone list.
 5187                          */
 5188                         if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
 5189                             (LIST_FIRST(&kz->uk_zones) != z))
 5190                                 uth.uth_zone_flags = UTH_ZONE_SECONDARY;
 5191                         uma_vm_zone_stats(&uth, z, &sbuf, ups,
 5192                             kz->uk_flags & UMA_ZFLAG_INTERNAL);
 5193                         (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 5194                         for (i = 0; i < mp_maxid + 1; i++)
 5195                                 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 5196                 }
 5197         }
 5198         LIST_FOREACH(z, &uma_cachezones, uz_link) {
 5199                 bzero(&uth, sizeof(uth));
 5200                 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 5201                 uth.uth_size = z->uz_size;
 5202                 uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
 5203                 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 5204                 for (i = 0; i < mp_maxid + 1; i++)
 5205                         (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 5206         }
 5207 
 5208         rw_runlock(&uma_rwlock);
 5209         error = sbuf_finish(&sbuf);
 5210         sbuf_delete(&sbuf);
 5211         free(ups, M_TEMP);
 5212         return (error);
 5213 }
 5214 
 5215 int
 5216 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
 5217 {
 5218         uma_zone_t zone = *(uma_zone_t *)arg1;
 5219         int error, max;
 5220 
 5221         max = uma_zone_get_max(zone);
 5222         error = sysctl_handle_int(oidp, &max, 0, req);
 5223         if (error || !req->newptr)
 5224                 return (error);
 5225 
 5226         uma_zone_set_max(zone, max);
 5227 
 5228         return (0);
 5229 }
 5230 
 5231 int
 5232 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
 5233 {
 5234         uma_zone_t zone;
 5235         int cur;
 5236 
 5237         /*
 5238          * Some callers want to add sysctls for global zones that
 5239          * may not yet exist so they pass a pointer to a pointer.
 5240          */
 5241         if (arg2 == 0)
 5242                 zone = *(uma_zone_t *)arg1;
 5243         else
 5244                 zone = arg1;
 5245         cur = uma_zone_get_cur(zone);
 5246         return (sysctl_handle_int(oidp, &cur, 0, req));
 5247 }
 5248 
 5249 static int
 5250 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS)
 5251 {
 5252         uma_zone_t zone = arg1;
 5253         uint64_t cur;
 5254 
 5255         cur = uma_zone_get_allocs(zone);
 5256         return (sysctl_handle_64(oidp, &cur, 0, req));
 5257 }
 5258 
 5259 static int
 5260 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS)
 5261 {
 5262         uma_zone_t zone = arg1;
 5263         uint64_t cur;
 5264 
 5265         cur = uma_zone_get_frees(zone);
 5266         return (sysctl_handle_64(oidp, &cur, 0, req));
 5267 }
 5268 
 5269 static int
 5270 sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS)
 5271 {
 5272         struct sbuf sbuf;
 5273         uma_zone_t zone = arg1;
 5274         int error;
 5275 
 5276         sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
 5277         if (zone->uz_flags != 0)
 5278                 sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS);
 5279         else
 5280                 sbuf_printf(&sbuf, "");
 5281         error = sbuf_finish(&sbuf);
 5282         sbuf_delete(&sbuf);
 5283 
 5284         return (error);
 5285 }
 5286 
 5287 static int
 5288 sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS)
 5289 {
 5290         uma_keg_t keg = arg1;
 5291         int avail, effpct, total;
 5292 
 5293         total = keg->uk_ppera * PAGE_SIZE;
 5294         if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0)
 5295                 total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize;
 5296         /*
 5297          * We consider the client's requested size and alignment here, not the
 5298          * real size determination uk_rsize, because we also adjust the real
 5299          * size for internal implementation reasons (max bitset size).
 5300          */
 5301         avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1);
 5302         if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
 5303                 avail *= mp_maxid + 1;
 5304         effpct = 100 * avail / total;
 5305         return (sysctl_handle_int(oidp, &effpct, 0, req));
 5306 }
 5307 
 5308 static int
 5309 sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS)
 5310 {
 5311         uma_zone_t zone = arg1;
 5312         uint64_t cur;
 5313 
 5314         cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items));
 5315         return (sysctl_handle_64(oidp, &cur, 0, req));
 5316 }
 5317 
 5318 #ifdef INVARIANTS
 5319 static uma_slab_t
 5320 uma_dbg_getslab(uma_zone_t zone, void *item)
 5321 {
 5322         uma_slab_t slab;
 5323         uma_keg_t keg;
 5324         uint8_t *mem;
 5325 
 5326         /*
 5327          * It is safe to return the slab here even though the
 5328          * zone is unlocked because the item's allocation state
 5329          * essentially holds a reference.
 5330          */
 5331         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 5332         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
 5333                 return (NULL);
 5334         if (zone->uz_flags & UMA_ZFLAG_VTOSLAB)
 5335                 return (vtoslab((vm_offset_t)mem));
 5336         keg = zone->uz_keg;
 5337         if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0)
 5338                 return ((uma_slab_t)(mem + keg->uk_pgoff));
 5339         KEG_LOCK(keg, 0);
 5340         slab = hash_sfind(&keg->uk_hash, mem);
 5341         KEG_UNLOCK(keg, 0);
 5342 
 5343         return (slab);
 5344 }
 5345 
 5346 static bool
 5347 uma_dbg_zskip(uma_zone_t zone, void *mem)
 5348 {
 5349 
 5350         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
 5351                 return (true);
 5352 
 5353         return (uma_dbg_kskip(zone->uz_keg, mem));
 5354 }
 5355 
 5356 static bool
 5357 uma_dbg_kskip(uma_keg_t keg, void *mem)
 5358 {
 5359         uintptr_t idx;
 5360 
 5361         if (dbg_divisor == 0)
 5362                 return (true);
 5363 
 5364         if (dbg_divisor == 1)
 5365                 return (false);
 5366 
 5367         idx = (uintptr_t)mem >> PAGE_SHIFT;
 5368         if (keg->uk_ipers > 1) {
 5369                 idx *= keg->uk_ipers;
 5370                 idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
 5371         }
 5372 
 5373         if ((idx / dbg_divisor) * dbg_divisor != idx) {
 5374                 counter_u64_add(uma_skip_cnt, 1);
 5375                 return (true);
 5376         }
 5377         counter_u64_add(uma_dbg_cnt, 1);
 5378 
 5379         return (false);
 5380 }
 5381 
 5382 /*
 5383  * Set up the slab's freei data such that uma_dbg_free can function.
 5384  *
 5385  */
 5386 static void
 5387 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 5388 {
 5389         uma_keg_t keg;
 5390         int freei;
 5391 
 5392         if (slab == NULL) {
 5393                 slab = uma_dbg_getslab(zone, item);
 5394                 if (slab == NULL) 
 5395                         panic("uma: item %p did not belong to zone %s",
 5396                             item, zone->uz_name);
 5397         }
 5398         keg = zone->uz_keg;
 5399         freei = slab_item_index(slab, keg, item);
 5400 
 5401         if (BIT_TEST_SET_ATOMIC(keg->uk_ipers, freei,
 5402             slab_dbg_bits(slab, keg)))
 5403                 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)",
 5404                     item, zone, zone->uz_name, slab, freei);
 5405 }
 5406 
 5407 /*
 5408  * Verifies freed addresses.  Checks for alignment, valid slab membership
 5409  * and duplicate frees.
 5410  *
 5411  */
 5412 static void
 5413 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 5414 {
 5415         uma_keg_t keg;
 5416         int freei;
 5417 
 5418         if (slab == NULL) {
 5419                 slab = uma_dbg_getslab(zone, item);
 5420                 if (slab == NULL) 
 5421                         panic("uma: Freed item %p did not belong to zone %s",
 5422                             item, zone->uz_name);
 5423         }
 5424         keg = zone->uz_keg;
 5425         freei = slab_item_index(slab, keg, item);
 5426 
 5427         if (freei >= keg->uk_ipers)
 5428                 panic("Invalid free of %p from zone %p(%s) slab %p(%d)",
 5429                     item, zone, zone->uz_name, slab, freei);
 5430 
 5431         if (slab_item(slab, keg, freei) != item)
 5432                 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)",
 5433                     item, zone, zone->uz_name, slab, freei);
 5434 
 5435         if (!BIT_TEST_CLR_ATOMIC(keg->uk_ipers, freei,
 5436             slab_dbg_bits(slab, keg)))
 5437                 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)",
 5438                     item, zone, zone->uz_name, slab, freei);
 5439 }
 5440 #endif /* INVARIANTS */
 5441 
 5442 #ifdef DDB
 5443 static int64_t
 5444 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
 5445     uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
 5446 {
 5447         uint64_t frees;
 5448         int i;
 5449 
 5450         if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
 5451                 *allocs = counter_u64_fetch(z->uz_allocs);
 5452                 frees = counter_u64_fetch(z->uz_frees);
 5453                 *sleeps = z->uz_sleeps;
 5454                 *cachefree = 0;
 5455                 *xdomain = 0;
 5456         } else
 5457                 uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
 5458                     xdomain);
 5459         for (i = 0; i < vm_ndomains; i++) {
 5460                 *cachefree += ZDOM_GET(z, i)->uzd_nitems;
 5461                 if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
 5462                     (LIST_FIRST(&kz->uk_zones) != z)))
 5463                         *cachefree += kz->uk_domain[i].ud_free_items;
 5464         }
 5465         *used = *allocs - frees;
 5466         return (((int64_t)*used + *cachefree) * kz->uk_size);
 5467 }
 5468 
 5469 DB_SHOW_COMMAND(uma, db_show_uma)
 5470 {
 5471         const char *fmt_hdr, *fmt_entry;
 5472         uma_keg_t kz;
 5473         uma_zone_t z;
 5474         uint64_t allocs, used, sleeps, xdomain;
 5475         long cachefree;
 5476         /* variables for sorting */
 5477         uma_keg_t cur_keg;
 5478         uma_zone_t cur_zone, last_zone;
 5479         int64_t cur_size, last_size, size;
 5480         int ties;
 5481 
 5482         /* /i option produces machine-parseable CSV output */
 5483         if (modif[0] == 'i') {
 5484                 fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
 5485                 fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
 5486         } else {
 5487                 fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
 5488                 fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
 5489         }
 5490 
 5491         db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
 5492             "Sleeps", "Bucket", "Total Mem", "XFree");
 5493 
 5494         /* Sort the zones with largest size first. */
 5495         last_zone = NULL;
 5496         last_size = INT64_MAX;
 5497         for (;;) {
 5498                 cur_zone = NULL;
 5499                 cur_size = -1;
 5500                 ties = 0;
 5501                 LIST_FOREACH(kz, &uma_kegs, uk_link) {
 5502                         LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 5503                                 /*
 5504                                  * In the case of size ties, print out zones
 5505                                  * in the order they are encountered.  That is,
 5506                                  * when we encounter the most recently output
 5507                                  * zone, we have already printed all preceding
 5508                                  * ties, and we must print all following ties.
 5509                                  */
 5510                                 if (z == last_zone) {
 5511                                         ties = 1;
 5512                                         continue;
 5513                                 }
 5514                                 size = get_uma_stats(kz, z, &allocs, &used,
 5515                                     &sleeps, &cachefree, &xdomain);
 5516                                 if (size > cur_size && size < last_size + ties)
 5517                                 {
 5518                                         cur_size = size;
 5519                                         cur_zone = z;
 5520                                         cur_keg = kz;
 5521                                 }
 5522                         }
 5523                 }
 5524                 if (cur_zone == NULL)
 5525                         break;
 5526 
 5527                 size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
 5528                     &sleeps, &cachefree, &xdomain);
 5529                 db_printf(fmt_entry, cur_zone->uz_name,
 5530                     (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
 5531                     (uintmax_t)allocs, (uintmax_t)sleeps,
 5532                     (unsigned)cur_zone->uz_bucket_size, (intmax_t)size,
 5533                     xdomain);
 5534 
 5535                 if (db_pager_quit)
 5536                         return;
 5537                 last_zone = cur_zone;
 5538                 last_size = cur_size;
 5539         }
 5540 }
 5541 
 5542 DB_SHOW_COMMAND(umacache, db_show_umacache)
 5543 {
 5544         uma_zone_t z;
 5545         uint64_t allocs, frees;
 5546         long cachefree;
 5547         int i;
 5548 
 5549         db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
 5550             "Requests", "Bucket");
 5551         LIST_FOREACH(z, &uma_cachezones, uz_link) {
 5552                 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
 5553                 for (i = 0; i < vm_ndomains; i++)
 5554                         cachefree += ZDOM_GET(z, i)->uzd_nitems;
 5555                 db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
 5556                     z->uz_name, (uintmax_t)z->uz_size,
 5557                     (intmax_t)(allocs - frees), cachefree,
 5558                     (uintmax_t)allocs, z->uz_bucket_size);
 5559                 if (db_pager_quit)
 5560                         return;
 5561         }
 5562 }
 5563 #endif  /* DDB */

Cache object: de5daad641c2eb8975870676b42109a2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.