The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_pool.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: subr_pool.c,v 1.285 2022/07/16 10:20:21 simonb Exp $   */
    2 
    3 /*
    4  * Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018,
    5  *     2020, 2021 The NetBSD Foundation, Inc.
    6  * All rights reserved.
    7  *
    8  * This code is derived from software contributed to The NetBSD Foundation
    9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
   10  * Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by
   11  * Maxime Villard.
   12  *
   13  * Redistribution and use in source and binary forms, with or without
   14  * modification, are permitted provided that the following conditions
   15  * are met:
   16  * 1. Redistributions of source code must retain the above copyright
   17  *    notice, this list of conditions and the following disclaimer.
   18  * 2. Redistributions in binary form must reproduce the above copyright
   19  *    notice, this list of conditions and the following disclaimer in the
   20  *    documentation and/or other materials provided with the distribution.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32  * POSSIBILITY OF SUCH DAMAGE.
   33  */
   34 
   35 #include <sys/cdefs.h>
   36 __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.285 2022/07/16 10:20:21 simonb Exp $");
   37 
   38 #ifdef _KERNEL_OPT
   39 #include "opt_ddb.h"
   40 #include "opt_lockdebug.h"
   41 #include "opt_pool.h"
   42 #endif
   43 
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/sysctl.h>
   47 #include <sys/bitops.h>
   48 #include <sys/proc.h>
   49 #include <sys/errno.h>
   50 #include <sys/kernel.h>
   51 #include <sys/vmem.h>
   52 #include <sys/pool.h>
   53 #include <sys/syslog.h>
   54 #include <sys/debug.h>
   55 #include <sys/lock.h>
   56 #include <sys/lockdebug.h>
   57 #include <sys/xcall.h>
   58 #include <sys/cpu.h>
   59 #include <sys/atomic.h>
   60 #include <sys/asan.h>
   61 #include <sys/msan.h>
   62 #include <sys/fault.h>
   63 
   64 #include <uvm/uvm_extern.h>
   65 
   66 /*
   67  * Pool resource management utility.
   68  *
   69  * Memory is allocated in pages which are split into pieces according to
   70  * the pool item size. Each page is kept on one of three lists in the
   71  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
   72  * for empty, full and partially-full pages respectively. The individual
   73  * pool items are on a linked list headed by `ph_itemlist' in each page
   74  * header. The memory for building the page list is either taken from
   75  * the allocated pages themselves (for small pool items) or taken from
   76  * an internal pool of page headers (`phpool').
   77  */
   78 
   79 /* List of all pools. Non static as needed by 'vmstat -m' */
   80 TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
   81 
   82 /* Private pool for page header structures */
   83 #define PHPOOL_MAX      8
   84 static struct pool phpool[PHPOOL_MAX];
   85 #define PHPOOL_FREELIST_NELEM(idx) \
   86         (((idx) == 0) ? BITMAP_MIN_SIZE : BITMAP_SIZE * (1 << (idx)))
   87 
   88 #if !defined(KMSAN) && (defined(DIAGNOSTIC) || defined(KASAN))
   89 #define POOL_REDZONE
   90 #endif
   91 
   92 #if defined(POOL_QUARANTINE)
   93 #define POOL_NOCACHE
   94 #endif
   95 
   96 #ifdef POOL_REDZONE
   97 # ifdef KASAN
   98 #  define POOL_REDZONE_SIZE 8
   99 # else
  100 #  define POOL_REDZONE_SIZE 2
  101 # endif
  102 static void pool_redzone_init(struct pool *, size_t);
  103 static void pool_redzone_fill(struct pool *, void *);
  104 static void pool_redzone_check(struct pool *, void *);
  105 static void pool_cache_redzone_check(pool_cache_t, void *);
  106 #else
  107 # define pool_redzone_init(pp, sz)              __nothing
  108 # define pool_redzone_fill(pp, ptr)             __nothing
  109 # define pool_redzone_check(pp, ptr)            __nothing
  110 # define pool_cache_redzone_check(pc, ptr)      __nothing
  111 #endif
  112 
  113 #ifdef KMSAN
  114 static inline void pool_get_kmsan(struct pool *, void *);
  115 static inline void pool_put_kmsan(struct pool *, void *);
  116 static inline void pool_cache_get_kmsan(pool_cache_t, void *);
  117 static inline void pool_cache_put_kmsan(pool_cache_t, void *);
  118 #else
  119 #define pool_get_kmsan(pp, ptr)         __nothing
  120 #define pool_put_kmsan(pp, ptr)         __nothing
  121 #define pool_cache_get_kmsan(pc, ptr)   __nothing
  122 #define pool_cache_put_kmsan(pc, ptr)   __nothing
  123 #endif
  124 
  125 #ifdef POOL_QUARANTINE
  126 static void pool_quarantine_init(struct pool *);
  127 static void pool_quarantine_flush(struct pool *);
  128 static bool pool_put_quarantine(struct pool *, void *,
  129     struct pool_pagelist *);
  130 #else
  131 #define pool_quarantine_init(a)                 __nothing
  132 #define pool_quarantine_flush(a)                __nothing
  133 #define pool_put_quarantine(a, b, c)            false
  134 #endif
  135 
  136 #ifdef POOL_NOCACHE
  137 static bool pool_cache_put_nocache(pool_cache_t, void *);
  138 #else
  139 #define pool_cache_put_nocache(a, b)            false
  140 #endif
  141 
  142 #define NO_CTOR __FPTRCAST(int (*)(void *, void *, int), nullop)
  143 #define NO_DTOR __FPTRCAST(void (*)(void *, void *), nullop)
  144 
  145 #define pc_has_pser(pc) (((pc)->pc_roflags & PR_PSERIALIZE) != 0)
  146 #define pc_has_ctor(pc) ((pc)->pc_ctor != NO_CTOR)
  147 #define pc_has_dtor(pc) ((pc)->pc_dtor != NO_DTOR)
  148 
  149 #define pp_has_pser(pp) (((pp)->pr_roflags & PR_PSERIALIZE) != 0)
  150 
  151 #define pool_barrier()  xc_barrier(0)
  152 
  153 /*
  154  * Pool backend allocators.
  155  *
  156  * Each pool has a backend allocator that handles allocation, deallocation,
  157  * and any additional draining that might be needed.
  158  *
  159  * We provide two standard allocators:
  160  *
  161  *      pool_allocator_kmem - the default when no allocator is specified
  162  *
  163  *      pool_allocator_nointr - used for pools that will not be accessed
  164  *      in interrupt context.
  165  */
  166 void *pool_page_alloc(struct pool *, int);
  167 void pool_page_free(struct pool *, void *);
  168 
  169 static void *pool_page_alloc_meta(struct pool *, int);
  170 static void pool_page_free_meta(struct pool *, void *);
  171 
  172 struct pool_allocator pool_allocator_kmem = {
  173         .pa_alloc = pool_page_alloc,
  174         .pa_free = pool_page_free,
  175         .pa_pagesz = 0
  176 };
  177 
  178 struct pool_allocator pool_allocator_nointr = {
  179         .pa_alloc = pool_page_alloc,
  180         .pa_free = pool_page_free,
  181         .pa_pagesz = 0
  182 };
  183 
  184 struct pool_allocator pool_allocator_meta = {
  185         .pa_alloc = pool_page_alloc_meta,
  186         .pa_free = pool_page_free_meta,
  187         .pa_pagesz = 0
  188 };
  189 
  190 #define POOL_ALLOCATOR_BIG_BASE 13
  191 static struct pool_allocator pool_allocator_big[] = {
  192         {
  193                 .pa_alloc = pool_page_alloc,
  194                 .pa_free = pool_page_free,
  195                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0),
  196         },
  197         {
  198                 .pa_alloc = pool_page_alloc,
  199                 .pa_free = pool_page_free,
  200                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1),
  201         },
  202         {
  203                 .pa_alloc = pool_page_alloc,
  204                 .pa_free = pool_page_free,
  205                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2),
  206         },
  207         {
  208                 .pa_alloc = pool_page_alloc,
  209                 .pa_free = pool_page_free,
  210                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3),
  211         },
  212         {
  213                 .pa_alloc = pool_page_alloc,
  214                 .pa_free = pool_page_free,
  215                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4),
  216         },
  217         {
  218                 .pa_alloc = pool_page_alloc,
  219                 .pa_free = pool_page_free,
  220                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5),
  221         },
  222         {
  223                 .pa_alloc = pool_page_alloc,
  224                 .pa_free = pool_page_free,
  225                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6),
  226         },
  227         {
  228                 .pa_alloc = pool_page_alloc,
  229                 .pa_free = pool_page_free,
  230                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7),
  231         },
  232         {
  233                 .pa_alloc = pool_page_alloc,
  234                 .pa_free = pool_page_free,
  235                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 8),
  236         },
  237         {
  238                 .pa_alloc = pool_page_alloc,
  239                 .pa_free = pool_page_free,
  240                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 9),
  241         },
  242         {
  243                 .pa_alloc = pool_page_alloc,
  244                 .pa_free = pool_page_free,
  245                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 10),
  246         },
  247         {
  248                 .pa_alloc = pool_page_alloc,
  249                 .pa_free = pool_page_free,
  250                 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 11),
  251         }
  252 };
  253 
  254 static int pool_bigidx(size_t);
  255 
  256 /* # of seconds to retain page after last use */
  257 int pool_inactive_time = 10;
  258 
  259 /* Next candidate for drainage (see pool_drain()) */
  260 static struct pool *drainpp;
  261 
  262 /* This lock protects both pool_head and drainpp. */
  263 static kmutex_t pool_head_lock;
  264 static kcondvar_t pool_busy;
  265 
  266 /* This lock protects initialization of a potentially shared pool allocator */
  267 static kmutex_t pool_allocator_lock;
  268 
  269 static unsigned int poolid_counter = 0;
  270 
  271 typedef uint32_t pool_item_bitmap_t;
  272 #define BITMAP_SIZE     (CHAR_BIT * sizeof(pool_item_bitmap_t))
  273 #define BITMAP_MASK     (BITMAP_SIZE - 1)
  274 #define BITMAP_MIN_SIZE (CHAR_BIT * sizeof(((struct pool_item_header *)NULL)->ph_u2))
  275 
  276 struct pool_item_header {
  277         /* Page headers */
  278         LIST_ENTRY(pool_item_header)
  279                                 ph_pagelist;    /* pool page list */
  280         union {
  281                 /* !PR_PHINPAGE */
  282                 struct {
  283                         SPLAY_ENTRY(pool_item_header)
  284                                 phu_node;       /* off-page page headers */
  285                 } phu_offpage;
  286                 /* PR_PHINPAGE */
  287                 struct {
  288                         unsigned int phu_poolid;
  289                 } phu_onpage;
  290         } ph_u1;
  291         void *                  ph_page;        /* this page's address */
  292         uint32_t                ph_time;        /* last referenced */
  293         uint16_t                ph_nmissing;    /* # of chunks in use */
  294         uint16_t                ph_off;         /* start offset in page */
  295         union {
  296                 /* !PR_USEBMAP */
  297                 struct {
  298                         LIST_HEAD(, pool_item)
  299                                 phu_itemlist;   /* chunk list for this page */
  300                 } phu_normal;
  301                 /* PR_USEBMAP */
  302                 struct {
  303                         pool_item_bitmap_t phu_bitmap[1];
  304                 } phu_notouch;
  305         } ph_u2;
  306 };
  307 #define ph_node         ph_u1.phu_offpage.phu_node
  308 #define ph_poolid       ph_u1.phu_onpage.phu_poolid
  309 #define ph_itemlist     ph_u2.phu_normal.phu_itemlist
  310 #define ph_bitmap       ph_u2.phu_notouch.phu_bitmap
  311 
  312 #define PHSIZE  ALIGN(sizeof(struct pool_item_header))
  313 
  314 CTASSERT(offsetof(struct pool_item_header, ph_u2) +
  315     BITMAP_MIN_SIZE / CHAR_BIT == sizeof(struct pool_item_header));
  316 
  317 #if defined(DIAGNOSTIC) && !defined(KASAN)
  318 #define POOL_CHECK_MAGIC
  319 #endif
  320 
  321 struct pool_item {
  322 #ifdef POOL_CHECK_MAGIC
  323         u_int pi_magic;
  324 #endif
  325 #define PI_MAGIC 0xdeaddeadU
  326         /* Other entries use only this list entry */
  327         LIST_ENTRY(pool_item)   pi_list;
  328 };
  329 
  330 #define POOL_NEEDS_CATCHUP(pp)                                          \
  331         ((pp)->pr_nitems < (pp)->pr_minitems ||                         \
  332          (pp)->pr_npages < (pp)->pr_minpages)
  333 #define POOL_OBJ_TO_PAGE(pp, v)                                         \
  334         (void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask)
  335 
  336 /*
  337  * Pool cache management.
  338  *
  339  * Pool caches provide a way for constructed objects to be cached by the
  340  * pool subsystem.  This can lead to performance improvements by avoiding
  341  * needless object construction/destruction; it is deferred until absolutely
  342  * necessary.
  343  *
  344  * Caches are grouped into cache groups.  Each cache group references up
  345  * to PCG_NUMOBJECTS constructed objects.  When a cache allocates an
  346  * object from the pool, it calls the object's constructor and places it
  347  * into a cache group.  When a cache group frees an object back to the
  348  * pool, it first calls the object's destructor.  This allows the object
  349  * to persist in constructed form while freed to the cache.
  350  *
  351  * The pool references each cache, so that when a pool is drained by the
  352  * pagedaemon, it can drain each individual cache as well.  Each time a
  353  * cache is drained, the most idle cache group is freed to the pool in
  354  * its entirety.
  355  *
  356  * Pool caches are laid on top of pools.  By layering them, we can avoid
  357  * the complexity of cache management for pools which would not benefit
  358  * from it.
  359  */
  360 
  361 static struct pool pcg_normal_pool;
  362 static struct pool pcg_large_pool;
  363 static struct pool cache_pool;
  364 static struct pool cache_cpu_pool;
  365 
  366 static pcg_t *volatile pcg_large_cache __cacheline_aligned;
  367 static pcg_t *volatile pcg_normal_cache __cacheline_aligned;
  368 
  369 /* List of all caches. */
  370 TAILQ_HEAD(,pool_cache) pool_cache_head =
  371     TAILQ_HEAD_INITIALIZER(pool_cache_head);
  372 
  373 int pool_cache_disable;         /* global disable for caching */
  374 static const pcg_t pcg_dummy;   /* zero sized: always empty, yet always full */
  375 
  376 static bool     pool_cache_put_slow(pool_cache_t, pool_cache_cpu_t *, int,
  377                                     void *);
  378 static bool     pool_cache_get_slow(pool_cache_t, pool_cache_cpu_t *, int,
  379                                     void **, paddr_t *, int);
  380 static void     pool_cache_cpu_init1(struct cpu_info *, pool_cache_t);
  381 static int      pool_cache_invalidate_groups(pool_cache_t, pcg_t *);
  382 static void     pool_cache_invalidate_cpu(pool_cache_t, u_int);
  383 static void     pool_cache_transfer(pool_cache_t);
  384 static int      pool_pcg_get(pcg_t *volatile *, pcg_t **);
  385 static int      pool_pcg_put(pcg_t *volatile *, pcg_t *);
  386 static pcg_t *  pool_pcg_trunc(pcg_t *volatile *);
  387 
  388 static int      pool_catchup(struct pool *);
  389 static void     pool_prime_page(struct pool *, void *,
  390                     struct pool_item_header *);
  391 static void     pool_update_curpage(struct pool *);
  392 
  393 static int      pool_grow(struct pool *, int);
  394 static void     *pool_allocator_alloc(struct pool *, int);
  395 static void     pool_allocator_free(struct pool *, void *);
  396 
  397 static void pool_print_pagelist(struct pool *, struct pool_pagelist *,
  398         void (*)(const char *, ...) __printflike(1, 2));
  399 static void pool_print1(struct pool *, const char *,
  400         void (*)(const char *, ...) __printflike(1, 2));
  401 
  402 static int pool_chk_page(struct pool *, const char *,
  403                          struct pool_item_header *);
  404 
  405 /* -------------------------------------------------------------------------- */
  406 
  407 static inline unsigned int
  408 pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph,
  409     const void *v)
  410 {
  411         const char *cp = v;
  412         unsigned int idx;
  413 
  414         KASSERT(pp->pr_roflags & PR_USEBMAP);
  415         idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size;
  416 
  417         if (__predict_false(idx >= pp->pr_itemsperpage)) {
  418                 panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx,
  419                     pp->pr_itemsperpage);
  420         }
  421 
  422         return idx;
  423 }
  424 
  425 static inline void
  426 pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph,
  427     void *obj)
  428 {
  429         unsigned int idx = pr_item_bitmap_index(pp, ph, obj);
  430         pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE);
  431         pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK);
  432 
  433         if (__predict_false((*bitmap & mask) != 0)) {
  434                 panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj);
  435         }
  436 
  437         *bitmap |= mask;
  438 }
  439 
  440 static inline void *
  441 pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph)
  442 {
  443         pool_item_bitmap_t *bitmap = ph->ph_bitmap;
  444         unsigned int idx;
  445         int i;
  446 
  447         for (i = 0; ; i++) {
  448                 int bit;
  449 
  450                 KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage);
  451                 bit = ffs32(bitmap[i]);
  452                 if (bit) {
  453                         pool_item_bitmap_t mask;
  454 
  455                         bit--;
  456                         idx = (i * BITMAP_SIZE) + bit;
  457                         mask = 1U << bit;
  458                         KASSERT((bitmap[i] & mask) != 0);
  459                         bitmap[i] &= ~mask;
  460                         break;
  461                 }
  462         }
  463         KASSERT(idx < pp->pr_itemsperpage);
  464         return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size;
  465 }
  466 
  467 static inline void
  468 pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph)
  469 {
  470         pool_item_bitmap_t *bitmap = ph->ph_bitmap;
  471         const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE);
  472         int i;
  473 
  474         for (i = 0; i < n; i++) {
  475                 bitmap[i] = (pool_item_bitmap_t)-1;
  476         }
  477 }
  478 
  479 /* -------------------------------------------------------------------------- */
  480 
  481 static inline void
  482 pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph,
  483     void *obj)
  484 {
  485         struct pool_item *pi = obj;
  486 
  487         KASSERT(!pp_has_pser(pp));
  488 
  489 #ifdef POOL_CHECK_MAGIC
  490         pi->pi_magic = PI_MAGIC;
  491 #endif
  492 
  493         if (pp->pr_redzone) {
  494                 /*
  495                  * Mark the pool_item as valid. The rest is already
  496                  * invalid.
  497                  */
  498                 kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0);
  499         }
  500 
  501         LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
  502 }
  503 
  504 static inline void *
  505 pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph)
  506 {
  507         struct pool_item *pi;
  508         void *v;
  509 
  510         v = pi = LIST_FIRST(&ph->ph_itemlist);
  511         if (__predict_false(v == NULL)) {
  512                 mutex_exit(&pp->pr_lock);
  513                 panic("%s: [%s] page empty", __func__, pp->pr_wchan);
  514         }
  515         KASSERTMSG((pp->pr_nitems > 0),
  516             "%s: [%s] nitems %u inconsistent on itemlist",
  517             __func__, pp->pr_wchan, pp->pr_nitems);
  518 #ifdef POOL_CHECK_MAGIC
  519         KASSERTMSG((pi->pi_magic == PI_MAGIC),
  520             "%s: [%s] free list modified: "
  521             "magic=%x; page %p; item addr %p", __func__,
  522             pp->pr_wchan, pi->pi_magic, ph->ph_page, pi);
  523 #endif
  524 
  525         /*
  526          * Remove from item list.
  527          */
  528         LIST_REMOVE(pi, pi_list);
  529 
  530         return v;
  531 }
  532 
  533 /* -------------------------------------------------------------------------- */
  534 
  535 static inline void
  536 pr_phinpage_check(struct pool *pp, struct pool_item_header *ph, void *page,
  537     void *object)
  538 {
  539         if (__predict_false((void *)ph->ph_page != page)) {
  540                 panic("%s: [%s] item %p not part of pool", __func__,
  541                     pp->pr_wchan, object);
  542         }
  543         if (__predict_false((char *)object < (char *)page + ph->ph_off)) {
  544                 panic("%s: [%s] item %p below item space", __func__,
  545                     pp->pr_wchan, object);
  546         }
  547         if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
  548                 panic("%s: [%s] item %p poolid %u != %u", __func__,
  549                     pp->pr_wchan, object, ph->ph_poolid, pp->pr_poolid);
  550         }
  551 }
  552 
  553 static inline void
  554 pc_phinpage_check(pool_cache_t pc, void *object)
  555 {
  556         struct pool_item_header *ph;
  557         struct pool *pp;
  558         void *page;
  559 
  560         pp = &pc->pc_pool;
  561         page = POOL_OBJ_TO_PAGE(pp, object);
  562         ph = (struct pool_item_header *)page;
  563 
  564         pr_phinpage_check(pp, ph, page, object);
  565 }
  566 
  567 /* -------------------------------------------------------------------------- */
  568 
  569 static inline int
  570 phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
  571 {
  572 
  573         /*
  574          * We consider pool_item_header with smaller ph_page bigger. This
  575          * unnatural ordering is for the benefit of pr_find_pagehead.
  576          */
  577         if (a->ph_page < b->ph_page)
  578                 return 1;
  579         else if (a->ph_page > b->ph_page)
  580                 return -1;
  581         else
  582                 return 0;
  583 }
  584 
  585 SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
  586 SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
  587 
  588 static inline struct pool_item_header *
  589 pr_find_pagehead_noalign(struct pool *pp, void *v)
  590 {
  591         struct pool_item_header *ph, tmp;
  592 
  593         tmp.ph_page = (void *)(uintptr_t)v;
  594         ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
  595         if (ph == NULL) {
  596                 ph = SPLAY_ROOT(&pp->pr_phtree);
  597                 if (ph != NULL && phtree_compare(&tmp, ph) >= 0) {
  598                         ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph);
  599                 }
  600                 KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0);
  601         }
  602 
  603         return ph;
  604 }
  605 
  606 /*
  607  * Return the pool page header based on item address.
  608  */
  609 static inline struct pool_item_header *
  610 pr_find_pagehead(struct pool *pp, void *v)
  611 {
  612         struct pool_item_header *ph, tmp;
  613 
  614         if ((pp->pr_roflags & PR_NOALIGN) != 0) {
  615                 ph = pr_find_pagehead_noalign(pp, v);
  616         } else {
  617                 void *page = POOL_OBJ_TO_PAGE(pp, v);
  618                 if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
  619                         ph = (struct pool_item_header *)page;
  620                         pr_phinpage_check(pp, ph, page, v);
  621                 } else {
  622                         tmp.ph_page = page;
  623                         ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
  624                 }
  625         }
  626 
  627         KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) ||
  628             ((char *)ph->ph_page <= (char *)v &&
  629             (char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz));
  630         return ph;
  631 }
  632 
  633 static void
  634 pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq)
  635 {
  636         struct pool_item_header *ph;
  637 
  638         while ((ph = LIST_FIRST(pq)) != NULL) {
  639                 LIST_REMOVE(ph, ph_pagelist);
  640                 pool_allocator_free(pp, ph->ph_page);
  641                 if ((pp->pr_roflags & PR_PHINPAGE) == 0)
  642                         pool_put(pp->pr_phpool, ph);
  643         }
  644 }
  645 
  646 /*
  647  * Remove a page from the pool.
  648  */
  649 static inline void
  650 pr_rmpage(struct pool *pp, struct pool_item_header *ph,
  651      struct pool_pagelist *pq)
  652 {
  653 
  654         KASSERT(mutex_owned(&pp->pr_lock));
  655 
  656         /*
  657          * If the page was idle, decrement the idle page count.
  658          */
  659         if (ph->ph_nmissing == 0) {
  660                 KASSERT(pp->pr_nidle != 0);
  661                 KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage),
  662                     "%s: [%s] nitems=%u < itemsperpage=%u", __func__,
  663                     pp->pr_wchan, pp->pr_nitems, pp->pr_itemsperpage);
  664                 pp->pr_nidle--;
  665         }
  666 
  667         pp->pr_nitems -= pp->pr_itemsperpage;
  668 
  669         /*
  670          * Unlink the page from the pool and queue it for release.
  671          */
  672         LIST_REMOVE(ph, ph_pagelist);
  673         if (pp->pr_roflags & PR_PHINPAGE) {
  674                 if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
  675                         panic("%s: [%s] ph %p poolid %u != %u",
  676                             __func__, pp->pr_wchan, ph, ph->ph_poolid,
  677                             pp->pr_poolid);
  678                 }
  679         } else {
  680                 SPLAY_REMOVE(phtree, &pp->pr_phtree, ph);
  681         }
  682         LIST_INSERT_HEAD(pq, ph, ph_pagelist);
  683 
  684         pp->pr_npages--;
  685         pp->pr_npagefree++;
  686 
  687         pool_update_curpage(pp);
  688 }
  689 
  690 /*
  691  * Initialize all the pools listed in the "pools" link set.
  692  */
  693 void
  694 pool_subsystem_init(void)
  695 {
  696         size_t size;
  697         int idx;
  698 
  699         mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE);
  700         mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE);
  701         cv_init(&pool_busy, "poolbusy");
  702 
  703         /*
  704          * Initialize private page header pool and cache magazine pool if we
  705          * haven't done so yet.
  706          */
  707         for (idx = 0; idx < PHPOOL_MAX; idx++) {
  708                 static char phpool_names[PHPOOL_MAX][6+1+6+1];
  709                 int nelem;
  710                 size_t sz;
  711 
  712                 nelem = PHPOOL_FREELIST_NELEM(idx);
  713                 KASSERT(nelem != 0);
  714                 snprintf(phpool_names[idx], sizeof(phpool_names[idx]),
  715                     "phpool-%d", nelem);
  716                 sz = offsetof(struct pool_item_header,
  717                     ph_bitmap[howmany(nelem, BITMAP_SIZE)]);
  718                 pool_init(&phpool[idx], sz, 0, 0, 0,
  719                     phpool_names[idx], &pool_allocator_meta, IPL_VM);
  720         }
  721 
  722         size = sizeof(pcg_t) +
  723             (PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t);
  724         pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0,
  725             "pcgnormal", &pool_allocator_meta, IPL_VM);
  726 
  727         size = sizeof(pcg_t) +
  728             (PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t);
  729         pool_init(&pcg_large_pool, size, coherency_unit, 0, 0,
  730             "pcglarge", &pool_allocator_meta, IPL_VM);
  731 
  732         pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit,
  733             0, 0, "pcache", &pool_allocator_meta, IPL_NONE);
  734 
  735         pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit,
  736             0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE);
  737 }
  738 
  739 static inline bool
  740 pool_init_is_phinpage(const struct pool *pp)
  741 {
  742         size_t pagesize;
  743 
  744         if (pp->pr_roflags & PR_PHINPAGE) {
  745                 return true;
  746         }
  747         if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) {
  748                 return false;
  749         }
  750 
  751         pagesize = pp->pr_alloc->pa_pagesz;
  752 
  753         /*
  754          * Threshold: the item size is below 1/16 of a page size, and below
  755          * 8 times the page header size. The latter ensures we go off-page
  756          * if the page header would make us waste a rather big item.
  757          */
  758         if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) {
  759                 return true;
  760         }
  761 
  762         /* Put the header into the page if it doesn't waste any items. */
  763         if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) {
  764                 return true;
  765         }
  766 
  767         return false;
  768 }
  769 
  770 static inline bool
  771 pool_init_is_usebmap(const struct pool *pp)
  772 {
  773         size_t bmapsize;
  774 
  775         if (pp->pr_roflags & PR_NOTOUCH) {
  776                 return true;
  777         }
  778 
  779         /*
  780          * If we're off-page, go with a bitmap.
  781          */
  782         if (!(pp->pr_roflags & PR_PHINPAGE)) {
  783                 return true;
  784         }
  785 
  786         /*
  787          * If we're on-page, and the page header can already contain a bitmap
  788          * big enough to cover all the items of the page, go with a bitmap.
  789          */
  790         bmapsize = roundup(PHSIZE, pp->pr_align) -
  791             offsetof(struct pool_item_header, ph_bitmap[0]);
  792         KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0);
  793         if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) {
  794                 return true;
  795         }
  796 
  797         return false;
  798 }
  799 
  800 /*
  801  * Initialize the given pool resource structure.
  802  *
  803  * We export this routine to allow other kernel parts to declare
  804  * static pools that must be initialized before kmem(9) is available.
  805  */
  806 void
  807 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
  808     const char *wchan, struct pool_allocator *palloc, int ipl)
  809 {
  810         struct pool *pp1;
  811         size_t prsize;
  812         int itemspace, slack;
  813 
  814         /* XXX ioff will be removed. */
  815         KASSERT(ioff == 0);
  816 
  817 #ifdef DEBUG
  818         if (__predict_true(!cold))
  819                 mutex_enter(&pool_head_lock);
  820         /*
  821          * Check that the pool hasn't already been initialised and
  822          * added to the list of all pools.
  823          */
  824         TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
  825                 if (pp == pp1)
  826                         panic("%s: [%s] already initialised", __func__,
  827                             wchan);
  828         }
  829         if (__predict_true(!cold))
  830                 mutex_exit(&pool_head_lock);
  831 #endif
  832 
  833         if (palloc == NULL)
  834                 palloc = &pool_allocator_kmem;
  835 
  836         if (!cold)
  837                 mutex_enter(&pool_allocator_lock);
  838         if (palloc->pa_refcnt++ == 0) {
  839                 if (palloc->pa_pagesz == 0)
  840                         palloc->pa_pagesz = PAGE_SIZE;
  841 
  842                 TAILQ_INIT(&palloc->pa_list);
  843 
  844                 mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM);
  845                 palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
  846                 palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
  847         }
  848         if (!cold)
  849                 mutex_exit(&pool_allocator_lock);
  850 
  851         /*
  852          * PR_PSERIALIZE implies PR_NOTOUCH; freed objects must remain
  853          * valid until the the backing page is returned to the system.
  854          */
  855         if (flags & PR_PSERIALIZE) {
  856                 flags |= PR_NOTOUCH;
  857         }
  858 
  859         if (align == 0)
  860                 align = ALIGN(1);
  861 
  862         prsize = size;
  863         if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item))
  864                 prsize = sizeof(struct pool_item);
  865 
  866         prsize = roundup(prsize, align);
  867         KASSERTMSG((prsize <= palloc->pa_pagesz),
  868             "%s: [%s] pool item size (%zu) larger than page size (%u)",
  869             __func__, wchan, prsize, palloc->pa_pagesz);
  870 
  871         /*
  872          * Initialize the pool structure.
  873          */
  874         LIST_INIT(&pp->pr_emptypages);
  875         LIST_INIT(&pp->pr_fullpages);
  876         LIST_INIT(&pp->pr_partpages);
  877         pp->pr_cache = NULL;
  878         pp->pr_curpage = NULL;
  879         pp->pr_npages = 0;
  880         pp->pr_minitems = 0;
  881         pp->pr_minpages = 0;
  882         pp->pr_maxpages = UINT_MAX;
  883         pp->pr_roflags = flags;
  884         pp->pr_flags = 0;
  885         pp->pr_size = prsize;
  886         pp->pr_reqsize = size;
  887         pp->pr_align = align;
  888         pp->pr_wchan = wchan;
  889         pp->pr_alloc = palloc;
  890         pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter);
  891         pp->pr_nitems = 0;
  892         pp->pr_nout = 0;
  893         pp->pr_hardlimit = UINT_MAX;
  894         pp->pr_hardlimit_warning = NULL;
  895         pp->pr_hardlimit_ratecap.tv_sec = 0;
  896         pp->pr_hardlimit_ratecap.tv_usec = 0;
  897         pp->pr_hardlimit_warning_last.tv_sec = 0;
  898         pp->pr_hardlimit_warning_last.tv_usec = 0;
  899         pp->pr_drain_hook = NULL;
  900         pp->pr_drain_hook_arg = NULL;
  901         pp->pr_freecheck = NULL;
  902         pp->pr_redzone = false;
  903         pool_redzone_init(pp, size);
  904         pool_quarantine_init(pp);
  905 
  906         /*
  907          * Decide whether to put the page header off-page to avoid wasting too
  908          * large a part of the page or too big an item. Off-page page headers
  909          * go on a hash table, so we can match a returned item with its header
  910          * based on the page address.
  911          */
  912         if (pool_init_is_phinpage(pp)) {
  913                 /* Use the beginning of the page for the page header */
  914                 itemspace = palloc->pa_pagesz - roundup(PHSIZE, align);
  915                 pp->pr_itemoffset = roundup(PHSIZE, align);
  916                 pp->pr_roflags |= PR_PHINPAGE;
  917         } else {
  918                 /* The page header will be taken from our page header pool */
  919                 itemspace = palloc->pa_pagesz;
  920                 pp->pr_itemoffset = 0;
  921                 SPLAY_INIT(&pp->pr_phtree);
  922         }
  923 
  924         pp->pr_itemsperpage = itemspace / pp->pr_size;
  925         KASSERT(pp->pr_itemsperpage != 0);
  926 
  927         /*
  928          * Decide whether to use a bitmap or a linked list to manage freed
  929          * items.
  930          */
  931         if (pool_init_is_usebmap(pp)) {
  932                 pp->pr_roflags |= PR_USEBMAP;
  933         }
  934 
  935         /*
  936          * If we're off-page, then we're using a bitmap; choose the appropriate
  937          * pool to allocate page headers, whose size varies depending on the
  938          * bitmap. If we're on-page, nothing to do.
  939          */
  940         if (!(pp->pr_roflags & PR_PHINPAGE)) {
  941                 int idx;
  942 
  943                 KASSERT(pp->pr_roflags & PR_USEBMAP);
  944 
  945                 for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx);
  946                     idx++) {
  947                         /* nothing */
  948                 }
  949                 if (idx >= PHPOOL_MAX) {
  950                         /*
  951                          * if you see this panic, consider to tweak
  952                          * PHPOOL_MAX and PHPOOL_FREELIST_NELEM.
  953                          */
  954                         panic("%s: [%s] too large itemsperpage(%d) for "
  955                             "PR_USEBMAP", __func__,
  956                             pp->pr_wchan, pp->pr_itemsperpage);
  957                 }
  958                 pp->pr_phpool = &phpool[idx];
  959         } else {
  960                 pp->pr_phpool = NULL;
  961         }
  962 
  963         /*
  964          * Use the slack between the chunks and the page header
  965          * for "cache coloring".
  966          */
  967         slack = itemspace - pp->pr_itemsperpage * pp->pr_size;
  968         pp->pr_maxcolor = rounddown(slack, align);
  969         pp->pr_curcolor = 0;
  970 
  971         pp->pr_nget = 0;
  972         pp->pr_nfail = 0;
  973         pp->pr_nput = 0;
  974         pp->pr_npagealloc = 0;
  975         pp->pr_npagefree = 0;
  976         pp->pr_hiwat = 0;
  977         pp->pr_nidle = 0;
  978         pp->pr_refcnt = 0;
  979 
  980         mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl);
  981         cv_init(&pp->pr_cv, wchan);
  982         pp->pr_ipl = ipl;
  983 
  984         /* Insert into the list of all pools. */
  985         if (!cold)
  986                 mutex_enter(&pool_head_lock);
  987         TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
  988                 if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0)
  989                         break;
  990         }
  991         if (pp1 == NULL)
  992                 TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist);
  993         else
  994                 TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist);
  995         if (!cold)
  996                 mutex_exit(&pool_head_lock);
  997 
  998         /* Insert this into the list of pools using this allocator. */
  999         if (!cold)
 1000                 mutex_enter(&palloc->pa_lock);
 1001         TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
 1002         if (!cold)
 1003                 mutex_exit(&palloc->pa_lock);
 1004 }
 1005 
 1006 /*
 1007  * De-commission a pool resource.
 1008  */
 1009 void
 1010 pool_destroy(struct pool *pp)
 1011 {
 1012         struct pool_pagelist pq;
 1013         struct pool_item_header *ph;
 1014 
 1015         pool_quarantine_flush(pp);
 1016 
 1017         /* Remove from global pool list */
 1018         mutex_enter(&pool_head_lock);
 1019         while (pp->pr_refcnt != 0)
 1020                 cv_wait(&pool_busy, &pool_head_lock);
 1021         TAILQ_REMOVE(&pool_head, pp, pr_poollist);
 1022         if (drainpp == pp)
 1023                 drainpp = NULL;
 1024         mutex_exit(&pool_head_lock);
 1025 
 1026         /* Remove this pool from its allocator's list of pools. */
 1027         mutex_enter(&pp->pr_alloc->pa_lock);
 1028         TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
 1029         mutex_exit(&pp->pr_alloc->pa_lock);
 1030 
 1031         mutex_enter(&pool_allocator_lock);
 1032         if (--pp->pr_alloc->pa_refcnt == 0)
 1033                 mutex_destroy(&pp->pr_alloc->pa_lock);
 1034         mutex_exit(&pool_allocator_lock);
 1035 
 1036         mutex_enter(&pp->pr_lock);
 1037 
 1038         KASSERT(pp->pr_cache == NULL);
 1039         KASSERTMSG((pp->pr_nout == 0),
 1040             "%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan,
 1041             pp->pr_nout);
 1042         KASSERT(LIST_EMPTY(&pp->pr_fullpages));
 1043         KASSERT(LIST_EMPTY(&pp->pr_partpages));
 1044 
 1045         /* Remove all pages */
 1046         LIST_INIT(&pq);
 1047         while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
 1048                 pr_rmpage(pp, ph, &pq);
 1049 
 1050         mutex_exit(&pp->pr_lock);
 1051 
 1052         pr_pagelist_free(pp, &pq);
 1053         cv_destroy(&pp->pr_cv);
 1054         mutex_destroy(&pp->pr_lock);
 1055 }
 1056 
 1057 void
 1058 pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg)
 1059 {
 1060 
 1061         /* XXX no locking -- must be used just after pool_init() */
 1062         KASSERTMSG((pp->pr_drain_hook == NULL),
 1063             "%s: [%s] already set", __func__, pp->pr_wchan);
 1064         pp->pr_drain_hook = fn;
 1065         pp->pr_drain_hook_arg = arg;
 1066 }
 1067 
 1068 static struct pool_item_header *
 1069 pool_alloc_item_header(struct pool *pp, void *storage, int flags)
 1070 {
 1071         struct pool_item_header *ph;
 1072 
 1073         if ((pp->pr_roflags & PR_PHINPAGE) != 0)
 1074                 ph = storage;
 1075         else
 1076                 ph = pool_get(pp->pr_phpool, flags);
 1077 
 1078         return ph;
 1079 }
 1080 
 1081 /*
 1082  * Grab an item from the pool.
 1083  */
 1084 void *
 1085 pool_get(struct pool *pp, int flags)
 1086 {
 1087         struct pool_item_header *ph;
 1088         void *v;
 1089 
 1090         KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK));
 1091         KASSERTMSG((pp->pr_itemsperpage != 0),
 1092             "%s: [%s] pr_itemsperpage is zero, "
 1093             "pool not initialized?", __func__, pp->pr_wchan);
 1094         KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p())
 1095                 || pp->pr_ipl != IPL_NONE || cold || panicstr != NULL),
 1096             "%s: [%s] is IPL_NONE, but called from interrupt context",
 1097             __func__, pp->pr_wchan);
 1098         if (flags & PR_WAITOK) {
 1099                 ASSERT_SLEEPABLE();
 1100         }
 1101 
 1102         if (flags & PR_NOWAIT) {
 1103                 if (fault_inject())
 1104                         return NULL;
 1105         }
 1106 
 1107         mutex_enter(&pp->pr_lock);
 1108  startover:
 1109         /*
 1110          * Check to see if we've reached the hard limit.  If we have,
 1111          * and we can wait, then wait until an item has been returned to
 1112          * the pool.
 1113          */
 1114         KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit),
 1115             "%s: %s: crossed hard limit", __func__, pp->pr_wchan);
 1116         if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
 1117                 if (pp->pr_drain_hook != NULL) {
 1118                         /*
 1119                          * Since the drain hook is going to free things
 1120                          * back to the pool, unlock, call the hook, re-lock,
 1121                          * and check the hardlimit condition again.
 1122                          */
 1123                         mutex_exit(&pp->pr_lock);
 1124                         (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
 1125                         mutex_enter(&pp->pr_lock);
 1126                         if (pp->pr_nout < pp->pr_hardlimit)
 1127                                 goto startover;
 1128                 }
 1129 
 1130                 if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
 1131                         /*
 1132                          * XXX: A warning isn't logged in this case.  Should
 1133                          * it be?
 1134                          */
 1135                         pp->pr_flags |= PR_WANTED;
 1136                         do {
 1137                                 cv_wait(&pp->pr_cv, &pp->pr_lock);
 1138                         } while (pp->pr_flags & PR_WANTED);
 1139                         goto startover;
 1140                 }
 1141 
 1142                 /*
 1143                  * Log a message that the hard limit has been hit.
 1144                  */
 1145                 if (pp->pr_hardlimit_warning != NULL &&
 1146                     ratecheck(&pp->pr_hardlimit_warning_last,
 1147                               &pp->pr_hardlimit_ratecap))
 1148                         log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
 1149 
 1150                 pp->pr_nfail++;
 1151 
 1152                 mutex_exit(&pp->pr_lock);
 1153                 KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
 1154                 return NULL;
 1155         }
 1156 
 1157         /*
 1158          * The convention we use is that if `curpage' is not NULL, then
 1159          * it points at a non-empty bucket. In particular, `curpage'
 1160          * never points at a page header which has PR_PHINPAGE set and
 1161          * has no items in its bucket.
 1162          */
 1163         if ((ph = pp->pr_curpage) == NULL) {
 1164                 int error;
 1165 
 1166                 KASSERTMSG((pp->pr_nitems == 0),
 1167                     "%s: [%s] curpage NULL, inconsistent nitems %u",
 1168                     __func__, pp->pr_wchan, pp->pr_nitems);
 1169 
 1170                 /*
 1171                  * Call the back-end page allocator for more memory.
 1172                  * Release the pool lock, as the back-end page allocator
 1173                  * may block.
 1174                  */
 1175                 error = pool_grow(pp, flags);
 1176                 if (error != 0) {
 1177                         /*
 1178                          * pool_grow aborts when another thread
 1179                          * is allocating a new page. Retry if it
 1180                          * waited for it.
 1181                          */
 1182                         if (error == ERESTART)
 1183                                 goto startover;
 1184 
 1185                         /*
 1186                          * We were unable to allocate a page or item
 1187                          * header, but we released the lock during
 1188                          * allocation, so perhaps items were freed
 1189                          * back to the pool.  Check for this case.
 1190                          */
 1191                         if (pp->pr_curpage != NULL)
 1192                                 goto startover;
 1193 
 1194                         pp->pr_nfail++;
 1195                         mutex_exit(&pp->pr_lock);
 1196                         KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
 1197                         return NULL;
 1198                 }
 1199 
 1200                 /* Start the allocation process over. */
 1201                 goto startover;
 1202         }
 1203         if (pp->pr_roflags & PR_USEBMAP) {
 1204                 KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage),
 1205                     "%s: [%s] pool page empty", __func__, pp->pr_wchan);
 1206                 v = pr_item_bitmap_get(pp, ph);
 1207         } else {
 1208                 v = pr_item_linkedlist_get(pp, ph);
 1209         }
 1210         pp->pr_nitems--;
 1211         pp->pr_nout++;
 1212         if (ph->ph_nmissing == 0) {
 1213                 KASSERT(pp->pr_nidle > 0);
 1214                 pp->pr_nidle--;
 1215 
 1216                 /*
 1217                  * This page was previously empty.  Move it to the list of
 1218                  * partially-full pages.  This page is already curpage.
 1219                  */
 1220                 LIST_REMOVE(ph, ph_pagelist);
 1221                 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
 1222         }
 1223         ph->ph_nmissing++;
 1224         if (ph->ph_nmissing == pp->pr_itemsperpage) {
 1225                 KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) ||
 1226                         LIST_EMPTY(&ph->ph_itemlist)),
 1227                     "%s: [%s] nmissing (%u) inconsistent", __func__,
 1228                         pp->pr_wchan, ph->ph_nmissing);
 1229                 /*
 1230                  * This page is now full.  Move it to the full list
 1231                  * and select a new current page.
 1232                  */
 1233                 LIST_REMOVE(ph, ph_pagelist);
 1234                 LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
 1235                 pool_update_curpage(pp);
 1236         }
 1237 
 1238         pp->pr_nget++;
 1239 
 1240         /*
 1241          * If we have a low water mark and we are now below that low
 1242          * water mark, add more items to the pool.
 1243          */
 1244         if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
 1245                 /*
 1246                  * XXX: Should we log a warning?  Should we set up a timeout
 1247                  * to try again in a second or so?  The latter could break
 1248                  * a caller's assumptions about interrupt protection, etc.
 1249                  */
 1250         }
 1251 
 1252         mutex_exit(&pp->pr_lock);
 1253         KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0);
 1254         FREECHECK_OUT(&pp->pr_freecheck, v);
 1255         pool_redzone_fill(pp, v);
 1256         pool_get_kmsan(pp, v);
 1257         if (flags & PR_ZERO)
 1258                 memset(v, 0, pp->pr_reqsize);
 1259         return v;
 1260 }
 1261 
 1262 /*
 1263  * Internal version of pool_put().  Pool is already locked/entered.
 1264  */
 1265 static void
 1266 pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq)
 1267 {
 1268         struct pool_item_header *ph;
 1269 
 1270         KASSERT(mutex_owned(&pp->pr_lock));
 1271         pool_redzone_check(pp, v);
 1272         pool_put_kmsan(pp, v);
 1273         FREECHECK_IN(&pp->pr_freecheck, v);
 1274         LOCKDEBUG_MEM_CHECK(v, pp->pr_size);
 1275 
 1276         KASSERTMSG((pp->pr_nout > 0),
 1277             "%s: [%s] putting with none out", __func__, pp->pr_wchan);
 1278 
 1279         if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) {
 1280                 panic("%s: [%s] page header missing", __func__,  pp->pr_wchan);
 1281         }
 1282 
 1283         /*
 1284          * Return to item list.
 1285          */
 1286         if (pp->pr_roflags & PR_USEBMAP) {
 1287                 pr_item_bitmap_put(pp, ph, v);
 1288         } else {
 1289                 pr_item_linkedlist_put(pp, ph, v);
 1290         }
 1291         KDASSERT(ph->ph_nmissing != 0);
 1292         ph->ph_nmissing--;
 1293         pp->pr_nput++;
 1294         pp->pr_nitems++;
 1295         pp->pr_nout--;
 1296 
 1297         /* Cancel "pool empty" condition if it exists */
 1298         if (pp->pr_curpage == NULL)
 1299                 pp->pr_curpage = ph;
 1300 
 1301         if (pp->pr_flags & PR_WANTED) {
 1302                 pp->pr_flags &= ~PR_WANTED;
 1303                 cv_broadcast(&pp->pr_cv);
 1304         }
 1305 
 1306         /*
 1307          * If this page is now empty, do one of two things:
 1308          *
 1309          *      (1) If we have more pages than the page high water mark,
 1310          *          free the page back to the system.  ONLY CONSIDER
 1311          *          FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE
 1312          *          CLAIM.
 1313          *
 1314          *      (2) Otherwise, move the page to the empty page list.
 1315          *
 1316          * Either way, select a new current page (so we use a partially-full
 1317          * page if one is available).
 1318          */
 1319         if (ph->ph_nmissing == 0) {
 1320                 pp->pr_nidle++;
 1321                 if (pp->pr_nitems - pp->pr_itemsperpage >= pp->pr_minitems &&
 1322                     pp->pr_npages > pp->pr_minpages &&
 1323                     pp->pr_npages > pp->pr_maxpages) {
 1324                         pr_rmpage(pp, ph, pq);
 1325                 } else {
 1326                         LIST_REMOVE(ph, ph_pagelist);
 1327                         LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
 1328 
 1329                         /*
 1330                          * Update the timestamp on the page.  A page must
 1331                          * be idle for some period of time before it can
 1332                          * be reclaimed by the pagedaemon.  This minimizes
 1333                          * ping-pong'ing for memory.
 1334                          *
 1335                          * note for 64-bit time_t: truncating to 32-bit is not
 1336                          * a problem for our usage.
 1337                          */
 1338                         ph->ph_time = time_uptime;
 1339                 }
 1340                 pool_update_curpage(pp);
 1341         }
 1342 
 1343         /*
 1344          * If the page was previously completely full, move it to the
 1345          * partially-full list and make it the current page.  The next
 1346          * allocation will get the item from this page, instead of
 1347          * further fragmenting the pool.
 1348          */
 1349         else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
 1350                 LIST_REMOVE(ph, ph_pagelist);
 1351                 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
 1352                 pp->pr_curpage = ph;
 1353         }
 1354 }
 1355 
 1356 void
 1357 pool_put(struct pool *pp, void *v)
 1358 {
 1359         struct pool_pagelist pq;
 1360 
 1361         LIST_INIT(&pq);
 1362 
 1363         mutex_enter(&pp->pr_lock);
 1364         if (!pool_put_quarantine(pp, v, &pq)) {
 1365                 pool_do_put(pp, v, &pq);
 1366         }
 1367         mutex_exit(&pp->pr_lock);
 1368 
 1369         pr_pagelist_free(pp, &pq);
 1370 }
 1371 
 1372 /*
 1373  * pool_grow: grow a pool by a page.
 1374  *
 1375  * => called with pool locked.
 1376  * => unlock and relock the pool.
 1377  * => return with pool locked.
 1378  */
 1379 
 1380 static int
 1381 pool_grow(struct pool *pp, int flags)
 1382 {
 1383         struct pool_item_header *ph;
 1384         char *storage;
 1385 
 1386         /*
 1387          * If there's a pool_grow in progress, wait for it to complete
 1388          * and try again from the top.
 1389          */
 1390         if (pp->pr_flags & PR_GROWING) {
 1391                 if (flags & PR_WAITOK) {
 1392                         do {
 1393                                 cv_wait(&pp->pr_cv, &pp->pr_lock);
 1394                         } while (pp->pr_flags & PR_GROWING);
 1395                         return ERESTART;
 1396                 } else {
 1397                         if (pp->pr_flags & PR_GROWINGNOWAIT) {
 1398                                 /*
 1399                                  * This needs an unlock/relock dance so
 1400                                  * that the other caller has a chance to
 1401                                  * run and actually do the thing.  Note
 1402                                  * that this is effectively a busy-wait.
 1403                                  */
 1404                                 mutex_exit(&pp->pr_lock);
 1405                                 mutex_enter(&pp->pr_lock);
 1406                                 return ERESTART;
 1407                         }
 1408                         return EWOULDBLOCK;
 1409                 }
 1410         }
 1411         pp->pr_flags |= PR_GROWING;
 1412         if (flags & PR_WAITOK)
 1413                 mutex_exit(&pp->pr_lock);
 1414         else
 1415                 pp->pr_flags |= PR_GROWINGNOWAIT;
 1416 
 1417         storage = pool_allocator_alloc(pp, flags);
 1418         if (__predict_false(storage == NULL))
 1419                 goto out;
 1420 
 1421         ph = pool_alloc_item_header(pp, storage, flags);
 1422         if (__predict_false(ph == NULL)) {
 1423                 pool_allocator_free(pp, storage);
 1424                 goto out;
 1425         }
 1426 
 1427         if (flags & PR_WAITOK)
 1428                 mutex_enter(&pp->pr_lock);
 1429         pool_prime_page(pp, storage, ph);
 1430         pp->pr_npagealloc++;
 1431         KASSERT(pp->pr_flags & PR_GROWING);
 1432         pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
 1433         /*
 1434          * If anyone was waiting for pool_grow, notify them that we
 1435          * may have just done it.
 1436          */
 1437         cv_broadcast(&pp->pr_cv);
 1438         return 0;
 1439 out:
 1440         if (flags & PR_WAITOK)
 1441                 mutex_enter(&pp->pr_lock);
 1442         KASSERT(pp->pr_flags & PR_GROWING);
 1443         pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
 1444         return ENOMEM;
 1445 }
 1446 
 1447 void
 1448 pool_prime(struct pool *pp, int n)
 1449 {
 1450 
 1451         mutex_enter(&pp->pr_lock);
 1452         pp->pr_minpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
 1453         if (pp->pr_maxpages <= pp->pr_minpages)
 1454                 pp->pr_maxpages = pp->pr_minpages + 1;  /* XXX */
 1455         while (pp->pr_npages < pp->pr_minpages)
 1456                 (void) pool_grow(pp, PR_WAITOK);
 1457         mutex_exit(&pp->pr_lock);
 1458 }
 1459 
 1460 /*
 1461  * Add a page worth of items to the pool.
 1462  *
 1463  * Note, we must be called with the pool descriptor LOCKED.
 1464  */
 1465 static void
 1466 pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph)
 1467 {
 1468         const unsigned int align = pp->pr_align;
 1469         struct pool_item *pi;
 1470         void *cp = storage;
 1471         int n;
 1472 
 1473         KASSERT(mutex_owned(&pp->pr_lock));
 1474         KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) ||
 1475                 (((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)),
 1476             "%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp);
 1477 
 1478         /*
 1479          * Insert page header.
 1480          */
 1481         LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
 1482         LIST_INIT(&ph->ph_itemlist);
 1483         ph->ph_page = storage;
 1484         ph->ph_nmissing = 0;
 1485         ph->ph_time = time_uptime;
 1486         if (pp->pr_roflags & PR_PHINPAGE)
 1487                 ph->ph_poolid = pp->pr_poolid;
 1488         else
 1489                 SPLAY_INSERT(phtree, &pp->pr_phtree, ph);
 1490 
 1491         pp->pr_nidle++;
 1492 
 1493         /*
 1494          * The item space starts after the on-page header, if any.
 1495          */
 1496         ph->ph_off = pp->pr_itemoffset;
 1497 
 1498         /*
 1499          * Color this page.
 1500          */
 1501         ph->ph_off += pp->pr_curcolor;
 1502         cp = (char *)cp + ph->ph_off;
 1503         if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
 1504                 pp->pr_curcolor = 0;
 1505 
 1506         KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
 1507 
 1508         /*
 1509          * Insert remaining chunks on the bucket list.
 1510          */
 1511         n = pp->pr_itemsperpage;
 1512         pp->pr_nitems += n;
 1513 
 1514         if (pp->pr_roflags & PR_USEBMAP) {
 1515                 pr_item_bitmap_init(pp, ph);
 1516         } else {
 1517                 while (n--) {
 1518                         pi = (struct pool_item *)cp;
 1519 
 1520                         KASSERT((((vaddr_t)pi) & (align - 1)) == 0);
 1521 
 1522                         /* Insert on page list */
 1523                         LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
 1524 #ifdef POOL_CHECK_MAGIC
 1525                         pi->pi_magic = PI_MAGIC;
 1526 #endif
 1527                         cp = (char *)cp + pp->pr_size;
 1528 
 1529                         KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
 1530                 }
 1531         }
 1532 
 1533         /*
 1534          * If the pool was depleted, point at the new page.
 1535          */
 1536         if (pp->pr_curpage == NULL)
 1537                 pp->pr_curpage = ph;
 1538 
 1539         if (++pp->pr_npages > pp->pr_hiwat)
 1540                 pp->pr_hiwat = pp->pr_npages;
 1541 }
 1542 
 1543 /*
 1544  * Used by pool_get() when nitems drops below the low water mark.  This
 1545  * is used to catch up pr_nitems with the low water mark.
 1546  *
 1547  * Note 1, we never wait for memory here, we let the caller decide what to do.
 1548  *
 1549  * Note 2, we must be called with the pool already locked, and we return
 1550  * with it locked.
 1551  */
 1552 static int
 1553 pool_catchup(struct pool *pp)
 1554 {
 1555         int error = 0;
 1556 
 1557         while (POOL_NEEDS_CATCHUP(pp)) {
 1558                 error = pool_grow(pp, PR_NOWAIT);
 1559                 if (error) {
 1560                         if (error == ERESTART)
 1561                                 continue;
 1562                         break;
 1563                 }
 1564         }
 1565         return error;
 1566 }
 1567 
 1568 static void
 1569 pool_update_curpage(struct pool *pp)
 1570 {
 1571 
 1572         pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
 1573         if (pp->pr_curpage == NULL) {
 1574                 pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
 1575         }
 1576         KASSERT((pp->pr_curpage == NULL && pp->pr_nitems == 0) ||
 1577             (pp->pr_curpage != NULL && pp->pr_nitems > 0));
 1578 }
 1579 
 1580 void
 1581 pool_setlowat(struct pool *pp, int n)
 1582 {
 1583 
 1584         mutex_enter(&pp->pr_lock);
 1585         pp->pr_minitems = n;
 1586 
 1587         /* Make sure we're caught up with the newly-set low water mark. */
 1588         if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
 1589                 /*
 1590                  * XXX: Should we log a warning?  Should we set up a timeout
 1591                  * to try again in a second or so?  The latter could break
 1592                  * a caller's assumptions about interrupt protection, etc.
 1593                  */
 1594         }
 1595 
 1596         mutex_exit(&pp->pr_lock);
 1597 }
 1598 
 1599 void
 1600 pool_sethiwat(struct pool *pp, int n)
 1601 {
 1602 
 1603         mutex_enter(&pp->pr_lock);
 1604 
 1605         pp->pr_maxitems = n;
 1606 
 1607         mutex_exit(&pp->pr_lock);
 1608 }
 1609 
 1610 void
 1611 pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap)
 1612 {
 1613 
 1614         mutex_enter(&pp->pr_lock);
 1615 
 1616         pp->pr_hardlimit = n;
 1617         pp->pr_hardlimit_warning = warnmess;
 1618         pp->pr_hardlimit_ratecap.tv_sec = ratecap;
 1619         pp->pr_hardlimit_warning_last.tv_sec = 0;
 1620         pp->pr_hardlimit_warning_last.tv_usec = 0;
 1621 
 1622         pp->pr_maxpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
 1623 
 1624         mutex_exit(&pp->pr_lock);
 1625 }
 1626 
 1627 unsigned int
 1628 pool_nget(struct pool *pp)
 1629 {
 1630 
 1631         return pp->pr_nget;
 1632 }
 1633 
 1634 unsigned int
 1635 pool_nput(struct pool *pp)
 1636 {
 1637 
 1638         return pp->pr_nput;
 1639 }
 1640 
 1641 /*
 1642  * Release all complete pages that have not been used recently.
 1643  *
 1644  * Must not be called from interrupt context.
 1645  */
 1646 int
 1647 pool_reclaim(struct pool *pp)
 1648 {
 1649         struct pool_item_header *ph, *phnext;
 1650         struct pool_pagelist pq;
 1651         struct pool_cache *pc;
 1652         uint32_t curtime;
 1653         bool klock;
 1654         int rv;
 1655 
 1656         KASSERT(!cpu_intr_p() && !cpu_softintr_p());
 1657 
 1658         if (pp->pr_drain_hook != NULL) {
 1659                 /*
 1660                  * The drain hook must be called with the pool unlocked.
 1661                  */
 1662                 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT);
 1663         }
 1664 
 1665         /*
 1666          * XXXSMP Because we do not want to cause non-MPSAFE code
 1667          * to block.
 1668          */
 1669         if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK ||
 1670             pp->pr_ipl == IPL_SOFTSERIAL) {
 1671                 KERNEL_LOCK(1, NULL);
 1672                 klock = true;
 1673         } else
 1674                 klock = false;
 1675 
 1676         /* Reclaim items from the pool's cache (if any). */
 1677         if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL)
 1678                 pool_cache_invalidate(pc);
 1679 
 1680         if (mutex_tryenter(&pp->pr_lock) == 0) {
 1681                 if (klock) {
 1682                         KERNEL_UNLOCK_ONE(NULL);
 1683                 }
 1684                 return 0;
 1685         }
 1686 
 1687         LIST_INIT(&pq);
 1688 
 1689         curtime = time_uptime;
 1690 
 1691         for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
 1692                 phnext = LIST_NEXT(ph, ph_pagelist);
 1693 
 1694                 /* Check our minimum page claim */
 1695                 if (pp->pr_npages <= pp->pr_minpages)
 1696                         break;
 1697 
 1698                 KASSERT(ph->ph_nmissing == 0);
 1699                 if (curtime - ph->ph_time < pool_inactive_time)
 1700                         continue;
 1701 
 1702                 /*
 1703                  * If freeing this page would put us below the minimum free items
 1704                  * or the minimum pages, stop now.
 1705                  */
 1706                 if (pp->pr_nitems - pp->pr_itemsperpage < pp->pr_minitems ||
 1707                     pp->pr_npages - 1 < pp->pr_minpages)
 1708                         break;
 1709 
 1710                 pr_rmpage(pp, ph, &pq);
 1711         }
 1712 
 1713         mutex_exit(&pp->pr_lock);
 1714 
 1715         if (LIST_EMPTY(&pq))
 1716                 rv = 0;
 1717         else {
 1718                 pr_pagelist_free(pp, &pq);
 1719                 rv = 1;
 1720         }
 1721 
 1722         if (klock) {
 1723                 KERNEL_UNLOCK_ONE(NULL);
 1724         }
 1725 
 1726         return rv;
 1727 }
 1728 
 1729 /*
 1730  * Drain pools, one at a time. The drained pool is returned within ppp.
 1731  *
 1732  * Note, must never be called from interrupt context.
 1733  */
 1734 bool
 1735 pool_drain(struct pool **ppp)
 1736 {
 1737         bool reclaimed;
 1738         struct pool *pp;
 1739 
 1740         KASSERT(!TAILQ_EMPTY(&pool_head));
 1741 
 1742         pp = NULL;
 1743 
 1744         /* Find next pool to drain, and add a reference. */
 1745         mutex_enter(&pool_head_lock);
 1746         do {
 1747                 if (drainpp == NULL) {
 1748                         drainpp = TAILQ_FIRST(&pool_head);
 1749                 }
 1750                 if (drainpp != NULL) {
 1751                         pp = drainpp;
 1752                         drainpp = TAILQ_NEXT(pp, pr_poollist);
 1753                 }
 1754                 /*
 1755                  * Skip completely idle pools.  We depend on at least
 1756                  * one pool in the system being active.
 1757                  */
 1758         } while (pp == NULL || pp->pr_npages == 0);
 1759         pp->pr_refcnt++;
 1760         mutex_exit(&pool_head_lock);
 1761 
 1762         /* Drain the cache (if any) and pool.. */
 1763         reclaimed = pool_reclaim(pp);
 1764 
 1765         /* Finally, unlock the pool. */
 1766         mutex_enter(&pool_head_lock);
 1767         pp->pr_refcnt--;
 1768         cv_broadcast(&pool_busy);
 1769         mutex_exit(&pool_head_lock);
 1770 
 1771         if (ppp != NULL)
 1772                 *ppp = pp;
 1773 
 1774         return reclaimed;
 1775 }
 1776 
 1777 /*
 1778  * Calculate the total number of pages consumed by pools.
 1779  */
 1780 int
 1781 pool_totalpages(void)
 1782 {
 1783 
 1784         mutex_enter(&pool_head_lock);
 1785         int pages = pool_totalpages_locked();
 1786         mutex_exit(&pool_head_lock);
 1787 
 1788         return pages;
 1789 }
 1790 
 1791 int
 1792 pool_totalpages_locked(void)
 1793 {
 1794         struct pool *pp;
 1795         uint64_t total = 0;
 1796 
 1797         TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
 1798                 uint64_t bytes =
 1799                     (uint64_t)pp->pr_npages * pp->pr_alloc->pa_pagesz;
 1800 
 1801                 if ((pp->pr_roflags & PR_RECURSIVE) != 0)
 1802                         bytes -= ((uint64_t)pp->pr_nout * pp->pr_size);
 1803                 total += bytes;
 1804         }
 1805 
 1806         return atop(total);
 1807 }
 1808 
 1809 /*
 1810  * Diagnostic helpers.
 1811  */
 1812 
 1813 void
 1814 pool_printall(const char *modif, void (*pr)(const char *, ...))
 1815 {
 1816         struct pool *pp;
 1817 
 1818         TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
 1819                 pool_printit(pp, modif, pr);
 1820         }
 1821 }
 1822 
 1823 void
 1824 pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
 1825 {
 1826 
 1827         if (pp == NULL) {
 1828                 (*pr)("Must specify a pool to print.\n");
 1829                 return;
 1830         }
 1831 
 1832         pool_print1(pp, modif, pr);
 1833 }
 1834 
 1835 static void
 1836 pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl,
 1837     void (*pr)(const char *, ...))
 1838 {
 1839         struct pool_item_header *ph;
 1840 
 1841         LIST_FOREACH(ph, pl, ph_pagelist) {
 1842                 (*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n",
 1843                     ph->ph_page, ph->ph_nmissing, ph->ph_time);
 1844 #ifdef POOL_CHECK_MAGIC
 1845                 struct pool_item *pi;
 1846                 if (!(pp->pr_roflags & PR_USEBMAP)) {
 1847                         LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
 1848                                 if (pi->pi_magic != PI_MAGIC) {
 1849                                         (*pr)("\t\t\titem %p, magic 0x%x\n",
 1850                                             pi, pi->pi_magic);
 1851                                 }
 1852                         }
 1853                 }
 1854 #endif
 1855         }
 1856 }
 1857 
 1858 static void
 1859 pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
 1860 {
 1861         struct pool_item_header *ph;
 1862         pool_cache_t pc;
 1863         pcg_t *pcg;
 1864         pool_cache_cpu_t *cc;
 1865         uint64_t cpuhit, cpumiss, pchit, pcmiss;
 1866         uint32_t nfull;
 1867         int i;
 1868         bool print_log = false, print_pagelist = false, print_cache = false;
 1869         bool print_short = false, skip_empty = false;
 1870         char c;
 1871 
 1872         while ((c = *modif++) != '\0') {
 1873                 if (c == 'l')
 1874                         print_log = true;
 1875                 if (c == 'p')
 1876                         print_pagelist = true;
 1877                 if (c == 'c')
 1878                         print_cache = true;
 1879                 if (c == 's')
 1880                         print_short = true;
 1881                 if (c == 'S')
 1882                         skip_empty = true;
 1883         }
 1884 
 1885         if (skip_empty && pp->pr_nget == 0)
 1886                 return;
 1887 
 1888         if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
 1889                 (*pr)("POOLCACHE");
 1890         } else {
 1891                 (*pr)("POOL");
 1892         }
 1893 
 1894         /* Single line output. */
 1895         if (print_short) {
 1896                 (*pr)(" %s:%p:%u:%u:%u:%u:%u:%u:%u:%u:%u:%u\n",
 1897                     pp->pr_wchan, pp, pp->pr_size, pp->pr_align, pp->pr_npages,
 1898                     pp->pr_nitems, pp->pr_nout, pp->pr_nget, pp->pr_nput,
 1899                     pp->pr_npagealloc, pp->pr_npagefree, pp->pr_nidle);
 1900                 return;
 1901         }
 1902 
 1903         (*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
 1904             pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
 1905             pp->pr_roflags);
 1906         (*pr)("\tpool %p, alloc %p\n", pp, pp->pr_alloc);
 1907         (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
 1908             pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
 1909         (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
 1910             pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
 1911 
 1912         (*pr)("\tnget %lu, nfail %lu, nput %lu\n",
 1913             pp->pr_nget, pp->pr_nfail, pp->pr_nput);
 1914         (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
 1915             pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
 1916 
 1917         if (!print_pagelist)
 1918                 goto skip_pagelist;
 1919 
 1920         if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
 1921                 (*pr)("\n\tempty page list:\n");
 1922         pool_print_pagelist(pp, &pp->pr_emptypages, pr);
 1923         if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
 1924                 (*pr)("\n\tfull page list:\n");
 1925         pool_print_pagelist(pp, &pp->pr_fullpages, pr);
 1926         if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
 1927                 (*pr)("\n\tpartial-page list:\n");
 1928         pool_print_pagelist(pp, &pp->pr_partpages, pr);
 1929 
 1930         if (pp->pr_curpage == NULL)
 1931                 (*pr)("\tno current page\n");
 1932         else
 1933                 (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
 1934 
 1935  skip_pagelist:
 1936         if (print_log)
 1937                 goto skip_log;
 1938 
 1939         (*pr)("\n");
 1940 
 1941  skip_log:
 1942 
 1943 #define PR_GROUPLIST(pcg)                                               \
 1944         (*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail);         \
 1945         for (i = 0; i < pcg->pcg_size; i++) {                           \
 1946                 if (pcg->pcg_objects[i].pcgo_pa !=                      \
 1947                     POOL_PADDR_INVALID) {                               \
 1948                         (*pr)("\t\t\t%p, 0x%llx\n",                     \
 1949                             pcg->pcg_objects[i].pcgo_va,                \
 1950                             (unsigned long long)                        \
 1951                             pcg->pcg_objects[i].pcgo_pa);               \
 1952                 } else {                                                \
 1953                         (*pr)("\t\t\t%p\n",                             \
 1954                             pcg->pcg_objects[i].pcgo_va);               \
 1955                 }                                                       \
 1956         }
 1957 
 1958         if (pc != NULL) {
 1959                 cpuhit = 0;
 1960                 cpumiss = 0;
 1961                 pcmiss = 0;
 1962                 nfull = 0;
 1963                 for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
 1964                         if ((cc = pc->pc_cpus[i]) == NULL)
 1965                                 continue;
 1966                         cpuhit += cc->cc_hits;
 1967                         cpumiss += cc->cc_misses;
 1968                         pcmiss += cc->cc_pcmisses;
 1969                         nfull += cc->cc_nfull;
 1970                 }
 1971                 pchit = cpumiss - pcmiss;
 1972                 (*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss);
 1973                 (*pr)("\tcache layer hits %llu misses %llu\n", pchit, pcmiss);
 1974                 (*pr)("\tcache layer full groups %u\n", nfull);
 1975                 if (print_cache) {
 1976                         (*pr)("\tfull cache groups:\n");
 1977                         for (pcg = pc->pc_fullgroups; pcg != NULL;
 1978                             pcg = pcg->pcg_next) {
 1979                                 PR_GROUPLIST(pcg);
 1980                         }
 1981                 }
 1982         }
 1983 #undef PR_GROUPLIST
 1984 }
 1985 
 1986 static int
 1987 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph)
 1988 {
 1989         struct pool_item *pi;
 1990         void *page;
 1991         int n;
 1992 
 1993         if ((pp->pr_roflags & PR_NOALIGN) == 0) {
 1994                 page = POOL_OBJ_TO_PAGE(pp, ph);
 1995                 if (page != ph->ph_page &&
 1996                     (pp->pr_roflags & PR_PHINPAGE) != 0) {
 1997                         if (label != NULL)
 1998                                 printf("%s: ", label);
 1999                         printf("pool(%p:%s): page inconsistency: page %p;"
 2000                                " at page head addr %p (p %p)\n", pp,
 2001                                 pp->pr_wchan, ph->ph_page,
 2002                                 ph, page);
 2003                         return 1;
 2004                 }
 2005         }
 2006 
 2007         if ((pp->pr_roflags & PR_USEBMAP) != 0)
 2008                 return 0;
 2009 
 2010         for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0;
 2011              pi != NULL;
 2012              pi = LIST_NEXT(pi,pi_list), n++) {
 2013 
 2014 #ifdef POOL_CHECK_MAGIC
 2015                 if (pi->pi_magic != PI_MAGIC) {
 2016                         if (label != NULL)
 2017                                 printf("%s: ", label);
 2018                         printf("pool(%s): free list modified: magic=%x;"
 2019                                " page %p; item ordinal %d; addr %p\n",
 2020                                 pp->pr_wchan, pi->pi_magic, ph->ph_page,
 2021                                 n, pi);
 2022                         panic("pool");
 2023                 }
 2024 #endif
 2025                 if ((pp->pr_roflags & PR_NOALIGN) != 0) {
 2026                         continue;
 2027                 }
 2028                 page = POOL_OBJ_TO_PAGE(pp, pi);
 2029                 if (page == ph->ph_page)
 2030                         continue;
 2031 
 2032                 if (label != NULL)
 2033                         printf("%s: ", label);
 2034                 printf("pool(%p:%s): page inconsistency: page %p;"
 2035                        " item ordinal %d; addr %p (p %p)\n", pp,
 2036                         pp->pr_wchan, ph->ph_page,
 2037                         n, pi, page);
 2038                 return 1;
 2039         }
 2040         return 0;
 2041 }
 2042 
 2043 
 2044 int
 2045 pool_chk(struct pool *pp, const char *label)
 2046 {
 2047         struct pool_item_header *ph;
 2048         int r = 0;
 2049 
 2050         mutex_enter(&pp->pr_lock);
 2051         LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
 2052                 r = pool_chk_page(pp, label, ph);
 2053                 if (r) {
 2054                         goto out;
 2055                 }
 2056         }
 2057         LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
 2058                 r = pool_chk_page(pp, label, ph);
 2059                 if (r) {
 2060                         goto out;
 2061                 }
 2062         }
 2063         LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
 2064                 r = pool_chk_page(pp, label, ph);
 2065                 if (r) {
 2066                         goto out;
 2067                 }
 2068         }
 2069 
 2070 out:
 2071         mutex_exit(&pp->pr_lock);
 2072         return r;
 2073 }
 2074 
 2075 /*
 2076  * pool_cache_init:
 2077  *
 2078  *      Initialize a pool cache.
 2079  */
 2080 pool_cache_t
 2081 pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags,
 2082     const char *wchan, struct pool_allocator *palloc, int ipl,
 2083     int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg)
 2084 {
 2085         pool_cache_t pc;
 2086 
 2087         pc = pool_get(&cache_pool, PR_WAITOK);
 2088         if (pc == NULL)
 2089                 return NULL;
 2090 
 2091         pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan,
 2092            palloc, ipl, ctor, dtor, arg);
 2093 
 2094         return pc;
 2095 }
 2096 
 2097 /*
 2098  * pool_cache_bootstrap:
 2099  *
 2100  *      Kernel-private version of pool_cache_init().  The caller
 2101  *      provides initial storage.
 2102  */
 2103 void
 2104 pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align,
 2105     u_int align_offset, u_int flags, const char *wchan,
 2106     struct pool_allocator *palloc, int ipl,
 2107     int (*ctor)(void *, void *, int), void (*dtor)(void *, void *),
 2108     void *arg)
 2109 {
 2110         CPU_INFO_ITERATOR cii;
 2111         pool_cache_t pc1;
 2112         struct cpu_info *ci;
 2113         struct pool *pp;
 2114         unsigned int ppflags;
 2115 
 2116         pp = &pc->pc_pool;
 2117         if (palloc == NULL && ipl == IPL_NONE) {
 2118                 if (size > PAGE_SIZE) {
 2119                         int bigidx = pool_bigidx(size);
 2120 
 2121                         palloc = &pool_allocator_big[bigidx];
 2122                         flags |= PR_NOALIGN;
 2123                 } else
 2124                         palloc = &pool_allocator_nointr;
 2125         }
 2126 
 2127         ppflags = flags;
 2128         if (ctor == NULL) {
 2129                 ctor = NO_CTOR;
 2130         }
 2131         if (dtor == NULL) {
 2132                 dtor = NO_DTOR;
 2133         } else {
 2134                 /*
 2135                  * If we have a destructor, then the pool layer does not
 2136                  * need to worry about PR_PSERIALIZE.
 2137                  */
 2138                 ppflags &= ~PR_PSERIALIZE;
 2139         }
 2140 
 2141         pool_init(pp, size, align, align_offset, ppflags, wchan, palloc, ipl);
 2142 
 2143         pc->pc_fullgroups = NULL;
 2144         pc->pc_partgroups = NULL;
 2145         pc->pc_ctor = ctor;
 2146         pc->pc_dtor = dtor;
 2147         pc->pc_arg  = arg;
 2148         pc->pc_refcnt = 0;
 2149         pc->pc_roflags = flags;
 2150         pc->pc_freecheck = NULL;
 2151 
 2152         if ((flags & PR_LARGECACHE) != 0) {
 2153                 pc->pc_pcgsize = PCG_NOBJECTS_LARGE;
 2154                 pc->pc_pcgpool = &pcg_large_pool;
 2155                 pc->pc_pcgcache = &pcg_large_cache;
 2156         } else {
 2157                 pc->pc_pcgsize = PCG_NOBJECTS_NORMAL;
 2158                 pc->pc_pcgpool = &pcg_normal_pool;
 2159                 pc->pc_pcgcache = &pcg_normal_cache;
 2160         }
 2161 
 2162         /* Allocate per-CPU caches. */
 2163         memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus));
 2164         pc->pc_ncpu = 0;
 2165         if (ncpu < 2) {
 2166                 /* XXX For sparc: boot CPU is not attached yet. */
 2167                 pool_cache_cpu_init1(curcpu(), pc);
 2168         } else {
 2169                 for (CPU_INFO_FOREACH(cii, ci)) {
 2170                         pool_cache_cpu_init1(ci, pc);
 2171                 }
 2172         }
 2173 
 2174         /* Add to list of all pools. */
 2175         if (__predict_true(!cold))
 2176                 mutex_enter(&pool_head_lock);
 2177         TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) {
 2178                 if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0)
 2179                         break;
 2180         }
 2181         if (pc1 == NULL)
 2182                 TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist);
 2183         else
 2184                 TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist);
 2185         if (__predict_true(!cold))
 2186                 mutex_exit(&pool_head_lock);
 2187 
 2188         atomic_store_release(&pp->pr_cache, pc);
 2189 }
 2190 
 2191 /*
 2192  * pool_cache_destroy:
 2193  *
 2194  *      Destroy a pool cache.
 2195  */
 2196 void
 2197 pool_cache_destroy(pool_cache_t pc)
 2198 {
 2199 
 2200         pool_cache_bootstrap_destroy(pc);
 2201         pool_put(&cache_pool, pc);
 2202 }
 2203 
 2204 /*
 2205  * pool_cache_bootstrap_destroy:
 2206  *
 2207  *      Destroy a pool cache.
 2208  */
 2209 void
 2210 pool_cache_bootstrap_destroy(pool_cache_t pc)
 2211 {
 2212         struct pool *pp = &pc->pc_pool;
 2213         u_int i;
 2214 
 2215         /* Remove it from the global list. */
 2216         mutex_enter(&pool_head_lock);
 2217         while (pc->pc_refcnt != 0)
 2218                 cv_wait(&pool_busy, &pool_head_lock);
 2219         TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist);
 2220         mutex_exit(&pool_head_lock);
 2221 
 2222         /* First, invalidate the entire cache. */
 2223         pool_cache_invalidate(pc);
 2224 
 2225         /* Disassociate it from the pool. */
 2226         mutex_enter(&pp->pr_lock);
 2227         atomic_store_relaxed(&pp->pr_cache, NULL);
 2228         mutex_exit(&pp->pr_lock);
 2229 
 2230         /* Destroy per-CPU data */
 2231         for (i = 0; i < __arraycount(pc->pc_cpus); i++)
 2232                 pool_cache_invalidate_cpu(pc, i);
 2233 
 2234         /* Finally, destroy it. */
 2235         pool_destroy(pp);
 2236 }
 2237 
 2238 /*
 2239  * pool_cache_cpu_init1:
 2240  *
 2241  *      Called for each pool_cache whenever a new CPU is attached.
 2242  */
 2243 static void
 2244 pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc)
 2245 {
 2246         pool_cache_cpu_t *cc;
 2247         int index;
 2248 
 2249         index = ci->ci_index;
 2250 
 2251         KASSERT(index < __arraycount(pc->pc_cpus));
 2252 
 2253         if ((cc = pc->pc_cpus[index]) != NULL) {
 2254                 return;
 2255         }
 2256 
 2257         /*
 2258          * The first CPU is 'free'.  This needs to be the case for
 2259          * bootstrap - we may not be able to allocate yet.
 2260          */
 2261         if (pc->pc_ncpu == 0) {
 2262                 cc = &pc->pc_cpu0;
 2263                 pc->pc_ncpu = 1;
 2264         } else {
 2265                 pc->pc_ncpu++;
 2266                 cc = pool_get(&cache_cpu_pool, PR_WAITOK);
 2267         }
 2268 
 2269         cc->cc_current = __UNCONST(&pcg_dummy);
 2270         cc->cc_previous = __UNCONST(&pcg_dummy);
 2271         cc->cc_pcgcache = pc->pc_pcgcache;
 2272         cc->cc_hits = 0;
 2273         cc->cc_misses = 0;
 2274         cc->cc_pcmisses = 0;
 2275         cc->cc_contended = 0;
 2276         cc->cc_nfull = 0;
 2277         cc->cc_npart = 0;
 2278 
 2279         pc->pc_cpus[index] = cc;
 2280 }
 2281 
 2282 /*
 2283  * pool_cache_cpu_init:
 2284  *
 2285  *      Called whenever a new CPU is attached.
 2286  */
 2287 void
 2288 pool_cache_cpu_init(struct cpu_info *ci)
 2289 {
 2290         pool_cache_t pc;
 2291 
 2292         mutex_enter(&pool_head_lock);
 2293         TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) {
 2294                 pc->pc_refcnt++;
 2295                 mutex_exit(&pool_head_lock);
 2296 
 2297                 pool_cache_cpu_init1(ci, pc);
 2298 
 2299                 mutex_enter(&pool_head_lock);
 2300                 pc->pc_refcnt--;
 2301                 cv_broadcast(&pool_busy);
 2302         }
 2303         mutex_exit(&pool_head_lock);
 2304 }
 2305 
 2306 /*
 2307  * pool_cache_reclaim:
 2308  *
 2309  *      Reclaim memory from a pool cache.
 2310  */
 2311 bool
 2312 pool_cache_reclaim(pool_cache_t pc)
 2313 {
 2314 
 2315         return pool_reclaim(&pc->pc_pool);
 2316 }
 2317 
 2318 static inline void
 2319 pool_cache_pre_destruct(pool_cache_t pc)
 2320 {
 2321         /*
 2322          * Perform a passive serialization barrier before destructing
 2323          * a batch of one or more objects.
 2324          */
 2325         if (__predict_false(pc_has_pser(pc))) {
 2326                 pool_barrier();
 2327         }
 2328 }
 2329 
 2330 static void
 2331 pool_cache_destruct_object1(pool_cache_t pc, void *object)
 2332 {
 2333         (*pc->pc_dtor)(pc->pc_arg, object);
 2334         pool_put(&pc->pc_pool, object);
 2335 }
 2336 
 2337 /*
 2338  * pool_cache_destruct_object:
 2339  *
 2340  *      Force destruction of an object and its release back into
 2341  *      the pool.
 2342  */
 2343 void
 2344 pool_cache_destruct_object(pool_cache_t pc, void *object)
 2345 {
 2346 
 2347         FREECHECK_IN(&pc->pc_freecheck, object);
 2348 
 2349         pool_cache_pre_destruct(pc);
 2350         pool_cache_destruct_object1(pc, object);
 2351 }
 2352 
 2353 /*
 2354  * pool_cache_invalidate_groups:
 2355  *
 2356  *      Invalidate a chain of groups and destruct all objects.  Return the
 2357  *      number of groups that were invalidated.
 2358  */
 2359 static int
 2360 pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg)
 2361 {
 2362         void *object;
 2363         pcg_t *next;
 2364         int i, n;
 2365 
 2366         if (pcg == NULL) {
 2367                 return 0;
 2368         }
 2369 
 2370         pool_cache_pre_destruct(pc);
 2371 
 2372         for (n = 0; pcg != NULL; pcg = next, n++) {
 2373                 next = pcg->pcg_next;
 2374 
 2375                 for (i = 0; i < pcg->pcg_avail; i++) {
 2376                         object = pcg->pcg_objects[i].pcgo_va;
 2377                         pool_cache_destruct_object1(pc, object);
 2378                 }
 2379 
 2380                 if (pcg->pcg_size == PCG_NOBJECTS_LARGE) {
 2381                         pool_put(&pcg_large_pool, pcg);
 2382                 } else {
 2383                         KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL);
 2384                         pool_put(&pcg_normal_pool, pcg);
 2385                 }
 2386         }
 2387         return n;
 2388 }
 2389 
 2390 /*
 2391  * pool_cache_invalidate:
 2392  *
 2393  *      Invalidate a pool cache (destruct and release all of the
 2394  *      cached objects).  Does not reclaim objects from the pool.
 2395  *
 2396  *      Note: For pool caches that provide constructed objects, there
 2397  *      is an assumption that another level of synchronization is occurring
 2398  *      between the input to the constructor and the cache invalidation.
 2399  *
 2400  *      Invalidation is a costly process and should not be called from
 2401  *      interrupt context.
 2402  */
 2403 void
 2404 pool_cache_invalidate(pool_cache_t pc)
 2405 {
 2406         uint64_t where;
 2407         pcg_t *pcg;
 2408         int n, s;
 2409 
 2410         KASSERT(!cpu_intr_p() && !cpu_softintr_p());
 2411 
 2412         if (ncpu < 2 || !mp_online) {
 2413                 /*
 2414                  * We might be called early enough in the boot process
 2415                  * for the CPU data structures to not be fully initialized.
 2416                  * In this case, transfer the content of the local CPU's
 2417                  * cache back into global cache as only this CPU is currently
 2418                  * running.
 2419                  */
 2420                 pool_cache_transfer(pc);
 2421         } else {
 2422                 /*
 2423                  * Signal all CPUs that they must transfer their local
 2424                  * cache back to the global pool then wait for the xcall to
 2425                  * complete.
 2426                  */
 2427                 where = xc_broadcast(0,
 2428                     __FPTRCAST(xcfunc_t, pool_cache_transfer), pc, NULL);
 2429                 xc_wait(where);
 2430         }
 2431 
 2432         /* Now dequeue and invalidate everything. */
 2433         pcg = pool_pcg_trunc(&pcg_normal_cache);
 2434         (void)pool_cache_invalidate_groups(pc, pcg);
 2435 
 2436         pcg = pool_pcg_trunc(&pcg_large_cache);
 2437         (void)pool_cache_invalidate_groups(pc, pcg);
 2438 
 2439         pcg = pool_pcg_trunc(&pc->pc_fullgroups);
 2440         n = pool_cache_invalidate_groups(pc, pcg);
 2441         s = splvm();
 2442         ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n;
 2443         splx(s);
 2444 
 2445         pcg = pool_pcg_trunc(&pc->pc_partgroups);
 2446         n = pool_cache_invalidate_groups(pc, pcg);
 2447         s = splvm();
 2448         ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_npart -= n;
 2449         splx(s);
 2450 }
 2451 
 2452 /*
 2453  * pool_cache_invalidate_cpu:
 2454  *
 2455  *      Invalidate all CPU-bound cached objects in pool cache, the CPU being
 2456  *      identified by its associated index.
 2457  *      It is caller's responsibility to ensure that no operation is
 2458  *      taking place on this pool cache while doing this invalidation.
 2459  *      WARNING: as no inter-CPU locking is enforced, trying to invalidate
 2460  *      pool cached objects from a CPU different from the one currently running
 2461  *      may result in an undefined behaviour.
 2462  */
 2463 static void
 2464 pool_cache_invalidate_cpu(pool_cache_t pc, u_int index)
 2465 {
 2466         pool_cache_cpu_t *cc;
 2467         pcg_t *pcg;
 2468 
 2469         if ((cc = pc->pc_cpus[index]) == NULL)
 2470                 return;
 2471 
 2472         if ((pcg = cc->cc_current) != &pcg_dummy) {
 2473                 pcg->pcg_next = NULL;
 2474                 pool_cache_invalidate_groups(pc, pcg);
 2475         }
 2476         if ((pcg = cc->cc_previous) != &pcg_dummy) {
 2477                 pcg->pcg_next = NULL;
 2478                 pool_cache_invalidate_groups(pc, pcg);
 2479         }
 2480         if (cc != &pc->pc_cpu0)
 2481                 pool_put(&cache_cpu_pool, cc);
 2482 
 2483 }
 2484 
 2485 void
 2486 pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg)
 2487 {
 2488 
 2489         pool_set_drain_hook(&pc->pc_pool, fn, arg);
 2490 }
 2491 
 2492 void
 2493 pool_cache_setlowat(pool_cache_t pc, int n)
 2494 {
 2495 
 2496         pool_setlowat(&pc->pc_pool, n);
 2497 }
 2498 
 2499 void
 2500 pool_cache_sethiwat(pool_cache_t pc, int n)
 2501 {
 2502 
 2503         pool_sethiwat(&pc->pc_pool, n);
 2504 }
 2505 
 2506 void
 2507 pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap)
 2508 {
 2509 
 2510         pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap);
 2511 }
 2512 
 2513 void
 2514 pool_cache_prime(pool_cache_t pc, int n)
 2515 {
 2516 
 2517         pool_prime(&pc->pc_pool, n);
 2518 }
 2519 
 2520 unsigned int
 2521 pool_cache_nget(pool_cache_t pc)
 2522 {
 2523 
 2524         return pool_nget(&pc->pc_pool);
 2525 }
 2526 
 2527 unsigned int
 2528 pool_cache_nput(pool_cache_t pc)
 2529 {
 2530 
 2531         return pool_nput(&pc->pc_pool);
 2532 }
 2533 
 2534 /*
 2535  * pool_pcg_get:
 2536  *
 2537  *      Get a cache group from the specified list.  Return true if
 2538  *      contention was encountered.  Must be called at IPL_VM because
 2539  *      of spin wait vs. kernel_lock.
 2540  */
 2541 static int
 2542 pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp)
 2543 {
 2544         int count = SPINLOCK_BACKOFF_MIN;
 2545         pcg_t *o, *n;
 2546 
 2547         for (o = atomic_load_relaxed(head);; o = n) {
 2548                 if (__predict_false(o == &pcg_dummy)) {
 2549                         /* Wait for concurrent get to complete. */
 2550                         SPINLOCK_BACKOFF(count);
 2551                         n = atomic_load_relaxed(head);
 2552                         continue;
 2553                 }
 2554                 if (__predict_false(o == NULL)) {
 2555                         break;
 2556                 }
 2557                 /* Lock out concurrent get/put. */
 2558                 n = atomic_cas_ptr(head, o, __UNCONST(&pcg_dummy));
 2559                 if (o == n) {
 2560                         /* Fetch pointer to next item and then unlock. */
 2561 #ifndef __HAVE_ATOMIC_AS_MEMBAR
 2562                         membar_datadep_consumer(); /* alpha */
 2563 #endif
 2564                         n = atomic_load_relaxed(&o->pcg_next);
 2565                         atomic_store_release(head, n);
 2566                         break;
 2567                 }
 2568         }
 2569         *pcgp = o;
 2570         return count != SPINLOCK_BACKOFF_MIN;
 2571 }
 2572 
 2573 /*
 2574  * pool_pcg_trunc:
 2575  *
 2576  *      Chop out entire list of pool cache groups.
 2577  */
 2578 static pcg_t *
 2579 pool_pcg_trunc(pcg_t *volatile *head)
 2580 {
 2581         int count = SPINLOCK_BACKOFF_MIN, s;
 2582         pcg_t *o, *n;
 2583 
 2584         s = splvm();
 2585         for (o = atomic_load_relaxed(head);; o = n) {
 2586                 if (__predict_false(o == &pcg_dummy)) {
 2587                         /* Wait for concurrent get to complete. */
 2588                         SPINLOCK_BACKOFF(count);
 2589                         n = atomic_load_relaxed(head);
 2590                         continue;
 2591                 }
 2592                 n = atomic_cas_ptr(head, o, NULL);
 2593                 if (o == n) {
 2594                         splx(s);
 2595 #ifndef __HAVE_ATOMIC_AS_MEMBAR
 2596                         membar_datadep_consumer(); /* alpha */
 2597 #endif
 2598                         return o;
 2599                 }
 2600         }
 2601 }
 2602 
 2603 /*
 2604  * pool_pcg_put:
 2605  *
 2606  *      Put a pool cache group to the specified list.  Return true if
 2607  *      contention was encountered.  Must be called at IPL_VM because of
 2608  *      spin wait vs. kernel_lock.
 2609  */
 2610 static int
 2611 pool_pcg_put(pcg_t *volatile *head, pcg_t *pcg)
 2612 {
 2613         int count = SPINLOCK_BACKOFF_MIN;
 2614         pcg_t *o, *n;
 2615 
 2616         for (o = atomic_load_relaxed(head);; o = n) {
 2617                 if (__predict_false(o == &pcg_dummy)) {
 2618                         /* Wait for concurrent get to complete. */
 2619                         SPINLOCK_BACKOFF(count);
 2620                         n = atomic_load_relaxed(head);
 2621                         continue;
 2622                 }
 2623                 pcg->pcg_next = o;
 2624 #ifndef __HAVE_ATOMIC_AS_MEMBAR
 2625                 membar_release();
 2626 #endif
 2627                 n = atomic_cas_ptr(head, o, pcg);
 2628                 if (o == n) {
 2629                         return count != SPINLOCK_BACKOFF_MIN;
 2630                 }
 2631         }
 2632 }
 2633 
 2634 static bool __noinline
 2635 pool_cache_get_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s,
 2636     void **objectp, paddr_t *pap, int flags)
 2637 {
 2638         pcg_t *pcg, *cur;
 2639         void *object;
 2640 
 2641         KASSERT(cc->cc_current->pcg_avail == 0);
 2642         KASSERT(cc->cc_previous->pcg_avail == 0);
 2643 
 2644         cc->cc_misses++;
 2645 
 2646         /*
 2647          * If there's a full group, release our empty group back to the
 2648          * cache.  Install the full group as cc_current and return.
 2649          */
 2650         cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg);
 2651         if (__predict_true(pcg != NULL)) {
 2652                 KASSERT(pcg->pcg_avail == pcg->pcg_size);
 2653                 if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) {
 2654                         KASSERT(cur->pcg_avail == 0);
 2655                         (void)pool_pcg_put(cc->cc_pcgcache, cur);
 2656                 }
 2657                 cc->cc_nfull--;
 2658                 cc->cc_current = pcg;
 2659                 return true;
 2660         }
 2661 
 2662         /*
 2663          * Nothing available locally or in cache.  Take the slow
 2664          * path: fetch a new object from the pool and construct
 2665          * it.
 2666          */
 2667         cc->cc_pcmisses++;
 2668         splx(s);
 2669 
 2670         object = pool_get(&pc->pc_pool, flags);
 2671         *objectp = object;
 2672         if (__predict_false(object == NULL)) {
 2673                 KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
 2674                 return false;
 2675         }
 2676 
 2677         if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) {
 2678                 pool_put(&pc->pc_pool, object);
 2679                 *objectp = NULL;
 2680                 return false;
 2681         }
 2682 
 2683         KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0);
 2684 
 2685         if (pap != NULL) {
 2686 #ifdef POOL_VTOPHYS
 2687                 *pap = POOL_VTOPHYS(object);
 2688 #else
 2689                 *pap = POOL_PADDR_INVALID;
 2690 #endif
 2691         }
 2692 
 2693         FREECHECK_OUT(&pc->pc_freecheck, object);
 2694         return false;
 2695 }
 2696 
 2697 /*
 2698  * pool_cache_get{,_paddr}:
 2699  *
 2700  *      Get an object from a pool cache (optionally returning
 2701  *      the physical address of the object).
 2702  */
 2703 void *
 2704 pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap)
 2705 {
 2706         pool_cache_cpu_t *cc;
 2707         pcg_t *pcg;
 2708         void *object;
 2709         int s;
 2710 
 2711         KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK));
 2712         KASSERTMSG((!cpu_intr_p() && !cpu_softintr_p()) ||
 2713             (pc->pc_pool.pr_ipl != IPL_NONE || cold || panicstr != NULL),
 2714             "%s: [%s] is IPL_NONE, but called from interrupt context",
 2715             __func__, pc->pc_pool.pr_wchan);
 2716 
 2717         if (flags & PR_WAITOK) {
 2718                 ASSERT_SLEEPABLE();
 2719         }
 2720 
 2721         if (flags & PR_NOWAIT) {
 2722                 if (fault_inject())
 2723                         return NULL;
 2724         }
 2725 
 2726         /* Lock out interrupts and disable preemption. */
 2727         s = splvm();
 2728         while (/* CONSTCOND */ true) {
 2729                 /* Try and allocate an object from the current group. */
 2730                 cc = pc->pc_cpus[curcpu()->ci_index];
 2731                 pcg = cc->cc_current;
 2732                 if (__predict_true(pcg->pcg_avail > 0)) {
 2733                         object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va;
 2734                         if (__predict_false(pap != NULL))
 2735                                 *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa;
 2736 #if defined(DIAGNOSTIC)
 2737                         pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL;
 2738                         KASSERT(pcg->pcg_avail < pcg->pcg_size);
 2739                         KASSERT(object != NULL);
 2740 #endif
 2741                         cc->cc_hits++;
 2742                         splx(s);
 2743                         FREECHECK_OUT(&pc->pc_freecheck, object);
 2744                         pool_redzone_fill(&pc->pc_pool, object);
 2745                         pool_cache_get_kmsan(pc, object);
 2746                         return object;
 2747                 }
 2748 
 2749                 /*
 2750                  * That failed.  If the previous group isn't empty, swap
 2751                  * it with the current group and allocate from there.
 2752                  */
 2753                 pcg = cc->cc_previous;
 2754                 if (__predict_true(pcg->pcg_avail > 0)) {
 2755                         cc->cc_previous = cc->cc_current;
 2756                         cc->cc_current = pcg;
 2757                         continue;
 2758                 }
 2759 
 2760                 /*
 2761                  * Can't allocate from either group: try the slow path.
 2762                  * If get_slow() allocated an object for us, or if
 2763                  * no more objects are available, it will return false.
 2764                  * Otherwise, we need to retry.
 2765                  */
 2766                 if (!pool_cache_get_slow(pc, cc, s, &object, pap, flags)) {
 2767                         if (object != NULL) {
 2768                                 kmsan_orig(object, pc->pc_pool.pr_size,
 2769                                     KMSAN_TYPE_POOL, __RET_ADDR);
 2770                         }
 2771                         break;
 2772                 }
 2773         }
 2774 
 2775         /*
 2776          * We would like to KASSERT(object || (flags & PR_NOWAIT)), but
 2777          * pool_cache_get can fail even in the PR_WAITOK case, if the
 2778          * constructor fails.
 2779          */
 2780         return object;
 2781 }
 2782 
 2783 static bool __noinline
 2784 pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object)
 2785 {
 2786         pcg_t *pcg, *cur;
 2787 
 2788         KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size);
 2789         KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size);
 2790 
 2791         cc->cc_misses++;
 2792 
 2793         /*
 2794          * Try to get an empty group from the cache.  If there are no empty
 2795          * groups in the cache then allocate one.
 2796          */
 2797         (void)pool_pcg_get(cc->cc_pcgcache, &pcg);
 2798         if (__predict_false(pcg == NULL)) {
 2799                 if (__predict_true(!pool_cache_disable)) {
 2800                         pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT);
 2801                 }
 2802                 if (__predict_true(pcg != NULL)) {
 2803                         pcg->pcg_avail = 0;
 2804                         pcg->pcg_size = pc->pc_pcgsize;
 2805                 }
 2806         }
 2807 
 2808         /*
 2809          * If there's a empty group, release our full group back to the
 2810          * cache.  Install the empty group to the local CPU and return.
 2811          */
 2812         if (pcg != NULL) {
 2813                 KASSERT(pcg->pcg_avail == 0);
 2814                 if (__predict_false(cc->cc_previous == &pcg_dummy)) {
 2815                         cc->cc_previous = pcg;
 2816                 } else {
 2817                         cur = cc->cc_current;
 2818                         if (__predict_true(cur != &pcg_dummy)) {
 2819                                 KASSERT(cur->pcg_avail == cur->pcg_size);
 2820                                 cc->cc_contended +=
 2821                                     pool_pcg_put(&pc->pc_fullgroups, cur);
 2822                                 cc->cc_nfull++;
 2823                         }
 2824                         cc->cc_current = pcg;
 2825                 }
 2826                 return true;
 2827         }
 2828 
 2829         /*
 2830          * Nothing available locally or in cache, and we didn't
 2831          * allocate an empty group.  Take the slow path and destroy
 2832          * the object here and now.
 2833          */
 2834         cc->cc_pcmisses++;
 2835         splx(s);
 2836         pool_cache_destruct_object(pc, object);
 2837 
 2838         return false;
 2839 }
 2840 
 2841 /*
 2842  * pool_cache_put{,_paddr}:
 2843  *
 2844  *      Put an object back to the pool cache (optionally caching the
 2845  *      physical address of the object).
 2846  */
 2847 void
 2848 pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa)
 2849 {
 2850         pool_cache_cpu_t *cc;
 2851         pcg_t *pcg;
 2852         int s;
 2853 
 2854         KASSERT(object != NULL);
 2855         pool_cache_put_kmsan(pc, object);
 2856         pool_cache_redzone_check(pc, object);
 2857         FREECHECK_IN(&pc->pc_freecheck, object);
 2858 
 2859         if (pc->pc_pool.pr_roflags & PR_PHINPAGE) {
 2860                 pc_phinpage_check(pc, object);
 2861         }
 2862 
 2863         if (pool_cache_put_nocache(pc, object)) {
 2864                 return;
 2865         }
 2866 
 2867         /* Lock out interrupts and disable preemption. */
 2868         s = splvm();
 2869         while (/* CONSTCOND */ true) {
 2870                 /* If the current group isn't full, release it there. */
 2871                 cc = pc->pc_cpus[curcpu()->ci_index];
 2872                 pcg = cc->cc_current;
 2873                 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
 2874                         pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object;
 2875                         pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa;
 2876                         pcg->pcg_avail++;
 2877                         cc->cc_hits++;
 2878                         splx(s);
 2879                         return;
 2880                 }
 2881 
 2882                 /*
 2883                  * That failed.  If the previous group isn't full, swap
 2884                  * it with the current group and try again.
 2885                  */
 2886                 pcg = cc->cc_previous;
 2887                 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
 2888                         cc->cc_previous = cc->cc_current;
 2889                         cc->cc_current = pcg;
 2890                         continue;
 2891                 }
 2892 
 2893                 /*
 2894                  * Can't free to either group: try the slow path.
 2895                  * If put_slow() releases the object for us, it
 2896                  * will return false.  Otherwise we need to retry.
 2897                  */
 2898                 if (!pool_cache_put_slow(pc, cc, s, object))
 2899                         break;
 2900         }
 2901 }
 2902 
 2903 /*
 2904  * pool_cache_transfer:
 2905  *
 2906  *      Transfer objects from the per-CPU cache to the global cache.
 2907  *      Run within a cross-call thread.
 2908  */
 2909 static void
 2910 pool_cache_transfer(pool_cache_t pc)
 2911 {
 2912         pool_cache_cpu_t *cc;
 2913         pcg_t *prev, *cur;
 2914         int s;
 2915 
 2916         s = splvm();
 2917         cc = pc->pc_cpus[curcpu()->ci_index];
 2918         cur = cc->cc_current;
 2919         cc->cc_current = __UNCONST(&pcg_dummy);
 2920         prev = cc->cc_previous;
 2921         cc->cc_previous = __UNCONST(&pcg_dummy);
 2922         if (cur != &pcg_dummy) {
 2923                 if (cur->pcg_avail == cur->pcg_size) {
 2924                         (void)pool_pcg_put(&pc->pc_fullgroups, cur);
 2925                         cc->cc_nfull++;
 2926                 } else if (cur->pcg_avail == 0) {
 2927                         (void)pool_pcg_put(pc->pc_pcgcache, cur);
 2928                 } else {
 2929                         (void)pool_pcg_put(&pc->pc_partgroups, cur);
 2930                         cc->cc_npart++;
 2931                 }
 2932         }
 2933         if (prev != &pcg_dummy) {
 2934                 if (prev->pcg_avail == prev->pcg_size) {
 2935                         (void)pool_pcg_put(&pc->pc_fullgroups, prev);
 2936                         cc->cc_nfull++;
 2937                 } else if (prev->pcg_avail == 0) {
 2938                         (void)pool_pcg_put(pc->pc_pcgcache, prev);
 2939                 } else {
 2940                         (void)pool_pcg_put(&pc->pc_partgroups, prev);
 2941                         cc->cc_npart++;
 2942                 }
 2943         }
 2944         splx(s);
 2945 }
 2946 
 2947 static int
 2948 pool_bigidx(size_t size)
 2949 {
 2950         int i;
 2951 
 2952         for (i = 0; i < __arraycount(pool_allocator_big); i++) {
 2953                 if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size)
 2954                         return i;
 2955         }
 2956         panic("pool item size %zu too large, use a custom allocator", size);
 2957 }
 2958 
 2959 static void *
 2960 pool_allocator_alloc(struct pool *pp, int flags)
 2961 {
 2962         struct pool_allocator *pa = pp->pr_alloc;
 2963         void *res;
 2964 
 2965         res = (*pa->pa_alloc)(pp, flags);
 2966         if (res == NULL && (flags & PR_WAITOK) == 0) {
 2967                 /*
 2968                  * We only run the drain hook here if PR_NOWAIT.
 2969                  * In other cases, the hook will be run in
 2970                  * pool_reclaim().
 2971                  */
 2972                 if (pp->pr_drain_hook != NULL) {
 2973                         (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
 2974                         res = (*pa->pa_alloc)(pp, flags);
 2975                 }
 2976         }
 2977         return res;
 2978 }
 2979 
 2980 static void
 2981 pool_allocator_free(struct pool *pp, void *v)
 2982 {
 2983         struct pool_allocator *pa = pp->pr_alloc;
 2984 
 2985         if (pp->pr_redzone) {
 2986                 KASSERT(!pp_has_pser(pp));
 2987                 kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0);
 2988         } else if (__predict_false(pp_has_pser(pp))) {
 2989                 /*
 2990                  * Perform a passive serialization barrier before freeing
 2991                  * the pool page back to the system.
 2992                  */
 2993                 pool_barrier();
 2994         }
 2995         (*pa->pa_free)(pp, v);
 2996 }
 2997 
 2998 void *
 2999 pool_page_alloc(struct pool *pp, int flags)
 3000 {
 3001         const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
 3002         vmem_addr_t va;
 3003         int ret;
 3004 
 3005         ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz,
 3006             vflags | VM_INSTANTFIT, &va);
 3007 
 3008         return ret ? NULL : (void *)va;
 3009 }
 3010 
 3011 void
 3012 pool_page_free(struct pool *pp, void *v)
 3013 {
 3014 
 3015         uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
 3016 }
 3017 
 3018 static void *
 3019 pool_page_alloc_meta(struct pool *pp, int flags)
 3020 {
 3021         const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
 3022         vmem_addr_t va;
 3023         int ret;
 3024 
 3025         ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz,
 3026             vflags | VM_INSTANTFIT, &va);
 3027 
 3028         return ret ? NULL : (void *)va;
 3029 }
 3030 
 3031 static void
 3032 pool_page_free_meta(struct pool *pp, void *v)
 3033 {
 3034 
 3035         vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz);
 3036 }
 3037 
 3038 #ifdef KMSAN
 3039 static inline void
 3040 pool_get_kmsan(struct pool *pp, void *p)
 3041 {
 3042         kmsan_orig(p, pp->pr_size, KMSAN_TYPE_POOL, __RET_ADDR);
 3043         kmsan_mark(p, pp->pr_size, KMSAN_STATE_UNINIT);
 3044 }
 3045 
 3046 static inline void
 3047 pool_put_kmsan(struct pool *pp, void *p)
 3048 {
 3049         kmsan_mark(p, pp->pr_size, KMSAN_STATE_INITED);
 3050 }
 3051 
 3052 static inline void
 3053 pool_cache_get_kmsan(pool_cache_t pc, void *p)
 3054 {
 3055         if (__predict_false(pc_has_ctor(pc))) {
 3056                 return;
 3057         }
 3058         pool_get_kmsan(&pc->pc_pool, p);
 3059 }
 3060 
 3061 static inline void
 3062 pool_cache_put_kmsan(pool_cache_t pc, void *p)
 3063 {
 3064         pool_put_kmsan(&pc->pc_pool, p);
 3065 }
 3066 #endif
 3067 
 3068 #ifdef POOL_QUARANTINE
 3069 static void
 3070 pool_quarantine_init(struct pool *pp)
 3071 {
 3072         pp->pr_quar.rotor = 0;
 3073         memset(&pp->pr_quar, 0, sizeof(pp->pr_quar));
 3074 }
 3075 
 3076 static void
 3077 pool_quarantine_flush(struct pool *pp)
 3078 {
 3079         pool_quar_t *quar = &pp->pr_quar;
 3080         struct pool_pagelist pq;
 3081         size_t i;
 3082 
 3083         LIST_INIT(&pq);
 3084 
 3085         mutex_enter(&pp->pr_lock);
 3086         for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) {
 3087                 if (quar->list[i] == 0)
 3088                         continue;
 3089                 pool_do_put(pp, (void *)quar->list[i], &pq);
 3090         }
 3091         mutex_exit(&pp->pr_lock);
 3092 
 3093         pr_pagelist_free(pp, &pq);
 3094 }
 3095 
 3096 static bool
 3097 pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq)
 3098 {
 3099         pool_quar_t *quar = &pp->pr_quar;
 3100         uintptr_t old;
 3101 
 3102         if (pp->pr_roflags & PR_NOTOUCH) {
 3103                 return false;
 3104         }
 3105 
 3106         pool_redzone_check(pp, v);
 3107 
 3108         old = quar->list[quar->rotor];
 3109         quar->list[quar->rotor] = (uintptr_t)v;
 3110         quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH;
 3111         if (old != 0) {
 3112                 pool_do_put(pp, (void *)old, pq);
 3113         }
 3114 
 3115         return true;
 3116 }
 3117 #endif
 3118 
 3119 #ifdef POOL_NOCACHE
 3120 static bool
 3121 pool_cache_put_nocache(pool_cache_t pc, void *p)
 3122 {
 3123         pool_cache_destruct_object(pc, p);
 3124         return true;
 3125 }
 3126 #endif
 3127 
 3128 #ifdef POOL_REDZONE
 3129 #if defined(_LP64)
 3130 # define PRIME 0x9e37fffffffc0000UL
 3131 #else /* defined(_LP64) */
 3132 # define PRIME 0x9e3779b1
 3133 #endif /* defined(_LP64) */
 3134 #define STATIC_BYTE     0xFE
 3135 CTASSERT(POOL_REDZONE_SIZE > 1);
 3136 
 3137 #ifndef KASAN
 3138 static inline uint8_t
 3139 pool_pattern_generate(const void *p)
 3140 {
 3141         return (uint8_t)(((uintptr_t)p) * PRIME
 3142            >> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT);
 3143 }
 3144 #endif
 3145 
 3146 static void
 3147 pool_redzone_init(struct pool *pp, size_t requested_size)
 3148 {
 3149         size_t redzsz;
 3150         size_t nsz;
 3151 
 3152 #ifdef KASAN
 3153         redzsz = requested_size;
 3154         kasan_add_redzone(&redzsz);
 3155         redzsz -= requested_size;
 3156 #else
 3157         redzsz = POOL_REDZONE_SIZE;
 3158 #endif
 3159 
 3160         if (pp->pr_roflags & PR_NOTOUCH) {
 3161                 pp->pr_redzone = false;
 3162                 return;
 3163         }
 3164 
 3165         /*
 3166          * We may have extended the requested size earlier; check if
 3167          * there's naturally space in the padding for a red zone.
 3168          */
 3169         if (pp->pr_size - requested_size >= redzsz) {
 3170                 pp->pr_reqsize_with_redzone = requested_size + redzsz;
 3171                 pp->pr_redzone = true;
 3172                 return;
 3173         }
 3174 
 3175         /*
 3176          * No space in the natural padding; check if we can extend a
 3177          * bit the size of the pool.
 3178          *
 3179          * Avoid using redzone for allocations half of a page or larger.
 3180          * For pagesize items, we'd waste a whole new page (could be
 3181          * unmapped?), and for half pagesize items, approximately half
 3182          * the space is lost (eg, 4K pages, you get one 2K allocation.)
 3183          */
 3184         nsz = roundup(pp->pr_size + redzsz, pp->pr_align);
 3185         if (nsz <= (pp->pr_alloc->pa_pagesz / 2)) {
 3186                 /* Ok, we can */
 3187                 pp->pr_size = nsz;
 3188                 pp->pr_reqsize_with_redzone = requested_size + redzsz;
 3189                 pp->pr_redzone = true;
 3190         } else {
 3191                 /* No space for a red zone... snif :'( */
 3192                 pp->pr_redzone = false;
 3193                 aprint_debug("pool redzone disabled for '%s'\n", pp->pr_wchan);
 3194         }
 3195 }
 3196 
 3197 static void
 3198 pool_redzone_fill(struct pool *pp, void *p)
 3199 {
 3200         if (!pp->pr_redzone)
 3201                 return;
 3202         KASSERT(!pp_has_pser(pp));
 3203 #ifdef KASAN
 3204         kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone,
 3205             KASAN_POOL_REDZONE);
 3206 #else
 3207         uint8_t *cp, pat;
 3208         const uint8_t *ep;
 3209 
 3210         cp = (uint8_t *)p + pp->pr_reqsize;
 3211         ep = cp + POOL_REDZONE_SIZE;
 3212 
 3213         /*
 3214          * We really don't want the first byte of the red zone to be '\0';
 3215          * an off-by-one in a string may not be properly detected.
 3216          */
 3217         pat = pool_pattern_generate(cp);
 3218         *cp = (pat == '\0') ? STATIC_BYTE: pat;
 3219         cp++;
 3220 
 3221         while (cp < ep) {
 3222                 *cp = pool_pattern_generate(cp);
 3223                 cp++;
 3224         }
 3225 #endif
 3226 }
 3227 
 3228 static void
 3229 pool_redzone_check(struct pool *pp, void *p)
 3230 {
 3231         if (!pp->pr_redzone)
 3232                 return;
 3233         KASSERT(!pp_has_pser(pp));
 3234 #ifdef KASAN
 3235         kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED);
 3236 #else
 3237         uint8_t *cp, pat, expected;
 3238         const uint8_t *ep;
 3239 
 3240         cp = (uint8_t *)p + pp->pr_reqsize;
 3241         ep = cp + POOL_REDZONE_SIZE;
 3242 
 3243         pat = pool_pattern_generate(cp);
 3244         expected = (pat == '\0') ? STATIC_BYTE: pat;
 3245         if (__predict_false(*cp != expected)) {
 3246                 panic("%s: [%s] 0x%02x != 0x%02x", __func__,
 3247                     pp->pr_wchan, *cp, expected);
 3248         }
 3249         cp++;
 3250 
 3251         while (cp < ep) {
 3252                 expected = pool_pattern_generate(cp);
 3253                 if (__predict_false(*cp != expected)) {
 3254                         panic("%s: [%s] 0x%02x != 0x%02x", __func__,
 3255                             pp->pr_wchan, *cp, expected);
 3256                 }
 3257                 cp++;
 3258         }
 3259 #endif
 3260 }
 3261 
 3262 static void
 3263 pool_cache_redzone_check(pool_cache_t pc, void *p)
 3264 {
 3265 #ifdef KASAN
 3266         /*
 3267          * If there is a ctor/dtor, or if the cache objects use
 3268          * passive serialization, leave the data as valid.
 3269          */
 3270         if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc) ||
 3271             pc_has_pser(pc))) {
 3272                 return;
 3273         }
 3274 #endif
 3275         pool_redzone_check(&pc->pc_pool, p);
 3276 }
 3277 
 3278 #endif /* POOL_REDZONE */
 3279 
 3280 #if defined(DDB)
 3281 static bool
 3282 pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
 3283 {
 3284 
 3285         return (uintptr_t)ph->ph_page <= addr &&
 3286             addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz;
 3287 }
 3288 
 3289 static bool
 3290 pool_in_item(struct pool *pp, void *item, uintptr_t addr)
 3291 {
 3292 
 3293         return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size;
 3294 }
 3295 
 3296 static bool
 3297 pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr)
 3298 {
 3299         int i;
 3300 
 3301         if (pcg == NULL) {
 3302                 return false;
 3303         }
 3304         for (i = 0; i < pcg->pcg_avail; i++) {
 3305                 if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) {
 3306                         return true;
 3307                 }
 3308         }
 3309         return false;
 3310 }
 3311 
 3312 static bool
 3313 pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
 3314 {
 3315 
 3316         if ((pp->pr_roflags & PR_USEBMAP) != 0) {
 3317                 unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr);
 3318                 pool_item_bitmap_t *bitmap =
 3319                     ph->ph_bitmap + (idx / BITMAP_SIZE);
 3320                 pool_item_bitmap_t mask = 1 << (idx & BITMAP_MASK);
 3321 
 3322                 return (*bitmap & mask) == 0;
 3323         } else {
 3324                 struct pool_item *pi;
 3325 
 3326                 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
 3327                         if (pool_in_item(pp, pi, addr)) {
 3328                                 return false;
 3329                         }
 3330                 }
 3331                 return true;
 3332         }
 3333 }
 3334 
 3335 void
 3336 pool_whatis(uintptr_t addr, void (*pr)(const char *, ...))
 3337 {
 3338         struct pool *pp;
 3339 
 3340         TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
 3341                 struct pool_item_header *ph;
 3342                 struct pool_cache *pc;
 3343                 uintptr_t item;
 3344                 bool allocated = true;
 3345                 bool incache = false;
 3346                 bool incpucache = false;
 3347                 char cpucachestr[32];
 3348 
 3349                 if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
 3350                         LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
 3351                                 if (pool_in_page(pp, ph, addr)) {
 3352                                         goto found;
 3353                                 }
 3354                         }
 3355                         LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
 3356                                 if (pool_in_page(pp, ph, addr)) {
 3357                                         allocated =
 3358                                             pool_allocated(pp, ph, addr);
 3359                                         goto found;
 3360                                 }
 3361                         }
 3362                         LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
 3363                                 if (pool_in_page(pp, ph, addr)) {
 3364                                         allocated = false;
 3365                                         goto found;
 3366                                 }
 3367                         }
 3368                         continue;
 3369                 } else {
 3370                         ph = pr_find_pagehead_noalign(pp, (void *)addr);
 3371                         if (ph == NULL || !pool_in_page(pp, ph, addr)) {
 3372                                 continue;
 3373                         }
 3374                         allocated = pool_allocated(pp, ph, addr);
 3375                 }
 3376 found:
 3377                 if (allocated &&
 3378                     (pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
 3379                         struct pool_cache_group *pcg;
 3380                         int i;
 3381 
 3382                         for (pcg = pc->pc_fullgroups; pcg != NULL;
 3383                             pcg = pcg->pcg_next) {
 3384                                 if (pool_in_cg(pp, pcg, addr)) {
 3385                                         incache = true;
 3386                                         goto print;
 3387                                 }
 3388                         }
 3389                         for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
 3390                                 pool_cache_cpu_t *cc;
 3391 
 3392                                 if ((cc = pc->pc_cpus[i]) == NULL) {
 3393                                         continue;
 3394                                 }
 3395                                 if (pool_in_cg(pp, cc->cc_current, addr) ||
 3396                                     pool_in_cg(pp, cc->cc_previous, addr)) {
 3397                                         struct cpu_info *ci =
 3398                                             cpu_lookup(i);
 3399 
 3400                                         incpucache = true;
 3401                                         snprintf(cpucachestr,
 3402                                             sizeof(cpucachestr),
 3403                                             "cached by CPU %u",
 3404                                             ci->ci_index);
 3405                                         goto print;
 3406                                 }
 3407                         }
 3408                 }
 3409 print:
 3410                 item = (uintptr_t)ph->ph_page + ph->ph_off;
 3411                 item = item + rounddown(addr - item, pp->pr_size);
 3412                 (*pr)("%p is %p+%zu in POOL '%s' (%s)\n",
 3413                     (void *)addr, item, (size_t)(addr - item),
 3414                     pp->pr_wchan,
 3415                     incpucache ? cpucachestr :
 3416                     incache ? "cached" : allocated ? "allocated" : "free");
 3417         }
 3418 }
 3419 #endif /* defined(DDB) */
 3420 
 3421 static int
 3422 pool_sysctl(SYSCTLFN_ARGS)
 3423 {
 3424         struct pool_sysctl data;
 3425         struct pool *pp;
 3426         struct pool_cache *pc;
 3427         pool_cache_cpu_t *cc;
 3428         int error;
 3429         size_t i, written;
 3430 
 3431         if (oldp == NULL) {
 3432                 *oldlenp = 0;
 3433                 TAILQ_FOREACH(pp, &pool_head, pr_poollist)
 3434                         *oldlenp += sizeof(data);
 3435                 return 0;
 3436         }
 3437 
 3438         memset(&data, 0, sizeof(data));
 3439         error = 0;
 3440         written = 0;
 3441         mutex_enter(&pool_head_lock);
 3442         TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
 3443                 if (written + sizeof(data) > *oldlenp)
 3444                         break;
 3445                 pp->pr_refcnt++;
 3446                 strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan));
 3447                 data.pr_pagesize = pp->pr_alloc->pa_pagesz;
 3448                 data.pr_flags = pp->pr_roflags | pp->pr_flags;
 3449 #define COPY(field) data.field = pp->field
 3450                 COPY(pr_size);
 3451 
 3452                 COPY(pr_itemsperpage);
 3453                 COPY(pr_nitems);
 3454                 COPY(pr_nout);
 3455                 COPY(pr_hardlimit);
 3456                 COPY(pr_npages);
 3457                 COPY(pr_minpages);
 3458                 COPY(pr_maxpages);
 3459 
 3460                 COPY(pr_nget);
 3461                 COPY(pr_nfail);
 3462                 COPY(pr_nput);
 3463                 COPY(pr_npagealloc);
 3464                 COPY(pr_npagefree);
 3465                 COPY(pr_hiwat);
 3466                 COPY(pr_nidle);
 3467 #undef COPY
 3468 
 3469                 data.pr_cache_nmiss_pcpu = 0;
 3470                 data.pr_cache_nhit_pcpu = 0;
 3471                 data.pr_cache_nmiss_global = 0;
 3472                 data.pr_cache_nempty = 0;
 3473                 data.pr_cache_ncontended = 0;
 3474                 data.pr_cache_npartial = 0;
 3475                 if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
 3476                         uint32_t nfull = 0;
 3477                         data.pr_cache_meta_size = pc->pc_pcgsize;
 3478                         for (i = 0; i < pc->pc_ncpu; ++i) {
 3479                                 cc = pc->pc_cpus[i];
 3480                                 if (cc == NULL)
 3481                                         continue;
 3482                                 data.pr_cache_ncontended += cc->cc_contended;
 3483                                 data.pr_cache_nmiss_pcpu += cc->cc_misses;
 3484                                 data.pr_cache_nhit_pcpu += cc->cc_hits;
 3485                                 data.pr_cache_nmiss_global += cc->cc_pcmisses;
 3486                                 nfull += cc->cc_nfull; /* 32-bit rollover! */
 3487                                 data.pr_cache_npartial += cc->cc_npart;
 3488                         }
 3489                         data.pr_cache_nfull = nfull;
 3490                 } else {
 3491                         data.pr_cache_meta_size = 0;
 3492                         data.pr_cache_nfull = 0;
 3493                 }
 3494                 data.pr_cache_nhit_global = data.pr_cache_nmiss_pcpu -
 3495                     data.pr_cache_nmiss_global;
 3496 
 3497                 if (pp->pr_refcnt == UINT_MAX) /* XXX possible? */
 3498                         continue;
 3499                 mutex_exit(&pool_head_lock);
 3500                 error = sysctl_copyout(l, &data, oldp, sizeof(data));
 3501                 mutex_enter(&pool_head_lock);
 3502                 if (--pp->pr_refcnt == 0)
 3503                         cv_broadcast(&pool_busy);
 3504                 if (error)
 3505                         break;
 3506                 written += sizeof(data);
 3507                 oldp = (char *)oldp + sizeof(data);
 3508         }
 3509         mutex_exit(&pool_head_lock);
 3510 
 3511         *oldlenp = written;
 3512         return error;
 3513 }
 3514 
 3515 SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup")
 3516 {
 3517         const struct sysctlnode *rnode = NULL;
 3518 
 3519         sysctl_createv(clog, 0, NULL, &rnode,
 3520                        CTLFLAG_PERMANENT,
 3521                        CTLTYPE_STRUCT, "pool",
 3522                        SYSCTL_DESCR("Get pool statistics"),
 3523                        pool_sysctl, 0, NULL, 0,
 3524                        CTL_KERN, CTL_CREATE, CTL_EOL);
 3525 }

Cache object: 2cefc3f0e644ac76fa9f4c13e815ba63


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.