The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_vmem.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
    5  * Copyright (c) 2013 EMC Corp.
    6  * All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 /*
   31  * From:
   32  *      $NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $
   33  *      $NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $
   34  */
   35 
   36 /*
   37  * reference:
   38  * -    Magazines and Vmem: Extending the Slab Allocator
   39  *      to Many CPUs and Arbitrary Resources
   40  *      http://www.usenix.org/event/usenix01/bonwick.html
   41  */
   42 
   43 #include <sys/cdefs.h>
   44 __FBSDID("$FreeBSD: releng/12.0/sys/kern/subr_vmem.c 338836 2018-09-20 18:29:55Z markj $");
   45 
   46 #include "opt_ddb.h"
   47 
   48 #include <sys/param.h>
   49 #include <sys/systm.h>
   50 #include <sys/kernel.h>
   51 #include <sys/queue.h>
   52 #include <sys/callout.h>
   53 #include <sys/hash.h>
   54 #include <sys/lock.h>
   55 #include <sys/malloc.h>
   56 #include <sys/mutex.h>
   57 #include <sys/smp.h>
   58 #include <sys/condvar.h>
   59 #include <sys/sysctl.h>
   60 #include <sys/taskqueue.h>
   61 #include <sys/vmem.h>
   62 #include <sys/vmmeter.h>
   63 
   64 #include "opt_vm.h"
   65 
   66 #include <vm/uma.h>
   67 #include <vm/vm.h>
   68 #include <vm/pmap.h>
   69 #include <vm/vm_map.h>
   70 #include <vm/vm_object.h>
   71 #include <vm/vm_kern.h>
   72 #include <vm/vm_extern.h>
   73 #include <vm/vm_param.h>
   74 #include <vm/vm_page.h>
   75 #include <vm/vm_pageout.h>
   76 #include <vm/vm_phys.h>
   77 #include <vm/vm_pagequeue.h>
   78 #include <vm/uma_int.h>
   79 
   80 int     vmem_startup_count(void);
   81 
   82 #define VMEM_OPTORDER           5
   83 #define VMEM_OPTVALUE           (1 << VMEM_OPTORDER)
   84 #define VMEM_MAXORDER                                           \
   85     (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER)
   86 
   87 #define VMEM_HASHSIZE_MIN       16
   88 #define VMEM_HASHSIZE_MAX       131072
   89 
   90 #define VMEM_QCACHE_IDX_MAX     16
   91 
   92 #define VMEM_FITMASK    (M_BESTFIT | M_FIRSTFIT)
   93 
   94 #define VMEM_FLAGS                                              \
   95     (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT)
   96 
   97 #define BT_FLAGS        (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM)
   98 
   99 #define QC_NAME_MAX     16
  100 
  101 /*
  102  * Data structures private to vmem.
  103  */
  104 MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures");
  105 
  106 typedef struct vmem_btag bt_t;
  107 
  108 TAILQ_HEAD(vmem_seglist, vmem_btag);
  109 LIST_HEAD(vmem_freelist, vmem_btag);
  110 LIST_HEAD(vmem_hashlist, vmem_btag);
  111 
  112 struct qcache {
  113         uma_zone_t      qc_cache;
  114         vmem_t          *qc_vmem;
  115         vmem_size_t     qc_size;
  116         char            qc_name[QC_NAME_MAX];
  117 };
  118 typedef struct qcache qcache_t;
  119 #define QC_POOL_TO_QCACHE(pool) ((qcache_t *)(pool->pr_qcache))
  120 
  121 #define VMEM_NAME_MAX   16
  122 
  123 /* vmem arena */
  124 struct vmem {
  125         struct mtx_padalign     vm_lock;
  126         struct cv               vm_cv;
  127         char                    vm_name[VMEM_NAME_MAX+1];
  128         LIST_ENTRY(vmem)        vm_alllist;
  129         struct vmem_hashlist    vm_hash0[VMEM_HASHSIZE_MIN];
  130         struct vmem_freelist    vm_freelist[VMEM_MAXORDER];
  131         struct vmem_seglist     vm_seglist;
  132         struct vmem_hashlist    *vm_hashlist;
  133         vmem_size_t             vm_hashsize;
  134 
  135         /* Constant after init */
  136         vmem_size_t             vm_qcache_max;
  137         vmem_size_t             vm_quantum_mask;
  138         vmem_size_t             vm_import_quantum;
  139         int                     vm_quantum_shift;
  140 
  141         /* Written on alloc/free */
  142         LIST_HEAD(, vmem_btag)  vm_freetags;
  143         int                     vm_nfreetags;
  144         int                     vm_nbusytag;
  145         vmem_size_t             vm_inuse;
  146         vmem_size_t             vm_size;
  147         vmem_size_t             vm_limit;
  148 
  149         /* Used on import. */
  150         vmem_import_t           *vm_importfn;
  151         vmem_release_t          *vm_releasefn;
  152         void                    *vm_arg;
  153 
  154         /* Space exhaustion callback. */
  155         vmem_reclaim_t          *vm_reclaimfn;
  156 
  157         /* quantum cache */
  158         qcache_t                vm_qcache[VMEM_QCACHE_IDX_MAX];
  159 };
  160 
  161 /* boundary tag */
  162 struct vmem_btag {
  163         TAILQ_ENTRY(vmem_btag) bt_seglist;
  164         union {
  165                 LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */
  166                 LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */
  167         } bt_u;
  168 #define bt_hashlist     bt_u.u_hashlist
  169 #define bt_freelist     bt_u.u_freelist
  170         vmem_addr_t     bt_start;
  171         vmem_size_t     bt_size;
  172         int             bt_type;
  173 };
  174 
  175 #define BT_TYPE_SPAN            1       /* Allocated from importfn */
  176 #define BT_TYPE_SPAN_STATIC     2       /* vmem_add() or create. */
  177 #define BT_TYPE_FREE            3       /* Available space. */
  178 #define BT_TYPE_BUSY            4       /* Used space. */
  179 #define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC)
  180 
  181 #define BT_END(bt)      ((bt)->bt_start + (bt)->bt_size - 1)
  182 
  183 #if defined(DIAGNOSTIC)
  184 static int enable_vmem_check = 1;
  185 SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN,
  186     &enable_vmem_check, 0, "Enable vmem check");
  187 static void vmem_check(vmem_t *);
  188 #endif
  189 
  190 static struct callout   vmem_periodic_ch;
  191 static int              vmem_periodic_interval;
  192 static struct task      vmem_periodic_wk;
  193 
  194 static struct mtx_padalign __exclusive_cache_line vmem_list_lock;
  195 static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
  196 static uma_zone_t vmem_zone;
  197 
  198 /* ---- misc */
  199 #define VMEM_CONDVAR_INIT(vm, wchan)    cv_init(&vm->vm_cv, wchan)
  200 #define VMEM_CONDVAR_DESTROY(vm)        cv_destroy(&vm->vm_cv)
  201 #define VMEM_CONDVAR_WAIT(vm)           cv_wait(&vm->vm_cv, &vm->vm_lock)
  202 #define VMEM_CONDVAR_BROADCAST(vm)      cv_broadcast(&vm->vm_cv)
  203 
  204 
  205 #define VMEM_LOCK(vm)           mtx_lock(&vm->vm_lock)
  206 #define VMEM_TRYLOCK(vm)        mtx_trylock(&vm->vm_lock)
  207 #define VMEM_UNLOCK(vm)         mtx_unlock(&vm->vm_lock)
  208 #define VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF)
  209 #define VMEM_LOCK_DESTROY(vm)   mtx_destroy(&vm->vm_lock)
  210 #define VMEM_ASSERT_LOCKED(vm)  mtx_assert(&vm->vm_lock, MA_OWNED);
  211 
  212 #define VMEM_ALIGNUP(addr, align)       (-(-(addr) & -(align)))
  213 
  214 #define VMEM_CROSS_P(addr1, addr2, boundary) \
  215         ((((addr1) ^ (addr2)) & -(boundary)) != 0)
  216 
  217 #define ORDER2SIZE(order)       ((order) < VMEM_OPTVALUE ? ((order) + 1) : \
  218     (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1)))
  219 #define SIZE2ORDER(size)        ((size) <= VMEM_OPTVALUE ? ((size) - 1) : \
  220     (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2)))
  221 
  222 /*
  223  * Maximum number of boundary tags that may be required to satisfy an
  224  * allocation.  Two may be required to import.  Another two may be
  225  * required to clip edges.
  226  */
  227 #define BT_MAXALLOC     4
  228 
  229 /*
  230  * Max free limits the number of locally cached boundary tags.  We
  231  * just want to avoid hitting the zone allocator for every call.
  232  */
  233 #define BT_MAXFREE      (BT_MAXALLOC * 8)
  234 
  235 /* Allocator for boundary tags. */
  236 static uma_zone_t vmem_bt_zone;
  237 
  238 /* boot time arena storage. */
  239 static struct vmem kernel_arena_storage;
  240 static struct vmem buffer_arena_storage;
  241 static struct vmem transient_arena_storage;
  242 /* kernel and kmem arenas are aliased for backwards KPI compat. */
  243 vmem_t *kernel_arena = &kernel_arena_storage;
  244 vmem_t *kmem_arena = &kernel_arena_storage;
  245 vmem_t *buffer_arena = &buffer_arena_storage;
  246 vmem_t *transient_arena = &transient_arena_storage;
  247 
  248 #ifdef DEBUG_MEMGUARD
  249 static struct vmem memguard_arena_storage;
  250 vmem_t *memguard_arena = &memguard_arena_storage;
  251 #endif
  252 
  253 /*
  254  * Fill the vmem's boundary tag cache.  We guarantee that boundary tag
  255  * allocation will not fail once bt_fill() passes.  To do so we cache
  256  * at least the maximum possible tag allocations in the arena.
  257  */
  258 static int
  259 bt_fill(vmem_t *vm, int flags)
  260 {
  261         bt_t *bt;
  262 
  263         VMEM_ASSERT_LOCKED(vm);
  264 
  265         /*
  266          * Only allow the kernel arena and arenas derived from kernel arena to
  267          * dip into reserve tags.  They are where new tags come from.
  268          */
  269         flags &= BT_FLAGS;
  270         if (vm != kernel_arena && vm->vm_arg != kernel_arena)
  271                 flags &= ~M_USE_RESERVE;
  272 
  273         /*
  274          * Loop until we meet the reserve.  To minimize the lock shuffle
  275          * and prevent simultaneous fills we first try a NOWAIT regardless
  276          * of the caller's flags.  Specify M_NOVM so we don't recurse while
  277          * holding a vmem lock.
  278          */
  279         while (vm->vm_nfreetags < BT_MAXALLOC) {
  280                 bt = uma_zalloc(vmem_bt_zone,
  281                     (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM);
  282                 if (bt == NULL) {
  283                         VMEM_UNLOCK(vm);
  284                         bt = uma_zalloc(vmem_bt_zone, flags);
  285                         VMEM_LOCK(vm);
  286                         if (bt == NULL && (flags & M_NOWAIT) != 0)
  287                                 break;
  288                 }
  289                 LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
  290                 vm->vm_nfreetags++;
  291         }
  292 
  293         if (vm->vm_nfreetags < BT_MAXALLOC)
  294                 return ENOMEM;
  295 
  296         return 0;
  297 }
  298 
  299 /*
  300  * Pop a tag off of the freetag stack.
  301  */
  302 static bt_t *
  303 bt_alloc(vmem_t *vm)
  304 {
  305         bt_t *bt;
  306 
  307         VMEM_ASSERT_LOCKED(vm);
  308         bt = LIST_FIRST(&vm->vm_freetags);
  309         MPASS(bt != NULL);
  310         LIST_REMOVE(bt, bt_freelist);
  311         vm->vm_nfreetags--;
  312 
  313         return bt;
  314 }
  315 
  316 /*
  317  * Trim the per-vmem free list.  Returns with the lock released to
  318  * avoid allocator recursions.
  319  */
  320 static void
  321 bt_freetrim(vmem_t *vm, int freelimit)
  322 {
  323         LIST_HEAD(, vmem_btag) freetags;
  324         bt_t *bt;
  325 
  326         LIST_INIT(&freetags);
  327         VMEM_ASSERT_LOCKED(vm);
  328         while (vm->vm_nfreetags > freelimit) {
  329                 bt = LIST_FIRST(&vm->vm_freetags);
  330                 LIST_REMOVE(bt, bt_freelist);
  331                 vm->vm_nfreetags--;
  332                 LIST_INSERT_HEAD(&freetags, bt, bt_freelist);
  333         }
  334         VMEM_UNLOCK(vm);
  335         while ((bt = LIST_FIRST(&freetags)) != NULL) {
  336                 LIST_REMOVE(bt, bt_freelist);
  337                 uma_zfree(vmem_bt_zone, bt);
  338         }
  339 }
  340 
  341 static inline void
  342 bt_free(vmem_t *vm, bt_t *bt)
  343 {
  344 
  345         VMEM_ASSERT_LOCKED(vm);
  346         MPASS(LIST_FIRST(&vm->vm_freetags) != bt);
  347         LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
  348         vm->vm_nfreetags++;
  349 }
  350 
  351 /*
  352  * freelist[0] ... [1, 1]
  353  * freelist[1] ... [2, 2]
  354  *  :
  355  * freelist[29] ... [30, 30]
  356  * freelist[30] ... [31, 31]
  357  * freelist[31] ... [32, 63]
  358  * freelist[33] ... [64, 127]
  359  *  :
  360  * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1]
  361  *  :
  362  */
  363 
  364 static struct vmem_freelist *
  365 bt_freehead_tofree(vmem_t *vm, vmem_size_t size)
  366 {
  367         const vmem_size_t qsize = size >> vm->vm_quantum_shift;
  368         const int idx = SIZE2ORDER(qsize);
  369 
  370         MPASS(size != 0 && qsize != 0);
  371         MPASS((size & vm->vm_quantum_mask) == 0);
  372         MPASS(idx >= 0);
  373         MPASS(idx < VMEM_MAXORDER);
  374 
  375         return &vm->vm_freelist[idx];
  376 }
  377 
  378 /*
  379  * bt_freehead_toalloc: return the freelist for the given size and allocation
  380  * strategy.
  381  *
  382  * For M_FIRSTFIT, return the list in which any blocks are large enough
  383  * for the requested size.  otherwise, return the list which can have blocks
  384  * large enough for the requested size.
  385  */
  386 static struct vmem_freelist *
  387 bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat)
  388 {
  389         const vmem_size_t qsize = size >> vm->vm_quantum_shift;
  390         int idx = SIZE2ORDER(qsize);
  391 
  392         MPASS(size != 0 && qsize != 0);
  393         MPASS((size & vm->vm_quantum_mask) == 0);
  394 
  395         if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) {
  396                 idx++;
  397                 /* check too large request? */
  398         }
  399         MPASS(idx >= 0);
  400         MPASS(idx < VMEM_MAXORDER);
  401 
  402         return &vm->vm_freelist[idx];
  403 }
  404 
  405 /* ---- boundary tag hash */
  406 
  407 static struct vmem_hashlist *
  408 bt_hashhead(vmem_t *vm, vmem_addr_t addr)
  409 {
  410         struct vmem_hashlist *list;
  411         unsigned int hash;
  412 
  413         hash = hash32_buf(&addr, sizeof(addr), 0);
  414         list = &vm->vm_hashlist[hash % vm->vm_hashsize];
  415 
  416         return list;
  417 }
  418 
  419 static bt_t *
  420 bt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
  421 {
  422         struct vmem_hashlist *list;
  423         bt_t *bt;
  424 
  425         VMEM_ASSERT_LOCKED(vm);
  426         list = bt_hashhead(vm, addr); 
  427         LIST_FOREACH(bt, list, bt_hashlist) {
  428                 if (bt->bt_start == addr) {
  429                         break;
  430                 }
  431         }
  432 
  433         return bt;
  434 }
  435 
  436 static void
  437 bt_rembusy(vmem_t *vm, bt_t *bt)
  438 {
  439 
  440         VMEM_ASSERT_LOCKED(vm);
  441         MPASS(vm->vm_nbusytag > 0);
  442         vm->vm_inuse -= bt->bt_size;
  443         vm->vm_nbusytag--;
  444         LIST_REMOVE(bt, bt_hashlist);
  445 }
  446 
  447 static void
  448 bt_insbusy(vmem_t *vm, bt_t *bt)
  449 {
  450         struct vmem_hashlist *list;
  451 
  452         VMEM_ASSERT_LOCKED(vm);
  453         MPASS(bt->bt_type == BT_TYPE_BUSY);
  454 
  455         list = bt_hashhead(vm, bt->bt_start);
  456         LIST_INSERT_HEAD(list, bt, bt_hashlist);
  457         vm->vm_nbusytag++;
  458         vm->vm_inuse += bt->bt_size;
  459 }
  460 
  461 /* ---- boundary tag list */
  462 
  463 static void
  464 bt_remseg(vmem_t *vm, bt_t *bt)
  465 {
  466 
  467         TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
  468         bt_free(vm, bt);
  469 }
  470 
  471 static void
  472 bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
  473 {
  474 
  475         TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
  476 }
  477 
  478 static void
  479 bt_insseg_tail(vmem_t *vm, bt_t *bt)
  480 {
  481 
  482         TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
  483 }
  484 
  485 static void
  486 bt_remfree(vmem_t *vm, bt_t *bt)
  487 {
  488 
  489         MPASS(bt->bt_type == BT_TYPE_FREE);
  490 
  491         LIST_REMOVE(bt, bt_freelist);
  492 }
  493 
  494 static void
  495 bt_insfree(vmem_t *vm, bt_t *bt)
  496 {
  497         struct vmem_freelist *list;
  498 
  499         list = bt_freehead_tofree(vm, bt->bt_size);
  500         LIST_INSERT_HEAD(list, bt, bt_freelist);
  501 }
  502 
  503 /* ---- vmem internal functions */
  504 
  505 /*
  506  * Import from the arena into the quantum cache in UMA.
  507  */
  508 static int
  509 qc_import(void *arg, void **store, int cnt, int domain, int flags)
  510 {
  511         qcache_t *qc;
  512         vmem_addr_t addr;
  513         int i;
  514 
  515         qc = arg;
  516         if ((flags & VMEM_FITMASK) == 0)
  517                 flags |= M_BESTFIT;
  518         for (i = 0; i < cnt; i++) {
  519                 if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0,
  520                     VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0)
  521                         break;
  522                 store[i] = (void *)addr;
  523                 /* Only guarantee one allocation. */
  524                 flags &= ~M_WAITOK;
  525                 flags |= M_NOWAIT;
  526         }
  527         return i;
  528 }
  529 
  530 /*
  531  * Release memory from the UMA cache to the arena.
  532  */
  533 static void
  534 qc_release(void *arg, void **store, int cnt)
  535 {
  536         qcache_t *qc;
  537         int i;
  538 
  539         qc = arg;
  540         for (i = 0; i < cnt; i++)
  541                 vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size);
  542 }
  543 
  544 static void
  545 qc_init(vmem_t *vm, vmem_size_t qcache_max)
  546 {
  547         qcache_t *qc;
  548         vmem_size_t size;
  549         int qcache_idx_max;
  550         int i;
  551 
  552         MPASS((qcache_max & vm->vm_quantum_mask) == 0);
  553         qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift,
  554             VMEM_QCACHE_IDX_MAX);
  555         vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift;
  556         for (i = 0; i < qcache_idx_max; i++) {
  557                 qc = &vm->vm_qcache[i];
  558                 size = (i + 1) << vm->vm_quantum_shift;
  559                 snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
  560                     vm->vm_name, size);
  561                 qc->qc_vmem = vm;
  562                 qc->qc_size = size;
  563                 qc->qc_cache = uma_zcache_create(qc->qc_name, size,
  564                     NULL, NULL, NULL, NULL, qc_import, qc_release, qc,
  565                     UMA_ZONE_VM);
  566                 MPASS(qc->qc_cache);
  567         }
  568 }
  569 
  570 static void
  571 qc_destroy(vmem_t *vm)
  572 {
  573         int qcache_idx_max;
  574         int i;
  575 
  576         qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
  577         for (i = 0; i < qcache_idx_max; i++)
  578                 uma_zdestroy(vm->vm_qcache[i].qc_cache);
  579 }
  580 
  581 static void
  582 qc_drain(vmem_t *vm)
  583 {
  584         int qcache_idx_max;
  585         int i;
  586 
  587         qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
  588         for (i = 0; i < qcache_idx_max; i++)
  589                 zone_drain(vm->vm_qcache[i].qc_cache);
  590 }
  591 
  592 #ifndef UMA_MD_SMALL_ALLOC
  593 
  594 static struct mtx_padalign __exclusive_cache_line vmem_bt_lock;
  595 
  596 /*
  597  * vmem_bt_alloc:  Allocate a new page of boundary tags.
  598  *
  599  * On architectures with uma_small_alloc there is no recursion; no address
  600  * space need be allocated to allocate boundary tags.  For the others, we
  601  * must handle recursion.  Boundary tags are necessary to allocate new
  602  * boundary tags.
  603  *
  604  * UMA guarantees that enough tags are held in reserve to allocate a new
  605  * page of kva.  We dip into this reserve by specifying M_USE_RESERVE only
  606  * when allocating the page to hold new boundary tags.  In this way the
  607  * reserve is automatically filled by the allocation that uses the reserve.
  608  * 
  609  * We still have to guarantee that the new tags are allocated atomically since
  610  * many threads may try concurrently.  The bt_lock provides this guarantee.
  611  * We convert WAITOK allocations to NOWAIT and then handle the blocking here
  612  * on failure.  It's ok to return NULL for a WAITOK allocation as UMA will
  613  * loop again after checking to see if we lost the race to allocate.
  614  *
  615  * There is a small race between vmem_bt_alloc() returning the page and the
  616  * zone lock being acquired to add the page to the zone.  For WAITOK
  617  * allocations we just pause briefly.  NOWAIT may experience a transient
  618  * failure.  To alleviate this we permit a small number of simultaneous
  619  * fills to proceed concurrently so NOWAIT is less likely to fail unless
  620  * we are really out of KVA.
  621  */
  622 static void *
  623 vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
  624     int wait)
  625 {
  626         vmem_addr_t addr;
  627 
  628         *pflag = UMA_SLAB_KERNEL;
  629 
  630         /*
  631          * Single thread boundary tag allocation so that the address space
  632          * and memory are added in one atomic operation.
  633          */
  634         mtx_lock(&vmem_bt_lock);
  635         if (vmem_xalloc(vm_dom[domain].vmd_kernel_arena, bytes, 0, 0, 0,
  636             VMEM_ADDR_MIN, VMEM_ADDR_MAX,
  637             M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT, &addr) == 0) {
  638                 if (kmem_back_domain(domain, kernel_object, addr, bytes,
  639                     M_NOWAIT | M_USE_RESERVE) == 0) {
  640                         mtx_unlock(&vmem_bt_lock);
  641                         return ((void *)addr);
  642                 }
  643                 vmem_xfree(vm_dom[domain].vmd_kernel_arena, addr, bytes);
  644                 mtx_unlock(&vmem_bt_lock);
  645                 /*
  646                  * Out of memory, not address space.  This may not even be
  647                  * possible due to M_USE_RESERVE page allocation.
  648                  */
  649                 if (wait & M_WAITOK)
  650                         vm_wait_domain(domain);
  651                 return (NULL);
  652         }
  653         mtx_unlock(&vmem_bt_lock);
  654         /*
  655          * We're either out of address space or lost a fill race.
  656          */
  657         if (wait & M_WAITOK)
  658                 pause("btalloc", 1);
  659 
  660         return (NULL);
  661 }
  662 
  663 /*
  664  * How many pages do we need to startup_alloc.
  665  */
  666 int
  667 vmem_startup_count(void)
  668 {
  669 
  670         return (howmany(BT_MAXALLOC,
  671             UMA_SLAB_SPACE / sizeof(struct vmem_btag)));
  672 }
  673 #endif
  674 
  675 void
  676 vmem_startup(void)
  677 {
  678 
  679         mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF);
  680         vmem_zone = uma_zcreate("vmem",
  681             sizeof(struct vmem), NULL, NULL, NULL, NULL,
  682             UMA_ALIGN_PTR, UMA_ZONE_VM);
  683         vmem_bt_zone = uma_zcreate("vmem btag",
  684             sizeof(struct vmem_btag), NULL, NULL, NULL, NULL,
  685             UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
  686 #ifndef UMA_MD_SMALL_ALLOC
  687         mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF);
  688         uma_prealloc(vmem_bt_zone, BT_MAXALLOC);
  689         /*
  690          * Reserve enough tags to allocate new tags.  We allow multiple
  691          * CPUs to attempt to allocate new tags concurrently to limit
  692          * false restarts in UMA.
  693          */
  694         uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2);
  695         uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc);
  696 #endif
  697 }
  698 
  699 /* ---- rehash */
  700 
  701 static int
  702 vmem_rehash(vmem_t *vm, vmem_size_t newhashsize)
  703 {
  704         bt_t *bt;
  705         int i;
  706         struct vmem_hashlist *newhashlist;
  707         struct vmem_hashlist *oldhashlist;
  708         vmem_size_t oldhashsize;
  709 
  710         MPASS(newhashsize > 0);
  711 
  712         newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize,
  713             M_VMEM, M_NOWAIT);
  714         if (newhashlist == NULL)
  715                 return ENOMEM;
  716         for (i = 0; i < newhashsize; i++) {
  717                 LIST_INIT(&newhashlist[i]);
  718         }
  719 
  720         VMEM_LOCK(vm);
  721         oldhashlist = vm->vm_hashlist;
  722         oldhashsize = vm->vm_hashsize;
  723         vm->vm_hashlist = newhashlist;
  724         vm->vm_hashsize = newhashsize;
  725         if (oldhashlist == NULL) {
  726                 VMEM_UNLOCK(vm);
  727                 return 0;
  728         }
  729         for (i = 0; i < oldhashsize; i++) {
  730                 while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
  731                         bt_rembusy(vm, bt);
  732                         bt_insbusy(vm, bt);
  733                 }
  734         }
  735         VMEM_UNLOCK(vm);
  736 
  737         if (oldhashlist != vm->vm_hash0) {
  738                 free(oldhashlist, M_VMEM);
  739         }
  740 
  741         return 0;
  742 }
  743 
  744 static void
  745 vmem_periodic_kick(void *dummy)
  746 {
  747 
  748         taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk);
  749 }
  750 
  751 static void
  752 vmem_periodic(void *unused, int pending)
  753 {
  754         vmem_t *vm;
  755         vmem_size_t desired;
  756         vmem_size_t current;
  757 
  758         mtx_lock(&vmem_list_lock);
  759         LIST_FOREACH(vm, &vmem_list, vm_alllist) {
  760 #ifdef DIAGNOSTIC
  761                 /* Convenient time to verify vmem state. */
  762                 if (enable_vmem_check == 1) {
  763                         VMEM_LOCK(vm);
  764                         vmem_check(vm);
  765                         VMEM_UNLOCK(vm);
  766                 }
  767 #endif
  768                 desired = 1 << flsl(vm->vm_nbusytag);
  769                 desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN),
  770                     VMEM_HASHSIZE_MAX);
  771                 current = vm->vm_hashsize;
  772 
  773                 /* Grow in powers of two.  Shrink less aggressively. */
  774                 if (desired >= current * 2 || desired * 4 <= current)
  775                         vmem_rehash(vm, desired);
  776 
  777                 /*
  778                  * Periodically wake up threads waiting for resources,
  779                  * so they could ask for reclamation again.
  780                  */
  781                 VMEM_CONDVAR_BROADCAST(vm);
  782         }
  783         mtx_unlock(&vmem_list_lock);
  784 
  785         callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
  786             vmem_periodic_kick, NULL);
  787 }
  788 
  789 static void
  790 vmem_start_callout(void *unused)
  791 {
  792 
  793         TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL);
  794         vmem_periodic_interval = hz * 10;
  795         callout_init(&vmem_periodic_ch, 1);
  796         callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
  797             vmem_periodic_kick, NULL);
  798 }
  799 SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL);
  800 
  801 static void
  802 vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type)
  803 {
  804         bt_t *btspan;
  805         bt_t *btfree;
  806 
  807         MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC);
  808         MPASS((size & vm->vm_quantum_mask) == 0);
  809 
  810         btspan = bt_alloc(vm);
  811         btspan->bt_type = type;
  812         btspan->bt_start = addr;
  813         btspan->bt_size = size;
  814         bt_insseg_tail(vm, btspan);
  815 
  816         btfree = bt_alloc(vm);
  817         btfree->bt_type = BT_TYPE_FREE;
  818         btfree->bt_start = addr;
  819         btfree->bt_size = size;
  820         bt_insseg(vm, btfree, btspan);
  821         bt_insfree(vm, btfree);
  822 
  823         vm->vm_size += size;
  824 }
  825 
  826 static void
  827 vmem_destroy1(vmem_t *vm)
  828 {
  829         bt_t *bt;
  830 
  831         /*
  832          * Drain per-cpu quantum caches.
  833          */
  834         qc_destroy(vm);
  835 
  836         /*
  837          * The vmem should now only contain empty segments.
  838          */
  839         VMEM_LOCK(vm);
  840         MPASS(vm->vm_nbusytag == 0);
  841 
  842         while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL)
  843                 bt_remseg(vm, bt);
  844 
  845         if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0)
  846                 free(vm->vm_hashlist, M_VMEM);
  847 
  848         bt_freetrim(vm, 0);
  849 
  850         VMEM_CONDVAR_DESTROY(vm);
  851         VMEM_LOCK_DESTROY(vm);
  852         uma_zfree(vmem_zone, vm);
  853 }
  854 
  855 static int
  856 vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
  857 {
  858         vmem_addr_t addr;
  859         int error;
  860 
  861         if (vm->vm_importfn == NULL)
  862                 return (EINVAL);
  863 
  864         /*
  865          * To make sure we get a span that meets the alignment we double it
  866          * and add the size to the tail.  This slightly overestimates.
  867          */
  868         if (align != vm->vm_quantum_mask + 1)
  869                 size = (align * 2) + size;
  870         size = roundup(size, vm->vm_import_quantum);
  871 
  872         if (vm->vm_limit != 0 && vm->vm_limit < vm->vm_size + size)
  873                 return (ENOMEM);
  874 
  875         /*
  876          * Hide MAXALLOC tags so we're guaranteed to be able to add this
  877          * span and the tag we want to allocate from it.
  878          */
  879         MPASS(vm->vm_nfreetags >= BT_MAXALLOC);
  880         vm->vm_nfreetags -= BT_MAXALLOC;
  881         VMEM_UNLOCK(vm);
  882         error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
  883         VMEM_LOCK(vm);
  884         vm->vm_nfreetags += BT_MAXALLOC;
  885         if (error)
  886                 return (ENOMEM);
  887 
  888         vmem_add1(vm, addr, size, BT_TYPE_SPAN);
  889 
  890         return 0;
  891 }
  892 
  893 /*
  894  * vmem_fit: check if a bt can satisfy the given restrictions.
  895  *
  896  * it's a caller's responsibility to ensure the region is big enough
  897  * before calling us.
  898  */
  899 static int
  900 vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
  901     vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
  902     vmem_addr_t maxaddr, vmem_addr_t *addrp)
  903 {
  904         vmem_addr_t start;
  905         vmem_addr_t end;
  906 
  907         MPASS(size > 0);
  908         MPASS(bt->bt_size >= size); /* caller's responsibility */
  909 
  910         /*
  911          * XXX assumption: vmem_addr_t and vmem_size_t are
  912          * unsigned integer of the same size.
  913          */
  914 
  915         start = bt->bt_start;
  916         if (start < minaddr) {
  917                 start = minaddr;
  918         }
  919         end = BT_END(bt);
  920         if (end > maxaddr)
  921                 end = maxaddr;
  922         if (start > end) 
  923                 return (ENOMEM);
  924 
  925         start = VMEM_ALIGNUP(start - phase, align) + phase;
  926         if (start < bt->bt_start)
  927                 start += align;
  928         if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
  929                 MPASS(align < nocross);
  930                 start = VMEM_ALIGNUP(start - phase, nocross) + phase;
  931         }
  932         if (start <= end && end - start >= size - 1) {
  933                 MPASS((start & (align - 1)) == phase);
  934                 MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross));
  935                 MPASS(minaddr <= start);
  936                 MPASS(maxaddr == 0 || start + size - 1 <= maxaddr);
  937                 MPASS(bt->bt_start <= start);
  938                 MPASS(BT_END(bt) - start >= size - 1);
  939                 *addrp = start;
  940 
  941                 return (0);
  942         }
  943         return (ENOMEM);
  944 }
  945 
  946 /*
  947  * vmem_clip:  Trim the boundary tag edges to the requested start and size.
  948  */
  949 static void
  950 vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size)
  951 {
  952         bt_t *btnew;
  953         bt_t *btprev;
  954 
  955         VMEM_ASSERT_LOCKED(vm);
  956         MPASS(bt->bt_type == BT_TYPE_FREE);
  957         MPASS(bt->bt_size >= size);
  958         bt_remfree(vm, bt);
  959         if (bt->bt_start != start) {
  960                 btprev = bt_alloc(vm);
  961                 btprev->bt_type = BT_TYPE_FREE;
  962                 btprev->bt_start = bt->bt_start;
  963                 btprev->bt_size = start - bt->bt_start;
  964                 bt->bt_start = start;
  965                 bt->bt_size -= btprev->bt_size;
  966                 bt_insfree(vm, btprev);
  967                 bt_insseg(vm, btprev,
  968                     TAILQ_PREV(bt, vmem_seglist, bt_seglist));
  969         }
  970         MPASS(bt->bt_start == start);
  971         if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
  972                 /* split */
  973                 btnew = bt_alloc(vm);
  974                 btnew->bt_type = BT_TYPE_BUSY;
  975                 btnew->bt_start = bt->bt_start;
  976                 btnew->bt_size = size;
  977                 bt->bt_start = bt->bt_start + size;
  978                 bt->bt_size -= size;
  979                 bt_insfree(vm, bt);
  980                 bt_insseg(vm, btnew,
  981                     TAILQ_PREV(bt, vmem_seglist, bt_seglist));
  982                 bt_insbusy(vm, btnew);
  983                 bt = btnew;
  984         } else {
  985                 bt->bt_type = BT_TYPE_BUSY;
  986                 bt_insbusy(vm, bt);
  987         }
  988         MPASS(bt->bt_size >= size);
  989 }
  990 
  991 /* ---- vmem API */
  992 
  993 void
  994 vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
  995      vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum)
  996 {
  997 
  998         VMEM_LOCK(vm);
  999         vm->vm_importfn = importfn;
 1000         vm->vm_releasefn = releasefn;
 1001         vm->vm_arg = arg;
 1002         vm->vm_import_quantum = import_quantum;
 1003         VMEM_UNLOCK(vm);
 1004 }
 1005 
 1006 void
 1007 vmem_set_limit(vmem_t *vm, vmem_size_t limit)
 1008 {
 1009 
 1010         VMEM_LOCK(vm);
 1011         vm->vm_limit = limit;
 1012         VMEM_UNLOCK(vm);
 1013 }
 1014 
 1015 void
 1016 vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn)
 1017 {
 1018 
 1019         VMEM_LOCK(vm);
 1020         vm->vm_reclaimfn = reclaimfn;
 1021         VMEM_UNLOCK(vm);
 1022 }
 1023 
 1024 /*
 1025  * vmem_init: Initializes vmem arena.
 1026  */
 1027 vmem_t *
 1028 vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size,
 1029     vmem_size_t quantum, vmem_size_t qcache_max, int flags)
 1030 {
 1031         int i;
 1032 
 1033         MPASS(quantum > 0);
 1034         MPASS((quantum & (quantum - 1)) == 0);
 1035 
 1036         bzero(vm, sizeof(*vm));
 1037 
 1038         VMEM_CONDVAR_INIT(vm, name);
 1039         VMEM_LOCK_INIT(vm, name);
 1040         vm->vm_nfreetags = 0;
 1041         LIST_INIT(&vm->vm_freetags);
 1042         strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
 1043         vm->vm_quantum_mask = quantum - 1;
 1044         vm->vm_quantum_shift = flsl(quantum) - 1;
 1045         vm->vm_nbusytag = 0;
 1046         vm->vm_size = 0;
 1047         vm->vm_limit = 0;
 1048         vm->vm_inuse = 0;
 1049         qc_init(vm, qcache_max);
 1050 
 1051         TAILQ_INIT(&vm->vm_seglist);
 1052         for (i = 0; i < VMEM_MAXORDER; i++) {
 1053                 LIST_INIT(&vm->vm_freelist[i]);
 1054         }
 1055         memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
 1056         vm->vm_hashsize = VMEM_HASHSIZE_MIN;
 1057         vm->vm_hashlist = vm->vm_hash0;
 1058 
 1059         if (size != 0) {
 1060                 if (vmem_add(vm, base, size, flags) != 0) {
 1061                         vmem_destroy1(vm);
 1062                         return NULL;
 1063                 }
 1064         }
 1065 
 1066         mtx_lock(&vmem_list_lock);
 1067         LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
 1068         mtx_unlock(&vmem_list_lock);
 1069 
 1070         return vm;
 1071 }
 1072 
 1073 /*
 1074  * vmem_create: create an arena.
 1075  */
 1076 vmem_t *
 1077 vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
 1078     vmem_size_t quantum, vmem_size_t qcache_max, int flags)
 1079 {
 1080 
 1081         vmem_t *vm;
 1082 
 1083         vm = uma_zalloc(vmem_zone, flags & (M_WAITOK|M_NOWAIT));
 1084         if (vm == NULL)
 1085                 return (NULL);
 1086         if (vmem_init(vm, name, base, size, quantum, qcache_max,
 1087             flags) == NULL)
 1088                 return (NULL);
 1089         return (vm);
 1090 }
 1091 
 1092 void
 1093 vmem_destroy(vmem_t *vm)
 1094 {
 1095 
 1096         mtx_lock(&vmem_list_lock);
 1097         LIST_REMOVE(vm, vm_alllist);
 1098         mtx_unlock(&vmem_list_lock);
 1099 
 1100         vmem_destroy1(vm);
 1101 }
 1102 
 1103 vmem_size_t
 1104 vmem_roundup_size(vmem_t *vm, vmem_size_t size)
 1105 {
 1106 
 1107         return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
 1108 }
 1109 
 1110 /*
 1111  * vmem_alloc: allocate resource from the arena.
 1112  */
 1113 int
 1114 vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp)
 1115 {
 1116         const int strat __unused = flags & VMEM_FITMASK;
 1117         qcache_t *qc;
 1118 
 1119         flags &= VMEM_FLAGS;
 1120         MPASS(size > 0);
 1121         MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
 1122         if ((flags & M_NOWAIT) == 0)
 1123                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc");
 1124 
 1125         if (size <= vm->vm_qcache_max) {
 1126                 qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
 1127                 *addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags);
 1128                 if (*addrp == 0)
 1129                         return (ENOMEM);
 1130                 return (0);
 1131         }
 1132 
 1133         return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
 1134             flags, addrp);
 1135 }
 1136 
 1137 int
 1138 vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
 1139     const vmem_size_t phase, const vmem_size_t nocross,
 1140     const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags,
 1141     vmem_addr_t *addrp)
 1142 {
 1143         const vmem_size_t size = vmem_roundup_size(vm, size0);
 1144         struct vmem_freelist *list;
 1145         struct vmem_freelist *first;
 1146         struct vmem_freelist *end;
 1147         vmem_size_t avail;
 1148         bt_t *bt;
 1149         int error;
 1150         int strat;
 1151 
 1152         flags &= VMEM_FLAGS;
 1153         strat = flags & VMEM_FITMASK;
 1154         MPASS(size0 > 0);
 1155         MPASS(size > 0);
 1156         MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
 1157         MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK));
 1158         if ((flags & M_NOWAIT) == 0)
 1159                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc");
 1160         MPASS((align & vm->vm_quantum_mask) == 0);
 1161         MPASS((align & (align - 1)) == 0);
 1162         MPASS((phase & vm->vm_quantum_mask) == 0);
 1163         MPASS((nocross & vm->vm_quantum_mask) == 0);
 1164         MPASS((nocross & (nocross - 1)) == 0);
 1165         MPASS((align == 0 && phase == 0) || phase < align);
 1166         MPASS(nocross == 0 || nocross >= size);
 1167         MPASS(minaddr <= maxaddr);
 1168         MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
 1169 
 1170         if (align == 0)
 1171                 align = vm->vm_quantum_mask + 1;
 1172 
 1173         *addrp = 0;
 1174         end = &vm->vm_freelist[VMEM_MAXORDER];
 1175         /*
 1176          * choose a free block from which we allocate.
 1177          */
 1178         first = bt_freehead_toalloc(vm, size, strat);
 1179         VMEM_LOCK(vm);
 1180         for (;;) {
 1181                 /*
 1182                  * Make sure we have enough tags to complete the
 1183                  * operation.
 1184                  */
 1185                 if (vm->vm_nfreetags < BT_MAXALLOC &&
 1186                     bt_fill(vm, flags) != 0) {
 1187                         error = ENOMEM;
 1188                         break;
 1189                 }
 1190                 /*
 1191                  * Scan freelists looking for a tag that satisfies the
 1192                  * allocation.  If we're doing BESTFIT we may encounter
 1193                  * sizes below the request.  If we're doing FIRSTFIT we
 1194                  * inspect only the first element from each list.
 1195                  */
 1196                 for (list = first; list < end; list++) {
 1197                         LIST_FOREACH(bt, list, bt_freelist) {
 1198                                 if (bt->bt_size >= size) {
 1199                                         error = vmem_fit(bt, size, align, phase,
 1200                                             nocross, minaddr, maxaddr, addrp);
 1201                                         if (error == 0) {
 1202                                                 vmem_clip(vm, bt, *addrp, size);
 1203                                                 goto out;
 1204                                         }
 1205                                 }
 1206                                 /* FIRST skips to the next list. */
 1207                                 if (strat == M_FIRSTFIT)
 1208                                         break;
 1209                         }
 1210                 }
 1211                 /*
 1212                  * Retry if the fast algorithm failed.
 1213                  */
 1214                 if (strat == M_FIRSTFIT) {
 1215                         strat = M_BESTFIT;
 1216                         first = bt_freehead_toalloc(vm, size, strat);
 1217                         continue;
 1218                 }
 1219                 /*
 1220                  * XXX it is possible to fail to meet restrictions with the
 1221                  * imported region.  It is up to the user to specify the
 1222                  * import quantum such that it can satisfy any allocation.
 1223                  */
 1224                 if (vmem_import(vm, size, align, flags) == 0)
 1225                         continue;
 1226 
 1227                 /*
 1228                  * Try to free some space from the quantum cache or reclaim
 1229                  * functions if available.
 1230                  */
 1231                 if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) {
 1232                         avail = vm->vm_size - vm->vm_inuse;
 1233                         VMEM_UNLOCK(vm);
 1234                         if (vm->vm_qcache_max != 0)
 1235                                 qc_drain(vm);
 1236                         if (vm->vm_reclaimfn != NULL)
 1237                                 vm->vm_reclaimfn(vm, flags);
 1238                         VMEM_LOCK(vm);
 1239                         /* If we were successful retry even NOWAIT. */
 1240                         if (vm->vm_size - vm->vm_inuse > avail)
 1241                                 continue;
 1242                 }
 1243                 if ((flags & M_NOWAIT) != 0) {
 1244                         error = ENOMEM;
 1245                         break;
 1246                 }
 1247                 VMEM_CONDVAR_WAIT(vm);
 1248         }
 1249 out:
 1250         VMEM_UNLOCK(vm);
 1251         if (error != 0 && (flags & M_NOWAIT) == 0)
 1252                 panic("failed to allocate waiting allocation\n");
 1253 
 1254         return (error);
 1255 }
 1256 
 1257 /*
 1258  * vmem_free: free the resource to the arena.
 1259  */
 1260 void
 1261 vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
 1262 {
 1263         qcache_t *qc;
 1264         MPASS(size > 0);
 1265 
 1266         if (size <= vm->vm_qcache_max) {
 1267                 qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
 1268                 uma_zfree(qc->qc_cache, (void *)addr);
 1269         } else
 1270                 vmem_xfree(vm, addr, size);
 1271 }
 1272 
 1273 void
 1274 vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
 1275 {
 1276         bt_t *bt;
 1277         bt_t *t;
 1278 
 1279         MPASS(size > 0);
 1280 
 1281         VMEM_LOCK(vm);
 1282         bt = bt_lookupbusy(vm, addr);
 1283         MPASS(bt != NULL);
 1284         MPASS(bt->bt_start == addr);
 1285         MPASS(bt->bt_size == vmem_roundup_size(vm, size) ||
 1286             bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
 1287         MPASS(bt->bt_type == BT_TYPE_BUSY);
 1288         bt_rembusy(vm, bt);
 1289         bt->bt_type = BT_TYPE_FREE;
 1290 
 1291         /* coalesce */
 1292         t = TAILQ_NEXT(bt, bt_seglist);
 1293         if (t != NULL && t->bt_type == BT_TYPE_FREE) {
 1294                 MPASS(BT_END(bt) < t->bt_start);        /* YYY */
 1295                 bt->bt_size += t->bt_size;
 1296                 bt_remfree(vm, t);
 1297                 bt_remseg(vm, t);
 1298         }
 1299         t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
 1300         if (t != NULL && t->bt_type == BT_TYPE_FREE) {
 1301                 MPASS(BT_END(t) < bt->bt_start);        /* YYY */
 1302                 bt->bt_size += t->bt_size;
 1303                 bt->bt_start = t->bt_start;
 1304                 bt_remfree(vm, t);
 1305                 bt_remseg(vm, t);
 1306         }
 1307 
 1308         t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
 1309         MPASS(t != NULL);
 1310         MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
 1311         if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
 1312             t->bt_size == bt->bt_size) {
 1313                 vmem_addr_t spanaddr;
 1314                 vmem_size_t spansize;
 1315 
 1316                 MPASS(t->bt_start == bt->bt_start);
 1317                 spanaddr = bt->bt_start;
 1318                 spansize = bt->bt_size;
 1319                 bt_remseg(vm, bt);
 1320                 bt_remseg(vm, t);
 1321                 vm->vm_size -= spansize;
 1322                 VMEM_CONDVAR_BROADCAST(vm);
 1323                 bt_freetrim(vm, BT_MAXFREE);
 1324                 (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
 1325         } else {
 1326                 bt_insfree(vm, bt);
 1327                 VMEM_CONDVAR_BROADCAST(vm);
 1328                 bt_freetrim(vm, BT_MAXFREE);
 1329         }
 1330 }
 1331 
 1332 /*
 1333  * vmem_add:
 1334  *
 1335  */
 1336 int
 1337 vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags)
 1338 {
 1339         int error;
 1340 
 1341         error = 0;
 1342         flags &= VMEM_FLAGS;
 1343         VMEM_LOCK(vm);
 1344         if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0)
 1345                 vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC);
 1346         else
 1347                 error = ENOMEM;
 1348         VMEM_UNLOCK(vm);
 1349 
 1350         return (error);
 1351 }
 1352 
 1353 /*
 1354  * vmem_size: information about arenas size
 1355  */
 1356 vmem_size_t
 1357 vmem_size(vmem_t *vm, int typemask)
 1358 {
 1359         int i;
 1360 
 1361         switch (typemask) {
 1362         case VMEM_ALLOC:
 1363                 return vm->vm_inuse;
 1364         case VMEM_FREE:
 1365                 return vm->vm_size - vm->vm_inuse;
 1366         case VMEM_FREE|VMEM_ALLOC:
 1367                 return vm->vm_size;
 1368         case VMEM_MAXFREE:
 1369                 VMEM_LOCK(vm);
 1370                 for (i = VMEM_MAXORDER - 1; i >= 0; i--) {
 1371                         if (LIST_EMPTY(&vm->vm_freelist[i]))
 1372                                 continue;
 1373                         VMEM_UNLOCK(vm);
 1374                         return ((vmem_size_t)ORDER2SIZE(i) <<
 1375                             vm->vm_quantum_shift);
 1376                 }
 1377                 VMEM_UNLOCK(vm);
 1378                 return (0);
 1379         default:
 1380                 panic("vmem_size");
 1381         }
 1382 }
 1383 
 1384 /* ---- debug */
 1385 
 1386 #if defined(DDB) || defined(DIAGNOSTIC)
 1387 
 1388 static void bt_dump(const bt_t *, int (*)(const char *, ...)
 1389     __printflike(1, 2));
 1390 
 1391 static const char *
 1392 bt_type_string(int type)
 1393 {
 1394 
 1395         switch (type) {
 1396         case BT_TYPE_BUSY:
 1397                 return "busy";
 1398         case BT_TYPE_FREE:
 1399                 return "free";
 1400         case BT_TYPE_SPAN:
 1401                 return "span";
 1402         case BT_TYPE_SPAN_STATIC:
 1403                 return "static span";
 1404         default:
 1405                 break;
 1406         }
 1407         return "BOGUS";
 1408 }
 1409 
 1410 static void
 1411 bt_dump(const bt_t *bt, int (*pr)(const char *, ...))
 1412 {
 1413 
 1414         (*pr)("\t%p: %jx %jx, %d(%s)\n",
 1415             bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size,
 1416             bt->bt_type, bt_type_string(bt->bt_type));
 1417 }
 1418 
 1419 static void
 1420 vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2))
 1421 {
 1422         const bt_t *bt;
 1423         int i;
 1424 
 1425         (*pr)("vmem %p '%s'\n", vm, vm->vm_name);
 1426         TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 1427                 bt_dump(bt, pr);
 1428         }
 1429 
 1430         for (i = 0; i < VMEM_MAXORDER; i++) {
 1431                 const struct vmem_freelist *fl = &vm->vm_freelist[i];
 1432 
 1433                 if (LIST_EMPTY(fl)) {
 1434                         continue;
 1435                 }
 1436 
 1437                 (*pr)("freelist[%d]\n", i);
 1438                 LIST_FOREACH(bt, fl, bt_freelist) {
 1439                         bt_dump(bt, pr);
 1440                 }
 1441         }
 1442 }
 1443 
 1444 #endif /* defined(DDB) || defined(DIAGNOSTIC) */
 1445 
 1446 #if defined(DDB)
 1447 #include <ddb/ddb.h>
 1448 
 1449 static bt_t *
 1450 vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr)
 1451 {
 1452         bt_t *bt;
 1453 
 1454         TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 1455                 if (BT_ISSPAN_P(bt)) {
 1456                         continue;
 1457                 }
 1458                 if (bt->bt_start <= addr && addr <= BT_END(bt)) {
 1459                         return bt;
 1460                 }
 1461         }
 1462 
 1463         return NULL;
 1464 }
 1465 
 1466 void
 1467 vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...))
 1468 {
 1469         vmem_t *vm;
 1470 
 1471         LIST_FOREACH(vm, &vmem_list, vm_alllist) {
 1472                 bt_t *bt;
 1473 
 1474                 bt = vmem_whatis_lookup(vm, addr);
 1475                 if (bt == NULL) {
 1476                         continue;
 1477                 }
 1478                 (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
 1479                     (void *)addr, (void *)bt->bt_start,
 1480                     (vmem_size_t)(addr - bt->bt_start), vm->vm_name,
 1481                     (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
 1482         }
 1483 }
 1484 
 1485 void
 1486 vmem_printall(const char *modif, int (*pr)(const char *, ...))
 1487 {
 1488         const vmem_t *vm;
 1489 
 1490         LIST_FOREACH(vm, &vmem_list, vm_alllist) {
 1491                 vmem_dump(vm, pr);
 1492         }
 1493 }
 1494 
 1495 void
 1496 vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...))
 1497 {
 1498         const vmem_t *vm = (const void *)addr;
 1499 
 1500         vmem_dump(vm, pr);
 1501 }
 1502 
 1503 DB_SHOW_COMMAND(vmemdump, vmemdump)
 1504 {
 1505 
 1506         if (!have_addr) {
 1507                 db_printf("usage: show vmemdump <addr>\n");
 1508                 return;
 1509         }
 1510 
 1511         vmem_dump((const vmem_t *)addr, db_printf);
 1512 }
 1513 
 1514 DB_SHOW_ALL_COMMAND(vmemdump, vmemdumpall)
 1515 {
 1516         const vmem_t *vm;
 1517 
 1518         LIST_FOREACH(vm, &vmem_list, vm_alllist)
 1519                 vmem_dump(vm, db_printf);
 1520 }
 1521 
 1522 DB_SHOW_COMMAND(vmem, vmem_summ)
 1523 {
 1524         const vmem_t *vm = (const void *)addr;
 1525         const bt_t *bt;
 1526         size_t ft[VMEM_MAXORDER], ut[VMEM_MAXORDER];
 1527         size_t fs[VMEM_MAXORDER], us[VMEM_MAXORDER];
 1528         int ord;
 1529 
 1530         if (!have_addr) {
 1531                 db_printf("usage: show vmem <addr>\n");
 1532                 return;
 1533         }
 1534 
 1535         db_printf("vmem %p '%s'\n", vm, vm->vm_name);
 1536         db_printf("\tquantum:\t%zu\n", vm->vm_quantum_mask + 1);
 1537         db_printf("\tsize:\t%zu\n", vm->vm_size);
 1538         db_printf("\tinuse:\t%zu\n", vm->vm_inuse);
 1539         db_printf("\tfree:\t%zu\n", vm->vm_size - vm->vm_inuse);
 1540         db_printf("\tbusy tags:\t%d\n", vm->vm_nbusytag);
 1541         db_printf("\tfree tags:\t%d\n", vm->vm_nfreetags);
 1542 
 1543         memset(&ft, 0, sizeof(ft));
 1544         memset(&ut, 0, sizeof(ut));
 1545         memset(&fs, 0, sizeof(fs));
 1546         memset(&us, 0, sizeof(us));
 1547         TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 1548                 ord = SIZE2ORDER(bt->bt_size >> vm->vm_quantum_shift);
 1549                 if (bt->bt_type == BT_TYPE_BUSY) {
 1550                         ut[ord]++;
 1551                         us[ord] += bt->bt_size;
 1552                 } else if (bt->bt_type == BT_TYPE_FREE) {
 1553                         ft[ord]++;
 1554                         fs[ord] += bt->bt_size;
 1555                 }
 1556         }
 1557         db_printf("\t\t\tinuse\tsize\t\tfree\tsize\n");
 1558         for (ord = 0; ord < VMEM_MAXORDER; ord++) {
 1559                 if (ut[ord] == 0 && ft[ord] == 0)
 1560                         continue;
 1561                 db_printf("\t%-15zu %zu\t%-15zu %zu\t%-16zu\n",
 1562                     ORDER2SIZE(ord) << vm->vm_quantum_shift,
 1563                     ut[ord], us[ord], ft[ord], fs[ord]);
 1564         }
 1565 }
 1566 
 1567 DB_SHOW_ALL_COMMAND(vmem, vmem_summall)
 1568 {
 1569         const vmem_t *vm;
 1570 
 1571         LIST_FOREACH(vm, &vmem_list, vm_alllist)
 1572                 vmem_summ((db_expr_t)vm, TRUE, count, modif);
 1573 }
 1574 #endif /* defined(DDB) */
 1575 
 1576 #define vmem_printf printf
 1577 
 1578 #if defined(DIAGNOSTIC)
 1579 
 1580 static bool
 1581 vmem_check_sanity(vmem_t *vm)
 1582 {
 1583         const bt_t *bt, *bt2;
 1584 
 1585         MPASS(vm != NULL);
 1586 
 1587         TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 1588                 if (bt->bt_start > BT_END(bt)) {
 1589                         printf("corrupted tag\n");
 1590                         bt_dump(bt, vmem_printf);
 1591                         return false;
 1592                 }
 1593         }
 1594         TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 1595                 TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
 1596                         if (bt == bt2) {
 1597                                 continue;
 1598                         }
 1599                         if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
 1600                                 continue;
 1601                         }
 1602                         if (bt->bt_start <= BT_END(bt2) &&
 1603                             bt2->bt_start <= BT_END(bt)) {
 1604                                 printf("overwrapped tags\n");
 1605                                 bt_dump(bt, vmem_printf);
 1606                                 bt_dump(bt2, vmem_printf);
 1607                                 return false;
 1608                         }
 1609                 }
 1610         }
 1611 
 1612         return true;
 1613 }
 1614 
 1615 static void
 1616 vmem_check(vmem_t *vm)
 1617 {
 1618 
 1619         if (!vmem_check_sanity(vm)) {
 1620                 panic("insanity vmem %p", vm);
 1621         }
 1622 }
 1623 
 1624 #endif /* defined(DIAGNOSTIC) */

Cache object: a78b236a3ccf2babe7b792f4d23a8256


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.