vm_reserv.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2002-2006 Rice University
    5  * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
    6  * All rights reserved.
    7  *
    8  * This software was developed for the FreeBSD Project by Alan L. Cox,
    9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
   24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
   27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
   30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   31  * POSSIBILITY OF SUCH DAMAGE.
   32  */
   33 
   34 /*
   35  *      Superpage reservation management module
   36  *
   37  * Any external functions defined by this module are only to be used by the
   38  * virtual memory system.
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD$");
   43 
   44 #include "opt_vm.h"
   45 
   46 #include <sys/param.h>
   47 #include <sys/kernel.h>
   48 #include <sys/lock.h>
   49 #include <sys/malloc.h>
   50 #include <sys/mutex.h>
   51 #include <sys/queue.h>
   52 #include <sys/rwlock.h>
   53 #include <sys/sbuf.h>
   54 #include <sys/sysctl.h>
   55 #include <sys/systm.h>
   56 #include <sys/counter.h>
   57 #include <sys/ktr.h>
   58 #include <sys/vmmeter.h>
   59 #include <sys/smp.h>
   60 
   61 #include <vm/vm.h>
   62 #include <vm/vm_param.h>
   63 #include <vm/vm_object.h>
   64 #include <vm/vm_page.h>
   65 #include <vm/vm_pageout.h>
   66 #include <vm/vm_pagequeue.h>
   67 #include <vm/vm_phys.h>
   68 #include <vm/vm_radix.h>
   69 #include <vm/vm_reserv.h>
   70 
   71 /*
   72  * The reservation system supports the speculative allocation of large physical
   73  * pages ("superpages").  Speculative allocation enables the fully automatic
   74  * utilization of superpages by the virtual memory system.  In other words, no
   75  * programmatic directives are required to use superpages.
   76  */
   77 
   78 #if VM_NRESERVLEVEL > 0
   79 
   80 #ifndef VM_LEVEL_0_ORDER_MAX
   81 #define VM_LEVEL_0_ORDER_MAX    VM_LEVEL_0_ORDER
   82 #endif
   83 
   84 /*
   85  * The number of small pages that are contained in a level 0 reservation
   86  */
   87 #define VM_LEVEL_0_NPAGES       (1 << VM_LEVEL_0_ORDER)
   88 #define VM_LEVEL_0_NPAGES_MAX   (1 << VM_LEVEL_0_ORDER_MAX)
   89 
   90 /*
   91  * The number of bits by which a physical address is shifted to obtain the
   92  * reservation number
   93  */
   94 #define VM_LEVEL_0_SHIFT        (VM_LEVEL_0_ORDER + PAGE_SHIFT)
   95 
   96 /*
   97  * The size of a level 0 reservation in bytes
   98  */
   99 #define VM_LEVEL_0_SIZE         (1 << VM_LEVEL_0_SHIFT)
  100 
  101 /*
  102  * Computes the index of the small page underlying the given (object, pindex)
  103  * within the reservation's array of small pages.
  104  */
  105 #define VM_RESERV_INDEX(object, pindex) \
  106     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
  107 
  108 /*
  109  * The size of a population map entry
  110  */
  111 typedef u_long          popmap_t;
  112 
  113 /*
  114  * The number of bits in a population map entry
  115  */
  116 #define NBPOPMAP        (NBBY * sizeof(popmap_t))
  117 
  118 /*
  119  * The number of population map entries in a reservation
  120  */
  121 #define NPOPMAP         howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
  122 #define NPOPMAP_MAX     howmany(VM_LEVEL_0_NPAGES_MAX, NBPOPMAP)
  123 
  124 /*
  125  * Number of elapsed ticks before we update the LRU queue position.  Used
  126  * to reduce contention and churn on the list.
  127  */
  128 #define PARTPOPSLOP     1
  129 
  130 /*
  131  * Clear a bit in the population map.
  132  */
  133 static __inline void
  134 popmap_clear(popmap_t popmap[], int i)
  135 {
  136 
  137         popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
  138 }
  139 
  140 /*
  141  * Set a bit in the population map.
  142  */
  143 static __inline void
  144 popmap_set(popmap_t popmap[], int i)
  145 {
  146 
  147         popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
  148 }
  149 
  150 /*
  151  * Is a bit in the population map clear?
  152  */
  153 static __inline boolean_t
  154 popmap_is_clear(popmap_t popmap[], int i)
  155 {
  156 
  157         return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
  158 }
  159 
  160 /*
  161  * Is a bit in the population map set?
  162  */
  163 static __inline boolean_t
  164 popmap_is_set(popmap_t popmap[], int i)
  165 {
  166 
  167         return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
  168 }
  169 
  170 /*
  171  * The reservation structure
  172  *
  173  * A reservation structure is constructed whenever a large physical page is
  174  * speculatively allocated to an object.  The reservation provides the small
  175  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  176  * within that object.  The reservation's "popcnt" tracks the number of these
  177  * small physical pages that are in use at any given time.  When and if the
  178  * reservation is not fully utilized, it appears in the queue of partially
  179  * populated reservations.  The reservation always appears on the containing
  180  * object's list of reservations.
  181  *
  182  * A partially populated reservation can be broken and reclaimed at any time.
  183  *
  184  * c - constant after boot
  185  * d - vm_reserv_domain_lock
  186  * o - vm_reserv_object_lock
  187  * r - vm_reserv_lock
  188  * s - vm_reserv_domain_scan_lock
  189  */
  190 struct vm_reserv {
  191         struct mtx      lock;                   /* reservation lock. */
  192         TAILQ_ENTRY(vm_reserv) partpopq;        /* (d, r) per-domain queue. */
  193         LIST_ENTRY(vm_reserv) objq;             /* (o, r) object queue */
  194         vm_object_t     object;                 /* (o, r) containing object */
  195         vm_pindex_t     pindex;                 /* (o, r) offset in object */
  196         vm_page_t       pages;                  /* (c) first page  */
  197         uint16_t        popcnt;                 /* (r) # of pages in use */
  198         uint8_t         domain;                 /* (c) NUMA domain. */
  199         char            inpartpopq;             /* (d, r) */
  200         int             lasttick;               /* (r) last pop update tick. */
  201         popmap_t        popmap[NPOPMAP_MAX];    /* (r) bit vector, used pages */
  202 };
  203 
  204 TAILQ_HEAD(vm_reserv_queue, vm_reserv);
  205 
  206 #define vm_reserv_lockptr(rv)           (&(rv)->lock)
  207 #define vm_reserv_assert_locked(rv)                                     \
  208             mtx_assert(vm_reserv_lockptr(rv), MA_OWNED)
  209 #define vm_reserv_lock(rv)              mtx_lock(vm_reserv_lockptr(rv))
  210 #define vm_reserv_trylock(rv)           mtx_trylock(vm_reserv_lockptr(rv))
  211 #define vm_reserv_unlock(rv)            mtx_unlock(vm_reserv_lockptr(rv))
  212 
  213 /*
  214  * The reservation array
  215  *
  216  * This array is analoguous in function to vm_page_array.  It differs in the
  217  * respect that it may contain a greater number of useful reservation
  218  * structures than there are (physical) superpages.  These "invalid"
  219  * reservation structures exist to trade-off space for time in the
  220  * implementation of vm_reserv_from_page().  Invalid reservation structures are
  221  * distinguishable from "valid" reservation structures by inspecting the
  222  * reservation's "pages" field.  Invalid reservation structures have a NULL
  223  * "pages" field.
  224  *
  225  * vm_reserv_from_page() maps a small (physical) page to an element of this
  226  * array by computing a physical reservation number from the page's physical
  227  * address.  The physical reservation number is used as the array index.
  228  *
  229  * An "active" reservation is a valid reservation structure that has a non-NULL
  230  * "object" field and a non-zero "popcnt" field.  In other words, every active
  231  * reservation belongs to a particular object.  Moreover, every active
  232  * reservation has an entry in the containing object's list of reservations.  
  233  */
  234 static vm_reserv_t vm_reserv_array;
  235 
  236 /*
  237  * The per-domain partially populated reservation queues
  238  *
  239  * These queues enable the fast recovery of an unused free small page from a
  240  * partially populated reservation.  The reservation at the head of a queue
  241  * is the least recently changed, partially populated reservation.
  242  *
  243  * Access to this queue is synchronized by the per-domain reservation lock.
  244  * Threads reclaiming free pages from the queue must hold the per-domain scan
  245  * lock.
  246  */
  247 struct vm_reserv_domain {
  248         struct mtx              lock;
  249         struct vm_reserv_queue  partpop;        /* (d) */
  250         struct vm_reserv        marker;         /* (d, s) scan marker/lock */
  251 } __aligned(CACHE_LINE_SIZE);
  252 
  253 static struct vm_reserv_domain vm_rvd[MAXMEMDOM];
  254 
  255 #define vm_reserv_domain_lockptr(d)     (&vm_rvd[(d)].lock)
  256 #define vm_reserv_domain_assert_locked(d)       \
  257         mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED)
  258 #define vm_reserv_domain_lock(d)        mtx_lock(vm_reserv_domain_lockptr(d))
  259 #define vm_reserv_domain_unlock(d)      mtx_unlock(vm_reserv_domain_lockptr(d))
  260 
  261 #define vm_reserv_domain_scan_lock(d)   mtx_lock(&vm_rvd[(d)].marker.lock)
  262 #define vm_reserv_domain_scan_unlock(d) mtx_unlock(&vm_rvd[(d)].marker.lock)
  263 
  264 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  265     "Reservation Info");
  266 
  267 static COUNTER_U64_DEFINE_EARLY(vm_reserv_broken);
  268 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
  269     &vm_reserv_broken, "Cumulative number of broken reservations");
  270 
  271 static COUNTER_U64_DEFINE_EARLY(vm_reserv_freed);
  272 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
  273     &vm_reserv_freed, "Cumulative number of freed reservations");
  274 
  275 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
  276 
  277 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD,
  278     NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
  279 
  280 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
  281 
  282 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq,
  283     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  284     sysctl_vm_reserv_partpopq, "A",
  285     "Partially populated reservation queues");
  286 
  287 static COUNTER_U64_DEFINE_EARLY(vm_reserv_reclaimed);
  288 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
  289     &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations");
  290 
  291 /*
  292  * The object lock pool is used to synchronize the rvq.  We can not use a
  293  * pool mutex because it is required before malloc works.
  294  *
  295  * The "hash" function could be made faster without divide and modulo.
  296  */
  297 #define VM_RESERV_OBJ_LOCK_COUNT        MAXCPU
  298 
  299 struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];
  300 
  301 #define vm_reserv_object_lock_idx(object)                       \
  302             (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
  303 #define vm_reserv_object_lock_ptr(object)                       \
  304             &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
  305 #define vm_reserv_object_lock(object)                           \
  306             mtx_lock(vm_reserv_object_lock_ptr((object)))
  307 #define vm_reserv_object_unlock(object)                         \
  308             mtx_unlock(vm_reserv_object_lock_ptr((object)))
  309 
  310 static void             vm_reserv_break(vm_reserv_t rv);
  311 static void             vm_reserv_depopulate(vm_reserv_t rv, int index);
  312 static vm_reserv_t      vm_reserv_from_page(vm_page_t m);
  313 static boolean_t        vm_reserv_has_pindex(vm_reserv_t rv,
  314                             vm_pindex_t pindex);
  315 static void             vm_reserv_populate(vm_reserv_t rv, int index);
  316 static void             vm_reserv_reclaim(vm_reserv_t rv);
  317 
  318 /*
  319  * Returns the current number of full reservations.
  320  *
  321  * Since the number of full reservations is computed without acquiring any
  322  * locks, the returned value is inexact.
  323  */
  324 static int
  325 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
  326 {
  327         vm_paddr_t paddr;
  328         struct vm_phys_seg *seg;
  329         vm_reserv_t rv;
  330         int fullpop, segind;
  331 
  332         fullpop = 0;
  333         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  334                 seg = &vm_phys_segs[segind];
  335                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
  336 #ifdef VM_PHYSSEG_SPARSE
  337                 rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) -
  338                     (seg->start >> VM_LEVEL_0_SHIFT);
  339 #else
  340                 rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
  341 #endif
  342                 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
  343                     VM_LEVEL_0_SIZE <= seg->end) {
  344                         fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
  345                         paddr += VM_LEVEL_0_SIZE;
  346                         rv++;
  347                 }
  348         }
  349         return (sysctl_handle_int(oidp, &fullpop, 0, req));
  350 }
  351 
  352 /*
  353  * Describes the current state of the partially populated reservation queue.
  354  */
  355 static int
  356 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
  357 {
  358         struct sbuf sbuf;
  359         vm_reserv_t rv;
  360         int counter, error, domain, level, unused_pages;
  361 
  362         error = sysctl_wire_old_buffer(req, 0);
  363         if (error != 0)
  364                 return (error);
  365         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  366         sbuf_printf(&sbuf, "\nDOMAIN    LEVEL     SIZE  NUMBER\n\n");
  367         for (domain = 0; domain < vm_ndomains; domain++) {
  368                 for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
  369                         counter = 0;
  370                         unused_pages = 0;
  371                         vm_reserv_domain_lock(domain);
  372                         TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
  373                                 if (rv == &vm_rvd[domain].marker)
  374                                         continue;
  375                                 counter++;
  376                                 unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
  377                         }
  378                         vm_reserv_domain_unlock(domain);
  379                         sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
  380                             domain, level,
  381                             unused_pages * ((int)PAGE_SIZE / 1024), counter);
  382                 }
  383         }
  384         error = sbuf_finish(&sbuf);
  385         sbuf_delete(&sbuf);
  386         return (error);
  387 }
  388 
  389 /*
  390  * Remove a reservation from the object's objq.
  391  */
  392 static void
  393 vm_reserv_remove(vm_reserv_t rv)
  394 {
  395         vm_object_t object;
  396 
  397         vm_reserv_assert_locked(rv);
  398         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  399             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  400         KASSERT(rv->object != NULL,
  401             ("vm_reserv_remove: reserv %p is free", rv));
  402         KASSERT(!rv->inpartpopq,
  403             ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
  404         object = rv->object;
  405         vm_reserv_object_lock(object);
  406         LIST_REMOVE(rv, objq);
  407         rv->object = NULL;
  408         vm_reserv_object_unlock(object);
  409 }
  410 
  411 /*
  412  * Insert a new reservation into the object's objq.
  413  */
  414 static void
  415 vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
  416 {
  417         int i;
  418 
  419         vm_reserv_assert_locked(rv);
  420         CTR6(KTR_VM,
  421             "%s: rv %p(%p) object %p new %p popcnt %d",
  422             __FUNCTION__, rv, rv->pages, rv->object, object,
  423            rv->popcnt);
  424         KASSERT(rv->object == NULL,
  425             ("vm_reserv_insert: reserv %p isn't free", rv));
  426         KASSERT(rv->popcnt == 0,
  427             ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
  428         KASSERT(!rv->inpartpopq,
  429             ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
  430         for (i = 0; i < NPOPMAP; i++)
  431                 KASSERT(rv->popmap[i] == 0,
  432                     ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
  433         vm_reserv_object_lock(object);
  434         rv->pindex = pindex;
  435         rv->object = object;
  436         rv->lasttick = ticks;
  437         LIST_INSERT_HEAD(&object->rvq, rv, objq);
  438         vm_reserv_object_unlock(object);
  439 }
  440 
  441 /*
  442  * Reduces the given reservation's population count.  If the population count
  443  * becomes zero, the reservation is destroyed.  Additionally, moves the
  444  * reservation to the tail of the partially populated reservation queue if the
  445  * population count is non-zero.
  446  */
  447 static void
  448 vm_reserv_depopulate(vm_reserv_t rv, int index)
  449 {
  450         struct vm_domain *vmd;
  451 
  452         vm_reserv_assert_locked(rv);
  453         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  454             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  455         KASSERT(rv->object != NULL,
  456             ("vm_reserv_depopulate: reserv %p is free", rv));
  457         KASSERT(popmap_is_set(rv->popmap, index),
  458             ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
  459             index));
  460         KASSERT(rv->popcnt > 0,
  461             ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
  462         KASSERT(rv->domain < vm_ndomains,
  463             ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
  464             rv, rv->domain));
  465         if (rv->popcnt == VM_LEVEL_0_NPAGES) {
  466                 KASSERT(rv->pages->psind == 1,
  467                     ("vm_reserv_depopulate: reserv %p is already demoted",
  468                     rv));
  469                 rv->pages->psind = 0;
  470         }
  471         popmap_clear(rv->popmap, index);
  472         rv->popcnt--;
  473         if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP ||
  474             rv->popcnt == 0) {
  475                 vm_reserv_domain_lock(rv->domain);
  476                 if (rv->inpartpopq) {
  477                         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
  478                         rv->inpartpopq = FALSE;
  479                 }
  480                 if (rv->popcnt != 0) {
  481                         rv->inpartpopq = TRUE;
  482                         TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv,
  483                             partpopq);
  484                 }
  485                 vm_reserv_domain_unlock(rv->domain);
  486                 rv->lasttick = ticks;
  487         }
  488         vmd = VM_DOMAIN(rv->domain);
  489         if (rv->popcnt == 0) {
  490                 vm_reserv_remove(rv);
  491                 vm_domain_free_lock(vmd);
  492                 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
  493                 vm_domain_free_unlock(vmd);
  494                 counter_u64_add(vm_reserv_freed, 1);
  495         }
  496         vm_domain_freecnt_inc(vmd, 1);
  497 }
  498 
  499 /*
  500  * Returns the reservation to which the given page might belong.
  501  */
  502 static __inline vm_reserv_t
  503 vm_reserv_from_page(vm_page_t m)
  504 {
  505 #ifdef VM_PHYSSEG_SPARSE
  506         struct vm_phys_seg *seg;
  507 
  508         seg = &vm_phys_segs[m->segind];
  509         return (seg->first_reserv + (VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT) -
  510             (seg->start >> VM_LEVEL_0_SHIFT));
  511 #else
  512         return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
  513 #endif
  514 }
  515 
  516 /*
  517  * Returns an existing reservation or NULL and initialized successor pointer.
  518  */
  519 static vm_reserv_t
  520 vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
  521     vm_page_t mpred, vm_page_t *msuccp)
  522 {
  523         vm_reserv_t rv;
  524         vm_page_t msucc;
  525 
  526         msucc = NULL;
  527         if (mpred != NULL) {
  528                 KASSERT(mpred->object == object,
  529                     ("vm_reserv_from_object: object doesn't contain mpred"));
  530                 KASSERT(mpred->pindex < pindex,
  531                     ("vm_reserv_from_object: mpred doesn't precede pindex"));
  532                 rv = vm_reserv_from_page(mpred);
  533                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
  534                         goto found;
  535                 msucc = TAILQ_NEXT(mpred, listq);
  536         } else
  537                 msucc = TAILQ_FIRST(&object->memq);
  538         if (msucc != NULL) {
  539                 KASSERT(msucc->pindex > pindex,
  540                     ("vm_reserv_from_object: msucc doesn't succeed pindex"));
  541                 rv = vm_reserv_from_page(msucc);
  542                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
  543                         goto found;
  544         }
  545         rv = NULL;
  546 
  547 found:
  548         *msuccp = msucc;
  549 
  550         return (rv);
  551 }
  552 
  553 /*
  554  * Returns TRUE if the given reservation contains the given page index and
  555  * FALSE otherwise.
  556  */
  557 static __inline boolean_t
  558 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
  559 {
  560 
  561         return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
  562 }
  563 
  564 /*
  565  * Increases the given reservation's population count.  Moves the reservation
  566  * to the tail of the partially populated reservation queue.
  567  */
  568 static void
  569 vm_reserv_populate(vm_reserv_t rv, int index)
  570 {
  571 
  572         vm_reserv_assert_locked(rv);
  573         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  574             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  575         KASSERT(rv->object != NULL,
  576             ("vm_reserv_populate: reserv %p is free", rv));
  577         KASSERT(popmap_is_clear(rv->popmap, index),
  578             ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
  579             index));
  580         KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
  581             ("vm_reserv_populate: reserv %p is already full", rv));
  582         KASSERT(rv->pages->psind == 0,
  583             ("vm_reserv_populate: reserv %p is already promoted", rv));
  584         KASSERT(rv->domain < vm_ndomains,
  585             ("vm_reserv_populate: reserv %p's domain is corrupted %d",
  586             rv, rv->domain));
  587         popmap_set(rv->popmap, index);
  588         rv->popcnt++;
  589         if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP &&
  590             rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES)
  591                 return;
  592         rv->lasttick = ticks;
  593         vm_reserv_domain_lock(rv->domain);
  594         if (rv->inpartpopq) {
  595                 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
  596                 rv->inpartpopq = FALSE;
  597         }
  598         if (rv->popcnt < VM_LEVEL_0_NPAGES) {
  599                 rv->inpartpopq = TRUE;
  600                 TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq);
  601         } else {
  602                 KASSERT(rv->pages->psind == 0,
  603                     ("vm_reserv_populate: reserv %p is already promoted",
  604                     rv));
  605                 rv->pages->psind = 1;
  606         }
  607         vm_reserv_domain_unlock(rv->domain);
  608 }
  609 
  610 /*
  611  * Allocates a contiguous set of physical pages of the given size "npages"
  612  * from existing or newly created reservations.  All of the physical pages
  613  * must be at or above the given physical address "low" and below the given
  614  * physical address "high".  The given value "alignment" determines the
  615  * alignment of the first physical page in the set.  If the given value
  616  * "boundary" is non-zero, then the set of physical pages cannot cross any
  617  * physical address boundary that is a multiple of that value.  Both
  618  * "alignment" and "boundary" must be a power of two.
  619  *
  620  * The page "mpred" must immediately precede the offset "pindex" within the
  621  * specified object.
  622  *
  623  * The object must be locked.
  624  */
  625 vm_page_t
  626 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain,
  627     int req, vm_page_t mpred, u_long npages, vm_paddr_t low, vm_paddr_t high,
  628     u_long alignment, vm_paddr_t boundary)
  629 {
  630         struct vm_domain *vmd;
  631         vm_paddr_t pa, size;
  632         vm_page_t m, m_ret, msucc;
  633         vm_pindex_t first, leftcap, rightcap;
  634         vm_reserv_t rv;
  635         u_long allocpages, maxpages, minpages;
  636         int i, index, n;
  637 
  638         VM_OBJECT_ASSERT_WLOCKED(object);
  639         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
  640 
  641         /*
  642          * Is a reservation fundamentally impossible?
  643          */
  644         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  645             pindex + npages > object->size)
  646                 return (NULL);
  647 
  648         /*
  649          * All reservations of a particular size have the same alignment.
  650          * Assuming that the first page is allocated from a reservation, the
  651          * least significant bits of its physical address can be determined
  652          * from its offset from the beginning of the reservation and the size
  653          * of the reservation.
  654          *
  655          * Could the specified index within a reservation of the smallest
  656          * possible size satisfy the alignment and boundary requirements?
  657          */
  658         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
  659         if ((pa & (alignment - 1)) != 0)
  660                 return (NULL);
  661         size = npages << PAGE_SHIFT;
  662         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
  663                 return (NULL);
  664 
  665         /*
  666          * Look for an existing reservation.
  667          */
  668         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  669         if (rv != NULL) {
  670                 KASSERT(object != kernel_object || rv->domain == domain,
  671                     ("vm_reserv_alloc_contig: domain mismatch"));
  672                 index = VM_RESERV_INDEX(object, pindex);
  673                 /* Does the allocation fit within the reservation? */
  674                 if (index + npages > VM_LEVEL_0_NPAGES)
  675                         return (NULL);
  676                 domain = rv->domain;
  677                 vmd = VM_DOMAIN(domain);
  678                 vm_reserv_lock(rv);
  679                 /* Handle reclaim race. */
  680                 if (rv->object != object)
  681                         goto out;
  682                 m = &rv->pages[index];
  683                 pa = VM_PAGE_TO_PHYS(m);
  684                 if (pa < low || pa + size > high ||
  685                     (pa & (alignment - 1)) != 0 ||
  686                     ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
  687                         goto out;
  688                 /* Handle vm_page_rename(m, new_object, ...). */
  689                 for (i = 0; i < npages; i++)
  690                         if (popmap_is_set(rv->popmap, index + i))
  691                                 goto out;
  692                 if (!vm_domain_allocate(vmd, req, npages))
  693                         goto out;
  694                 for (i = 0; i < npages; i++)
  695                         vm_reserv_populate(rv, index + i);
  696                 vm_reserv_unlock(rv);
  697                 return (m);
  698 out:
  699                 vm_reserv_unlock(rv);
  700                 return (NULL);
  701         }
  702 
  703         /*
  704          * Could at least one reservation fit between the first index to the
  705          * left that can be used ("leftcap") and the first index to the right
  706          * that cannot be used ("rightcap")?
  707          *
  708          * We must synchronize with the reserv object lock to protect the
  709          * pindex/object of the resulting reservations against rename while
  710          * we are inspecting.
  711          */
  712         first = pindex - VM_RESERV_INDEX(object, pindex);
  713         minpages = VM_RESERV_INDEX(object, pindex) + npages;
  714         maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
  715         allocpages = maxpages;
  716         vm_reserv_object_lock(object);
  717         if (mpred != NULL) {
  718                 if ((rv = vm_reserv_from_page(mpred))->object != object)
  719                         leftcap = mpred->pindex + 1;
  720                 else
  721                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
  722                 if (leftcap > first) {
  723                         vm_reserv_object_unlock(object);
  724                         return (NULL);
  725                 }
  726         }
  727         if (msucc != NULL) {
  728                 if ((rv = vm_reserv_from_page(msucc))->object != object)
  729                         rightcap = msucc->pindex;
  730                 else
  731                         rightcap = rv->pindex;
  732                 if (first + maxpages > rightcap) {
  733                         if (maxpages == VM_LEVEL_0_NPAGES) {
  734                                 vm_reserv_object_unlock(object);
  735                                 return (NULL);
  736                         }
  737 
  738                         /*
  739                          * At least one reservation will fit between "leftcap"
  740                          * and "rightcap".  However, a reservation for the
  741                          * last of the requested pages will not fit.  Reduce
  742                          * the size of the upcoming allocation accordingly.
  743                          */
  744                         allocpages = minpages;
  745                 }
  746         }
  747         vm_reserv_object_unlock(object);
  748 
  749         /*
  750          * Would the last new reservation extend past the end of the object?
  751          *
  752          * If the object is unlikely to grow don't allocate a reservation for
  753          * the tail.
  754          */
  755         if ((object->flags & OBJ_ANON) == 0 &&
  756             first + maxpages > object->size) {
  757                 if (maxpages == VM_LEVEL_0_NPAGES)
  758                         return (NULL);
  759                 allocpages = minpages;
  760         }
  761 
  762         /*
  763          * Allocate the physical pages.  The alignment and boundary specified
  764          * for this allocation may be different from the alignment and
  765          * boundary specified for the requested pages.  For instance, the
  766          * specified index may not be the first page within the first new
  767          * reservation.
  768          */
  769         m = NULL;
  770         vmd = VM_DOMAIN(domain);
  771         if (vm_domain_allocate(vmd, req, npages)) {
  772                 vm_domain_free_lock(vmd);
  773                 m = vm_phys_alloc_contig(domain, allocpages, low, high,
  774                     ulmax(alignment, VM_LEVEL_0_SIZE),
  775                     boundary > VM_LEVEL_0_SIZE ? boundary : 0);
  776                 vm_domain_free_unlock(vmd);
  777                 if (m == NULL) {
  778                         vm_domain_freecnt_inc(vmd, npages);
  779                         return (NULL);
  780                 }
  781         } else
  782                 return (NULL);
  783         KASSERT(vm_page_domain(m) == domain,
  784             ("vm_reserv_alloc_contig: Page domain does not match requested."));
  785 
  786         /*
  787          * The allocated physical pages always begin at a reservation
  788          * boundary, but they do not always end at a reservation boundary.
  789          * Initialize every reservation that is completely covered by the
  790          * allocated physical pages.
  791          */
  792         m_ret = NULL;
  793         index = VM_RESERV_INDEX(object, pindex);
  794         do {
  795                 rv = vm_reserv_from_page(m);
  796                 KASSERT(rv->pages == m,
  797                     ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
  798                     rv));
  799                 vm_reserv_lock(rv);
  800                 vm_reserv_insert(rv, object, first);
  801                 n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
  802                 for (i = 0; i < n; i++)
  803                         vm_reserv_populate(rv, index + i);
  804                 npages -= n;
  805                 if (m_ret == NULL) {
  806                         m_ret = &rv->pages[index];
  807                         index = 0;
  808                 }
  809                 vm_reserv_unlock(rv);
  810                 m += VM_LEVEL_0_NPAGES;
  811                 first += VM_LEVEL_0_NPAGES;
  812                 allocpages -= VM_LEVEL_0_NPAGES;
  813         } while (allocpages >= VM_LEVEL_0_NPAGES);
  814         return (m_ret);
  815 }
  816 
  817 /*
  818  * Allocate a physical page from an existing or newly created reservation.
  819  *
  820  * The page "mpred" must immediately precede the offset "pindex" within the
  821  * specified object.
  822  *
  823  * The object must be locked.
  824  */
  825 vm_page_t
  826 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain,
  827     int req, vm_page_t mpred)
  828 {
  829         struct vm_domain *vmd;
  830         vm_page_t m, msucc;
  831         vm_pindex_t first, leftcap, rightcap;
  832         vm_reserv_t rv;
  833         int index;
  834 
  835         VM_OBJECT_ASSERT_WLOCKED(object);
  836 
  837         /*
  838          * Is a reservation fundamentally impossible?
  839          */
  840         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  841             pindex >= object->size)
  842                 return (NULL);
  843 
  844         /*
  845          * Look for an existing reservation.
  846          */
  847         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  848         if (rv != NULL) {
  849                 KASSERT(object != kernel_object || rv->domain == domain,
  850                     ("vm_reserv_alloc_page: domain mismatch"));
  851                 domain = rv->domain;
  852                 vmd = VM_DOMAIN(domain);
  853                 index = VM_RESERV_INDEX(object, pindex);
  854                 m = &rv->pages[index];
  855                 vm_reserv_lock(rv);
  856                 /* Handle reclaim race. */
  857                 if (rv->object != object ||
  858                     /* Handle vm_page_rename(m, new_object, ...). */
  859                     popmap_is_set(rv->popmap, index)) {
  860                         m = NULL;
  861                         goto out;
  862                 }
  863                 if (vm_domain_allocate(vmd, req, 1) == 0)
  864                         m = NULL;
  865                 else
  866                         vm_reserv_populate(rv, index);
  867 out:
  868                 vm_reserv_unlock(rv);
  869                 return (m);
  870         }
  871 
  872         /*
  873          * Could a reservation fit between the first index to the left that
  874          * can be used and the first index to the right that cannot be used?
  875          *
  876          * We must synchronize with the reserv object lock to protect the
  877          * pindex/object of the resulting reservations against rename while
  878          * we are inspecting.
  879          */
  880         first = pindex - VM_RESERV_INDEX(object, pindex);
  881         vm_reserv_object_lock(object);
  882         if (mpred != NULL) {
  883                 if ((rv = vm_reserv_from_page(mpred))->object != object)
  884                         leftcap = mpred->pindex + 1;
  885                 else
  886                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
  887                 if (leftcap > first) {
  888                         vm_reserv_object_unlock(object);
  889                         return (NULL);
  890                 }
  891         }
  892         if (msucc != NULL) {
  893                 if ((rv = vm_reserv_from_page(msucc))->object != object)
  894                         rightcap = msucc->pindex;
  895                 else
  896                         rightcap = rv->pindex;
  897                 if (first + VM_LEVEL_0_NPAGES > rightcap) {
  898                         vm_reserv_object_unlock(object);
  899                         return (NULL);
  900                 }
  901         }
  902         vm_reserv_object_unlock(object);
  903 
  904         /*
  905          * Would the last new reservation extend past the end of the object?
  906          *
  907          * If the object is unlikely to grow don't allocate a reservation for
  908          * the tail.
  909          */
  910         if ((object->flags & OBJ_ANON) == 0 &&
  911             first + VM_LEVEL_0_NPAGES > object->size)
  912                 return (NULL);
  913 
  914         /*
  915          * Allocate and populate the new reservation.
  916          */
  917         m = NULL;
  918         vmd = VM_DOMAIN(domain);
  919         if (vm_domain_allocate(vmd, req, 1)) {
  920                 vm_domain_free_lock(vmd);
  921                 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
  922                     VM_LEVEL_0_ORDER);
  923                 vm_domain_free_unlock(vmd);
  924                 if (m == NULL) {
  925                         vm_domain_freecnt_inc(vmd, 1);
  926                         return (NULL);
  927                 }
  928         } else
  929                 return (NULL);
  930         rv = vm_reserv_from_page(m);
  931         vm_reserv_lock(rv);
  932         KASSERT(rv->pages == m,
  933             ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
  934         vm_reserv_insert(rv, object, first);
  935         index = VM_RESERV_INDEX(object, pindex);
  936         vm_reserv_populate(rv, index);
  937         vm_reserv_unlock(rv);
  938 
  939         return (&rv->pages[index]);
  940 }
  941 
  942 /*
  943  * Breaks the given reservation.  All free pages in the reservation
  944  * are returned to the physical memory allocator.  The reservation's
  945  * population count and map are reset to their initial state.
  946  *
  947  * The given reservation must not be in the partially populated reservation
  948  * queue.
  949  */
  950 static void
  951 vm_reserv_break(vm_reserv_t rv)
  952 {
  953         u_long changes;
  954         int bitpos, hi, i, lo;
  955 
  956         vm_reserv_assert_locked(rv);
  957         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  958             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  959         vm_reserv_remove(rv);
  960         rv->pages->psind = 0;
  961         hi = lo = -1;
  962         for (i = 0; i <= NPOPMAP; i++) {
  963                 /*
  964                  * "changes" is a bitmask that marks where a new sequence of
  965                  * 0s or 1s begins in popmap[i], with last bit in popmap[i-1]
  966                  * considered to be 1 if and only if lo == hi.  The bits of
  967                  * popmap[-1] and popmap[NPOPMAP] are considered all 1s.
  968                  */
  969                 if (i == NPOPMAP)
  970                         changes = lo != hi;
  971                 else {
  972                         changes = rv->popmap[i];
  973                         changes ^= (changes << 1) | (lo == hi);
  974                         rv->popmap[i] = 0;
  975                 }
  976                 while (changes != 0) {
  977                         /*
  978                          * If the next change marked begins a run of 0s, set
  979                          * lo to mark that position.  Otherwise set hi and
  980                          * free pages from lo up to hi.
  981                          */
  982                         bitpos = ffsl(changes) - 1;
  983                         changes ^= 1UL << bitpos;
  984                         if (lo == hi)
  985                                 lo = NBPOPMAP * i + bitpos;
  986                         else {
  987                                 hi = NBPOPMAP * i + bitpos;
  988                                 vm_domain_free_lock(VM_DOMAIN(rv->domain));
  989                                 vm_phys_enqueue_contig(&rv->pages[lo], hi - lo);
  990                                 vm_domain_free_unlock(VM_DOMAIN(rv->domain));
  991                                 lo = hi;
  992                         }
  993                 }
  994         }
  995         rv->popcnt = 0;
  996         counter_u64_add(vm_reserv_broken, 1);
  997 }
  998 
  999 /*
 1000  * Breaks all reservations belonging to the given object.
 1001  */
 1002 void
 1003 vm_reserv_break_all(vm_object_t object)
 1004 {
 1005         vm_reserv_t rv;
 1006 
 1007         /*
 1008          * This access of object->rvq is unsynchronized so that the
 1009          * object rvq lock can nest after the domain_free lock.  We
 1010          * must check for races in the results.  However, the object
 1011          * lock prevents new additions, so we are guaranteed that when
 1012          * it returns NULL the object is properly empty.
 1013          */
 1014         while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
 1015                 vm_reserv_lock(rv);
 1016                 /* Reclaim race. */
 1017                 if (rv->object != object) {
 1018                         vm_reserv_unlock(rv);
 1019                         continue;
 1020                 }
 1021                 vm_reserv_domain_lock(rv->domain);
 1022                 if (rv->inpartpopq) {
 1023                         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
 1024                         rv->inpartpopq = FALSE;
 1025                 }
 1026                 vm_reserv_domain_unlock(rv->domain);
 1027                 vm_reserv_break(rv);
 1028                 vm_reserv_unlock(rv);
 1029         }
 1030 }
 1031 
 1032 /*
 1033  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
 1034  * page is freed and FALSE otherwise.
 1035  */
 1036 boolean_t
 1037 vm_reserv_free_page(vm_page_t m)
 1038 {
 1039         vm_reserv_t rv;
 1040         boolean_t ret;
 1041 
 1042         rv = vm_reserv_from_page(m);
 1043         if (rv->object == NULL)
 1044                 return (FALSE);
 1045         vm_reserv_lock(rv);
 1046         /* Re-validate after lock. */
 1047         if (rv->object != NULL) {
 1048                 vm_reserv_depopulate(rv, m - rv->pages);
 1049                 ret = TRUE;
 1050         } else
 1051                 ret = FALSE;
 1052         vm_reserv_unlock(rv);
 1053 
 1054         return (ret);
 1055 }
 1056 
 1057 /*
 1058  * Initializes the reservation management system.  Specifically, initializes
 1059  * the reservation array.
 1060  *
 1061  * Requires that vm_page_array and first_page are initialized!
 1062  */
 1063 void
 1064 vm_reserv_init(void)
 1065 {
 1066         vm_paddr_t paddr;
 1067         struct vm_phys_seg *seg;
 1068         struct vm_reserv *rv;
 1069         struct vm_reserv_domain *rvd;
 1070 #ifdef VM_PHYSSEG_SPARSE
 1071         vm_pindex_t used;
 1072 #endif
 1073         int i, j, segind;
 1074 
 1075         /*
 1076          * Initialize the reservation array.  Specifically, initialize the
 1077          * "pages" field for every element that has an underlying superpage.
 1078          */
 1079 #ifdef VM_PHYSSEG_SPARSE
 1080         used = 0;
 1081 #endif
 1082         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 1083                 seg = &vm_phys_segs[segind];
 1084 #ifdef VM_PHYSSEG_SPARSE
 1085                 seg->first_reserv = &vm_reserv_array[used];
 1086                 used += howmany(seg->end, VM_LEVEL_0_SIZE) -
 1087                     seg->start / VM_LEVEL_0_SIZE;
 1088 #else
 1089                 seg->first_reserv =
 1090                     &vm_reserv_array[seg->start >> VM_LEVEL_0_SHIFT];
 1091 #endif
 1092                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 1093                 rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) -
 1094                     (seg->start >> VM_LEVEL_0_SHIFT);
 1095                 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
 1096                     VM_LEVEL_0_SIZE <= seg->end) {
 1097                         rv->pages = PHYS_TO_VM_PAGE(paddr);
 1098                         rv->domain = seg->domain;
 1099                         mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF);
 1100                         paddr += VM_LEVEL_0_SIZE;
 1101                         rv++;
 1102                 }
 1103         }
 1104         for (i = 0; i < MAXMEMDOM; i++) {
 1105                 rvd = &vm_rvd[i];
 1106                 mtx_init(&rvd->lock, "vm reserv domain", NULL, MTX_DEF);
 1107                 TAILQ_INIT(&rvd->partpop);
 1108                 mtx_init(&rvd->marker.lock, "vm reserv marker", NULL, MTX_DEF);
 1109 
 1110                 /*
 1111                  * Fully populated reservations should never be present in the
 1112                  * partially populated reservation queues.
 1113                  */
 1114                 rvd->marker.popcnt = VM_LEVEL_0_NPAGES;
 1115                 for (j = 0; j < NBPOPMAP; j++)
 1116                         popmap_set(rvd->marker.popmap, j);
 1117         }
 1118 
 1119         for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
 1120                 mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
 1121                     MTX_DEF);
 1122 }
 1123 
 1124 /*
 1125  * Returns true if the given page belongs to a reservation and that page is
 1126  * free.  Otherwise, returns false.
 1127  */
 1128 bool
 1129 vm_reserv_is_page_free(vm_page_t m)
 1130 {
 1131         vm_reserv_t rv;
 1132 
 1133         rv = vm_reserv_from_page(m);
 1134         if (rv->object == NULL)
 1135                 return (false);
 1136         return (popmap_is_clear(rv->popmap, m - rv->pages));
 1137 }
 1138 
 1139 /*
 1140  * If the given page belongs to a reservation, returns the level of that
 1141  * reservation.  Otherwise, returns -1.
 1142  */
 1143 int
 1144 vm_reserv_level(vm_page_t m)
 1145 {
 1146         vm_reserv_t rv;
 1147 
 1148         rv = vm_reserv_from_page(m);
 1149         return (rv->object != NULL ? 0 : -1);
 1150 }
 1151 
 1152 /*
 1153  * Returns a reservation level if the given page belongs to a fully populated
 1154  * reservation and -1 otherwise.
 1155  */
 1156 int
 1157 vm_reserv_level_iffullpop(vm_page_t m)
 1158 {
 1159         vm_reserv_t rv;
 1160 
 1161         rv = vm_reserv_from_page(m);
 1162         return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 1163 }
 1164 
 1165 /*
 1166  * Remove a partially populated reservation from the queue.
 1167  */
 1168 static void
 1169 vm_reserv_dequeue(vm_reserv_t rv)
 1170 {
 1171 
 1172         vm_reserv_domain_assert_locked(rv->domain);
 1173         vm_reserv_assert_locked(rv);
 1174         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1175             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1176         KASSERT(rv->inpartpopq,
 1177             ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 1178 
 1179         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
 1180         rv->inpartpopq = FALSE;
 1181 }
 1182 
 1183 /*
 1184  * Breaks the given partially populated reservation, releasing its free pages
 1185  * to the physical memory allocator.
 1186  */
 1187 static void
 1188 vm_reserv_reclaim(vm_reserv_t rv)
 1189 {
 1190 
 1191         vm_reserv_assert_locked(rv);
 1192         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1193             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1194         if (rv->inpartpopq) {
 1195                 vm_reserv_domain_lock(rv->domain);
 1196                 vm_reserv_dequeue(rv);
 1197                 vm_reserv_domain_unlock(rv->domain);
 1198         }
 1199         vm_reserv_break(rv);
 1200         counter_u64_add(vm_reserv_reclaimed, 1);
 1201 }
 1202 
 1203 /*
 1204  * Breaks a reservation near the head of the partially populated reservation
 1205  * queue, releasing its free pages to the physical memory allocator.  Returns
 1206  * TRUE if a reservation is broken and FALSE otherwise.
 1207  */
 1208 bool
 1209 vm_reserv_reclaim_inactive(int domain)
 1210 {
 1211         vm_reserv_t rv;
 1212 
 1213         vm_reserv_domain_lock(domain);
 1214         TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
 1215                 /*
 1216                  * A locked reservation is likely being updated or reclaimed,
 1217                  * so just skip ahead.
 1218                  */
 1219                 if (rv != &vm_rvd[domain].marker && vm_reserv_trylock(rv)) {
 1220                         vm_reserv_dequeue(rv);
 1221                         break;
 1222                 }
 1223         }
 1224         vm_reserv_domain_unlock(domain);
 1225         if (rv != NULL) {
 1226                 vm_reserv_reclaim(rv);
 1227                 vm_reserv_unlock(rv);
 1228                 return (true);
 1229         }
 1230         return (false);
 1231 }
 1232 
 1233 /*
 1234  * Determine whether this reservation has free pages that satisfy the given
 1235  * request for contiguous physical memory.  Start searching from the lower
 1236  * bound, defined by low_index.
 1237  */
 1238 static bool
 1239 vm_reserv_test_contig(vm_reserv_t rv, u_long npages, vm_paddr_t low,
 1240     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 1241 {
 1242         vm_paddr_t pa, size;
 1243         u_long changes;
 1244         int bitpos, bits_left, i, hi, lo, n;
 1245 
 1246         vm_reserv_assert_locked(rv);
 1247         size = npages << PAGE_SHIFT;
 1248         pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
 1249         lo = (pa < low) ?
 1250             ((low + PAGE_MASK - pa) >> PAGE_SHIFT) : 0;
 1251         i = lo / NBPOPMAP;
 1252         changes = rv->popmap[i] | ((1UL << (lo % NBPOPMAP)) - 1);
 1253         hi = (pa + VM_LEVEL_0_SIZE > high) ?
 1254             ((high + PAGE_MASK - pa) >> PAGE_SHIFT) : VM_LEVEL_0_NPAGES;
 1255         n = hi / NBPOPMAP;
 1256         bits_left = hi % NBPOPMAP;
 1257         hi = lo = -1;
 1258         for (;;) {
 1259                 /*
 1260                  * "changes" is a bitmask that marks where a new sequence of
 1261                  * 0s or 1s begins in popmap[i], with last bit in popmap[i-1]
 1262                  * considered to be 1 if and only if lo == hi.  The bits of
 1263                  * popmap[-1] and popmap[NPOPMAP] are considered all 1s.
 1264                  */
 1265                 changes ^= (changes << 1) | (lo == hi);
 1266                 while (changes != 0) {
 1267                         /*
 1268                          * If the next change marked begins a run of 0s, set
 1269                          * lo to mark that position.  Otherwise set hi and
 1270                          * look for a satisfactory first page from lo up to hi.
 1271                          */
 1272                         bitpos = ffsl(changes) - 1;
 1273                         changes ^= 1UL << bitpos;
 1274                         if (lo == hi) {
 1275                                 lo = NBPOPMAP * i + bitpos;
 1276                                 continue;
 1277                         }
 1278                         hi = NBPOPMAP * i + bitpos;
 1279                         pa = VM_PAGE_TO_PHYS(&rv->pages[lo]);
 1280                         if ((pa & (alignment - 1)) != 0) {
 1281                                 /* Skip to next aligned page. */
 1282                                 lo += (((pa - 1) | (alignment - 1)) + 1) >>
 1283                                     PAGE_SHIFT;
 1284                                 if (lo >= VM_LEVEL_0_NPAGES)
 1285                                         return (false);
 1286                                 pa = VM_PAGE_TO_PHYS(&rv->pages[lo]);
 1287                         }
 1288                         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
 1289                                 /* Skip to next boundary-matching page. */
 1290                                 lo += (((pa - 1) | (boundary - 1)) + 1) >>
 1291                                     PAGE_SHIFT;
 1292                                 if (lo >= VM_LEVEL_0_NPAGES)
 1293                                         return (false);
 1294                                 pa = VM_PAGE_TO_PHYS(&rv->pages[lo]);
 1295                         }
 1296                         if (lo * PAGE_SIZE + size <= hi * PAGE_SIZE)
 1297                                 return (true);
 1298                         lo = hi;
 1299                 }
 1300                 if (++i < n)
 1301                         changes = rv->popmap[i];
 1302                 else if (i == n)
 1303                         changes = bits_left == 0 ? -1UL :
 1304                             (rv->popmap[n] | (-1UL << bits_left));
 1305                 else
 1306                         return (false);
 1307         }
 1308 }
 1309 
 1310 /*
 1311  * Searches the partially populated reservation queue for the least recently
 1312  * changed reservation with free pages that satisfy the given request for
 1313  * contiguous physical memory.  If a satisfactory reservation is found, it is
 1314  * broken.  Returns true if a reservation is broken and false otherwise.
 1315  */
 1316 bool
 1317 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low,
 1318     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 1319 {
 1320         struct vm_reserv_queue *queue;
 1321         vm_paddr_t pa, size;
 1322         vm_reserv_t marker, rv, rvn;
 1323 
 1324         if (npages > VM_LEVEL_0_NPAGES - 1)
 1325                 return (false);
 1326         marker = &vm_rvd[domain].marker;
 1327         queue = &vm_rvd[domain].partpop;
 1328         size = npages << PAGE_SHIFT;
 1329 
 1330         vm_reserv_domain_scan_lock(domain);
 1331         vm_reserv_domain_lock(domain);
 1332         TAILQ_FOREACH_SAFE(rv, queue, partpopq, rvn) {
 1333                 pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
 1334                 if (pa + VM_LEVEL_0_SIZE - size < low) {
 1335                         /* This entire reservation is too low; go to next. */
 1336                         continue;
 1337                 }
 1338                 if (pa + size > high) {
 1339                         /* This entire reservation is too high; go to next. */
 1340                         continue;
 1341                 }
 1342 
 1343                 if (vm_reserv_trylock(rv) == 0) {
 1344                         TAILQ_INSERT_AFTER(queue, rv, marker, partpopq);
 1345                         vm_reserv_domain_unlock(domain);
 1346                         vm_reserv_lock(rv);
 1347                         if (TAILQ_PREV(marker, vm_reserv_queue, partpopq) !=
 1348                             rv) {
 1349                                 vm_reserv_unlock(rv);
 1350                                 vm_reserv_domain_lock(domain);
 1351                                 rvn = TAILQ_NEXT(marker, partpopq);
 1352                                 TAILQ_REMOVE(queue, marker, partpopq);
 1353                                 continue;
 1354                         }
 1355                         vm_reserv_domain_lock(domain);
 1356                         TAILQ_REMOVE(queue, marker, partpopq);
 1357                 }
 1358                 vm_reserv_domain_unlock(domain);
 1359                 if (vm_reserv_test_contig(rv, npages, low, high,
 1360                     alignment, boundary)) {
 1361                         vm_reserv_domain_scan_unlock(domain);
 1362                         vm_reserv_reclaim(rv);
 1363                         vm_reserv_unlock(rv);
 1364                         return (true);
 1365                 }
 1366                 vm_reserv_domain_lock(domain);
 1367                 rvn = TAILQ_NEXT(rv, partpopq);
 1368                 vm_reserv_unlock(rv);
 1369         }
 1370         vm_reserv_domain_unlock(domain);
 1371         vm_reserv_domain_scan_unlock(domain);
 1372         return (false);
 1373 }
 1374 
 1375 /*
 1376  * Transfers the reservation underlying the given page to a new object.
 1377  *
 1378  * The object must be locked.
 1379  */
 1380 void
 1381 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
 1382     vm_pindex_t old_object_offset)
 1383 {
 1384         vm_reserv_t rv;
 1385 
 1386         VM_OBJECT_ASSERT_WLOCKED(new_object);
 1387         rv = vm_reserv_from_page(m);
 1388         if (rv->object == old_object) {
 1389                 vm_reserv_lock(rv);
 1390                 CTR6(KTR_VM,
 1391                     "%s: rv %p object %p new %p popcnt %d inpartpop %d",
 1392                     __FUNCTION__, rv, rv->object, new_object, rv->popcnt,
 1393                     rv->inpartpopq);
 1394                 if (rv->object == old_object) {
 1395                         vm_reserv_object_lock(old_object);
 1396                         rv->object = NULL;
 1397                         LIST_REMOVE(rv, objq);
 1398                         vm_reserv_object_unlock(old_object);
 1399                         vm_reserv_object_lock(new_object);
 1400                         rv->object = new_object;
 1401                         rv->pindex -= old_object_offset;
 1402                         LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 1403                         vm_reserv_object_unlock(new_object);
 1404                 }
 1405                 vm_reserv_unlock(rv);
 1406         }
 1407 }
 1408 
 1409 /*
 1410  * Returns the size (in bytes) of a reservation of the specified level.
 1411  */
 1412 int
 1413 vm_reserv_size(int level)
 1414 {
 1415 
 1416         switch (level) {
 1417         case 0:
 1418                 return (VM_LEVEL_0_SIZE);
 1419         case -1:
 1420                 return (PAGE_SIZE);
 1421         default:
 1422                 return (0);
 1423         }
 1424 }
 1425 
 1426 /*
 1427  * Allocates the virtual and physical memory required by the reservation
 1428  * management system's data structures, in particular, the reservation array.
 1429  */
 1430 vm_paddr_t
 1431 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end)
 1432 {
 1433         vm_paddr_t new_end;
 1434         vm_pindex_t count;
 1435         size_t size;
 1436         int i;
 1437 
 1438         count = 0;
 1439         for (i = 0; i < vm_phys_nsegs; i++) {
 1440 #ifdef VM_PHYSSEG_SPARSE
 1441                 count += howmany(vm_phys_segs[i].end, VM_LEVEL_0_SIZE) -
 1442                     vm_phys_segs[i].start / VM_LEVEL_0_SIZE;
 1443 #else
 1444                 count = MAX(count,
 1445                     howmany(vm_phys_segs[i].end, VM_LEVEL_0_SIZE));
 1446 #endif
 1447         }
 1448 
 1449         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 1450 #ifdef VM_PHYSSEG_SPARSE
 1451                 count += howmany(phys_avail[i + 1], VM_LEVEL_0_SIZE) -
 1452                     phys_avail[i] / VM_LEVEL_0_SIZE;
 1453 #else
 1454                 count = MAX(count,
 1455                     howmany(phys_avail[i + 1], VM_LEVEL_0_SIZE));
 1456 #endif
 1457         }
 1458 
 1459         /*
 1460          * Calculate the size (in bytes) of the reservation array.  Rounding up
 1461          * for partial superpages at boundaries, as every small page is mapped
 1462          * to an element in the reservation array based on its physical address.
 1463          * Thus, the number of elements in the reservation array can be greater
 1464          * than the number of superpages.
 1465          */
 1466         size = count * sizeof(struct vm_reserv);
 1467 
 1468         /*
 1469          * Allocate and map the physical memory for the reservation array.  The
 1470          * next available virtual address is returned by reference.
 1471          */
 1472         new_end = end - round_page(size);
 1473         vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 1474             VM_PROT_READ | VM_PROT_WRITE);
 1475         bzero(vm_reserv_array, size);
 1476 
 1477         /*
 1478          * Return the next available physical address.
 1479          */
 1480         return (new_end);
 1481 }
 1482 
 1483 /*
 1484  * Returns the superpage containing the given page.
 1485  */
 1486 vm_page_t
 1487 vm_reserv_to_superpage(vm_page_t m)
 1488 {
 1489         vm_reserv_t rv;
 1490 
 1491         VM_OBJECT_ASSERT_LOCKED(m->object);
 1492         rv = vm_reserv_from_page(m);
 1493         if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES)
 1494                 m = rv->pages;
 1495         else
 1496                 m = NULL;
 1497 
 1498         return (m);
 1499 }
 1500 
 1501 #endif  /* VM_NRESERVLEVEL > 0 */
Cache object: d5633b71a6c94490ead96e5d2c1e6b43
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vm/vm_reserv.c

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_reserv.c