The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_reserv.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2002-2006 Rice University
    5  * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
    6  * All rights reserved.
    7  *
    8  * This software was developed for the FreeBSD Project by Alan L. Cox,
    9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
   24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
   27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
   30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   31  * POSSIBILITY OF SUCH DAMAGE.
   32  */
   33 
   34 /*
   35  *      Superpage reservation management module
   36  *
   37  * Any external functions defined by this module are only to be used by the
   38  * virtual memory system.
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD$");
   43 
   44 #include "opt_vm.h"
   45 
   46 #include <sys/param.h>
   47 #include <sys/kernel.h>
   48 #include <sys/lock.h>
   49 #include <sys/malloc.h>
   50 #include <sys/mutex.h>
   51 #include <sys/queue.h>
   52 #include <sys/rwlock.h>
   53 #include <sys/sbuf.h>
   54 #include <sys/sysctl.h>
   55 #include <sys/systm.h>
   56 #include <sys/bitstring.h>
   57 #include <sys/counter.h>
   58 #include <sys/ktr.h>
   59 #include <sys/vmmeter.h>
   60 #include <sys/smp.h>
   61 
   62 #include <vm/vm.h>
   63 #include <vm/vm_extern.h>
   64 #include <vm/vm_param.h>
   65 #include <vm/vm_object.h>
   66 #include <vm/vm_page.h>
   67 #include <vm/vm_pageout.h>
   68 #include <vm/vm_pagequeue.h>
   69 #include <vm/vm_phys.h>
   70 #include <vm/vm_radix.h>
   71 #include <vm/vm_reserv.h>
   72 
   73 /*
   74  * The reservation system supports the speculative allocation of large physical
   75  * pages ("superpages").  Speculative allocation enables the fully automatic
   76  * utilization of superpages by the virtual memory system.  In other words, no
   77  * programmatic directives are required to use superpages.
   78  */
   79 
   80 #if VM_NRESERVLEVEL > 0
   81 
   82 #ifndef VM_LEVEL_0_ORDER_MAX
   83 #define VM_LEVEL_0_ORDER_MAX    VM_LEVEL_0_ORDER
   84 #endif
   85 
   86 /*
   87  * The number of small pages that are contained in a level 0 reservation
   88  */
   89 #define VM_LEVEL_0_NPAGES       (1 << VM_LEVEL_0_ORDER)
   90 #define VM_LEVEL_0_NPAGES_MAX   (1 << VM_LEVEL_0_ORDER_MAX)
   91 
   92 /*
   93  * The number of bits by which a physical address is shifted to obtain the
   94  * reservation number
   95  */
   96 #define VM_LEVEL_0_SHIFT        (VM_LEVEL_0_ORDER + PAGE_SHIFT)
   97 
   98 /*
   99  * The size of a level 0 reservation in bytes
  100  */
  101 #define VM_LEVEL_0_SIZE         (1 << VM_LEVEL_0_SHIFT)
  102 
  103 /*
  104  * Computes the index of the small page underlying the given (object, pindex)
  105  * within the reservation's array of small pages.
  106  */
  107 #define VM_RESERV_INDEX(object, pindex) \
  108     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
  109 
  110 /*
  111  * Number of elapsed ticks before we update the LRU queue position.  Used
  112  * to reduce contention and churn on the list.
  113  */
  114 #define PARTPOPSLOP     1
  115 
  116 /*
  117  * The reservation structure
  118  *
  119  * A reservation structure is constructed whenever a large physical page is
  120  * speculatively allocated to an object.  The reservation provides the small
  121  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  122  * within that object.  The reservation's "popcnt" tracks the number of these
  123  * small physical pages that are in use at any given time.  When and if the
  124  * reservation is not fully utilized, it appears in the queue of partially
  125  * populated reservations.  The reservation always appears on the containing
  126  * object's list of reservations.
  127  *
  128  * A partially populated reservation can be broken and reclaimed at any time.
  129  *
  130  * c - constant after boot
  131  * d - vm_reserv_domain_lock
  132  * o - vm_reserv_object_lock
  133  * r - vm_reserv_lock
  134  * s - vm_reserv_domain_scan_lock
  135  */
  136 struct vm_reserv {
  137         struct mtx      lock;                   /* reservation lock. */
  138         TAILQ_ENTRY(vm_reserv) partpopq;        /* (d, r) per-domain queue. */
  139         LIST_ENTRY(vm_reserv) objq;             /* (o, r) object queue */
  140         vm_object_t     object;                 /* (o, r) containing object */
  141         vm_pindex_t     pindex;                 /* (o, r) offset in object */
  142         vm_page_t       pages;                  /* (c) first page  */
  143         uint16_t        popcnt;                 /* (r) # of pages in use */
  144         uint8_t         domain;                 /* (c) NUMA domain. */
  145         char            inpartpopq;             /* (d, r) */
  146         int             lasttick;               /* (r) last pop update tick. */
  147         bitstr_t        bit_decl(popmap, VM_LEVEL_0_NPAGES_MAX);
  148                                                 /* (r) bit vector, used pages */
  149 };
  150 
  151 TAILQ_HEAD(vm_reserv_queue, vm_reserv);
  152 
  153 #define vm_reserv_lockptr(rv)           (&(rv)->lock)
  154 #define vm_reserv_assert_locked(rv)                                     \
  155             mtx_assert(vm_reserv_lockptr(rv), MA_OWNED)
  156 #define vm_reserv_lock(rv)              mtx_lock(vm_reserv_lockptr(rv))
  157 #define vm_reserv_trylock(rv)           mtx_trylock(vm_reserv_lockptr(rv))
  158 #define vm_reserv_unlock(rv)            mtx_unlock(vm_reserv_lockptr(rv))
  159 
  160 /*
  161  * The reservation array
  162  *
  163  * This array is analoguous in function to vm_page_array.  It differs in the
  164  * respect that it may contain a greater number of useful reservation
  165  * structures than there are (physical) superpages.  These "invalid"
  166  * reservation structures exist to trade-off space for time in the
  167  * implementation of vm_reserv_from_page().  Invalid reservation structures are
  168  * distinguishable from "valid" reservation structures by inspecting the
  169  * reservation's "pages" field.  Invalid reservation structures have a NULL
  170  * "pages" field.
  171  *
  172  * vm_reserv_from_page() maps a small (physical) page to an element of this
  173  * array by computing a physical reservation number from the page's physical
  174  * address.  The physical reservation number is used as the array index.
  175  *
  176  * An "active" reservation is a valid reservation structure that has a non-NULL
  177  * "object" field and a non-zero "popcnt" field.  In other words, every active
  178  * reservation belongs to a particular object.  Moreover, every active
  179  * reservation has an entry in the containing object's list of reservations.  
  180  */
  181 static vm_reserv_t vm_reserv_array;
  182 
  183 /*
  184  * The per-domain partially populated reservation queues
  185  *
  186  * These queues enable the fast recovery of an unused free small page from a
  187  * partially populated reservation.  The reservation at the head of a queue
  188  * is the least recently changed, partially populated reservation.
  189  *
  190  * Access to this queue is synchronized by the per-domain reservation lock.
  191  * Threads reclaiming free pages from the queue must hold the per-domain scan
  192  * lock.
  193  */
  194 struct vm_reserv_domain {
  195         struct mtx              lock;
  196         struct vm_reserv_queue  partpop;        /* (d) */
  197         struct vm_reserv        marker;         /* (d, s) scan marker/lock */
  198 } __aligned(CACHE_LINE_SIZE);
  199 
  200 static struct vm_reserv_domain vm_rvd[MAXMEMDOM];
  201 
  202 #define vm_reserv_domain_lockptr(d)     (&vm_rvd[(d)].lock)
  203 #define vm_reserv_domain_assert_locked(d)       \
  204         mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED)
  205 #define vm_reserv_domain_lock(d)        mtx_lock(vm_reserv_domain_lockptr(d))
  206 #define vm_reserv_domain_unlock(d)      mtx_unlock(vm_reserv_domain_lockptr(d))
  207 
  208 #define vm_reserv_domain_scan_lock(d)   mtx_lock(&vm_rvd[(d)].marker.lock)
  209 #define vm_reserv_domain_scan_unlock(d) mtx_unlock(&vm_rvd[(d)].marker.lock)
  210 
  211 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  212     "Reservation Info");
  213 
  214 static COUNTER_U64_DEFINE_EARLY(vm_reserv_broken);
  215 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
  216     &vm_reserv_broken, "Cumulative number of broken reservations");
  217 
  218 static COUNTER_U64_DEFINE_EARLY(vm_reserv_freed);
  219 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
  220     &vm_reserv_freed, "Cumulative number of freed reservations");
  221 
  222 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
  223 
  224 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD,
  225     NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
  226 
  227 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
  228 
  229 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq,
  230     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  231     sysctl_vm_reserv_partpopq, "A",
  232     "Partially populated reservation queues");
  233 
  234 static COUNTER_U64_DEFINE_EARLY(vm_reserv_reclaimed);
  235 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
  236     &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations");
  237 
  238 /*
  239  * The object lock pool is used to synchronize the rvq.  We can not use a
  240  * pool mutex because it is required before malloc works.
  241  *
  242  * The "hash" function could be made faster without divide and modulo.
  243  */
  244 #define VM_RESERV_OBJ_LOCK_COUNT        MAXCPU
  245 
  246 struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];
  247 
  248 #define vm_reserv_object_lock_idx(object)                       \
  249             (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
  250 #define vm_reserv_object_lock_ptr(object)                       \
  251             &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
  252 #define vm_reserv_object_lock(object)                           \
  253             mtx_lock(vm_reserv_object_lock_ptr((object)))
  254 #define vm_reserv_object_unlock(object)                         \
  255             mtx_unlock(vm_reserv_object_lock_ptr((object)))
  256 
  257 static void             vm_reserv_break(vm_reserv_t rv);
  258 static void             vm_reserv_depopulate(vm_reserv_t rv, int index);
  259 static vm_reserv_t      vm_reserv_from_page(vm_page_t m);
  260 static boolean_t        vm_reserv_has_pindex(vm_reserv_t rv,
  261                             vm_pindex_t pindex);
  262 static void             vm_reserv_populate(vm_reserv_t rv, int index);
  263 static void             vm_reserv_reclaim(vm_reserv_t rv);
  264 
  265 /*
  266  * Returns the current number of full reservations.
  267  *
  268  * Since the number of full reservations is computed without acquiring any
  269  * locks, the returned value is inexact.
  270  */
  271 static int
  272 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
  273 {
  274         vm_paddr_t paddr;
  275         struct vm_phys_seg *seg;
  276         vm_reserv_t rv;
  277         int fullpop, segind;
  278 
  279         fullpop = 0;
  280         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  281                 seg = &vm_phys_segs[segind];
  282                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
  283 #ifdef VM_PHYSSEG_SPARSE
  284                 rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) -
  285                     (seg->start >> VM_LEVEL_0_SHIFT);
  286 #else
  287                 rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
  288 #endif
  289                 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
  290                     VM_LEVEL_0_SIZE <= seg->end) {
  291                         fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
  292                         paddr += VM_LEVEL_0_SIZE;
  293                         rv++;
  294                 }
  295         }
  296         return (sysctl_handle_int(oidp, &fullpop, 0, req));
  297 }
  298 
  299 /*
  300  * Describes the current state of the partially populated reservation queue.
  301  */
  302 static int
  303 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
  304 {
  305         struct sbuf sbuf;
  306         vm_reserv_t rv;
  307         int counter, error, domain, level, unused_pages;
  308 
  309         error = sysctl_wire_old_buffer(req, 0);
  310         if (error != 0)
  311                 return (error);
  312         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  313         sbuf_printf(&sbuf, "\nDOMAIN    LEVEL     SIZE  NUMBER\n\n");
  314         for (domain = 0; domain < vm_ndomains; domain++) {
  315                 for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
  316                         counter = 0;
  317                         unused_pages = 0;
  318                         vm_reserv_domain_lock(domain);
  319                         TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
  320                                 if (rv == &vm_rvd[domain].marker)
  321                                         continue;
  322                                 counter++;
  323                                 unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
  324                         }
  325                         vm_reserv_domain_unlock(domain);
  326                         sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
  327                             domain, level,
  328                             unused_pages * ((int)PAGE_SIZE / 1024), counter);
  329                 }
  330         }
  331         error = sbuf_finish(&sbuf);
  332         sbuf_delete(&sbuf);
  333         return (error);
  334 }
  335 
  336 /*
  337  * Remove a reservation from the object's objq.
  338  */
  339 static void
  340 vm_reserv_remove(vm_reserv_t rv)
  341 {
  342         vm_object_t object;
  343 
  344         vm_reserv_assert_locked(rv);
  345         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  346             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  347         KASSERT(rv->object != NULL,
  348             ("vm_reserv_remove: reserv %p is free", rv));
  349         KASSERT(!rv->inpartpopq,
  350             ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
  351         object = rv->object;
  352         vm_reserv_object_lock(object);
  353         LIST_REMOVE(rv, objq);
  354         rv->object = NULL;
  355         vm_reserv_object_unlock(object);
  356 }
  357 
  358 /*
  359  * Insert a new reservation into the object's objq.
  360  */
  361 static void
  362 vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
  363 {
  364 
  365         vm_reserv_assert_locked(rv);
  366         CTR6(KTR_VM,
  367             "%s: rv %p(%p) object %p new %p popcnt %d",
  368             __FUNCTION__, rv, rv->pages, rv->object, object,
  369            rv->popcnt);
  370         KASSERT(rv->object == NULL,
  371             ("vm_reserv_insert: reserv %p isn't free", rv));
  372         KASSERT(rv->popcnt == 0,
  373             ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
  374         KASSERT(!rv->inpartpopq,
  375             ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
  376         KASSERT(bit_ntest(rv->popmap, 0, VM_LEVEL_0_NPAGES - 1, 0),
  377             ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
  378         vm_reserv_object_lock(object);
  379         rv->pindex = pindex;
  380         rv->object = object;
  381         rv->lasttick = ticks;
  382         LIST_INSERT_HEAD(&object->rvq, rv, objq);
  383         vm_reserv_object_unlock(object);
  384 }
  385 
  386 /*
  387  * Reduces the given reservation's population count.  If the population count
  388  * becomes zero, the reservation is destroyed.  Additionally, moves the
  389  * reservation to the tail of the partially populated reservation queue if the
  390  * population count is non-zero.
  391  */
  392 static void
  393 vm_reserv_depopulate(vm_reserv_t rv, int index)
  394 {
  395         struct vm_domain *vmd;
  396 
  397         vm_reserv_assert_locked(rv);
  398         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  399             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  400         KASSERT(rv->object != NULL,
  401             ("vm_reserv_depopulate: reserv %p is free", rv));
  402         KASSERT(bit_test(rv->popmap, index),
  403             ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
  404             index));
  405         KASSERT(rv->popcnt > 0,
  406             ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
  407         KASSERT(rv->domain < vm_ndomains,
  408             ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
  409             rv, rv->domain));
  410         if (rv->popcnt == VM_LEVEL_0_NPAGES) {
  411                 KASSERT(rv->pages->psind == 1,
  412                     ("vm_reserv_depopulate: reserv %p is already demoted",
  413                     rv));
  414                 rv->pages->psind = 0;
  415         }
  416         bit_clear(rv->popmap, index);
  417         rv->popcnt--;
  418         if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP ||
  419             rv->popcnt == 0) {
  420                 vm_reserv_domain_lock(rv->domain);
  421                 if (rv->inpartpopq) {
  422                         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
  423                         rv->inpartpopq = FALSE;
  424                 }
  425                 if (rv->popcnt != 0) {
  426                         rv->inpartpopq = TRUE;
  427                         TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv,
  428                             partpopq);
  429                 }
  430                 vm_reserv_domain_unlock(rv->domain);
  431                 rv->lasttick = ticks;
  432         }
  433         vmd = VM_DOMAIN(rv->domain);
  434         if (rv->popcnt == 0) {
  435                 vm_reserv_remove(rv);
  436                 vm_domain_free_lock(vmd);
  437                 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
  438                 vm_domain_free_unlock(vmd);
  439                 counter_u64_add(vm_reserv_freed, 1);
  440         }
  441         vm_domain_freecnt_inc(vmd, 1);
  442 }
  443 
  444 /*
  445  * Returns the reservation to which the given page might belong.
  446  */
  447 static __inline vm_reserv_t
  448 vm_reserv_from_page(vm_page_t m)
  449 {
  450 #ifdef VM_PHYSSEG_SPARSE
  451         struct vm_phys_seg *seg;
  452 
  453         seg = &vm_phys_segs[m->segind];
  454         return (seg->first_reserv + (VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT) -
  455             (seg->start >> VM_LEVEL_0_SHIFT));
  456 #else
  457         return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
  458 #endif
  459 }
  460 
  461 /*
  462  * Returns an existing reservation or NULL and initialized successor pointer.
  463  */
  464 static vm_reserv_t
  465 vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
  466     vm_page_t mpred, vm_page_t *msuccp)
  467 {
  468         vm_reserv_t rv;
  469         vm_page_t msucc;
  470 
  471         msucc = NULL;
  472         if (mpred != NULL) {
  473                 KASSERT(mpred->object == object,
  474                     ("vm_reserv_from_object: object doesn't contain mpred"));
  475                 KASSERT(mpred->pindex < pindex,
  476                     ("vm_reserv_from_object: mpred doesn't precede pindex"));
  477                 rv = vm_reserv_from_page(mpred);
  478                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
  479                         goto found;
  480                 msucc = TAILQ_NEXT(mpred, listq);
  481         } else
  482                 msucc = TAILQ_FIRST(&object->memq);
  483         if (msucc != NULL) {
  484                 KASSERT(msucc->pindex > pindex,
  485                     ("vm_reserv_from_object: msucc doesn't succeed pindex"));
  486                 rv = vm_reserv_from_page(msucc);
  487                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
  488                         goto found;
  489         }
  490         rv = NULL;
  491 
  492 found:
  493         *msuccp = msucc;
  494 
  495         return (rv);
  496 }
  497 
  498 /*
  499  * Returns TRUE if the given reservation contains the given page index and
  500  * FALSE otherwise.
  501  */
  502 static __inline boolean_t
  503 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
  504 {
  505 
  506         return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
  507 }
  508 
  509 /*
  510  * Increases the given reservation's population count.  Moves the reservation
  511  * to the tail of the partially populated reservation queue.
  512  */
  513 static void
  514 vm_reserv_populate(vm_reserv_t rv, int index)
  515 {
  516 
  517         vm_reserv_assert_locked(rv);
  518         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  519             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  520         KASSERT(rv->object != NULL,
  521             ("vm_reserv_populate: reserv %p is free", rv));
  522         KASSERT(!bit_test(rv->popmap, index),
  523             ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
  524             index));
  525         KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
  526             ("vm_reserv_populate: reserv %p is already full", rv));
  527         KASSERT(rv->pages->psind == 0,
  528             ("vm_reserv_populate: reserv %p is already promoted", rv));
  529         KASSERT(rv->domain < vm_ndomains,
  530             ("vm_reserv_populate: reserv %p's domain is corrupted %d",
  531             rv, rv->domain));
  532         bit_set(rv->popmap, index);
  533         rv->popcnt++;
  534         if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP &&
  535             rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES)
  536                 return;
  537         rv->lasttick = ticks;
  538         vm_reserv_domain_lock(rv->domain);
  539         if (rv->inpartpopq) {
  540                 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
  541                 rv->inpartpopq = FALSE;
  542         }
  543         if (rv->popcnt < VM_LEVEL_0_NPAGES) {
  544                 rv->inpartpopq = TRUE;
  545                 TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq);
  546         } else {
  547                 KASSERT(rv->pages->psind == 0,
  548                     ("vm_reserv_populate: reserv %p is already promoted",
  549                     rv));
  550                 rv->pages->psind = 1;
  551         }
  552         vm_reserv_domain_unlock(rv->domain);
  553 }
  554 
  555 /*
  556  * Allocates a contiguous set of physical pages of the given size "npages"
  557  * from existing or newly created reservations.  All of the physical pages
  558  * must be at or above the given physical address "low" and below the given
  559  * physical address "high".  The given value "alignment" determines the
  560  * alignment of the first physical page in the set.  If the given value
  561  * "boundary" is non-zero, then the set of physical pages cannot cross any
  562  * physical address boundary that is a multiple of that value.  Both
  563  * "alignment" and "boundary" must be a power of two.
  564  *
  565  * The page "mpred" must immediately precede the offset "pindex" within the
  566  * specified object.
  567  *
  568  * The object must be locked.
  569  */
  570 vm_page_t
  571 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain,
  572     int req, vm_page_t mpred, u_long npages, vm_paddr_t low, vm_paddr_t high,
  573     u_long alignment, vm_paddr_t boundary)
  574 {
  575         struct vm_domain *vmd;
  576         vm_paddr_t pa, size;
  577         vm_page_t m, m_ret, msucc;
  578         vm_pindex_t first, leftcap, rightcap;
  579         vm_reserv_t rv;
  580         u_long allocpages, maxpages, minpages;
  581         int i, index, n;
  582 
  583         VM_OBJECT_ASSERT_WLOCKED(object);
  584         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
  585 
  586         /*
  587          * Is a reservation fundamentally impossible?
  588          */
  589         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  590             pindex + npages > object->size)
  591                 return (NULL);
  592 
  593         /*
  594          * All reservations of a particular size have the same alignment.
  595          * Assuming that the first page is allocated from a reservation, the
  596          * least significant bits of its physical address can be determined
  597          * from its offset from the beginning of the reservation and the size
  598          * of the reservation.
  599          *
  600          * Could the specified index within a reservation of the smallest
  601          * possible size satisfy the alignment and boundary requirements?
  602          */
  603         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
  604         size = npages << PAGE_SHIFT;
  605         if (!vm_addr_ok(pa, size, alignment, boundary))
  606                 return (NULL);
  607 
  608         /*
  609          * Look for an existing reservation.
  610          */
  611         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  612         if (rv != NULL) {
  613                 KASSERT(object != kernel_object || rv->domain == domain,
  614                     ("vm_reserv_alloc_contig: domain mismatch"));
  615                 index = VM_RESERV_INDEX(object, pindex);
  616                 /* Does the allocation fit within the reservation? */
  617                 if (index + npages > VM_LEVEL_0_NPAGES)
  618                         return (NULL);
  619                 domain = rv->domain;
  620                 vmd = VM_DOMAIN(domain);
  621                 vm_reserv_lock(rv);
  622                 /* Handle reclaim race. */
  623                 if (rv->object != object)
  624                         goto out;
  625                 m = &rv->pages[index];
  626                 pa = VM_PAGE_TO_PHYS(m);
  627                 if (pa < low || pa + size > high ||
  628                     !vm_addr_ok(pa, size, alignment, boundary))
  629                         goto out;
  630                 /* Handle vm_page_rename(m, new_object, ...). */
  631                 if (!bit_ntest(rv->popmap, index, index + npages - 1, 0))
  632                         goto out;
  633                 if (!vm_domain_allocate(vmd, req, npages))
  634                         goto out;
  635                 for (i = 0; i < npages; i++)
  636                         vm_reserv_populate(rv, index + i);
  637                 vm_reserv_unlock(rv);
  638                 return (m);
  639 out:
  640                 vm_reserv_unlock(rv);
  641                 return (NULL);
  642         }
  643 
  644         /*
  645          * Could at least one reservation fit between the first index to the
  646          * left that can be used ("leftcap") and the first index to the right
  647          * that cannot be used ("rightcap")?
  648          *
  649          * We must synchronize with the reserv object lock to protect the
  650          * pindex/object of the resulting reservations against rename while
  651          * we are inspecting.
  652          */
  653         first = pindex - VM_RESERV_INDEX(object, pindex);
  654         minpages = VM_RESERV_INDEX(object, pindex) + npages;
  655         maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
  656         allocpages = maxpages;
  657         vm_reserv_object_lock(object);
  658         if (mpred != NULL) {
  659                 if ((rv = vm_reserv_from_page(mpred))->object != object)
  660                         leftcap = mpred->pindex + 1;
  661                 else
  662                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
  663                 if (leftcap > first) {
  664                         vm_reserv_object_unlock(object);
  665                         return (NULL);
  666                 }
  667         }
  668         if (msucc != NULL) {
  669                 if ((rv = vm_reserv_from_page(msucc))->object != object)
  670                         rightcap = msucc->pindex;
  671                 else
  672                         rightcap = rv->pindex;
  673                 if (first + maxpages > rightcap) {
  674                         if (maxpages == VM_LEVEL_0_NPAGES) {
  675                                 vm_reserv_object_unlock(object);
  676                                 return (NULL);
  677                         }
  678 
  679                         /*
  680                          * At least one reservation will fit between "leftcap"
  681                          * and "rightcap".  However, a reservation for the
  682                          * last of the requested pages will not fit.  Reduce
  683                          * the size of the upcoming allocation accordingly.
  684                          */
  685                         allocpages = minpages;
  686                 }
  687         }
  688         vm_reserv_object_unlock(object);
  689 
  690         /*
  691          * Would the last new reservation extend past the end of the object?
  692          *
  693          * If the object is unlikely to grow don't allocate a reservation for
  694          * the tail.
  695          */
  696         if ((object->flags & OBJ_ANON) == 0 &&
  697             first + maxpages > object->size) {
  698                 if (maxpages == VM_LEVEL_0_NPAGES)
  699                         return (NULL);
  700                 allocpages = minpages;
  701         }
  702 
  703         /*
  704          * Allocate the physical pages.  The alignment and boundary specified
  705          * for this allocation may be different from the alignment and
  706          * boundary specified for the requested pages.  For instance, the
  707          * specified index may not be the first page within the first new
  708          * reservation.
  709          */
  710         m = NULL;
  711         vmd = VM_DOMAIN(domain);
  712         if (vm_domain_allocate(vmd, req, npages)) {
  713                 vm_domain_free_lock(vmd);
  714                 m = vm_phys_alloc_contig(domain, allocpages, low, high,
  715                     ulmax(alignment, VM_LEVEL_0_SIZE),
  716                     boundary > VM_LEVEL_0_SIZE ? boundary : 0);
  717                 vm_domain_free_unlock(vmd);
  718                 if (m == NULL) {
  719                         vm_domain_freecnt_inc(vmd, npages);
  720                         return (NULL);
  721                 }
  722         } else
  723                 return (NULL);
  724         KASSERT(vm_page_domain(m) == domain,
  725             ("vm_reserv_alloc_contig: Page domain does not match requested."));
  726 
  727         /*
  728          * The allocated physical pages always begin at a reservation
  729          * boundary, but they do not always end at a reservation boundary.
  730          * Initialize every reservation that is completely covered by the
  731          * allocated physical pages.
  732          */
  733         m_ret = NULL;
  734         index = VM_RESERV_INDEX(object, pindex);
  735         do {
  736                 rv = vm_reserv_from_page(m);
  737                 KASSERT(rv->pages == m,
  738                     ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
  739                     rv));
  740                 vm_reserv_lock(rv);
  741                 vm_reserv_insert(rv, object, first);
  742                 n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
  743                 for (i = 0; i < n; i++)
  744                         vm_reserv_populate(rv, index + i);
  745                 npages -= n;
  746                 if (m_ret == NULL) {
  747                         m_ret = &rv->pages[index];
  748                         index = 0;
  749                 }
  750                 vm_reserv_unlock(rv);
  751                 m += VM_LEVEL_0_NPAGES;
  752                 first += VM_LEVEL_0_NPAGES;
  753                 allocpages -= VM_LEVEL_0_NPAGES;
  754         } while (allocpages >= VM_LEVEL_0_NPAGES);
  755         return (m_ret);
  756 }
  757 
  758 /*
  759  * Allocate a physical page from an existing or newly created reservation.
  760  *
  761  * The page "mpred" must immediately precede the offset "pindex" within the
  762  * specified object.
  763  *
  764  * The object must be locked.
  765  */
  766 vm_page_t
  767 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain,
  768     int req, vm_page_t mpred)
  769 {
  770         struct vm_domain *vmd;
  771         vm_page_t m, msucc;
  772         vm_pindex_t first, leftcap, rightcap;
  773         vm_reserv_t rv;
  774         int index;
  775 
  776         VM_OBJECT_ASSERT_WLOCKED(object);
  777 
  778         /*
  779          * Is a reservation fundamentally impossible?
  780          */
  781         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  782             pindex >= object->size)
  783                 return (NULL);
  784 
  785         /*
  786          * Look for an existing reservation.
  787          */
  788         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  789         if (rv != NULL) {
  790                 KASSERT(object != kernel_object || rv->domain == domain,
  791                     ("vm_reserv_alloc_page: domain mismatch"));
  792                 domain = rv->domain;
  793                 vmd = VM_DOMAIN(domain);
  794                 index = VM_RESERV_INDEX(object, pindex);
  795                 m = &rv->pages[index];
  796                 vm_reserv_lock(rv);
  797                 /* Handle reclaim race. */
  798                 if (rv->object != object ||
  799                     /* Handle vm_page_rename(m, new_object, ...). */
  800                     bit_test(rv->popmap, index)) {
  801                         m = NULL;
  802                         goto out;
  803                 }
  804                 if (vm_domain_allocate(vmd, req, 1) == 0)
  805                         m = NULL;
  806                 else
  807                         vm_reserv_populate(rv, index);
  808 out:
  809                 vm_reserv_unlock(rv);
  810                 return (m);
  811         }
  812 
  813         /*
  814          * Could a reservation fit between the first index to the left that
  815          * can be used and the first index to the right that cannot be used?
  816          *
  817          * We must synchronize with the reserv object lock to protect the
  818          * pindex/object of the resulting reservations against rename while
  819          * we are inspecting.
  820          */
  821         first = pindex - VM_RESERV_INDEX(object, pindex);
  822         vm_reserv_object_lock(object);
  823         if (mpred != NULL) {
  824                 if ((rv = vm_reserv_from_page(mpred))->object != object)
  825                         leftcap = mpred->pindex + 1;
  826                 else
  827                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
  828                 if (leftcap > first) {
  829                         vm_reserv_object_unlock(object);
  830                         return (NULL);
  831                 }
  832         }
  833         if (msucc != NULL) {
  834                 if ((rv = vm_reserv_from_page(msucc))->object != object)
  835                         rightcap = msucc->pindex;
  836                 else
  837                         rightcap = rv->pindex;
  838                 if (first + VM_LEVEL_0_NPAGES > rightcap) {
  839                         vm_reserv_object_unlock(object);
  840                         return (NULL);
  841                 }
  842         }
  843         vm_reserv_object_unlock(object);
  844 
  845         /*
  846          * Would the last new reservation extend past the end of the object?
  847          *
  848          * If the object is unlikely to grow don't allocate a reservation for
  849          * the tail.
  850          */
  851         if ((object->flags & OBJ_ANON) == 0 &&
  852             first + VM_LEVEL_0_NPAGES > object->size)
  853                 return (NULL);
  854 
  855         /*
  856          * Allocate and populate the new reservation.
  857          */
  858         m = NULL;
  859         vmd = VM_DOMAIN(domain);
  860         if (vm_domain_allocate(vmd, req, 1)) {
  861                 vm_domain_free_lock(vmd);
  862                 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
  863                     VM_LEVEL_0_ORDER);
  864                 vm_domain_free_unlock(vmd);
  865                 if (m == NULL) {
  866                         vm_domain_freecnt_inc(vmd, 1);
  867                         return (NULL);
  868                 }
  869         } else
  870                 return (NULL);
  871         rv = vm_reserv_from_page(m);
  872         vm_reserv_lock(rv);
  873         KASSERT(rv->pages == m,
  874             ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
  875         vm_reserv_insert(rv, object, first);
  876         index = VM_RESERV_INDEX(object, pindex);
  877         vm_reserv_populate(rv, index);
  878         vm_reserv_unlock(rv);
  879 
  880         return (&rv->pages[index]);
  881 }
  882 
  883 /*
  884  * Breaks the given reservation.  All free pages in the reservation
  885  * are returned to the physical memory allocator.  The reservation's
  886  * population count and map are reset to their initial state.
  887  *
  888  * The given reservation must not be in the partially populated reservation
  889  * queue.
  890  */
  891 static void
  892 vm_reserv_break(vm_reserv_t rv)
  893 {
  894         int hi, lo, pos;
  895 
  896         vm_reserv_assert_locked(rv);
  897         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  898             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  899         vm_reserv_remove(rv);
  900         rv->pages->psind = 0;
  901         hi = lo = -1;
  902         pos = 0;
  903         for (;;) {
  904                 bit_ff_at(rv->popmap, pos, VM_LEVEL_0_NPAGES, lo != hi, &pos);
  905                 if (lo == hi) {
  906                         if (pos == -1)
  907                                 break;
  908                         lo = pos;
  909                         continue;
  910                 }
  911                 if (pos == -1)
  912                         pos = VM_LEVEL_0_NPAGES;
  913                 hi = pos;
  914                 vm_domain_free_lock(VM_DOMAIN(rv->domain));
  915                 vm_phys_enqueue_contig(&rv->pages[lo], hi - lo);
  916                 vm_domain_free_unlock(VM_DOMAIN(rv->domain));
  917                 lo = hi;
  918         }
  919         bit_nclear(rv->popmap, 0, VM_LEVEL_0_NPAGES - 1);
  920         rv->popcnt = 0;
  921         counter_u64_add(vm_reserv_broken, 1);
  922 }
  923 
  924 /*
  925  * Breaks all reservations belonging to the given object.
  926  */
  927 void
  928 vm_reserv_break_all(vm_object_t object)
  929 {
  930         vm_reserv_t rv;
  931 
  932         /*
  933          * This access of object->rvq is unsynchronized so that the
  934          * object rvq lock can nest after the domain_free lock.  We
  935          * must check for races in the results.  However, the object
  936          * lock prevents new additions, so we are guaranteed that when
  937          * it returns NULL the object is properly empty.
  938          */
  939         while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
  940                 vm_reserv_lock(rv);
  941                 /* Reclaim race. */
  942                 if (rv->object != object) {
  943                         vm_reserv_unlock(rv);
  944                         continue;
  945                 }
  946                 vm_reserv_domain_lock(rv->domain);
  947                 if (rv->inpartpopq) {
  948                         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
  949                         rv->inpartpopq = FALSE;
  950                 }
  951                 vm_reserv_domain_unlock(rv->domain);
  952                 vm_reserv_break(rv);
  953                 vm_reserv_unlock(rv);
  954         }
  955 }
  956 
  957 /*
  958  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
  959  * page is freed and FALSE otherwise.
  960  */
  961 boolean_t
  962 vm_reserv_free_page(vm_page_t m)
  963 {
  964         vm_reserv_t rv;
  965         boolean_t ret;
  966 
  967         rv = vm_reserv_from_page(m);
  968         if (rv->object == NULL)
  969                 return (FALSE);
  970         vm_reserv_lock(rv);
  971         /* Re-validate after lock. */
  972         if (rv->object != NULL) {
  973                 vm_reserv_depopulate(rv, m - rv->pages);
  974                 ret = TRUE;
  975         } else
  976                 ret = FALSE;
  977         vm_reserv_unlock(rv);
  978 
  979         return (ret);
  980 }
  981 
  982 /*
  983  * Initializes the reservation management system.  Specifically, initializes
  984  * the reservation array.
  985  *
  986  * Requires that vm_page_array and first_page are initialized!
  987  */
  988 void
  989 vm_reserv_init(void)
  990 {
  991         vm_paddr_t paddr;
  992         struct vm_phys_seg *seg;
  993         struct vm_reserv *rv;
  994         struct vm_reserv_domain *rvd;
  995 #ifdef VM_PHYSSEG_SPARSE
  996         vm_pindex_t used;
  997 #endif
  998         int i, segind;
  999 
 1000         /*
 1001          * Initialize the reservation array.  Specifically, initialize the
 1002          * "pages" field for every element that has an underlying superpage.
 1003          */
 1004 #ifdef VM_PHYSSEG_SPARSE
 1005         used = 0;
 1006 #endif
 1007         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 1008                 seg = &vm_phys_segs[segind];
 1009 #ifdef VM_PHYSSEG_SPARSE
 1010                 seg->first_reserv = &vm_reserv_array[used];
 1011                 used += howmany(seg->end, VM_LEVEL_0_SIZE) -
 1012                     seg->start / VM_LEVEL_0_SIZE;
 1013 #else
 1014                 seg->first_reserv =
 1015                     &vm_reserv_array[seg->start >> VM_LEVEL_0_SHIFT];
 1016 #endif
 1017                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 1018                 rv = seg->first_reserv + (paddr >> VM_LEVEL_0_SHIFT) -
 1019                     (seg->start >> VM_LEVEL_0_SHIFT);
 1020                 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
 1021                     VM_LEVEL_0_SIZE <= seg->end) {
 1022                         rv->pages = PHYS_TO_VM_PAGE(paddr);
 1023                         rv->domain = seg->domain;
 1024                         mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF);
 1025                         paddr += VM_LEVEL_0_SIZE;
 1026                         rv++;
 1027                 }
 1028         }
 1029         for (i = 0; i < MAXMEMDOM; i++) {
 1030                 rvd = &vm_rvd[i];
 1031                 mtx_init(&rvd->lock, "vm reserv domain", NULL, MTX_DEF);
 1032                 TAILQ_INIT(&rvd->partpop);
 1033                 mtx_init(&rvd->marker.lock, "vm reserv marker", NULL, MTX_DEF);
 1034 
 1035                 /*
 1036                  * Fully populated reservations should never be present in the
 1037                  * partially populated reservation queues.
 1038                  */
 1039                 rvd->marker.popcnt = VM_LEVEL_0_NPAGES;
 1040                 bit_nset(rvd->marker.popmap, 0, VM_LEVEL_0_NPAGES - 1);
 1041         }
 1042 
 1043         for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
 1044                 mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
 1045                     MTX_DEF);
 1046 }
 1047 
 1048 /*
 1049  * Returns true if the given page belongs to a reservation and that page is
 1050  * free.  Otherwise, returns false.
 1051  */
 1052 bool
 1053 vm_reserv_is_page_free(vm_page_t m)
 1054 {
 1055         vm_reserv_t rv;
 1056 
 1057         rv = vm_reserv_from_page(m);
 1058         if (rv->object == NULL)
 1059                 return (false);
 1060         return (!bit_test(rv->popmap, m - rv->pages));
 1061 }
 1062 
 1063 /*
 1064  * If the given page belongs to a reservation, returns the level of that
 1065  * reservation.  Otherwise, returns -1.
 1066  */
 1067 int
 1068 vm_reserv_level(vm_page_t m)
 1069 {
 1070         vm_reserv_t rv;
 1071 
 1072         rv = vm_reserv_from_page(m);
 1073         return (rv->object != NULL ? 0 : -1);
 1074 }
 1075 
 1076 /*
 1077  * Returns a reservation level if the given page belongs to a fully populated
 1078  * reservation and -1 otherwise.
 1079  */
 1080 int
 1081 vm_reserv_level_iffullpop(vm_page_t m)
 1082 {
 1083         vm_reserv_t rv;
 1084 
 1085         rv = vm_reserv_from_page(m);
 1086         return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 1087 }
 1088 
 1089 /*
 1090  * Remove a partially populated reservation from the queue.
 1091  */
 1092 static void
 1093 vm_reserv_dequeue(vm_reserv_t rv)
 1094 {
 1095 
 1096         vm_reserv_domain_assert_locked(rv->domain);
 1097         vm_reserv_assert_locked(rv);
 1098         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1099             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1100         KASSERT(rv->inpartpopq,
 1101             ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 1102 
 1103         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
 1104         rv->inpartpopq = FALSE;
 1105 }
 1106 
 1107 /*
 1108  * Breaks the given partially populated reservation, releasing its free pages
 1109  * to the physical memory allocator.
 1110  */
 1111 static void
 1112 vm_reserv_reclaim(vm_reserv_t rv)
 1113 {
 1114 
 1115         vm_reserv_assert_locked(rv);
 1116         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1117             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1118         if (rv->inpartpopq) {
 1119                 vm_reserv_domain_lock(rv->domain);
 1120                 vm_reserv_dequeue(rv);
 1121                 vm_reserv_domain_unlock(rv->domain);
 1122         }
 1123         vm_reserv_break(rv);
 1124         counter_u64_add(vm_reserv_reclaimed, 1);
 1125 }
 1126 
 1127 /*
 1128  * Breaks a reservation near the head of the partially populated reservation
 1129  * queue, releasing its free pages to the physical memory allocator.  Returns
 1130  * TRUE if a reservation is broken and FALSE otherwise.
 1131  */
 1132 bool
 1133 vm_reserv_reclaim_inactive(int domain)
 1134 {
 1135         vm_reserv_t rv;
 1136 
 1137         vm_reserv_domain_lock(domain);
 1138         TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
 1139                 /*
 1140                  * A locked reservation is likely being updated or reclaimed,
 1141                  * so just skip ahead.
 1142                  */
 1143                 if (rv != &vm_rvd[domain].marker && vm_reserv_trylock(rv)) {
 1144                         vm_reserv_dequeue(rv);
 1145                         break;
 1146                 }
 1147         }
 1148         vm_reserv_domain_unlock(domain);
 1149         if (rv != NULL) {
 1150                 vm_reserv_reclaim(rv);
 1151                 vm_reserv_unlock(rv);
 1152                 return (true);
 1153         }
 1154         return (false);
 1155 }
 1156 
 1157 /*
 1158  * Determine whether this reservation has free pages that satisfy the given
 1159  * request for contiguous physical memory.  Start searching from the lower
 1160  * bound, defined by lo, and stop at the upper bound, hi.  Return the index
 1161  * of the first satisfactory free page, or -1 if none is found.
 1162  */
 1163 static int
 1164 vm_reserv_find_contig(vm_reserv_t rv, int npages, int lo,
 1165     int hi, int ppn_align, int ppn_bound)
 1166 {
 1167 
 1168         vm_reserv_assert_locked(rv);
 1169         KASSERT(npages <= VM_LEVEL_0_NPAGES - 1,
 1170             ("%s: Too many pages", __func__));
 1171         KASSERT(ppn_bound <= VM_LEVEL_0_NPAGES,
 1172             ("%s: Too big a boundary for reservation size", __func__));
 1173         KASSERT(npages <= ppn_bound,
 1174             ("%s: Too many pages for given boundary", __func__));
 1175         KASSERT(ppn_align != 0 && powerof2(ppn_align),
 1176             ("ppn_align is not a positive power of 2"));
 1177         KASSERT(ppn_bound != 0 && powerof2(ppn_bound),
 1178             ("ppn_bound is not a positive power of 2"));
 1179         while (bit_ffc_area_at(rv->popmap, lo, hi, npages, &lo), lo != -1) {
 1180                 if (lo < roundup2(lo, ppn_align)) {
 1181                         /* Skip to next aligned page. */
 1182                         lo = roundup2(lo, ppn_align);
 1183                 } else if (roundup2(lo + 1, ppn_bound) >= lo + npages)
 1184                         return (lo);
 1185                 if (roundup2(lo + 1, ppn_bound) < lo + npages) {
 1186                         /* Skip to next boundary-matching page. */
 1187                         lo = roundup2(lo + 1, ppn_bound);
 1188                 }
 1189         }
 1190         return (-1);
 1191 }
 1192 
 1193 /*
 1194  * Searches the partially populated reservation queue for the least recently
 1195  * changed reservation with free pages that satisfy the given request for
 1196  * contiguous physical memory.  If a satisfactory reservation is found, it is
 1197  * broken.  Returns true if a reservation is broken and false otherwise.
 1198  */
 1199 vm_page_t
 1200 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low,
 1201     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 1202 {
 1203         struct vm_reserv_queue *queue;
 1204         vm_paddr_t pa, size;
 1205         vm_page_t m_ret;
 1206         vm_reserv_t marker, rv, rvn;
 1207         int hi, lo, posn, ppn_align, ppn_bound;
 1208 
 1209         KASSERT(npages > 0, ("npages is 0"));
 1210         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1211         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1212         if (npages > VM_LEVEL_0_NPAGES - 1)
 1213                 return (false);
 1214         size = npages << PAGE_SHIFT;
 1215         /* 
 1216          * Ensure that a free range starting at a boundary-multiple
 1217          * doesn't include a boundary-multiple within it.  Otherwise,
 1218          * no boundary-constrained allocation is possible.
 1219          */
 1220         if (!vm_addr_bound_ok(0, size, boundary))
 1221                 return (NULL);
 1222         marker = &vm_rvd[domain].marker;
 1223         queue = &vm_rvd[domain].partpop;
 1224         /*
 1225          * Compute shifted alignment, boundary values for page-based
 1226          * calculations.  Constrain to range [1, VM_LEVEL_0_NPAGES] to
 1227          * avoid overflow.
 1228          */
 1229         ppn_align = (int)(ulmin(ulmax(PAGE_SIZE, alignment),
 1230             VM_LEVEL_0_SIZE) >> PAGE_SHIFT);
 1231         ppn_bound = boundary == 0 ? VM_LEVEL_0_NPAGES :
 1232             (int)(MIN(MAX(PAGE_SIZE, boundary),
 1233             VM_LEVEL_0_SIZE) >> PAGE_SHIFT);
 1234 
 1235         vm_reserv_domain_scan_lock(domain);
 1236         vm_reserv_domain_lock(domain);
 1237         TAILQ_FOREACH_SAFE(rv, queue, partpopq, rvn) {
 1238                 pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
 1239                 if (pa + VM_LEVEL_0_SIZE - size < low) {
 1240                         /* This entire reservation is too low; go to next. */
 1241                         continue;
 1242                 }
 1243                 if (pa + size > high) {
 1244                         /* This entire reservation is too high; go to next. */
 1245                         continue;
 1246                 }
 1247                 if (!vm_addr_align_ok(pa, alignment)) {
 1248                         /* This entire reservation is unaligned; go to next. */
 1249                         continue;
 1250                 }
 1251 
 1252                 if (vm_reserv_trylock(rv) == 0) {
 1253                         TAILQ_INSERT_AFTER(queue, rv, marker, partpopq);
 1254                         vm_reserv_domain_unlock(domain);
 1255                         vm_reserv_lock(rv);
 1256                         if (TAILQ_PREV(marker, vm_reserv_queue, partpopq) !=
 1257                             rv) {
 1258                                 vm_reserv_unlock(rv);
 1259                                 vm_reserv_domain_lock(domain);
 1260                                 rvn = TAILQ_NEXT(marker, partpopq);
 1261                                 TAILQ_REMOVE(queue, marker, partpopq);
 1262                                 continue;
 1263                         }
 1264                         vm_reserv_domain_lock(domain);
 1265                         TAILQ_REMOVE(queue, marker, partpopq);
 1266                 }
 1267                 vm_reserv_domain_unlock(domain);
 1268                 lo = (pa >= low) ? 0 :
 1269                     (int)((low + PAGE_MASK - pa) >> PAGE_SHIFT);
 1270                 hi = (pa + VM_LEVEL_0_SIZE <= high) ? VM_LEVEL_0_NPAGES :
 1271                     (int)((high - pa) >> PAGE_SHIFT);
 1272                 posn = vm_reserv_find_contig(rv, (int)npages, lo, hi,
 1273                     ppn_align, ppn_bound);
 1274                 if (posn >= 0) {
 1275                         vm_reserv_domain_scan_unlock(domain);
 1276                         /* Allocate requested space */
 1277                         rv->popcnt += npages;
 1278                         bit_nset(rv->popmap, posn, posn + npages - 1);
 1279                         vm_reserv_reclaim(rv);
 1280                         vm_reserv_unlock(rv);
 1281                         m_ret = &rv->pages[posn];
 1282                         pa = VM_PAGE_TO_PHYS(m_ret);
 1283                         KASSERT(vm_addr_ok(pa, size, alignment, boundary),
 1284                             ("%s: adjusted address not aligned/bounded to "
 1285                              "%lx/%jx",
 1286                              __func__, alignment, (uintmax_t)boundary));
 1287                         return (m_ret);
 1288                 }
 1289                 vm_reserv_domain_lock(domain);
 1290                 rvn = TAILQ_NEXT(rv, partpopq);
 1291                 vm_reserv_unlock(rv);
 1292         }
 1293         vm_reserv_domain_unlock(domain);
 1294         vm_reserv_domain_scan_unlock(domain);
 1295         return (NULL);
 1296 }
 1297 
 1298 /*
 1299  * Transfers the reservation underlying the given page to a new object.
 1300  *
 1301  * The object must be locked.
 1302  */
 1303 void
 1304 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
 1305     vm_pindex_t old_object_offset)
 1306 {
 1307         vm_reserv_t rv;
 1308 
 1309         VM_OBJECT_ASSERT_WLOCKED(new_object);
 1310         rv = vm_reserv_from_page(m);
 1311         if (rv->object == old_object) {
 1312                 vm_reserv_lock(rv);
 1313                 CTR6(KTR_VM,
 1314                     "%s: rv %p object %p new %p popcnt %d inpartpop %d",
 1315                     __FUNCTION__, rv, rv->object, new_object, rv->popcnt,
 1316                     rv->inpartpopq);
 1317                 if (rv->object == old_object) {
 1318                         vm_reserv_object_lock(old_object);
 1319                         rv->object = NULL;
 1320                         LIST_REMOVE(rv, objq);
 1321                         vm_reserv_object_unlock(old_object);
 1322                         vm_reserv_object_lock(new_object);
 1323                         rv->object = new_object;
 1324                         rv->pindex -= old_object_offset;
 1325                         LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 1326                         vm_reserv_object_unlock(new_object);
 1327                 }
 1328                 vm_reserv_unlock(rv);
 1329         }
 1330 }
 1331 
 1332 /*
 1333  * Returns the size (in bytes) of a reservation of the specified level.
 1334  */
 1335 int
 1336 vm_reserv_size(int level)
 1337 {
 1338 
 1339         switch (level) {
 1340         case 0:
 1341                 return (VM_LEVEL_0_SIZE);
 1342         case -1:
 1343                 return (PAGE_SIZE);
 1344         default:
 1345                 return (0);
 1346         }
 1347 }
 1348 
 1349 /*
 1350  * Allocates the virtual and physical memory required by the reservation
 1351  * management system's data structures, in particular, the reservation array.
 1352  */
 1353 vm_paddr_t
 1354 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end)
 1355 {
 1356         vm_paddr_t new_end;
 1357         vm_pindex_t count;
 1358         size_t size;
 1359         int i;
 1360 
 1361         count = 0;
 1362         for (i = 0; i < vm_phys_nsegs; i++) {
 1363 #ifdef VM_PHYSSEG_SPARSE
 1364                 count += howmany(vm_phys_segs[i].end, VM_LEVEL_0_SIZE) -
 1365                     vm_phys_segs[i].start / VM_LEVEL_0_SIZE;
 1366 #else
 1367                 count = MAX(count,
 1368                     howmany(vm_phys_segs[i].end, VM_LEVEL_0_SIZE));
 1369 #endif
 1370         }
 1371 
 1372         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 1373 #ifdef VM_PHYSSEG_SPARSE
 1374                 count += howmany(phys_avail[i + 1], VM_LEVEL_0_SIZE) -
 1375                     phys_avail[i] / VM_LEVEL_0_SIZE;
 1376 #else
 1377                 count = MAX(count,
 1378                     howmany(phys_avail[i + 1], VM_LEVEL_0_SIZE));
 1379 #endif
 1380         }
 1381 
 1382         /*
 1383          * Calculate the size (in bytes) of the reservation array.  Rounding up
 1384          * for partial superpages at boundaries, as every small page is mapped
 1385          * to an element in the reservation array based on its physical address.
 1386          * Thus, the number of elements in the reservation array can be greater
 1387          * than the number of superpages.
 1388          */
 1389         size = count * sizeof(struct vm_reserv);
 1390 
 1391         /*
 1392          * Allocate and map the physical memory for the reservation array.  The
 1393          * next available virtual address is returned by reference.
 1394          */
 1395         new_end = end - round_page(size);
 1396         vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 1397             VM_PROT_READ | VM_PROT_WRITE);
 1398         bzero(vm_reserv_array, size);
 1399 
 1400         /*
 1401          * Return the next available physical address.
 1402          */
 1403         return (new_end);
 1404 }
 1405 
 1406 /*
 1407  * Returns the superpage containing the given page.
 1408  */
 1409 vm_page_t
 1410 vm_reserv_to_superpage(vm_page_t m)
 1411 {
 1412         vm_reserv_t rv;
 1413 
 1414         VM_OBJECT_ASSERT_LOCKED(m->object);
 1415         rv = vm_reserv_from_page(m);
 1416         if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES)
 1417                 m = rv->pages;
 1418         else
 1419                 m = NULL;
 1420 
 1421         return (m);
 1422 }
 1423 
 1424 #endif  /* VM_NRESERVLEVEL > 0 */

Cache object: 81c57ff73ff1ca3b10391275d16834bd


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.