The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_reserv.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2002-2006 Rice University
    5  * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
    6  * All rights reserved.
    7  *
    8  * This software was developed for the FreeBSD Project by Alan L. Cox,
    9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
   24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
   27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
   30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   31  * POSSIBILITY OF SUCH DAMAGE.
   32  */
   33 
   34 /*
   35  *      Superpage reservation management module
   36  *
   37  * Any external functions defined by this module are only to be used by the
   38  * virtual memory system.
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: releng/12.0/sys/vm/vm_reserv.c 336055 2018-07-07 01:54:45Z jeff $");
   43 
   44 #include "opt_vm.h"
   45 
   46 #include <sys/param.h>
   47 #include <sys/kernel.h>
   48 #include <sys/lock.h>
   49 #include <sys/malloc.h>
   50 #include <sys/mutex.h>
   51 #include <sys/queue.h>
   52 #include <sys/rwlock.h>
   53 #include <sys/sbuf.h>
   54 #include <sys/sysctl.h>
   55 #include <sys/systm.h>
   56 #include <sys/counter.h>
   57 #include <sys/ktr.h>
   58 #include <sys/vmmeter.h>
   59 #include <sys/smp.h>
   60 
   61 #include <vm/vm.h>
   62 #include <vm/vm_param.h>
   63 #include <vm/vm_object.h>
   64 #include <vm/vm_page.h>
   65 #include <vm/vm_pageout.h>
   66 #include <vm/vm_phys.h>
   67 #include <vm/vm_pagequeue.h>
   68 #include <vm/vm_radix.h>
   69 #include <vm/vm_reserv.h>
   70 
   71 /*
   72  * The reservation system supports the speculative allocation of large physical
   73  * pages ("superpages").  Speculative allocation enables the fully automatic
   74  * utilization of superpages by the virtual memory system.  In other words, no
   75  * programmatic directives are required to use superpages.
   76  */
   77 
   78 #if VM_NRESERVLEVEL > 0
   79 
   80 /*
   81  * The number of small pages that are contained in a level 0 reservation
   82  */
   83 #define VM_LEVEL_0_NPAGES       (1 << VM_LEVEL_0_ORDER)
   84 
   85 /*
   86  * The number of bits by which a physical address is shifted to obtain the
   87  * reservation number
   88  */
   89 #define VM_LEVEL_0_SHIFT        (VM_LEVEL_0_ORDER + PAGE_SHIFT)
   90 
   91 /*
   92  * The size of a level 0 reservation in bytes
   93  */
   94 #define VM_LEVEL_0_SIZE         (1 << VM_LEVEL_0_SHIFT)
   95 
   96 /*
   97  * Computes the index of the small page underlying the given (object, pindex)
   98  * within the reservation's array of small pages.
   99  */
  100 #define VM_RESERV_INDEX(object, pindex) \
  101     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
  102 
  103 /*
  104  * The size of a population map entry
  105  */
  106 typedef u_long          popmap_t;
  107 
  108 /*
  109  * The number of bits in a population map entry
  110  */
  111 #define NBPOPMAP        (NBBY * sizeof(popmap_t))
  112 
  113 /*
  114  * The number of population map entries in a reservation
  115  */
  116 #define NPOPMAP         howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
  117 
  118 /*
  119  * Number of elapsed ticks before we update the LRU queue position.  Used
  120  * to reduce contention and churn on the list.
  121  */
  122 #define PARTPOPSLOP     1
  123 
  124 /*
  125  * Clear a bit in the population map.
  126  */
  127 static __inline void
  128 popmap_clear(popmap_t popmap[], int i)
  129 {
  130 
  131         popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
  132 }
  133 
  134 /*
  135  * Set a bit in the population map.
  136  */
  137 static __inline void
  138 popmap_set(popmap_t popmap[], int i)
  139 {
  140 
  141         popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
  142 }
  143 
  144 /*
  145  * Is a bit in the population map clear?
  146  */
  147 static __inline boolean_t
  148 popmap_is_clear(popmap_t popmap[], int i)
  149 {
  150 
  151         return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
  152 }
  153 
  154 /*
  155  * Is a bit in the population map set?
  156  */
  157 static __inline boolean_t
  158 popmap_is_set(popmap_t popmap[], int i)
  159 {
  160 
  161         return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
  162 }
  163 
  164 /*
  165  * The reservation structure
  166  *
  167  * A reservation structure is constructed whenever a large physical page is
  168  * speculatively allocated to an object.  The reservation provides the small
  169  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  170  * within that object.  The reservation's "popcnt" tracks the number of these
  171  * small physical pages that are in use at any given time.  When and if the
  172  * reservation is not fully utilized, it appears in the queue of partially
  173  * populated reservations.  The reservation always appears on the containing
  174  * object's list of reservations.
  175  *
  176  * A partially populated reservation can be broken and reclaimed at any time.
  177  *
  178  * r - vm_reserv_lock
  179  * d - vm_reserv_domain_lock
  180  * o - vm_reserv_object_lock
  181  * c - constant after boot
  182  */
  183 struct vm_reserv {
  184         struct mtx      lock;                   /* reservation lock. */
  185         TAILQ_ENTRY(vm_reserv) partpopq;        /* (d) per-domain queue. */
  186         LIST_ENTRY(vm_reserv) objq;             /* (o, r) object queue */
  187         vm_object_t     object;                 /* (o, r) containing object */
  188         vm_pindex_t     pindex;                 /* (o, r) offset in object */
  189         vm_page_t       pages;                  /* (c) first page  */
  190         uint16_t        domain;                 /* (c) NUMA domain. */
  191         uint16_t        popcnt;                 /* (r) # of pages in use */
  192         int             lasttick;               /* (r) last pop update tick. */
  193         char            inpartpopq;             /* (d) */
  194         popmap_t        popmap[NPOPMAP];        /* (r) bit vector, used pages */
  195 };
  196 
  197 #define vm_reserv_lockptr(rv)           (&(rv)->lock)
  198 #define vm_reserv_assert_locked(rv)                                     \
  199             mtx_assert(vm_reserv_lockptr(rv), MA_OWNED)
  200 #define vm_reserv_lock(rv)              mtx_lock(vm_reserv_lockptr(rv))
  201 #define vm_reserv_trylock(rv)           mtx_trylock(vm_reserv_lockptr(rv))
  202 #define vm_reserv_unlock(rv)            mtx_unlock(vm_reserv_lockptr(rv))
  203 
  204 static struct mtx_padalign vm_reserv_domain_locks[MAXMEMDOM];
  205 
  206 #define vm_reserv_domain_lockptr(d)     &vm_reserv_domain_locks[(d)]
  207 #define vm_reserv_domain_lock(d)        mtx_lock(vm_reserv_domain_lockptr(d))
  208 #define vm_reserv_domain_unlock(d)      mtx_unlock(vm_reserv_domain_lockptr(d))
  209 
  210 /*
  211  * The reservation array
  212  *
  213  * This array is analoguous in function to vm_page_array.  It differs in the
  214  * respect that it may contain a greater number of useful reservation
  215  * structures than there are (physical) superpages.  These "invalid"
  216  * reservation structures exist to trade-off space for time in the
  217  * implementation of vm_reserv_from_page().  Invalid reservation structures are
  218  * distinguishable from "valid" reservation structures by inspecting the
  219  * reservation's "pages" field.  Invalid reservation structures have a NULL
  220  * "pages" field.
  221  *
  222  * vm_reserv_from_page() maps a small (physical) page to an element of this
  223  * array by computing a physical reservation number from the page's physical
  224  * address.  The physical reservation number is used as the array index.
  225  *
  226  * An "active" reservation is a valid reservation structure that has a non-NULL
  227  * "object" field and a non-zero "popcnt" field.  In other words, every active
  228  * reservation belongs to a particular object.  Moreover, every active
  229  * reservation has an entry in the containing object's list of reservations.  
  230  */
  231 static vm_reserv_t vm_reserv_array;
  232 
  233 /*
  234  * The partially populated reservation queue
  235  *
  236  * This queue enables the fast recovery of an unused free small page from a
  237  * partially populated reservation.  The reservation at the head of this queue
  238  * is the least recently changed, partially populated reservation.
  239  *
  240  * Access to this queue is synchronized by the free page queue lock.
  241  */
  242 static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop[MAXMEMDOM];
  243 
  244 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
  245 
  246 static counter_u64_t vm_reserv_broken = EARLY_COUNTER;
  247 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
  248     &vm_reserv_broken, "Cumulative number of broken reservations");
  249 
  250 static counter_u64_t vm_reserv_freed = EARLY_COUNTER;
  251 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
  252     &vm_reserv_freed, "Cumulative number of freed reservations");
  253 
  254 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
  255 
  256 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
  257     sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
  258 
  259 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
  260 
  261 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
  262     sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
  263 
  264 static counter_u64_t vm_reserv_reclaimed = EARLY_COUNTER;
  265 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
  266     &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations");
  267 
  268 /*
  269  * The object lock pool is used to synchronize the rvq.  We can not use a
  270  * pool mutex because it is required before malloc works.
  271  *
  272  * The "hash" function could be made faster without divide and modulo.
  273  */
  274 #define VM_RESERV_OBJ_LOCK_COUNT        MAXCPU
  275 
  276 struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];
  277 
  278 #define vm_reserv_object_lock_idx(object)                       \
  279             (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
  280 #define vm_reserv_object_lock_ptr(object)                       \
  281             &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
  282 #define vm_reserv_object_lock(object)                           \
  283             mtx_lock(vm_reserv_object_lock_ptr((object)))
  284 #define vm_reserv_object_unlock(object)                         \
  285             mtx_unlock(vm_reserv_object_lock_ptr((object)))
  286 
  287 static void             vm_reserv_break(vm_reserv_t rv);
  288 static void             vm_reserv_depopulate(vm_reserv_t rv, int index);
  289 static vm_reserv_t      vm_reserv_from_page(vm_page_t m);
  290 static boolean_t        vm_reserv_has_pindex(vm_reserv_t rv,
  291                             vm_pindex_t pindex);
  292 static void             vm_reserv_populate(vm_reserv_t rv, int index);
  293 static void             vm_reserv_reclaim(vm_reserv_t rv);
  294 
  295 /*
  296  * Returns the current number of full reservations.
  297  *
  298  * Since the number of full reservations is computed without acquiring the
  299  * free page queue lock, the returned value may be inexact.
  300  */
  301 static int
  302 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
  303 {
  304         vm_paddr_t paddr;
  305         struct vm_phys_seg *seg;
  306         vm_reserv_t rv;
  307         int fullpop, segind;
  308 
  309         fullpop = 0;
  310         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  311                 seg = &vm_phys_segs[segind];
  312                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
  313                 while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
  314                         rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
  315                         fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
  316                         paddr += VM_LEVEL_0_SIZE;
  317                 }
  318         }
  319         return (sysctl_handle_int(oidp, &fullpop, 0, req));
  320 }
  321 
  322 /*
  323  * Describes the current state of the partially populated reservation queue.
  324  */
  325 static int
  326 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
  327 {
  328         struct sbuf sbuf;
  329         vm_reserv_t rv;
  330         int counter, error, domain, level, unused_pages;
  331 
  332         error = sysctl_wire_old_buffer(req, 0);
  333         if (error != 0)
  334                 return (error);
  335         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  336         sbuf_printf(&sbuf, "\nDOMAIN    LEVEL     SIZE  NUMBER\n\n");
  337         for (domain = 0; domain < vm_ndomains; domain++) {
  338                 for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
  339                         counter = 0;
  340                         unused_pages = 0;
  341                         vm_reserv_domain_lock(domain);
  342                         TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) {
  343                                 counter++;
  344                                 unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
  345                         }
  346                         vm_reserv_domain_unlock(domain);
  347                         sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
  348                             domain, level,
  349                             unused_pages * ((int)PAGE_SIZE / 1024), counter);
  350                 }
  351         }
  352         error = sbuf_finish(&sbuf);
  353         sbuf_delete(&sbuf);
  354         return (error);
  355 }
  356 
  357 /*
  358  * Remove a reservation from the object's objq.
  359  */
  360 static void
  361 vm_reserv_remove(vm_reserv_t rv)
  362 {
  363         vm_object_t object;
  364 
  365         vm_reserv_assert_locked(rv);
  366         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  367             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  368         KASSERT(rv->object != NULL,
  369             ("vm_reserv_remove: reserv %p is free", rv));
  370         KASSERT(!rv->inpartpopq,
  371             ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
  372         object = rv->object;
  373         vm_reserv_object_lock(object);
  374         LIST_REMOVE(rv, objq);
  375         rv->object = NULL;
  376         vm_reserv_object_unlock(object);
  377 }
  378 
  379 /*
  380  * Insert a new reservation into the object's objq.
  381  */
  382 static void
  383 vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
  384 {
  385         int i;
  386 
  387         vm_reserv_assert_locked(rv);
  388         CTR6(KTR_VM,
  389             "%s: rv %p(%p) object %p new %p popcnt %d",
  390             __FUNCTION__, rv, rv->pages, rv->object, object,
  391            rv->popcnt);
  392         KASSERT(rv->object == NULL,
  393             ("vm_reserv_insert: reserv %p isn't free", rv));
  394         KASSERT(rv->popcnt == 0,
  395             ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
  396         KASSERT(!rv->inpartpopq,
  397             ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
  398         for (i = 0; i < NPOPMAP; i++)
  399                 KASSERT(rv->popmap[i] == 0,
  400                     ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
  401         vm_reserv_object_lock(object);
  402         rv->pindex = pindex;
  403         rv->object = object;
  404         rv->lasttick = ticks;
  405         LIST_INSERT_HEAD(&object->rvq, rv, objq);
  406         vm_reserv_object_unlock(object);
  407 }
  408 
  409 /*
  410  * Reduces the given reservation's population count.  If the population count
  411  * becomes zero, the reservation is destroyed.  Additionally, moves the
  412  * reservation to the tail of the partially populated reservation queue if the
  413  * population count is non-zero.
  414  */
  415 static void
  416 vm_reserv_depopulate(vm_reserv_t rv, int index)
  417 {
  418         struct vm_domain *vmd;
  419 
  420         vm_reserv_assert_locked(rv);
  421         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  422             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  423         KASSERT(rv->object != NULL,
  424             ("vm_reserv_depopulate: reserv %p is free", rv));
  425         KASSERT(popmap_is_set(rv->popmap, index),
  426             ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
  427             index));
  428         KASSERT(rv->popcnt > 0,
  429             ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
  430         KASSERT(rv->domain < vm_ndomains,
  431             ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
  432             rv, rv->domain));
  433         if (rv->popcnt == VM_LEVEL_0_NPAGES) {
  434                 KASSERT(rv->pages->psind == 1,
  435                     ("vm_reserv_depopulate: reserv %p is already demoted",
  436                     rv));
  437                 rv->pages->psind = 0;
  438         }
  439         popmap_clear(rv->popmap, index);
  440         rv->popcnt--;
  441         if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP ||
  442             rv->popcnt == 0) {
  443                 vm_reserv_domain_lock(rv->domain);
  444                 if (rv->inpartpopq) {
  445                         TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
  446                         rv->inpartpopq = FALSE;
  447                 }
  448                 if (rv->popcnt != 0) {
  449                         rv->inpartpopq = TRUE;
  450                         TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
  451                 }
  452                 vm_reserv_domain_unlock(rv->domain);
  453                 rv->lasttick = ticks;
  454         }
  455         vmd = VM_DOMAIN(rv->domain);
  456         if (rv->popcnt == 0) {
  457                 vm_reserv_remove(rv);
  458                 vm_domain_free_lock(vmd);
  459                 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
  460                 vm_domain_free_unlock(vmd);
  461                 counter_u64_add(vm_reserv_freed, 1);
  462         }
  463         vm_domain_freecnt_inc(vmd, 1);
  464 }
  465 
  466 /*
  467  * Returns the reservation to which the given page might belong.
  468  */
  469 static __inline vm_reserv_t
  470 vm_reserv_from_page(vm_page_t m)
  471 {
  472 
  473         return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
  474 }
  475 
  476 /*
  477  * Returns an existing reservation or NULL and initialized successor pointer.
  478  */
  479 static vm_reserv_t
  480 vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
  481     vm_page_t mpred, vm_page_t *msuccp)
  482 {
  483         vm_reserv_t rv;
  484         vm_page_t msucc;
  485 
  486         msucc = NULL;
  487         if (mpred != NULL) {
  488                 KASSERT(mpred->object == object,
  489                     ("vm_reserv_from_object: object doesn't contain mpred"));
  490                 KASSERT(mpred->pindex < pindex,
  491                     ("vm_reserv_from_object: mpred doesn't precede pindex"));
  492                 rv = vm_reserv_from_page(mpred);
  493                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
  494                         goto found;
  495                 msucc = TAILQ_NEXT(mpred, listq);
  496         } else
  497                 msucc = TAILQ_FIRST(&object->memq);
  498         if (msucc != NULL) {
  499                 KASSERT(msucc->pindex > pindex,
  500                     ("vm_reserv_from_object: msucc doesn't succeed pindex"));
  501                 rv = vm_reserv_from_page(msucc);
  502                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
  503                         goto found;
  504         }
  505         rv = NULL;
  506 
  507 found:
  508         *msuccp = msucc;
  509 
  510         return (rv);
  511 }
  512 
  513 /*
  514  * Returns TRUE if the given reservation contains the given page index and
  515  * FALSE otherwise.
  516  */
  517 static __inline boolean_t
  518 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
  519 {
  520 
  521         return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
  522 }
  523 
  524 /*
  525  * Increases the given reservation's population count.  Moves the reservation
  526  * to the tail of the partially populated reservation queue.
  527  *
  528  * The free page queue must be locked.
  529  */
  530 static void
  531 vm_reserv_populate(vm_reserv_t rv, int index)
  532 {
  533 
  534         vm_reserv_assert_locked(rv);
  535         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  536             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  537         KASSERT(rv->object != NULL,
  538             ("vm_reserv_populate: reserv %p is free", rv));
  539         KASSERT(popmap_is_clear(rv->popmap, index),
  540             ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
  541             index));
  542         KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
  543             ("vm_reserv_populate: reserv %p is already full", rv));
  544         KASSERT(rv->pages->psind == 0,
  545             ("vm_reserv_populate: reserv %p is already promoted", rv));
  546         KASSERT(rv->domain < vm_ndomains,
  547             ("vm_reserv_populate: reserv %p's domain is corrupted %d",
  548             rv, rv->domain));
  549         popmap_set(rv->popmap, index);
  550         rv->popcnt++;
  551         if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP &&
  552             rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES)
  553                 return;
  554         rv->lasttick = ticks;
  555         vm_reserv_domain_lock(rv->domain);
  556         if (rv->inpartpopq) {
  557                 TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
  558                 rv->inpartpopq = FALSE;
  559         }
  560         if (rv->popcnt < VM_LEVEL_0_NPAGES) {
  561                 rv->inpartpopq = TRUE;
  562                 TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
  563         } else {
  564                 KASSERT(rv->pages->psind == 0,
  565                     ("vm_reserv_populate: reserv %p is already promoted",
  566                     rv));
  567                 rv->pages->psind = 1;
  568         }
  569         vm_reserv_domain_unlock(rv->domain);
  570 }
  571 
  572 /*
  573  * Attempts to allocate a contiguous set of physical pages from existing
  574  * reservations.  See vm_reserv_alloc_contig() for a description of the
  575  * function's parameters.
  576  *
  577  * The page "mpred" must immediately precede the offset "pindex" within the
  578  * specified object.
  579  *
  580  * The object must be locked.
  581  */
  582 vm_page_t
  583 vm_reserv_extend_contig(int req, vm_object_t object, vm_pindex_t pindex,
  584     int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
  585     u_long alignment, vm_paddr_t boundary, vm_page_t mpred)
  586 {
  587         struct vm_domain *vmd;
  588         vm_paddr_t pa, size;
  589         vm_page_t m, msucc;
  590         vm_reserv_t rv;
  591         int i, index;
  592 
  593         VM_OBJECT_ASSERT_WLOCKED(object);
  594         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
  595 
  596         /*
  597          * Is a reservation fundamentally impossible?
  598          */
  599         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  600             pindex + npages > object->size || object->resident_page_count == 0)
  601                 return (NULL);
  602 
  603         /*
  604          * All reservations of a particular size have the same alignment.
  605          * Assuming that the first page is allocated from a reservation, the
  606          * least significant bits of its physical address can be determined
  607          * from its offset from the beginning of the reservation and the size
  608          * of the reservation.
  609          *
  610          * Could the specified index within a reservation of the smallest
  611          * possible size satisfy the alignment and boundary requirements?
  612          */
  613         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
  614         if ((pa & (alignment - 1)) != 0)
  615                 return (NULL);
  616         size = npages << PAGE_SHIFT;
  617         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
  618                 return (NULL);
  619 
  620         /*
  621          * Look for an existing reservation.
  622          */
  623         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  624         if (rv == NULL)
  625                 return (NULL);
  626         KASSERT(object != kernel_object || rv->domain == domain,
  627             ("vm_reserv_extend_contig: Domain mismatch from reservation."));
  628         index = VM_RESERV_INDEX(object, pindex);
  629         /* Does the allocation fit within the reservation? */
  630         if (index + npages > VM_LEVEL_0_NPAGES)
  631                 return (NULL);
  632         domain = rv->domain;
  633         vmd = VM_DOMAIN(domain);
  634         vm_reserv_lock(rv);
  635         if (rv->object != object)
  636                 goto out;
  637         m = &rv->pages[index];
  638         pa = VM_PAGE_TO_PHYS(m);
  639         if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
  640             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
  641                 goto out;
  642         /* Handle vm_page_rename(m, new_object, ...). */
  643         for (i = 0; i < npages; i++) {
  644                 if (popmap_is_set(rv->popmap, index + i))
  645                         goto out;
  646         }
  647         if (!vm_domain_allocate(vmd, req, npages))
  648                 goto out;
  649         for (i = 0; i < npages; i++)
  650                 vm_reserv_populate(rv, index + i);
  651         vm_reserv_unlock(rv);
  652         return (m);
  653 
  654 out:
  655         vm_reserv_unlock(rv);
  656         return (NULL);
  657 }
  658 
  659 /*
  660  * Allocates a contiguous set of physical pages of the given size "npages"
  661  * from newly created reservations.  All of the physical pages
  662  * must be at or above the given physical address "low" and below the given
  663  * physical address "high".  The given value "alignment" determines the
  664  * alignment of the first physical page in the set.  If the given value
  665  * "boundary" is non-zero, then the set of physical pages cannot cross any
  666  * physical address boundary that is a multiple of that value.  Both
  667  * "alignment" and "boundary" must be a power of two.
  668  *
  669  * Callers should first invoke vm_reserv_extend_contig() to attempt an
  670  * allocation from existing reservations.
  671  *
  672  * The page "mpred" must immediately precede the offset "pindex" within the
  673  * specified object.
  674  *
  675  * The object and free page queue must be locked.
  676  */
  677 vm_page_t
  678 vm_reserv_alloc_contig(int req, vm_object_t object, vm_pindex_t pindex, int domain,
  679     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
  680     vm_paddr_t boundary, vm_page_t mpred)
  681 {
  682         struct vm_domain *vmd;
  683         vm_paddr_t pa, size;
  684         vm_page_t m, m_ret, msucc;
  685         vm_pindex_t first, leftcap, rightcap;
  686         vm_reserv_t rv;
  687         u_long allocpages, maxpages, minpages;
  688         int i, index, n;
  689 
  690         VM_OBJECT_ASSERT_WLOCKED(object);
  691         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
  692 
  693         /*
  694          * Is a reservation fundamentally impossible?
  695          */
  696         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  697             pindex + npages > object->size)
  698                 return (NULL);
  699 
  700         /*
  701          * All reservations of a particular size have the same alignment.
  702          * Assuming that the first page is allocated from a reservation, the
  703          * least significant bits of its physical address can be determined
  704          * from its offset from the beginning of the reservation and the size
  705          * of the reservation.
  706          *
  707          * Could the specified index within a reservation of the smallest
  708          * possible size satisfy the alignment and boundary requirements?
  709          */
  710         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
  711         if ((pa & (alignment - 1)) != 0)
  712                 return (NULL);
  713         size = npages << PAGE_SHIFT;
  714         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
  715                 return (NULL);
  716 
  717         /*
  718          * Callers should've extended an existing reservation prior to
  719          * calling this function.  If a reservation exists it is
  720          * incompatible with the allocation.
  721          */
  722         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  723         if (rv != NULL)
  724                 return (NULL);
  725 
  726         /*
  727          * Could at least one reservation fit between the first index to the
  728          * left that can be used ("leftcap") and the first index to the right
  729          * that cannot be used ("rightcap")?
  730          *
  731          * We must synchronize with the reserv object lock to protect the
  732          * pindex/object of the resulting reservations against rename while
  733          * we are inspecting.
  734          */
  735         first = pindex - VM_RESERV_INDEX(object, pindex);
  736         minpages = VM_RESERV_INDEX(object, pindex) + npages;
  737         maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
  738         allocpages = maxpages;
  739         vm_reserv_object_lock(object);
  740         if (mpred != NULL) {
  741                 if ((rv = vm_reserv_from_page(mpred))->object != object)
  742                         leftcap = mpred->pindex + 1;
  743                 else
  744                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
  745                 if (leftcap > first) {
  746                         vm_reserv_object_unlock(object);
  747                         return (NULL);
  748                 }
  749         }
  750         if (msucc != NULL) {
  751                 if ((rv = vm_reserv_from_page(msucc))->object != object)
  752                         rightcap = msucc->pindex;
  753                 else
  754                         rightcap = rv->pindex;
  755                 if (first + maxpages > rightcap) {
  756                         if (maxpages == VM_LEVEL_0_NPAGES) {
  757                                 vm_reserv_object_unlock(object);
  758                                 return (NULL);
  759                         }
  760 
  761                         /*
  762                          * At least one reservation will fit between "leftcap"
  763                          * and "rightcap".  However, a reservation for the
  764                          * last of the requested pages will not fit.  Reduce
  765                          * the size of the upcoming allocation accordingly.
  766                          */
  767                         allocpages = minpages;
  768                 }
  769         }
  770         vm_reserv_object_unlock(object);
  771 
  772         /*
  773          * Would the last new reservation extend past the end of the object?
  774          */
  775         if (first + maxpages > object->size) {
  776                 /*
  777                  * Don't allocate the last new reservation if the object is a
  778                  * vnode or backed by another object that is a vnode. 
  779                  */
  780                 if (object->type == OBJT_VNODE ||
  781                     (object->backing_object != NULL &&
  782                     object->backing_object->type == OBJT_VNODE)) {
  783                         if (maxpages == VM_LEVEL_0_NPAGES)
  784                                 return (NULL);
  785                         allocpages = minpages;
  786                 }
  787                 /* Speculate that the object may grow. */
  788         }
  789 
  790         /*
  791          * Allocate the physical pages.  The alignment and boundary specified
  792          * for this allocation may be different from the alignment and
  793          * boundary specified for the requested pages.  For instance, the
  794          * specified index may not be the first page within the first new
  795          * reservation.
  796          */
  797         m = NULL;
  798         vmd = VM_DOMAIN(domain);
  799         if (vm_domain_allocate(vmd, req, npages)) {
  800                 vm_domain_free_lock(vmd);
  801                 m = vm_phys_alloc_contig(domain, allocpages, low, high,
  802                     ulmax(alignment, VM_LEVEL_0_SIZE),
  803                     boundary > VM_LEVEL_0_SIZE ? boundary : 0);
  804                 vm_domain_free_unlock(vmd);
  805                 if (m == NULL) {
  806                         vm_domain_freecnt_inc(vmd, npages);
  807                         return (NULL);
  808                 }
  809         } else
  810                 return (NULL);
  811         KASSERT(vm_phys_domain(m) == domain,
  812             ("vm_reserv_alloc_contig: Page domain does not match requested."));
  813 
  814         /*
  815          * The allocated physical pages always begin at a reservation
  816          * boundary, but they do not always end at a reservation boundary.
  817          * Initialize every reservation that is completely covered by the
  818          * allocated physical pages.
  819          */
  820         m_ret = NULL;
  821         index = VM_RESERV_INDEX(object, pindex);
  822         do {
  823                 rv = vm_reserv_from_page(m);
  824                 KASSERT(rv->pages == m,
  825                     ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
  826                     rv));
  827                 vm_reserv_lock(rv);
  828                 vm_reserv_insert(rv, object, first);
  829                 n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
  830                 for (i = 0; i < n; i++)
  831                         vm_reserv_populate(rv, index + i);
  832                 npages -= n;
  833                 if (m_ret == NULL) {
  834                         m_ret = &rv->pages[index];
  835                         index = 0;
  836                 }
  837                 vm_reserv_unlock(rv);
  838                 m += VM_LEVEL_0_NPAGES;
  839                 first += VM_LEVEL_0_NPAGES;
  840                 allocpages -= VM_LEVEL_0_NPAGES;
  841         } while (allocpages >= VM_LEVEL_0_NPAGES);
  842         return (m_ret);
  843 }
  844 
  845 /*
  846  * Attempts to extend an existing reservation and allocate the page to the
  847  * object.
  848  *
  849  * The page "mpred" must immediately precede the offset "pindex" within the
  850  * specified object.
  851  *
  852  * The object must be locked.
  853  */
  854 vm_page_t
  855 vm_reserv_extend(int req, vm_object_t object, vm_pindex_t pindex, int domain,
  856     vm_page_t mpred)
  857 {
  858         struct vm_domain *vmd;
  859         vm_page_t m, msucc;
  860         vm_reserv_t rv;
  861         int index;
  862 
  863         VM_OBJECT_ASSERT_WLOCKED(object);
  864 
  865         /*
  866          * Could a reservation currently exist?
  867          */
  868         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  869             pindex >= object->size || object->resident_page_count == 0)
  870                 return (NULL);
  871 
  872         /*
  873          * Look for an existing reservation.
  874          */
  875         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  876         if (rv == NULL)
  877                 return (NULL);
  878 
  879         KASSERT(object != kernel_object || rv->domain == domain,
  880             ("vm_reserv_extend: Domain mismatch from reservation."));
  881         domain = rv->domain;
  882         vmd = VM_DOMAIN(domain);
  883         index = VM_RESERV_INDEX(object, pindex);
  884         m = &rv->pages[index];
  885         vm_reserv_lock(rv);
  886         /* Handle reclaim race. */
  887         if (rv->object != object ||
  888             /* Handle vm_page_rename(m, new_object, ...). */
  889             popmap_is_set(rv->popmap, index)) {
  890                 m = NULL;
  891                 goto out;
  892         }
  893         if (vm_domain_allocate(vmd, req, 1) == 0)
  894                 m = NULL;
  895         else
  896                 vm_reserv_populate(rv, index);
  897 out:
  898         vm_reserv_unlock(rv);
  899 
  900         return (m);
  901 }
  902 
  903 /*
  904  * Attempts to allocate a new reservation for the object, and allocates a
  905  * page from that reservation.  Callers should first invoke vm_reserv_extend()
  906  * to attempt an allocation from an existing reservation.
  907  *
  908  * The page "mpred" must immediately precede the offset "pindex" within the
  909  * specified object.
  910  *
  911  * The object and free page queue must be locked.
  912  */
  913 vm_page_t
  914 vm_reserv_alloc_page(int req, vm_object_t object, vm_pindex_t pindex, int domain,
  915     vm_page_t mpred)
  916 {
  917         struct vm_domain *vmd;
  918         vm_page_t m, msucc;
  919         vm_pindex_t first, leftcap, rightcap;
  920         vm_reserv_t rv;
  921         int index;
  922 
  923         VM_OBJECT_ASSERT_WLOCKED(object);
  924 
  925         /*
  926          * Is a reservation fundamentally impossible?
  927          */
  928         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  929             pindex >= object->size)
  930                 return (NULL);
  931 
  932         /*
  933          * Callers should've extended an existing reservation prior to
  934          * calling this function.  If a reservation exists it is
  935          * incompatible with the allocation.
  936          */
  937         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  938         if (rv != NULL)
  939                 return (NULL);
  940 
  941         /*
  942          * Could a reservation fit between the first index to the left that
  943          * can be used and the first index to the right that cannot be used?
  944          *
  945          * We must synchronize with the reserv object lock to protect the
  946          * pindex/object of the resulting reservations against rename while
  947          * we are inspecting.
  948          */
  949         first = pindex - VM_RESERV_INDEX(object, pindex);
  950         vm_reserv_object_lock(object);
  951         if (mpred != NULL) {
  952                 if ((rv = vm_reserv_from_page(mpred))->object != object)
  953                         leftcap = mpred->pindex + 1;
  954                 else
  955                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
  956                 if (leftcap > first) {
  957                         vm_reserv_object_unlock(object);
  958                         return (NULL);
  959                 }
  960         }
  961         if (msucc != NULL) {
  962                 if ((rv = vm_reserv_from_page(msucc))->object != object)
  963                         rightcap = msucc->pindex;
  964                 else
  965                         rightcap = rv->pindex;
  966                 if (first + VM_LEVEL_0_NPAGES > rightcap) {
  967                         vm_reserv_object_unlock(object);
  968                         return (NULL);
  969                 }
  970         }
  971         vm_reserv_object_unlock(object);
  972 
  973         /*
  974          * Would a new reservation extend past the end of the object? 
  975          */
  976         if (first + VM_LEVEL_0_NPAGES > object->size) {
  977                 /*
  978                  * Don't allocate a new reservation if the object is a vnode or
  979                  * backed by another object that is a vnode. 
  980                  */
  981                 if (object->type == OBJT_VNODE ||
  982                     (object->backing_object != NULL &&
  983                     object->backing_object->type == OBJT_VNODE))
  984                         return (NULL);
  985                 /* Speculate that the object may grow. */
  986         }
  987 
  988         /*
  989          * Allocate and populate the new reservation.
  990          */
  991         m = NULL;
  992         vmd = VM_DOMAIN(domain);
  993         if (vm_domain_allocate(vmd, req, 1)) {
  994                 vm_domain_free_lock(vmd);
  995                 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
  996                     VM_LEVEL_0_ORDER);
  997                 vm_domain_free_unlock(vmd);
  998                 if (m == NULL) {
  999                         vm_domain_freecnt_inc(vmd, 1);
 1000                         return (NULL);
 1001                 }
 1002         } else
 1003                 return (NULL);
 1004         rv = vm_reserv_from_page(m);
 1005         vm_reserv_lock(rv);
 1006         KASSERT(rv->pages == m,
 1007             ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
 1008         vm_reserv_insert(rv, object, first);
 1009         index = VM_RESERV_INDEX(object, pindex);
 1010         vm_reserv_populate(rv, index);
 1011         vm_reserv_unlock(rv);
 1012 
 1013         return (&rv->pages[index]);
 1014 }
 1015 
 1016 /*
 1017  * Breaks the given reservation.  All free pages in the reservation
 1018  * are returned to the physical memory allocator.  The reservation's
 1019  * population count and map are reset to their initial state.
 1020  *
 1021  * The given reservation must not be in the partially populated reservation
 1022  * queue.  The free page queue lock must be held.
 1023  */
 1024 static void
 1025 vm_reserv_break(vm_reserv_t rv)
 1026 {
 1027         int begin_zeroes, hi, i, lo;
 1028 
 1029         vm_reserv_assert_locked(rv);
 1030         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1031             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1032         vm_reserv_remove(rv);
 1033         rv->pages->psind = 0;
 1034         i = hi = 0;
 1035         do {
 1036                 /* Find the next 0 bit.  Any previous 0 bits are < "hi". */
 1037                 lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
 1038                 if (lo == 0) {
 1039                         /* Redundantly clears bits < "hi". */
 1040                         rv->popmap[i] = 0;
 1041                         rv->popcnt -= NBPOPMAP - hi;
 1042                         while (++i < NPOPMAP) {
 1043                                 lo = ffsl(~rv->popmap[i]);
 1044                                 if (lo == 0) {
 1045                                         rv->popmap[i] = 0;
 1046                                         rv->popcnt -= NBPOPMAP;
 1047                                 } else
 1048                                         break;
 1049                         }
 1050                         if (i == NPOPMAP)
 1051                                 break;
 1052                         hi = 0;
 1053                 }
 1054                 KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
 1055                 /* Convert from ffsl() to ordinary bit numbering. */
 1056                 lo--;
 1057                 if (lo > 0) {
 1058                         /* Redundantly clears bits < "hi". */
 1059                         rv->popmap[i] &= ~((1UL << lo) - 1);
 1060                         rv->popcnt -= lo - hi;
 1061                 }
 1062                 begin_zeroes = NBPOPMAP * i + lo;
 1063                 /* Find the next 1 bit. */
 1064                 do
 1065                         hi = ffsl(rv->popmap[i]);
 1066                 while (hi == 0 && ++i < NPOPMAP);
 1067                 if (i != NPOPMAP)
 1068                         /* Convert from ffsl() to ordinary bit numbering. */
 1069                         hi--;
 1070                 vm_domain_free_lock(VM_DOMAIN(rv->domain));
 1071                 vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
 1072                     hi - begin_zeroes);
 1073                 vm_domain_free_unlock(VM_DOMAIN(rv->domain));
 1074         } while (i < NPOPMAP);
 1075         KASSERT(rv->popcnt == 0,
 1076             ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
 1077         counter_u64_add(vm_reserv_broken, 1);
 1078 }
 1079 
 1080 /*
 1081  * Breaks all reservations belonging to the given object.
 1082  */
 1083 void
 1084 vm_reserv_break_all(vm_object_t object)
 1085 {
 1086         vm_reserv_t rv;
 1087 
 1088         /*
 1089          * This access of object->rvq is unsynchronized so that the
 1090          * object rvq lock can nest after the domain_free lock.  We
 1091          * must check for races in the results.  However, the object
 1092          * lock prevents new additions, so we are guaranteed that when
 1093          * it returns NULL the object is properly empty.
 1094          */
 1095         while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
 1096                 vm_reserv_lock(rv);
 1097                 /* Reclaim race. */
 1098                 if (rv->object != object) {
 1099                         vm_reserv_unlock(rv);
 1100                         continue;
 1101                 }
 1102                 vm_reserv_domain_lock(rv->domain);
 1103                 if (rv->inpartpopq) {
 1104                         TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 1105                         rv->inpartpopq = FALSE;
 1106                 }
 1107                 vm_reserv_domain_unlock(rv->domain);
 1108                 vm_reserv_break(rv);
 1109                 vm_reserv_unlock(rv);
 1110         }
 1111 }
 1112 
 1113 /*
 1114  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
 1115  * page is freed and FALSE otherwise.
 1116  *
 1117  * The free page queue lock must be held.
 1118  */
 1119 boolean_t
 1120 vm_reserv_free_page(vm_page_t m)
 1121 {
 1122         vm_reserv_t rv;
 1123         boolean_t ret;
 1124 
 1125         rv = vm_reserv_from_page(m);
 1126         if (rv->object == NULL)
 1127                 return (FALSE);
 1128         vm_reserv_lock(rv);
 1129         /* Re-validate after lock. */
 1130         if (rv->object != NULL) {
 1131                 vm_reserv_depopulate(rv, m - rv->pages);
 1132                 ret = TRUE;
 1133         } else
 1134                 ret = FALSE;
 1135         vm_reserv_unlock(rv);
 1136 
 1137         return (ret);
 1138 }
 1139 
 1140 /*
 1141  * Initializes the reservation management system.  Specifically, initializes
 1142  * the reservation array.
 1143  *
 1144  * Requires that vm_page_array and first_page are initialized!
 1145  */
 1146 void
 1147 vm_reserv_init(void)
 1148 {
 1149         vm_paddr_t paddr;
 1150         struct vm_phys_seg *seg;
 1151         struct vm_reserv *rv;
 1152         int i, segind;
 1153 
 1154         /*
 1155          * Initialize the reservation array.  Specifically, initialize the
 1156          * "pages" field for every element that has an underlying superpage.
 1157          */
 1158         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 1159                 seg = &vm_phys_segs[segind];
 1160                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 1161                 while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
 1162                         rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
 1163                         rv->pages = PHYS_TO_VM_PAGE(paddr);
 1164                         rv->domain = seg->domain;
 1165                         mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF);
 1166                         paddr += VM_LEVEL_0_SIZE;
 1167                 }
 1168         }
 1169         for (i = 0; i < MAXMEMDOM; i++) {
 1170                 mtx_init(&vm_reserv_domain_locks[i], "VM reserv domain", NULL,
 1171                     MTX_DEF);
 1172                 TAILQ_INIT(&vm_rvq_partpop[i]);
 1173         }
 1174 
 1175         for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
 1176                 mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
 1177                     MTX_DEF);
 1178 }
 1179 
 1180 /*
 1181  * Returns true if the given page belongs to a reservation and that page is
 1182  * free.  Otherwise, returns false.
 1183  */
 1184 bool
 1185 vm_reserv_is_page_free(vm_page_t m)
 1186 {
 1187         vm_reserv_t rv;
 1188 
 1189         rv = vm_reserv_from_page(m);
 1190         if (rv->object == NULL)
 1191                 return (false);
 1192         return (popmap_is_clear(rv->popmap, m - rv->pages));
 1193 }
 1194 
 1195 /*
 1196  * If the given page belongs to a reservation, returns the level of that
 1197  * reservation.  Otherwise, returns -1.
 1198  */
 1199 int
 1200 vm_reserv_level(vm_page_t m)
 1201 {
 1202         vm_reserv_t rv;
 1203 
 1204         rv = vm_reserv_from_page(m);
 1205         return (rv->object != NULL ? 0 : -1);
 1206 }
 1207 
 1208 /*
 1209  * Returns a reservation level if the given page belongs to a fully populated
 1210  * reservation and -1 otherwise.
 1211  */
 1212 int
 1213 vm_reserv_level_iffullpop(vm_page_t m)
 1214 {
 1215         vm_reserv_t rv;
 1216 
 1217         rv = vm_reserv_from_page(m);
 1218         return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 1219 }
 1220 
 1221 /*
 1222  * Breaks the given partially populated reservation, releasing its free pages
 1223  * to the physical memory allocator.
 1224  *
 1225  * The free page queue lock must be held.
 1226  */
 1227 static void
 1228 vm_reserv_reclaim(vm_reserv_t rv)
 1229 {
 1230 
 1231         vm_reserv_assert_locked(rv);
 1232         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1233             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1234         vm_reserv_domain_lock(rv->domain);
 1235         KASSERT(rv->inpartpopq,
 1236             ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 1237         KASSERT(rv->domain < vm_ndomains,
 1238             ("vm_reserv_reclaim: reserv %p's domain is corrupted %d",
 1239             rv, rv->domain));
 1240         TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 1241         rv->inpartpopq = FALSE;
 1242         vm_reserv_domain_unlock(rv->domain);
 1243         vm_reserv_break(rv);
 1244         counter_u64_add(vm_reserv_reclaimed, 1);
 1245 }
 1246 
 1247 /*
 1248  * Breaks the reservation at the head of the partially populated reservation
 1249  * queue, releasing its free pages to the physical memory allocator.  Returns
 1250  * TRUE if a reservation is broken and FALSE otherwise.
 1251  *
 1252  * The free page queue lock must be held.
 1253  */
 1254 boolean_t
 1255 vm_reserv_reclaim_inactive(int domain)
 1256 {
 1257         vm_reserv_t rv;
 1258 
 1259         while ((rv = TAILQ_FIRST(&vm_rvq_partpop[domain])) != NULL) {
 1260                 vm_reserv_lock(rv);
 1261                 if (rv != TAILQ_FIRST(&vm_rvq_partpop[domain])) {
 1262                         vm_reserv_unlock(rv);
 1263                         continue;
 1264                 }
 1265                 vm_reserv_reclaim(rv);
 1266                 vm_reserv_unlock(rv);
 1267                 return (TRUE);
 1268         }
 1269         return (FALSE);
 1270 }
 1271 
 1272 /*
 1273  * Searches the partially populated reservation queue for the least recently
 1274  * changed reservation with free pages that satisfy the given request for
 1275  * contiguous physical memory.  If a satisfactory reservation is found, it is
 1276  * broken.  Returns TRUE if a reservation is broken and FALSE otherwise.
 1277  *
 1278  * The free page queue lock must be held.
 1279  */
 1280 boolean_t
 1281 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low,
 1282     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 1283 {
 1284         vm_paddr_t pa, size;
 1285         vm_reserv_t rv, rvn;
 1286         int hi, i, lo, low_index, next_free;
 1287 
 1288         if (npages > VM_LEVEL_0_NPAGES - 1)
 1289                 return (FALSE);
 1290         size = npages << PAGE_SHIFT;
 1291         vm_reserv_domain_lock(domain);
 1292 again:
 1293         for (rv = TAILQ_FIRST(&vm_rvq_partpop[domain]); rv != NULL; rv = rvn) {
 1294                 rvn = TAILQ_NEXT(rv, partpopq);
 1295                 pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 1296                 if (pa + PAGE_SIZE - size < low) {
 1297                         /* This entire reservation is too low; go to next. */
 1298                         continue;
 1299                 }
 1300                 pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
 1301                 if (pa + size > high) {
 1302                         /* This entire reservation is too high; go to next. */
 1303                         continue;
 1304                 }
 1305                 if (vm_reserv_trylock(rv) == 0) {
 1306                         vm_reserv_domain_unlock(domain);
 1307                         vm_reserv_lock(rv);
 1308                         if (!rv->inpartpopq) {
 1309                                 vm_reserv_domain_lock(domain);
 1310                                 if (!rvn->inpartpopq)
 1311                                         goto again;
 1312                                 continue;
 1313                         }
 1314                 } else
 1315                         vm_reserv_domain_unlock(domain);
 1316                 if (pa < low) {
 1317                         /* Start the search for free pages at "low". */
 1318                         low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
 1319                         i = low_index / NBPOPMAP;
 1320                         hi = low_index % NBPOPMAP;
 1321                 } else
 1322                         i = hi = 0;
 1323                 do {
 1324                         /* Find the next free page. */
 1325                         lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
 1326                         while (lo == 0 && ++i < NPOPMAP)
 1327                                 lo = ffsl(~rv->popmap[i]);
 1328                         if (i == NPOPMAP)
 1329                                 break;
 1330                         /* Convert from ffsl() to ordinary bit numbering. */
 1331                         lo--;
 1332                         next_free = NBPOPMAP * i + lo;
 1333                         pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]);
 1334                         KASSERT(pa >= low,
 1335                             ("vm_reserv_reclaim_contig: pa is too low"));
 1336                         if (pa + size > high) {
 1337                                 /* The rest of this reservation is too high. */
 1338                                 break;
 1339                         } else if ((pa & (alignment - 1)) != 0 ||
 1340                             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
 1341                                 /*
 1342                                  * The current page doesn't meet the alignment
 1343                                  * and/or boundary requirements.  Continue
 1344                                  * searching this reservation until the rest
 1345                                  * of its free pages are either excluded or
 1346                                  * exhausted.
 1347                                  */
 1348                                 hi = lo + 1;
 1349                                 if (hi >= NBPOPMAP) {
 1350                                         hi = 0;
 1351                                         i++;
 1352                                 }
 1353                                 continue;
 1354                         }
 1355                         /* Find the next used page. */
 1356                         hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1));
 1357                         while (hi == 0 && ++i < NPOPMAP) {
 1358                                 if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
 1359                                     size) {
 1360                                         vm_reserv_reclaim(rv);
 1361                                         vm_reserv_unlock(rv);
 1362                                         return (TRUE);
 1363                                 }
 1364                                 hi = ffsl(rv->popmap[i]);
 1365                         }
 1366                         /* Convert from ffsl() to ordinary bit numbering. */
 1367                         if (i != NPOPMAP)
 1368                                 hi--;
 1369                         if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
 1370                             size) {
 1371                                 vm_reserv_reclaim(rv);
 1372                                 vm_reserv_unlock(rv);
 1373                                 return (TRUE);
 1374                         }
 1375                 } while (i < NPOPMAP);
 1376                 vm_reserv_unlock(rv);
 1377                 vm_reserv_domain_lock(domain);
 1378                 if (rvn != NULL && !rvn->inpartpopq)
 1379                         goto again;
 1380         }
 1381         vm_reserv_domain_unlock(domain);
 1382         return (FALSE);
 1383 }
 1384 
 1385 /*
 1386  * Transfers the reservation underlying the given page to a new object.
 1387  *
 1388  * The object must be locked.
 1389  */
 1390 void
 1391 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
 1392     vm_pindex_t old_object_offset)
 1393 {
 1394         vm_reserv_t rv;
 1395 
 1396         VM_OBJECT_ASSERT_WLOCKED(new_object);
 1397         rv = vm_reserv_from_page(m);
 1398         if (rv->object == old_object) {
 1399                 vm_reserv_lock(rv);
 1400                 CTR6(KTR_VM,
 1401                     "%s: rv %p object %p new %p popcnt %d inpartpop %d",
 1402                     __FUNCTION__, rv, rv->object, new_object, rv->popcnt,
 1403                     rv->inpartpopq);
 1404                 if (rv->object == old_object) {
 1405                         vm_reserv_object_lock(old_object);
 1406                         rv->object = NULL;
 1407                         LIST_REMOVE(rv, objq);
 1408                         vm_reserv_object_unlock(old_object);
 1409                         vm_reserv_object_lock(new_object);
 1410                         rv->object = new_object;
 1411                         rv->pindex -= old_object_offset;
 1412                         LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 1413                         vm_reserv_object_unlock(new_object);
 1414                 }
 1415                 vm_reserv_unlock(rv);
 1416         }
 1417 }
 1418 
 1419 /*
 1420  * Returns the size (in bytes) of a reservation of the specified level.
 1421  */
 1422 int
 1423 vm_reserv_size(int level)
 1424 {
 1425 
 1426         switch (level) {
 1427         case 0:
 1428                 return (VM_LEVEL_0_SIZE);
 1429         case -1:
 1430                 return (PAGE_SIZE);
 1431         default:
 1432                 return (0);
 1433         }
 1434 }
 1435 
 1436 /*
 1437  * Allocates the virtual and physical memory required by the reservation
 1438  * management system's data structures, in particular, the reservation array.
 1439  */
 1440 vm_paddr_t
 1441 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
 1442 {
 1443         vm_paddr_t new_end;
 1444         size_t size;
 1445 
 1446         /*
 1447          * Calculate the size (in bytes) of the reservation array.  Round up
 1448          * from "high_water" because every small page is mapped to an element
 1449          * in the reservation array based on its physical address.  Thus, the
 1450          * number of elements in the reservation array can be greater than the
 1451          * number of superpages. 
 1452          */
 1453         size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
 1454 
 1455         /*
 1456          * Allocate and map the physical memory for the reservation array.  The
 1457          * next available virtual address is returned by reference.
 1458          */
 1459         new_end = end - round_page(size);
 1460         vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 1461             VM_PROT_READ | VM_PROT_WRITE);
 1462         bzero(vm_reserv_array, size);
 1463 
 1464         /*
 1465          * Return the next available physical address.
 1466          */
 1467         return (new_end);
 1468 }
 1469 
 1470 /*
 1471  * Initializes the reservation management system.  Specifically, initializes
 1472  * the reservation counters.
 1473  */
 1474 static void
 1475 vm_reserv_counter_init(void *unused)
 1476 {
 1477 
 1478         vm_reserv_freed = counter_u64_alloc(M_WAITOK); 
 1479         vm_reserv_broken = counter_u64_alloc(M_WAITOK); 
 1480         vm_reserv_reclaimed = counter_u64_alloc(M_WAITOK); 
 1481 }
 1482 SYSINIT(vm_reserv_counter_init, SI_SUB_CPU, SI_ORDER_ANY,
 1483     vm_reserv_counter_init, NULL);
 1484 
 1485 /*
 1486  * Returns the superpage containing the given page.
 1487  */
 1488 vm_page_t
 1489 vm_reserv_to_superpage(vm_page_t m)
 1490 {
 1491         vm_reserv_t rv;
 1492 
 1493         VM_OBJECT_ASSERT_LOCKED(m->object);
 1494         rv = vm_reserv_from_page(m);
 1495         if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES)
 1496                 m = rv->pages;
 1497         else
 1498                 m = NULL;
 1499 
 1500         return (m);
 1501 }
 1502 
 1503 #endif  /* VM_NRESERVLEVEL > 0 */

Cache object: 1acfb0d9e9ee23dd88ba33f9a4722f70


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.