vm_reserv.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2002-2006 Rice University
    5  * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
    6  * All rights reserved.
    7  *
    8  * This software was developed for the FreeBSD Project by Alan L. Cox,
    9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
   24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
   27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
   30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   31  * POSSIBILITY OF SUCH DAMAGE.
   32  */
   33 
   34 /*
   35  *      Superpage reservation management module
   36  *
   37  * Any external functions defined by this module are only to be used by the
   38  * virtual memory system.
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD$");
   43 
   44 #include "opt_vm.h"
   45 
   46 #include <sys/param.h>
   47 #include <sys/kernel.h>
   48 #include <sys/lock.h>
   49 #include <sys/malloc.h>
   50 #include <sys/mutex.h>
   51 #include <sys/queue.h>
   52 #include <sys/rwlock.h>
   53 #include <sys/sbuf.h>
   54 #include <sys/sysctl.h>
   55 #include <sys/systm.h>
   56 #include <sys/counter.h>
   57 #include <sys/ktr.h>
   58 #include <sys/vmmeter.h>
   59 #include <sys/smp.h>
   60 
   61 #include <vm/vm.h>
   62 #include <vm/vm_param.h>
   63 #include <vm/vm_object.h>
   64 #include <vm/vm_page.h>
   65 #include <vm/vm_pageout.h>
   66 #include <vm/vm_phys.h>
   67 #include <vm/vm_pagequeue.h>
   68 #include <vm/vm_radix.h>
   69 #include <vm/vm_reserv.h>
   70 
   71 /*
   72  * The reservation system supports the speculative allocation of large physical
   73  * pages ("superpages").  Speculative allocation enables the fully automatic
   74  * utilization of superpages by the virtual memory system.  In other words, no
   75  * programmatic directives are required to use superpages.
   76  */
   77 
   78 #if VM_NRESERVLEVEL > 0
   79 
   80 #ifndef VM_LEVEL_0_ORDER_MAX
   81 #define VM_LEVEL_0_ORDER_MAX    VM_LEVEL_0_ORDER
   82 #endif
   83 
   84 /*
   85  * The number of small pages that are contained in a level 0 reservation
   86  */
   87 #define VM_LEVEL_0_NPAGES       (1 << VM_LEVEL_0_ORDER)
   88 #define VM_LEVEL_0_NPAGES_MAX   (1 << VM_LEVEL_0_ORDER_MAX)
   89 
   90 /*
   91  * The number of bits by which a physical address is shifted to obtain the
   92  * reservation number
   93  */
   94 #define VM_LEVEL_0_SHIFT        (VM_LEVEL_0_ORDER + PAGE_SHIFT)
   95 
   96 /*
   97  * The size of a level 0 reservation in bytes
   98  */
   99 #define VM_LEVEL_0_SIZE         (1 << VM_LEVEL_0_SHIFT)
  100 
  101 /*
  102  * Computes the index of the small page underlying the given (object, pindex)
  103  * within the reservation's array of small pages.
  104  */
  105 #define VM_RESERV_INDEX(object, pindex) \
  106     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
  107 
  108 /*
  109  * The size of a population map entry
  110  */
  111 typedef u_long          popmap_t;
  112 
  113 /*
  114  * The number of bits in a population map entry
  115  */
  116 #define NBPOPMAP        (NBBY * sizeof(popmap_t))
  117 
  118 /*
  119  * The number of population map entries in a reservation
  120  */
  121 #define NPOPMAP         howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
  122 #define NPOPMAP_MAX     howmany(VM_LEVEL_0_NPAGES_MAX, NBPOPMAP)
  123 
  124 /*
  125  * Number of elapsed ticks before we update the LRU queue position.  Used
  126  * to reduce contention and churn on the list.
  127  */
  128 #define PARTPOPSLOP     1
  129 
  130 /*
  131  * Clear a bit in the population map.
  132  */
  133 static __inline void
  134 popmap_clear(popmap_t popmap[], int i)
  135 {
  136 
  137         popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
  138 }
  139 
  140 /*
  141  * Set a bit in the population map.
  142  */
  143 static __inline void
  144 popmap_set(popmap_t popmap[], int i)
  145 {
  146 
  147         popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
  148 }
  149 
  150 /*
  151  * Is a bit in the population map clear?
  152  */
  153 static __inline boolean_t
  154 popmap_is_clear(popmap_t popmap[], int i)
  155 {
  156 
  157         return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
  158 }
  159 
  160 /*
  161  * Is a bit in the population map set?
  162  */
  163 static __inline boolean_t
  164 popmap_is_set(popmap_t popmap[], int i)
  165 {
  166 
  167         return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
  168 }
  169 
  170 /*
  171  * The reservation structure
  172  *
  173  * A reservation structure is constructed whenever a large physical page is
  174  * speculatively allocated to an object.  The reservation provides the small
  175  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  176  * within that object.  The reservation's "popcnt" tracks the number of these
  177  * small physical pages that are in use at any given time.  When and if the
  178  * reservation is not fully utilized, it appears in the queue of partially
  179  * populated reservations.  The reservation always appears on the containing
  180  * object's list of reservations.
  181  *
  182  * A partially populated reservation can be broken and reclaimed at any time.
  183  *
  184  * c - constant after boot
  185  * d - vm_reserv_domain_lock
  186  * o - vm_reserv_object_lock
  187  * r - vm_reserv_lock
  188  * s - vm_reserv_domain_scan_lock
  189  */
  190 struct vm_reserv {
  191         struct mtx      lock;                   /* reservation lock. */
  192         TAILQ_ENTRY(vm_reserv) partpopq;        /* (d, r) per-domain queue. */
  193         LIST_ENTRY(vm_reserv) objq;             /* (o, r) object queue */
  194         vm_object_t     object;                 /* (o, r) containing object */
  195         vm_pindex_t     pindex;                 /* (o, r) offset in object */
  196         vm_page_t       pages;                  /* (c) first page  */
  197         uint16_t        popcnt;                 /* (r) # of pages in use */
  198         uint8_t         domain;                 /* (c) NUMA domain. */
  199         char            inpartpopq;             /* (d, r) */
  200         int             lasttick;               /* (r) last pop update tick. */
  201         popmap_t        popmap[NPOPMAP_MAX];    /* (r) bit vector, used pages */
  202 };
  203 
  204 TAILQ_HEAD(vm_reserv_queue, vm_reserv);
  205 
  206 #define vm_reserv_lockptr(rv)           (&(rv)->lock)
  207 #define vm_reserv_assert_locked(rv)                                     \
  208             mtx_assert(vm_reserv_lockptr(rv), MA_OWNED)
  209 #define vm_reserv_lock(rv)              mtx_lock(vm_reserv_lockptr(rv))
  210 #define vm_reserv_trylock(rv)           mtx_trylock(vm_reserv_lockptr(rv))
  211 #define vm_reserv_unlock(rv)            mtx_unlock(vm_reserv_lockptr(rv))
  212 
  213 /*
  214  * The reservation array
  215  *
  216  * This array is analoguous in function to vm_page_array.  It differs in the
  217  * respect that it may contain a greater number of useful reservation
  218  * structures than there are (physical) superpages.  These "invalid"
  219  * reservation structures exist to trade-off space for time in the
  220  * implementation of vm_reserv_from_page().  Invalid reservation structures are
  221  * distinguishable from "valid" reservation structures by inspecting the
  222  * reservation's "pages" field.  Invalid reservation structures have a NULL
  223  * "pages" field.
  224  *
  225  * vm_reserv_from_page() maps a small (physical) page to an element of this
  226  * array by computing a physical reservation number from the page's physical
  227  * address.  The physical reservation number is used as the array index.
  228  *
  229  * An "active" reservation is a valid reservation structure that has a non-NULL
  230  * "object" field and a non-zero "popcnt" field.  In other words, every active
  231  * reservation belongs to a particular object.  Moreover, every active
  232  * reservation has an entry in the containing object's list of reservations.  
  233  */
  234 static vm_reserv_t vm_reserv_array;
  235 
  236 /*
  237  * The per-domain partially populated reservation queues
  238  *
  239  * These queues enable the fast recovery of an unused free small page from a
  240  * partially populated reservation.  The reservation at the head of a queue
  241  * is the least recently changed, partially populated reservation.
  242  *
  243  * Access to this queue is synchronized by the per-domain reservation lock.
  244  * Threads reclaiming free pages from the queue must hold the per-domain scan
  245  * lock.
  246  */
  247 struct vm_reserv_domain {
  248         struct mtx              lock;
  249         struct vm_reserv_queue  partpop;        /* (d) */
  250         struct vm_reserv        marker;         /* (d, s) scan marker/lock */
  251 } __aligned(CACHE_LINE_SIZE);
  252 
  253 static struct vm_reserv_domain vm_rvd[MAXMEMDOM];
  254 
  255 #define vm_reserv_domain_lockptr(d)     (&vm_rvd[(d)].lock)
  256 #define vm_reserv_domain_assert_locked(d)       \
  257         mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED)
  258 #define vm_reserv_domain_lock(d)        mtx_lock(vm_reserv_domain_lockptr(d))
  259 #define vm_reserv_domain_unlock(d)      mtx_unlock(vm_reserv_domain_lockptr(d))
  260 
  261 #define vm_reserv_domain_scan_lock(d)   mtx_lock(&vm_rvd[(d)].marker.lock)
  262 #define vm_reserv_domain_scan_unlock(d) mtx_unlock(&vm_rvd[(d)].marker.lock)
  263 
  264 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
  265 
  266 static counter_u64_t vm_reserv_broken = EARLY_COUNTER;
  267 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
  268     &vm_reserv_broken, "Cumulative number of broken reservations");
  269 
  270 static counter_u64_t vm_reserv_freed = EARLY_COUNTER;
  271 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
  272     &vm_reserv_freed, "Cumulative number of freed reservations");
  273 
  274 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
  275 
  276 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
  277     sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
  278 
  279 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
  280 
  281 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq,
  282     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  283     sysctl_vm_reserv_partpopq, "A",
  284     "Partially populated reservation queues");
  285 
  286 static counter_u64_t vm_reserv_reclaimed = EARLY_COUNTER;
  287 SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
  288     &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations");
  289 
  290 /*
  291  * The object lock pool is used to synchronize the rvq.  We can not use a
  292  * pool mutex because it is required before malloc works.
  293  *
  294  * The "hash" function could be made faster without divide and modulo.
  295  */
  296 #define VM_RESERV_OBJ_LOCK_COUNT        MAXCPU
  297 
  298 struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];
  299 
  300 #define vm_reserv_object_lock_idx(object)                       \
  301             (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
  302 #define vm_reserv_object_lock_ptr(object)                       \
  303             &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
  304 #define vm_reserv_object_lock(object)                           \
  305             mtx_lock(vm_reserv_object_lock_ptr((object)))
  306 #define vm_reserv_object_unlock(object)                         \
  307             mtx_unlock(vm_reserv_object_lock_ptr((object)))
  308 
  309 static void             vm_reserv_break(vm_reserv_t rv);
  310 static void             vm_reserv_depopulate(vm_reserv_t rv, int index);
  311 static vm_reserv_t      vm_reserv_from_page(vm_page_t m);
  312 static boolean_t        vm_reserv_has_pindex(vm_reserv_t rv,
  313                             vm_pindex_t pindex);
  314 static void             vm_reserv_populate(vm_reserv_t rv, int index);
  315 static void             vm_reserv_reclaim(vm_reserv_t rv);
  316 
  317 /*
  318  * Returns the current number of full reservations.
  319  *
  320  * Since the number of full reservations is computed without acquiring any
  321  * locks, the returned value is inexact.
  322  */
  323 static int
  324 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
  325 {
  326         vm_paddr_t paddr;
  327         struct vm_phys_seg *seg;
  328         vm_reserv_t rv;
  329         int fullpop, segind;
  330 
  331         fullpop = 0;
  332         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  333                 seg = &vm_phys_segs[segind];
  334                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
  335                 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
  336                     VM_LEVEL_0_SIZE <= seg->end) {
  337                         rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
  338                         fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
  339                         paddr += VM_LEVEL_0_SIZE;
  340                 }
  341         }
  342         return (sysctl_handle_int(oidp, &fullpop, 0, req));
  343 }
  344 
  345 /*
  346  * Describes the current state of the partially populated reservation queue.
  347  */
  348 static int
  349 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
  350 {
  351         struct sbuf sbuf;
  352         vm_reserv_t rv;
  353         int counter, error, domain, level, unused_pages;
  354 
  355         error = sysctl_wire_old_buffer(req, 0);
  356         if (error != 0)
  357                 return (error);
  358         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  359         sbuf_printf(&sbuf, "\nDOMAIN    LEVEL     SIZE  NUMBER\n\n");
  360         for (domain = 0; domain < vm_ndomains; domain++) {
  361                 for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
  362                         counter = 0;
  363                         unused_pages = 0;
  364                         vm_reserv_domain_lock(domain);
  365                         TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
  366                                 if (rv == &vm_rvd[domain].marker)
  367                                         continue;
  368                                 counter++;
  369                                 unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
  370                         }
  371                         vm_reserv_domain_unlock(domain);
  372                         sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
  373                             domain, level,
  374                             unused_pages * ((int)PAGE_SIZE / 1024), counter);
  375                 }
  376         }
  377         error = sbuf_finish(&sbuf);
  378         sbuf_delete(&sbuf);
  379         return (error);
  380 }
  381 
  382 /*
  383  * Remove a reservation from the object's objq.
  384  */
  385 static void
  386 vm_reserv_remove(vm_reserv_t rv)
  387 {
  388         vm_object_t object;
  389 
  390         vm_reserv_assert_locked(rv);
  391         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  392             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  393         KASSERT(rv->object != NULL,
  394             ("vm_reserv_remove: reserv %p is free", rv));
  395         KASSERT(!rv->inpartpopq,
  396             ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
  397         object = rv->object;
  398         vm_reserv_object_lock(object);
  399         LIST_REMOVE(rv, objq);
  400         rv->object = NULL;
  401         vm_reserv_object_unlock(object);
  402 }
  403 
  404 /*
  405  * Insert a new reservation into the object's objq.
  406  */
  407 static void
  408 vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
  409 {
  410         int i;
  411 
  412         vm_reserv_assert_locked(rv);
  413         CTR6(KTR_VM,
  414             "%s: rv %p(%p) object %p new %p popcnt %d",
  415             __FUNCTION__, rv, rv->pages, rv->object, object,
  416            rv->popcnt);
  417         KASSERT(rv->object == NULL,
  418             ("vm_reserv_insert: reserv %p isn't free", rv));
  419         KASSERT(rv->popcnt == 0,
  420             ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
  421         KASSERT(!rv->inpartpopq,
  422             ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
  423         for (i = 0; i < NPOPMAP; i++)
  424                 KASSERT(rv->popmap[i] == 0,
  425                     ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
  426         vm_reserv_object_lock(object);
  427         rv->pindex = pindex;
  428         rv->object = object;
  429         rv->lasttick = ticks;
  430         LIST_INSERT_HEAD(&object->rvq, rv, objq);
  431         vm_reserv_object_unlock(object);
  432 }
  433 
  434 /*
  435  * Reduces the given reservation's population count.  If the population count
  436  * becomes zero, the reservation is destroyed.  Additionally, moves the
  437  * reservation to the tail of the partially populated reservation queue if the
  438  * population count is non-zero.
  439  */
  440 static void
  441 vm_reserv_depopulate(vm_reserv_t rv, int index)
  442 {
  443         struct vm_domain *vmd;
  444 
  445         vm_reserv_assert_locked(rv);
  446         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  447             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  448         KASSERT(rv->object != NULL,
  449             ("vm_reserv_depopulate: reserv %p is free", rv));
  450         KASSERT(popmap_is_set(rv->popmap, index),
  451             ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
  452             index));
  453         KASSERT(rv->popcnt > 0,
  454             ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
  455         KASSERT(rv->domain < vm_ndomains,
  456             ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
  457             rv, rv->domain));
  458         if (rv->popcnt == VM_LEVEL_0_NPAGES) {
  459                 KASSERT(rv->pages->psind == 1,
  460                     ("vm_reserv_depopulate: reserv %p is already demoted",
  461                     rv));
  462                 rv->pages->psind = 0;
  463         }
  464         popmap_clear(rv->popmap, index);
  465         rv->popcnt--;
  466         if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP ||
  467             rv->popcnt == 0) {
  468                 vm_reserv_domain_lock(rv->domain);
  469                 if (rv->inpartpopq) {
  470                         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
  471                         rv->inpartpopq = FALSE;
  472                 }
  473                 if (rv->popcnt != 0) {
  474                         rv->inpartpopq = TRUE;
  475                         TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv,
  476                             partpopq);
  477                 }
  478                 vm_reserv_domain_unlock(rv->domain);
  479                 rv->lasttick = ticks;
  480         }
  481         vmd = VM_DOMAIN(rv->domain);
  482         if (rv->popcnt == 0) {
  483                 vm_reserv_remove(rv);
  484                 vm_domain_free_lock(vmd);
  485                 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
  486                 vm_domain_free_unlock(vmd);
  487                 counter_u64_add(vm_reserv_freed, 1);
  488         }
  489         vm_domain_freecnt_inc(vmd, 1);
  490 }
  491 
  492 /*
  493  * Returns the reservation to which the given page might belong.
  494  */
  495 static __inline vm_reserv_t
  496 vm_reserv_from_page(vm_page_t m)
  497 {
  498 
  499         return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
  500 }
  501 
  502 /*
  503  * Returns an existing reservation or NULL and initialized successor pointer.
  504  */
  505 static vm_reserv_t
  506 vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
  507     vm_page_t mpred, vm_page_t *msuccp)
  508 {
  509         vm_reserv_t rv;
  510         vm_page_t msucc;
  511 
  512         msucc = NULL;
  513         if (mpred != NULL) {
  514                 KASSERT(mpred->object == object,
  515                     ("vm_reserv_from_object: object doesn't contain mpred"));
  516                 KASSERT(mpred->pindex < pindex,
  517                     ("vm_reserv_from_object: mpred doesn't precede pindex"));
  518                 rv = vm_reserv_from_page(mpred);
  519                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
  520                         goto found;
  521                 msucc = TAILQ_NEXT(mpred, listq);
  522         } else
  523                 msucc = TAILQ_FIRST(&object->memq);
  524         if (msucc != NULL) {
  525                 KASSERT(msucc->pindex > pindex,
  526                     ("vm_reserv_from_object: msucc doesn't succeed pindex"));
  527                 rv = vm_reserv_from_page(msucc);
  528                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
  529                         goto found;
  530         }
  531         rv = NULL;
  532 
  533 found:
  534         *msuccp = msucc;
  535 
  536         return (rv);
  537 }
  538 
  539 /*
  540  * Returns TRUE if the given reservation contains the given page index and
  541  * FALSE otherwise.
  542  */
  543 static __inline boolean_t
  544 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
  545 {
  546 
  547         return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
  548 }
  549 
  550 /*
  551  * Increases the given reservation's population count.  Moves the reservation
  552  * to the tail of the partially populated reservation queue.
  553  */
  554 static void
  555 vm_reserv_populate(vm_reserv_t rv, int index)
  556 {
  557 
  558         vm_reserv_assert_locked(rv);
  559         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
  560             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
  561         KASSERT(rv->object != NULL,
  562             ("vm_reserv_populate: reserv %p is free", rv));
  563         KASSERT(popmap_is_clear(rv->popmap, index),
  564             ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
  565             index));
  566         KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
  567             ("vm_reserv_populate: reserv %p is already full", rv));
  568         KASSERT(rv->pages->psind == 0,
  569             ("vm_reserv_populate: reserv %p is already promoted", rv));
  570         KASSERT(rv->domain < vm_ndomains,
  571             ("vm_reserv_populate: reserv %p's domain is corrupted %d",
  572             rv, rv->domain));
  573         popmap_set(rv->popmap, index);
  574         rv->popcnt++;
  575         if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP &&
  576             rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES)
  577                 return;
  578         rv->lasttick = ticks;
  579         vm_reserv_domain_lock(rv->domain);
  580         if (rv->inpartpopq) {
  581                 TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
  582                 rv->inpartpopq = FALSE;
  583         }
  584         if (rv->popcnt < VM_LEVEL_0_NPAGES) {
  585                 rv->inpartpopq = TRUE;
  586                 TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq);
  587         } else {
  588                 KASSERT(rv->pages->psind == 0,
  589                     ("vm_reserv_populate: reserv %p is already promoted",
  590                     rv));
  591                 rv->pages->psind = 1;
  592         }
  593         vm_reserv_domain_unlock(rv->domain);
  594 }
  595 
  596 /*
  597  * Attempts to allocate a contiguous set of physical pages from existing
  598  * reservations.  See vm_reserv_alloc_contig() for a description of the
  599  * function's parameters.
  600  *
  601  * The page "mpred" must immediately precede the offset "pindex" within the
  602  * specified object.
  603  *
  604  * The object must be locked.
  605  */
  606 vm_page_t
  607 vm_reserv_extend_contig(int req, vm_object_t object, vm_pindex_t pindex,
  608     int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
  609     u_long alignment, vm_paddr_t boundary, vm_page_t mpred)
  610 {
  611         struct vm_domain *vmd;
  612         vm_paddr_t pa, size;
  613         vm_page_t m, msucc;
  614         vm_reserv_t rv;
  615         int i, index;
  616 
  617         VM_OBJECT_ASSERT_WLOCKED(object);
  618         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
  619 
  620         /*
  621          * Is a reservation fundamentally impossible?
  622          */
  623         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  624             pindex + npages > object->size || object->resident_page_count == 0)
  625                 return (NULL);
  626 
  627         /*
  628          * All reservations of a particular size have the same alignment.
  629          * Assuming that the first page is allocated from a reservation, the
  630          * least significant bits of its physical address can be determined
  631          * from its offset from the beginning of the reservation and the size
  632          * of the reservation.
  633          *
  634          * Could the specified index within a reservation of the smallest
  635          * possible size satisfy the alignment and boundary requirements?
  636          */
  637         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
  638         if ((pa & (alignment - 1)) != 0)
  639                 return (NULL);
  640         size = npages << PAGE_SHIFT;
  641         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
  642                 return (NULL);
  643 
  644         /*
  645          * Look for an existing reservation.
  646          */
  647         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  648         if (rv == NULL)
  649                 return (NULL);
  650         KASSERT(object != kernel_object || rv->domain == domain,
  651             ("vm_reserv_extend_contig: Domain mismatch from reservation."));
  652         index = VM_RESERV_INDEX(object, pindex);
  653         /* Does the allocation fit within the reservation? */
  654         if (index + npages > VM_LEVEL_0_NPAGES)
  655                 return (NULL);
  656         domain = rv->domain;
  657         vmd = VM_DOMAIN(domain);
  658         vm_reserv_lock(rv);
  659         if (rv->object != object)
  660                 goto out;
  661         m = &rv->pages[index];
  662         pa = VM_PAGE_TO_PHYS(m);
  663         if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
  664             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
  665                 goto out;
  666         /* Handle vm_page_rename(m, new_object, ...). */
  667         for (i = 0; i < npages; i++) {
  668                 if (popmap_is_set(rv->popmap, index + i))
  669                         goto out;
  670         }
  671         if (!vm_domain_allocate(vmd, req, npages))
  672                 goto out;
  673         for (i = 0; i < npages; i++)
  674                 vm_reserv_populate(rv, index + i);
  675         vm_reserv_unlock(rv);
  676         return (m);
  677 
  678 out:
  679         vm_reserv_unlock(rv);
  680         return (NULL);
  681 }
  682 
  683 /*
  684  * Allocates a contiguous set of physical pages of the given size "npages"
  685  * from newly created reservations.  All of the physical pages
  686  * must be at or above the given physical address "low" and below the given
  687  * physical address "high".  The given value "alignment" determines the
  688  * alignment of the first physical page in the set.  If the given value
  689  * "boundary" is non-zero, then the set of physical pages cannot cross any
  690  * physical address boundary that is a multiple of that value.  Both
  691  * "alignment" and "boundary" must be a power of two.
  692  *
  693  * Callers should first invoke vm_reserv_extend_contig() to attempt an
  694  * allocation from existing reservations.
  695  *
  696  * The page "mpred" must immediately precede the offset "pindex" within the
  697  * specified object.
  698  *
  699  * The object and free page queue must be locked.
  700  */
  701 vm_page_t
  702 vm_reserv_alloc_contig(int req, vm_object_t object, vm_pindex_t pindex, int domain,
  703     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
  704     vm_paddr_t boundary, vm_page_t mpred)
  705 {
  706         struct vm_domain *vmd;
  707         vm_paddr_t pa, size;
  708         vm_page_t m, m_ret, msucc;
  709         vm_pindex_t first, leftcap, rightcap;
  710         vm_reserv_t rv;
  711         u_long allocpages, maxpages, minpages;
  712         int i, index, n;
  713 
  714         VM_OBJECT_ASSERT_WLOCKED(object);
  715         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
  716 
  717         /*
  718          * Is a reservation fundamentally impossible?
  719          */
  720         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  721             pindex + npages > object->size)
  722                 return (NULL);
  723 
  724         /*
  725          * All reservations of a particular size have the same alignment.
  726          * Assuming that the first page is allocated from a reservation, the
  727          * least significant bits of its physical address can be determined
  728          * from its offset from the beginning of the reservation and the size
  729          * of the reservation.
  730          *
  731          * Could the specified index within a reservation of the smallest
  732          * possible size satisfy the alignment and boundary requirements?
  733          */
  734         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
  735         if ((pa & (alignment - 1)) != 0)
  736                 return (NULL);
  737         size = npages << PAGE_SHIFT;
  738         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
  739                 return (NULL);
  740 
  741         /*
  742          * Callers should've extended an existing reservation prior to
  743          * calling this function.  If a reservation exists it is
  744          * incompatible with the allocation.
  745          */
  746         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  747         if (rv != NULL)
  748                 return (NULL);
  749 
  750         /*
  751          * Could at least one reservation fit between the first index to the
  752          * left that can be used ("leftcap") and the first index to the right
  753          * that cannot be used ("rightcap")?
  754          *
  755          * We must synchronize with the reserv object lock to protect the
  756          * pindex/object of the resulting reservations against rename while
  757          * we are inspecting.
  758          */
  759         first = pindex - VM_RESERV_INDEX(object, pindex);
  760         minpages = VM_RESERV_INDEX(object, pindex) + npages;
  761         maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
  762         allocpages = maxpages;
  763         vm_reserv_object_lock(object);
  764         if (mpred != NULL) {
  765                 if ((rv = vm_reserv_from_page(mpred))->object != object)
  766                         leftcap = mpred->pindex + 1;
  767                 else
  768                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
  769                 if (leftcap > first) {
  770                         vm_reserv_object_unlock(object);
  771                         return (NULL);
  772                 }
  773         }
  774         if (msucc != NULL) {
  775                 if ((rv = vm_reserv_from_page(msucc))->object != object)
  776                         rightcap = msucc->pindex;
  777                 else
  778                         rightcap = rv->pindex;
  779                 if (first + maxpages > rightcap) {
  780                         if (maxpages == VM_LEVEL_0_NPAGES) {
  781                                 vm_reserv_object_unlock(object);
  782                                 return (NULL);
  783                         }
  784 
  785                         /*
  786                          * At least one reservation will fit between "leftcap"
  787                          * and "rightcap".  However, a reservation for the
  788                          * last of the requested pages will not fit.  Reduce
  789                          * the size of the upcoming allocation accordingly.
  790                          */
  791                         allocpages = minpages;
  792                 }
  793         }
  794         vm_reserv_object_unlock(object);
  795 
  796         /*
  797          * Would the last new reservation extend past the end of the object?
  798          */
  799         if (first + maxpages > object->size) {
  800                 /*
  801                  * Don't allocate the last new reservation if the object is a
  802                  * vnode or backed by another object that is a vnode. 
  803                  */
  804                 if (object->type == OBJT_VNODE ||
  805                     (object->backing_object != NULL &&
  806                     object->backing_object->type == OBJT_VNODE)) {
  807                         if (maxpages == VM_LEVEL_0_NPAGES)
  808                                 return (NULL);
  809                         allocpages = minpages;
  810                 }
  811                 /* Speculate that the object may grow. */
  812         }
  813 
  814         /*
  815          * Allocate the physical pages.  The alignment and boundary specified
  816          * for this allocation may be different from the alignment and
  817          * boundary specified for the requested pages.  For instance, the
  818          * specified index may not be the first page within the first new
  819          * reservation.
  820          */
  821         m = NULL;
  822         vmd = VM_DOMAIN(domain);
  823         if (vm_domain_allocate(vmd, req, npages)) {
  824                 vm_domain_free_lock(vmd);
  825                 m = vm_phys_alloc_contig(domain, allocpages, low, high,
  826                     ulmax(alignment, VM_LEVEL_0_SIZE),
  827                     boundary > VM_LEVEL_0_SIZE ? boundary : 0);
  828                 vm_domain_free_unlock(vmd);
  829                 if (m == NULL) {
  830                         vm_domain_freecnt_inc(vmd, npages);
  831                         return (NULL);
  832                 }
  833         } else
  834                 return (NULL);
  835         KASSERT(vm_phys_domain(m) == domain,
  836             ("vm_reserv_alloc_contig: Page domain does not match requested."));
  837 
  838         /*
  839          * The allocated physical pages always begin at a reservation
  840          * boundary, but they do not always end at a reservation boundary.
  841          * Initialize every reservation that is completely covered by the
  842          * allocated physical pages.
  843          */
  844         m_ret = NULL;
  845         index = VM_RESERV_INDEX(object, pindex);
  846         do {
  847                 rv = vm_reserv_from_page(m);
  848                 KASSERT(rv->pages == m,
  849                     ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
  850                     rv));
  851                 vm_reserv_lock(rv);
  852                 vm_reserv_insert(rv, object, first);
  853                 n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
  854                 for (i = 0; i < n; i++)
  855                         vm_reserv_populate(rv, index + i);
  856                 npages -= n;
  857                 if (m_ret == NULL) {
  858                         m_ret = &rv->pages[index];
  859                         index = 0;
  860                 }
  861                 vm_reserv_unlock(rv);
  862                 m += VM_LEVEL_0_NPAGES;
  863                 first += VM_LEVEL_0_NPAGES;
  864                 allocpages -= VM_LEVEL_0_NPAGES;
  865         } while (allocpages >= VM_LEVEL_0_NPAGES);
  866         return (m_ret);
  867 }
  868 
  869 /*
  870  * Attempts to extend an existing reservation and allocate the page to the
  871  * object.
  872  *
  873  * The page "mpred" must immediately precede the offset "pindex" within the
  874  * specified object.
  875  *
  876  * The object must be locked.
  877  */
  878 vm_page_t
  879 vm_reserv_extend(int req, vm_object_t object, vm_pindex_t pindex, int domain,
  880     vm_page_t mpred)
  881 {
  882         struct vm_domain *vmd;
  883         vm_page_t m, msucc;
  884         vm_reserv_t rv;
  885         int index;
  886 
  887         VM_OBJECT_ASSERT_WLOCKED(object);
  888 
  889         /*
  890          * Could a reservation currently exist?
  891          */
  892         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  893             pindex >= object->size || object->resident_page_count == 0)
  894                 return (NULL);
  895 
  896         /*
  897          * Look for an existing reservation.
  898          */
  899         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  900         if (rv == NULL)
  901                 return (NULL);
  902 
  903         KASSERT(object != kernel_object || rv->domain == domain,
  904             ("vm_reserv_extend: Domain mismatch from reservation."));
  905         domain = rv->domain;
  906         vmd = VM_DOMAIN(domain);
  907         index = VM_RESERV_INDEX(object, pindex);
  908         m = &rv->pages[index];
  909         vm_reserv_lock(rv);
  910         /* Handle reclaim race. */
  911         if (rv->object != object ||
  912             /* Handle vm_page_rename(m, new_object, ...). */
  913             popmap_is_set(rv->popmap, index)) {
  914                 m = NULL;
  915                 goto out;
  916         }
  917         if (vm_domain_allocate(vmd, req, 1) == 0)
  918                 m = NULL;
  919         else
  920                 vm_reserv_populate(rv, index);
  921 out:
  922         vm_reserv_unlock(rv);
  923 
  924         return (m);
  925 }
  926 
  927 /*
  928  * Attempts to allocate a new reservation for the object, and allocates a
  929  * page from that reservation.  Callers should first invoke vm_reserv_extend()
  930  * to attempt an allocation from an existing reservation.
  931  *
  932  * The page "mpred" must immediately precede the offset "pindex" within the
  933  * specified object.
  934  *
  935  * The object and free page queue must be locked.
  936  */
  937 vm_page_t
  938 vm_reserv_alloc_page(int req, vm_object_t object, vm_pindex_t pindex, int domain,
  939     vm_page_t mpred)
  940 {
  941         struct vm_domain *vmd;
  942         vm_page_t m, msucc;
  943         vm_pindex_t first, leftcap, rightcap;
  944         vm_reserv_t rv;
  945         int index;
  946 
  947         VM_OBJECT_ASSERT_WLOCKED(object);
  948 
  949         /*
  950          * Is a reservation fundamentally impossible?
  951          */
  952         if (pindex < VM_RESERV_INDEX(object, pindex) ||
  953             pindex >= object->size)
  954                 return (NULL);
  955 
  956         /*
  957          * Callers should've extended an existing reservation prior to
  958          * calling this function.  If a reservation exists it is
  959          * incompatible with the allocation.
  960          */
  961         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
  962         if (rv != NULL)
  963                 return (NULL);
  964 
  965         /*
  966          * Could a reservation fit between the first index to the left that
  967          * can be used and the first index to the right that cannot be used?
  968          *
  969          * We must synchronize with the reserv object lock to protect the
  970          * pindex/object of the resulting reservations against rename while
  971          * we are inspecting.
  972          */
  973         first = pindex - VM_RESERV_INDEX(object, pindex);
  974         vm_reserv_object_lock(object);
  975         if (mpred != NULL) {
  976                 if ((rv = vm_reserv_from_page(mpred))->object != object)
  977                         leftcap = mpred->pindex + 1;
  978                 else
  979                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
  980                 if (leftcap > first) {
  981                         vm_reserv_object_unlock(object);
  982                         return (NULL);
  983                 }
  984         }
  985         if (msucc != NULL) {
  986                 if ((rv = vm_reserv_from_page(msucc))->object != object)
  987                         rightcap = msucc->pindex;
  988                 else
  989                         rightcap = rv->pindex;
  990                 if (first + VM_LEVEL_0_NPAGES > rightcap) {
  991                         vm_reserv_object_unlock(object);
  992                         return (NULL);
  993                 }
  994         }
  995         vm_reserv_object_unlock(object);
  996 
  997         /*
  998          * Would a new reservation extend past the end of the object? 
  999          */
 1000         if (first + VM_LEVEL_0_NPAGES > object->size) {
 1001                 /*
 1002                  * Don't allocate a new reservation if the object is a vnode or
 1003                  * backed by another object that is a vnode. 
 1004                  */
 1005                 if (object->type == OBJT_VNODE ||
 1006                     (object->backing_object != NULL &&
 1007                     object->backing_object->type == OBJT_VNODE))
 1008                         return (NULL);
 1009                 /* Speculate that the object may grow. */
 1010         }
 1011 
 1012         /*
 1013          * Allocate and populate the new reservation.
 1014          */
 1015         m = NULL;
 1016         vmd = VM_DOMAIN(domain);
 1017         if (vm_domain_allocate(vmd, req, 1)) {
 1018                 vm_domain_free_lock(vmd);
 1019                 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
 1020                     VM_LEVEL_0_ORDER);
 1021                 vm_domain_free_unlock(vmd);
 1022                 if (m == NULL) {
 1023                         vm_domain_freecnt_inc(vmd, 1);
 1024                         return (NULL);
 1025                 }
 1026         } else
 1027                 return (NULL);
 1028         rv = vm_reserv_from_page(m);
 1029         vm_reserv_lock(rv);
 1030         KASSERT(rv->pages == m,
 1031             ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
 1032         vm_reserv_insert(rv, object, first);
 1033         index = VM_RESERV_INDEX(object, pindex);
 1034         vm_reserv_populate(rv, index);
 1035         vm_reserv_unlock(rv);
 1036 
 1037         return (&rv->pages[index]);
 1038 }
 1039 
 1040 /*
 1041  * Breaks the given reservation.  All free pages in the reservation
 1042  * are returned to the physical memory allocator.  The reservation's
 1043  * population count and map are reset to their initial state.
 1044  *
 1045  * The given reservation must not be in the partially populated reservation
 1046  * queue.
 1047  */
 1048 static void
 1049 vm_reserv_break(vm_reserv_t rv)
 1050 {
 1051         int begin_zeroes, hi, i, lo;
 1052 
 1053         vm_reserv_assert_locked(rv);
 1054         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1055             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1056         vm_reserv_remove(rv);
 1057         rv->pages->psind = 0;
 1058         i = hi = 0;
 1059         do {
 1060                 /* Find the next 0 bit.  Any previous 0 bits are < "hi". */
 1061                 lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
 1062                 if (lo == 0) {
 1063                         /* Redundantly clears bits < "hi". */
 1064                         rv->popmap[i] = 0;
 1065                         rv->popcnt -= NBPOPMAP - hi;
 1066                         while (++i < NPOPMAP) {
 1067                                 lo = ffsl(~rv->popmap[i]);
 1068                                 if (lo == 0) {
 1069                                         rv->popmap[i] = 0;
 1070                                         rv->popcnt -= NBPOPMAP;
 1071                                 } else
 1072                                         break;
 1073                         }
 1074                         if (i == NPOPMAP)
 1075                                 break;
 1076                         hi = 0;
 1077                 }
 1078                 KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
 1079                 /* Convert from ffsl() to ordinary bit numbering. */
 1080                 lo--;
 1081                 if (lo > 0) {
 1082                         /* Redundantly clears bits < "hi". */
 1083                         rv->popmap[i] &= ~((1UL << lo) - 1);
 1084                         rv->popcnt -= lo - hi;
 1085                 }
 1086                 begin_zeroes = NBPOPMAP * i + lo;
 1087                 /* Find the next 1 bit. */
 1088                 do
 1089                         hi = ffsl(rv->popmap[i]);
 1090                 while (hi == 0 && ++i < NPOPMAP);
 1091                 if (i != NPOPMAP)
 1092                         /* Convert from ffsl() to ordinary bit numbering. */
 1093                         hi--;
 1094                 vm_domain_free_lock(VM_DOMAIN(rv->domain));
 1095                 vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
 1096                     hi - begin_zeroes);
 1097                 vm_domain_free_unlock(VM_DOMAIN(rv->domain));
 1098         } while (i < NPOPMAP);
 1099         KASSERT(rv->popcnt == 0,
 1100             ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
 1101         counter_u64_add(vm_reserv_broken, 1);
 1102 }
 1103 
 1104 /*
 1105  * Breaks all reservations belonging to the given object.
 1106  */
 1107 void
 1108 vm_reserv_break_all(vm_object_t object)
 1109 {
 1110         vm_reserv_t rv;
 1111 
 1112         /*
 1113          * This access of object->rvq is unsynchronized so that the
 1114          * object rvq lock can nest after the domain_free lock.  We
 1115          * must check for races in the results.  However, the object
 1116          * lock prevents new additions, so we are guaranteed that when
 1117          * it returns NULL the object is properly empty.
 1118          */
 1119         while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
 1120                 vm_reserv_lock(rv);
 1121                 /* Reclaim race. */
 1122                 if (rv->object != object) {
 1123                         vm_reserv_unlock(rv);
 1124                         continue;
 1125                 }
 1126                 vm_reserv_domain_lock(rv->domain);
 1127                 if (rv->inpartpopq) {
 1128                         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
 1129                         rv->inpartpopq = FALSE;
 1130                 }
 1131                 vm_reserv_domain_unlock(rv->domain);
 1132                 vm_reserv_break(rv);
 1133                 vm_reserv_unlock(rv);
 1134         }
 1135 }
 1136 
 1137 /*
 1138  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
 1139  * page is freed and FALSE otherwise.
 1140  */
 1141 boolean_t
 1142 vm_reserv_free_page(vm_page_t m)
 1143 {
 1144         vm_reserv_t rv;
 1145         boolean_t ret;
 1146 
 1147         rv = vm_reserv_from_page(m);
 1148         if (rv->object == NULL)
 1149                 return (FALSE);
 1150         vm_reserv_lock(rv);
 1151         /* Re-validate after lock. */
 1152         if (rv->object != NULL) {
 1153                 vm_reserv_depopulate(rv, m - rv->pages);
 1154                 ret = TRUE;
 1155         } else
 1156                 ret = FALSE;
 1157         vm_reserv_unlock(rv);
 1158 
 1159         return (ret);
 1160 }
 1161 
 1162 /*
 1163  * Initializes the reservation management system.  Specifically, initializes
 1164  * the reservation array.
 1165  *
 1166  * Requires that vm_page_array and first_page are initialized!
 1167  */
 1168 void
 1169 vm_reserv_init(void)
 1170 {
 1171         vm_paddr_t paddr;
 1172         struct vm_phys_seg *seg;
 1173         struct vm_reserv *rv;
 1174         struct vm_reserv_domain *rvd;
 1175         int i, j, segind;
 1176 
 1177         /*
 1178          * Initialize the reservation array.  Specifically, initialize the
 1179          * "pages" field for every element that has an underlying superpage.
 1180          */
 1181         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 1182                 seg = &vm_phys_segs[segind];
 1183                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 1184                 while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
 1185                     VM_LEVEL_0_SIZE <= seg->end) {
 1186                         rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
 1187                         rv->pages = PHYS_TO_VM_PAGE(paddr);
 1188                         rv->domain = seg->domain;
 1189                         mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF);
 1190                         paddr += VM_LEVEL_0_SIZE;
 1191                 }
 1192         }
 1193         for (i = 0; i < MAXMEMDOM; i++) {
 1194                 rvd = &vm_rvd[i];
 1195                 mtx_init(&rvd->lock, "vm reserv domain", NULL, MTX_DEF);
 1196                 TAILQ_INIT(&rvd->partpop);
 1197                 mtx_init(&rvd->marker.lock, "vm reserv marker", NULL, MTX_DEF);
 1198 
 1199                 /*
 1200                  * Fully populated reservations should never be present in the
 1201                  * partially populated reservation queues.
 1202                  */
 1203                 rvd->marker.popcnt = VM_LEVEL_0_NPAGES;
 1204                 for (j = 0; j < VM_LEVEL_0_NPAGES; j++)
 1205                         popmap_set(rvd->marker.popmap, j);
 1206         }
 1207 
 1208         for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
 1209                 mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
 1210                     MTX_DEF);
 1211 }
 1212 
 1213 /*
 1214  * Returns true if the given page belongs to a reservation and that page is
 1215  * free.  Otherwise, returns false.
 1216  */
 1217 bool
 1218 vm_reserv_is_page_free(vm_page_t m)
 1219 {
 1220         vm_reserv_t rv;
 1221 
 1222         rv = vm_reserv_from_page(m);
 1223         if (rv->object == NULL)
 1224                 return (false);
 1225         return (popmap_is_clear(rv->popmap, m - rv->pages));
 1226 }
 1227 
 1228 /*
 1229  * If the given page belongs to a reservation, returns the level of that
 1230  * reservation.  Otherwise, returns -1.
 1231  */
 1232 int
 1233 vm_reserv_level(vm_page_t m)
 1234 {
 1235         vm_reserv_t rv;
 1236 
 1237         rv = vm_reserv_from_page(m);
 1238         return (rv->object != NULL ? 0 : -1);
 1239 }
 1240 
 1241 /*
 1242  * Returns a reservation level if the given page belongs to a fully populated
 1243  * reservation and -1 otherwise.
 1244  */
 1245 int
 1246 vm_reserv_level_iffullpop(vm_page_t m)
 1247 {
 1248         vm_reserv_t rv;
 1249 
 1250         rv = vm_reserv_from_page(m);
 1251         return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 1252 }
 1253 
 1254 /*
 1255  * Remove a partially populated reservation from the queue.
 1256  */
 1257 static void
 1258 vm_reserv_dequeue(vm_reserv_t rv)
 1259 {
 1260 
 1261         vm_reserv_domain_assert_locked(rv->domain);
 1262         vm_reserv_assert_locked(rv);
 1263         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1264             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1265         KASSERT(rv->inpartpopq,
 1266             ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 1267 
 1268         TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
 1269         rv->inpartpopq = FALSE;
 1270 }
 1271 
 1272 /*
 1273  * Breaks the given partially populated reservation, releasing its free pages
 1274  * to the physical memory allocator.
 1275  */
 1276 static void
 1277 vm_reserv_reclaim(vm_reserv_t rv)
 1278 {
 1279 
 1280         vm_reserv_assert_locked(rv);
 1281         CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 1282             __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 1283         if (rv->inpartpopq) {
 1284                 vm_reserv_domain_lock(rv->domain);
 1285                 vm_reserv_dequeue(rv);
 1286                 vm_reserv_domain_unlock(rv->domain);
 1287         }
 1288         vm_reserv_break(rv);
 1289         counter_u64_add(vm_reserv_reclaimed, 1);
 1290 }
 1291 
 1292 /*
 1293  * Breaks a reservation near the head of the partially populated reservation
 1294  * queue, releasing its free pages to the physical memory allocator.  Returns
 1295  * TRUE if a reservation is broken and FALSE otherwise.
 1296  */
 1297 bool
 1298 vm_reserv_reclaim_inactive(int domain)
 1299 {
 1300         vm_reserv_t rv;
 1301 
 1302         vm_reserv_domain_lock(domain);
 1303         TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
 1304                 /*
 1305                  * A locked reservation is likely being updated or reclaimed,
 1306                  * so just skip ahead.
 1307                  */
 1308                 if (rv != &vm_rvd[domain].marker && vm_reserv_trylock(rv)) {
 1309                         vm_reserv_dequeue(rv);
 1310                         break;
 1311                 }
 1312         }
 1313         vm_reserv_domain_unlock(domain);
 1314         if (rv != NULL) {
 1315                 vm_reserv_reclaim(rv);
 1316                 vm_reserv_unlock(rv);
 1317                 return (true);
 1318         }
 1319         return (false);
 1320 }
 1321 
 1322 /*
 1323  * Determine whether this reservation has free pages that satisfy the given
 1324  * request for contiguous physical memory.  Start searching from the lower
 1325  * bound, defined by lo, and stop at the upper bound, hi.  Return the index
 1326  * of the first satisfactory free page, or -1 if none is found.
 1327  */
 1328 static int
 1329 vm_reserv_find_contig(vm_reserv_t rv, int npages, int lo,
 1330     int hi, int ppn_align, int ppn_bound)
 1331 {
 1332         u_long changes;
 1333         int bitpos, bits_left, i, n;
 1334 
 1335         vm_reserv_assert_locked(rv);
 1336         KASSERT(npages <= VM_LEVEL_0_NPAGES - 1,
 1337             ("%s: Too many pages", __func__));
 1338         KASSERT(ppn_bound <= VM_LEVEL_0_NPAGES,
 1339             ("%s: Too big a boundary for reservation size", __func__));
 1340         KASSERT(npages <= ppn_bound,
 1341             ("%s: Too many pages for given boundary", __func__));
 1342         KASSERT(ppn_align != 0 && powerof2(ppn_align),
 1343             ("ppn_align is not a positive power of 2"));
 1344         KASSERT(ppn_bound != 0 && powerof2(ppn_bound),
 1345             ("ppn_bound is not a positive power of 2"));
 1346         i = lo / NBPOPMAP;
 1347         changes = rv->popmap[i] | ((1UL << (lo % NBPOPMAP)) - 1);
 1348         n = hi / NBPOPMAP;
 1349         bits_left = hi % NBPOPMAP;
 1350         hi = lo = -1;
 1351         for (;;) {
 1352                 /*
 1353                  * "changes" is a bitmask that marks where a new sequence of
 1354                  * 0s or 1s begins in popmap[i], with last bit in popmap[i-1]
 1355                  * considered to be 1 if and only if lo == hi.  The bits of
 1356                  * popmap[-1] and popmap[NPOPMAP] are considered all 1s.
 1357                  */
 1358                 changes ^= (changes << 1) | (lo == hi);
 1359                 while (changes != 0) {
 1360                         /*
 1361                          * If the next change marked begins a run of 0s, set
 1362                          * lo to mark that position.  Otherwise set hi and
 1363                          * look for a satisfactory first page from lo up to hi.
 1364                          */
 1365                         bitpos = ffsl(changes) - 1;
 1366                         changes ^= 1UL << bitpos;
 1367                         if (lo == hi) {
 1368                                 lo = NBPOPMAP * i + bitpos;
 1369                                 continue;
 1370                         }
 1371                         hi = NBPOPMAP * i + bitpos;
 1372                         if (lo < roundup2(lo, ppn_align)) {
 1373                                 /* Skip to next aligned page. */
 1374                                 lo = roundup2(lo, ppn_align);
 1375                                 if (lo >= VM_LEVEL_0_NPAGES)
 1376                                         return (-1);
 1377                         }
 1378                         if (lo + npages > roundup2(lo, ppn_bound)) {
 1379                                 /* Skip to next boundary-matching page. */
 1380                                 lo = roundup2(lo, ppn_bound);
 1381                                 if (lo >= VM_LEVEL_0_NPAGES)
 1382                                         return (-1);
 1383                         }
 1384                         if (lo + npages <= hi)
 1385                                 return (lo);
 1386                         lo = hi;
 1387                 }
 1388                 if (++i < n)
 1389                         changes = rv->popmap[i];
 1390                 else if (i == n)
 1391                         changes = bits_left == 0 ? -1UL :
 1392                             (rv->popmap[n] | (-1UL << bits_left));
 1393                 else
 1394                         return (-1);
 1395         }
 1396 }
 1397 
 1398 /*
 1399  * Searches the partially populated reservation queue for the least recently
 1400  * changed reservation with free pages that satisfy the given request for
 1401  * contiguous physical memory.  If a satisfactory reservation is found, it is
 1402  * broken.  Returns true if a reservation is broken and false otherwise.
 1403  */
 1404 bool
 1405 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low,
 1406     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 1407 {
 1408         struct vm_reserv_queue *queue;
 1409         vm_paddr_t pa, size;
 1410         vm_reserv_t marker, rv, rvn;
 1411         int hi, lo, posn, ppn_align, ppn_bound;
 1412 
 1413         KASSERT(npages > 0, ("npages is 0"));
 1414         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1415         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1416         if (npages > VM_LEVEL_0_NPAGES - 1)
 1417                 return (false);
 1418         size = npages << PAGE_SHIFT;
 1419         /* 
 1420          * Ensure that a free range starting at a boundary-multiple
 1421          * doesn't include a boundary-multiple within it.  Otherwise,
 1422          * no boundary-constrained allocation is possible.
 1423          */
 1424         if (size > boundary && boundary > 0)
 1425                 return (false);
 1426         marker = &vm_rvd[domain].marker;
 1427         queue = &vm_rvd[domain].partpop;
 1428         /*
 1429          * Compute shifted alignment, boundary values for page-based
 1430          * calculations.  Constrain to range [1, VM_LEVEL_0_NPAGES] to
 1431          * avoid overflow.
 1432          */
 1433         ppn_align = (int)(ulmin(ulmax(PAGE_SIZE, alignment),
 1434             VM_LEVEL_0_SIZE) >> PAGE_SHIFT);
 1435         ppn_bound = boundary == 0 ? VM_LEVEL_0_NPAGES :
 1436             (int)(MIN(MAX(PAGE_SIZE, boundary),
 1437             VM_LEVEL_0_SIZE) >> PAGE_SHIFT);
 1438 
 1439         vm_reserv_domain_scan_lock(domain);
 1440         vm_reserv_domain_lock(domain);
 1441         TAILQ_FOREACH_SAFE(rv, queue, partpopq, rvn) {
 1442                 pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
 1443                 if (pa + VM_LEVEL_0_SIZE - size < low) {
 1444                         /* This entire reservation is too low; go to next. */
 1445                         continue;
 1446                 }
 1447                 if (pa + size > high) {
 1448                         /* This entire reservation is too high; go to next. */
 1449                         continue;
 1450                 }
 1451                 if ((pa & (alignment - 1)) != 0) {
 1452                         /* This entire reservation is unaligned; go to next. */
 1453                         continue;
 1454                 }
 1455 
 1456                 if (vm_reserv_trylock(rv) == 0) {
 1457                         TAILQ_INSERT_AFTER(queue, rv, marker, partpopq);
 1458                         vm_reserv_domain_unlock(domain);
 1459                         vm_reserv_lock(rv);
 1460                         if (!rv->inpartpopq ||
 1461                             TAILQ_NEXT(rv, partpopq) != marker) {
 1462                                 vm_reserv_unlock(rv);
 1463                                 vm_reserv_domain_lock(domain);
 1464                                 rvn = TAILQ_NEXT(marker, partpopq);
 1465                                 TAILQ_REMOVE(queue, marker, partpopq);
 1466                                 continue;
 1467                         }
 1468                         vm_reserv_domain_lock(domain);
 1469                         TAILQ_REMOVE(queue, marker, partpopq);
 1470                 }
 1471                 vm_reserv_domain_unlock(domain);
 1472                 lo = (pa >= low) ? 0 :
 1473                     (int)((low + PAGE_MASK - pa) >> PAGE_SHIFT);
 1474                 hi = (pa + VM_LEVEL_0_SIZE <= high) ? VM_LEVEL_0_NPAGES :
 1475                     (int)((high - pa) >> PAGE_SHIFT);
 1476                 posn = vm_reserv_find_contig(rv, (int)npages, lo, hi,
 1477                     ppn_align, ppn_bound);
 1478                 if (posn >= 0) {
 1479                         pa = VM_PAGE_TO_PHYS(&rv->pages[posn]);
 1480                         KASSERT((pa & (alignment - 1)) == 0,
 1481                             ("%s: adjusted address does not align to %lx",
 1482                             __func__, alignment));
 1483                         KASSERT(((pa ^ (pa + size - 1)) & -boundary) == 0,
 1484                             ("%s: adjusted address spans boundary to %jx",
 1485                             __func__, (uintmax_t)boundary));
 1486 
 1487                         vm_reserv_domain_scan_unlock(domain);
 1488                         vm_reserv_reclaim(rv);
 1489                         vm_reserv_unlock(rv);
 1490                         return (true);
 1491                 }
 1492                 vm_reserv_unlock(rv);
 1493                 vm_reserv_domain_lock(domain);
 1494         }
 1495         vm_reserv_domain_unlock(domain);
 1496         vm_reserv_domain_scan_unlock(domain);
 1497         return (false);
 1498 }
 1499 
 1500 /*
 1501  * Transfers the reservation underlying the given page to a new object.
 1502  *
 1503  * The object must be locked.
 1504  */
 1505 void
 1506 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
 1507     vm_pindex_t old_object_offset)
 1508 {
 1509         vm_reserv_t rv;
 1510 
 1511         VM_OBJECT_ASSERT_WLOCKED(new_object);
 1512         rv = vm_reserv_from_page(m);
 1513         if (rv->object == old_object) {
 1514                 vm_reserv_lock(rv);
 1515                 CTR6(KTR_VM,
 1516                     "%s: rv %p object %p new %p popcnt %d inpartpop %d",
 1517                     __FUNCTION__, rv, rv->object, new_object, rv->popcnt,
 1518                     rv->inpartpopq);
 1519                 if (rv->object == old_object) {
 1520                         vm_reserv_object_lock(old_object);
 1521                         rv->object = NULL;
 1522                         LIST_REMOVE(rv, objq);
 1523                         vm_reserv_object_unlock(old_object);
 1524                         vm_reserv_object_lock(new_object);
 1525                         rv->object = new_object;
 1526                         rv->pindex -= old_object_offset;
 1527                         LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 1528                         vm_reserv_object_unlock(new_object);
 1529                 }
 1530                 vm_reserv_unlock(rv);
 1531         }
 1532 }
 1533 
 1534 /*
 1535  * Returns the size (in bytes) of a reservation of the specified level.
 1536  */
 1537 int
 1538 vm_reserv_size(int level)
 1539 {
 1540 
 1541         switch (level) {
 1542         case 0:
 1543                 return (VM_LEVEL_0_SIZE);
 1544         case -1:
 1545                 return (PAGE_SIZE);
 1546         default:
 1547                 return (0);
 1548         }
 1549 }
 1550 
 1551 /*
 1552  * Allocates the virtual and physical memory required by the reservation
 1553  * management system's data structures, in particular, the reservation array.
 1554  */
 1555 vm_paddr_t
 1556 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
 1557 {
 1558         vm_paddr_t new_end;
 1559         size_t size;
 1560 
 1561         /*
 1562          * Calculate the size (in bytes) of the reservation array.  Round up
 1563          * from "high_water" because every small page is mapped to an element
 1564          * in the reservation array based on its physical address.  Thus, the
 1565          * number of elements in the reservation array can be greater than the
 1566          * number of superpages. 
 1567          */
 1568         size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
 1569 
 1570         /*
 1571          * Allocate and map the physical memory for the reservation array.  The
 1572          * next available virtual address is returned by reference.
 1573          */
 1574         new_end = end - round_page(size);
 1575         vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 1576             VM_PROT_READ | VM_PROT_WRITE);
 1577         bzero(vm_reserv_array, size);
 1578 
 1579         /*
 1580          * Return the next available physical address.
 1581          */
 1582         return (new_end);
 1583 }
 1584 
 1585 /*
 1586  * Initializes the reservation management system.  Specifically, initializes
 1587  * the reservation counters.
 1588  */
 1589 static void
 1590 vm_reserv_counter_init(void *unused)
 1591 {
 1592 
 1593         vm_reserv_freed = counter_u64_alloc(M_WAITOK); 
 1594         vm_reserv_broken = counter_u64_alloc(M_WAITOK); 
 1595         vm_reserv_reclaimed = counter_u64_alloc(M_WAITOK); 
 1596 }
 1597 SYSINIT(vm_reserv_counter_init, SI_SUB_CPU, SI_ORDER_ANY,
 1598     vm_reserv_counter_init, NULL);
 1599 
 1600 /*
 1601  * Returns the superpage containing the given page.
 1602  */
 1603 vm_page_t
 1604 vm_reserv_to_superpage(vm_page_t m)
 1605 {
 1606         vm_reserv_t rv;
 1607 
 1608         VM_OBJECT_ASSERT_LOCKED(m->object);
 1609         rv = vm_reserv_from_page(m);
 1610         if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES)
 1611                 m = rv->pages;
 1612         else
 1613                 m = NULL;
 1614 
 1615         return (m);
 1616 }
 1617 
 1618 #endif  /* VM_NRESERVLEVEL > 0 */
Cache object: 457c5067462e740c770f61ce65f29221
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vm/vm_reserv.c

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_reserv.c