The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_page.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
    3  *
    4  * Copyright (c) 1991 Regents of the University of California.
    5  * All rights reserved.
    6  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
    7  *
    8  * This code is derived from software contributed to Berkeley by
    9  * The Mach Operating System project at Carnegie-Mellon University.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  * 3. Neither the name of the University nor the names of its contributors
   20  *    may be used to endorse or promote products derived from this software
   21  *    without specific prior written permission.
   22  *
   23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   33  * SUCH DAMAGE.
   34  *
   35  *      from: @(#)vm_page.c     7.4 (Berkeley) 5/7/91
   36  */
   37 
   38 /*-
   39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   40  * All rights reserved.
   41  *
   42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
   43  *
   44  * Permission to use, copy, modify and distribute this software and
   45  * its documentation is hereby granted, provided that both the copyright
   46  * notice and this permission notice appear in all copies of the
   47  * software, derivative works or modified versions, and any portions
   48  * thereof, and that both notices appear in supporting documentation.
   49  *
   50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   53  *
   54  * Carnegie Mellon requests users of this software to return to
   55  *
   56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   57  *  School of Computer Science
   58  *  Carnegie Mellon University
   59  *  Pittsburgh PA 15213-3890
   60  *
   61  * any improvements or extensions that they make and grant Carnegie the
   62  * rights to redistribute these changes.
   63  */
   64 
   65 /*
   66  *      Resident memory management module.
   67  */
   68 
   69 #include <sys/cdefs.h>
   70 __FBSDID("$FreeBSD$");
   71 
   72 #include "opt_vm.h"
   73 
   74 #include <sys/param.h>
   75 #include <sys/systm.h>
   76 #include <sys/lock.h>
   77 #include <sys/domainset.h>
   78 #include <sys/kernel.h>
   79 #include <sys/limits.h>
   80 #include <sys/linker.h>
   81 #include <sys/malloc.h>
   82 #include <sys/mman.h>
   83 #include <sys/msgbuf.h>
   84 #include <sys/mutex.h>
   85 #include <sys/proc.h>
   86 #include <sys/rwlock.h>
   87 #include <sys/sbuf.h>
   88 #include <sys/sched.h>
   89 #include <sys/smp.h>
   90 #include <sys/sysctl.h>
   91 #include <sys/vmmeter.h>
   92 #include <sys/vnode.h>
   93 
   94 #include <vm/vm.h>
   95 #include <vm/pmap.h>
   96 #include <vm/vm_param.h>
   97 #include <vm/vm_domainset.h>
   98 #include <vm/vm_kern.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_object.h>
  101 #include <vm/vm_page.h>
  102 #include <vm/vm_pageout.h>
  103 #include <vm/vm_phys.h>
  104 #include <vm/vm_pagequeue.h>
  105 #include <vm/vm_pager.h>
  106 #include <vm/vm_radix.h>
  107 #include <vm/vm_reserv.h>
  108 #include <vm/vm_extern.h>
  109 #include <vm/uma.h>
  110 #include <vm/uma_int.h>
  111 
  112 #include <machine/md_var.h>
  113 
  114 extern int      uma_startup_count(int);
  115 extern void     uma_startup(void *, int);
  116 extern int      vmem_startup_count(void);
  117 
  118 struct vm_domain vm_dom[MAXMEMDOM];
  119 
  120 DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
  121 
  122 struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
  123 
  124 struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
  125 /* The following fields are protected by the domainset lock. */
  126 domainset_t __exclusive_cache_line vm_min_domains;
  127 domainset_t __exclusive_cache_line vm_severe_domains;
  128 static int vm_min_waiters;
  129 static int vm_severe_waiters;
  130 static int vm_pageproc_waiters;
  131 
  132 /*
  133  * bogus page -- for I/O to/from partially complete buffers,
  134  * or for paging into sparsely invalid regions.
  135  */
  136 vm_page_t bogus_page;
  137 
  138 vm_page_t vm_page_array;
  139 long vm_page_array_size;
  140 long first_page;
  141 
  142 static int boot_pages;
  143 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  144     &boot_pages, 0,
  145     "number of pages allocated for bootstrapping the VM system");
  146 
  147 static int pa_tryrelock_restart;
  148 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
  149     &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
  150 
  151 static TAILQ_HEAD(, vm_page) blacklist_head;
  152 static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
  153 SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
  154     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
  155 
  156 static uma_zone_t fakepg_zone;
  157 
  158 static void vm_page_alloc_check(vm_page_t m);
  159 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
  160 static void vm_page_dequeue_complete(vm_page_t m);
  161 static void vm_page_enqueue(vm_page_t m, uint8_t queue);
  162 static void vm_page_init(void *dummy);
  163 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
  164     vm_pindex_t pindex, vm_page_t mpred);
  165 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
  166     vm_page_t mpred);
  167 static int vm_page_reclaim_run(int req_class, int domain, u_long npages,
  168     vm_page_t m_run, vm_paddr_t high);
  169 static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
  170     int req);
  171 static int vm_page_zone_import(void *arg, void **store, int cnt, int domain,
  172     int flags);
  173 static void vm_page_zone_release(void *arg, void **store, int cnt);
  174 
  175 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);
  176 
  177 static void
  178 vm_page_init(void *dummy)
  179 {
  180 
  181         fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
  182             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
  183         bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
  184             VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
  185 }
  186 
  187 /*
  188  * The cache page zone is initialized later since we need to be able to allocate
  189  * pages before UMA is fully initialized.
  190  */
  191 static void
  192 vm_page_init_cache_zones(void *dummy __unused)
  193 {
  194         struct vm_domain *vmd;
  195         struct vm_pgcache *pgcache;
  196         int domain, pool;
  197 
  198         for (domain = 0; domain < vm_ndomains; domain++) {
  199                 vmd = VM_DOMAIN(domain);
  200 
  201                 /*
  202                  * Don't allow the page caches to take up more than .1875% of
  203                  * memory.  A UMA bucket contains at most 256 free pages, and we
  204                  * have two buckets per CPU per free pool.
  205                  */
  206                 if (vmd->vmd_page_count / 600 < 2 * 256 * mp_ncpus *
  207                     VM_NFREEPOOL)
  208                         continue;
  209                 for (pool = 0; pool < VM_NFREEPOOL; pool++) {
  210                         pgcache = &vmd->vmd_pgcache[pool];
  211                         pgcache->domain = domain;
  212                         pgcache->pool = pool;
  213                         pgcache->zone = uma_zcache_create("vm pgcache",
  214                             sizeof(struct vm_page), NULL, NULL, NULL, NULL,
  215                             vm_page_zone_import, vm_page_zone_release, pgcache,
  216                             UMA_ZONE_NOBUCKETCACHE | UMA_ZONE_MAXBUCKET |
  217                             UMA_ZONE_VM);
  218                 }
  219         }
  220 }
  221 SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL);
  222 
  223 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
  224 #if PAGE_SIZE == 32768
  225 #ifdef CTASSERT
  226 CTASSERT(sizeof(u_long) >= 8);
  227 #endif
  228 #endif
  229 
  230 /*
  231  * Try to acquire a physical address lock while a pmap is locked.  If we
  232  * fail to trylock we unlock and lock the pmap directly and cache the
  233  * locked pa in *locked.  The caller should then restart their loop in case
  234  * the virtual to physical mapping has changed.
  235  */
  236 int
  237 vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
  238 {
  239         vm_paddr_t lockpa;
  240 
  241         lockpa = *locked;
  242         *locked = pa;
  243         if (lockpa) {
  244                 PA_LOCK_ASSERT(lockpa, MA_OWNED);
  245                 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
  246                         return (0);
  247                 PA_UNLOCK(lockpa);
  248         }
  249         if (PA_TRYLOCK(pa))
  250                 return (0);
  251         PMAP_UNLOCK(pmap);
  252         atomic_add_int(&pa_tryrelock_restart, 1);
  253         PA_LOCK(pa);
  254         PMAP_LOCK(pmap);
  255         return (EAGAIN);
  256 }
  257 
  258 /*
  259  *      vm_set_page_size:
  260  *
  261  *      Sets the page size, perhaps based upon the memory
  262  *      size.  Must be called before any use of page-size
  263  *      dependent functions.
  264  */
  265 void
  266 vm_set_page_size(void)
  267 {
  268         if (vm_cnt.v_page_size == 0)
  269                 vm_cnt.v_page_size = PAGE_SIZE;
  270         if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
  271                 panic("vm_set_page_size: page size not a power of two");
  272 }
  273 
  274 /*
  275  *      vm_page_blacklist_next:
  276  *
  277  *      Find the next entry in the provided string of blacklist
  278  *      addresses.  Entries are separated by space, comma, or newline.
  279  *      If an invalid integer is encountered then the rest of the
  280  *      string is skipped.  Updates the list pointer to the next
  281  *      character, or NULL if the string is exhausted or invalid.
  282  */
  283 static vm_paddr_t
  284 vm_page_blacklist_next(char **list, char *end)
  285 {
  286         vm_paddr_t bad;
  287         char *cp, *pos;
  288 
  289         if (list == NULL || *list == NULL)
  290                 return (0);
  291         if (**list =='\0') {
  292                 *list = NULL;
  293                 return (0);
  294         }
  295 
  296         /*
  297          * If there's no end pointer then the buffer is coming from
  298          * the kenv and we know it's null-terminated.
  299          */
  300         if (end == NULL)
  301                 end = *list + strlen(*list);
  302 
  303         /* Ensure that strtoq() won't walk off the end */
  304         if (*end != '\0') {
  305                 if (*end == '\n' || *end == ' ' || *end  == ',')
  306                         *end = '\0';
  307                 else {
  308                         printf("Blacklist not terminated, skipping\n");
  309                         *list = NULL;
  310                         return (0);
  311                 }
  312         }
  313 
  314         for (pos = *list; *pos != '\0'; pos = cp) {
  315                 bad = strtoq(pos, &cp, 0);
  316                 if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
  317                         if (bad == 0) {
  318                                 if (++cp < end)
  319                                         continue;
  320                                 else
  321                                         break;
  322                         }
  323                 } else
  324                         break;
  325                 if (*cp == '\0' || ++cp >= end)
  326                         *list = NULL;
  327                 else
  328                         *list = cp;
  329                 return (trunc_page(bad));
  330         }
  331         printf("Garbage in RAM blacklist, skipping\n");
  332         *list = NULL;
  333         return (0);
  334 }
  335 
  336 bool
  337 vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
  338 {
  339         struct vm_domain *vmd;
  340         vm_page_t m;
  341         int ret;
  342 
  343         m = vm_phys_paddr_to_vm_page(pa);
  344         if (m == NULL)
  345                 return (true); /* page does not exist, no failure */
  346 
  347         vmd = vm_pagequeue_domain(m);
  348         vm_domain_free_lock(vmd);
  349         ret = vm_phys_unfree_page(m);
  350         vm_domain_free_unlock(vmd);
  351         if (ret != 0) {
  352                 vm_domain_freecnt_inc(vmd, -1);
  353                 TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
  354                 if (verbose)
  355                         printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
  356         }
  357         return (ret);
  358 }
  359 
  360 /*
  361  *      vm_page_blacklist_check:
  362  *
  363  *      Iterate through the provided string of blacklist addresses, pulling
  364  *      each entry out of the physical allocator free list and putting it
  365  *      onto a list for reporting via the vm.page_blacklist sysctl.
  366  */
  367 static void
  368 vm_page_blacklist_check(char *list, char *end)
  369 {
  370         vm_paddr_t pa;
  371         char *next;
  372 
  373         next = list;
  374         while (next != NULL) {
  375                 if ((pa = vm_page_blacklist_next(&next, end)) == 0)
  376                         continue;
  377                 vm_page_blacklist_add(pa, bootverbose);
  378         }
  379 }
  380 
  381 /*
  382  *      vm_page_blacklist_load:
  383  *
  384  *      Search for a special module named "ram_blacklist".  It'll be a
  385  *      plain text file provided by the user via the loader directive
  386  *      of the same name.
  387  */
  388 static void
  389 vm_page_blacklist_load(char **list, char **end)
  390 {
  391         void *mod;
  392         u_char *ptr;
  393         u_int len;
  394 
  395         mod = NULL;
  396         ptr = NULL;
  397 
  398         mod = preload_search_by_type("ram_blacklist");
  399         if (mod != NULL) {
  400                 ptr = preload_fetch_addr(mod);
  401                 len = preload_fetch_size(mod);
  402         }
  403         *list = ptr;
  404         if (ptr != NULL)
  405                 *end = ptr + len;
  406         else
  407                 *end = NULL;
  408         return;
  409 }
  410 
  411 static int
  412 sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
  413 {
  414         vm_page_t m;
  415         struct sbuf sbuf;
  416         int error, first;
  417 
  418         first = 1;
  419         error = sysctl_wire_old_buffer(req, 0);
  420         if (error != 0)
  421                 return (error);
  422         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  423         TAILQ_FOREACH(m, &blacklist_head, listq) {
  424                 sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
  425                     (uintmax_t)m->phys_addr);
  426                 first = 0;
  427         }
  428         error = sbuf_finish(&sbuf);
  429         sbuf_delete(&sbuf);
  430         return (error);
  431 }
  432 
  433 /*
  434  * Initialize a dummy page for use in scans of the specified paging queue.
  435  * In principle, this function only needs to set the flag PG_MARKER.
  436  * Nonetheless, it write busies and initializes the hold count to one as
  437  * safety precautions.
  438  */
  439 static void
  440 vm_page_init_marker(vm_page_t marker, int queue, uint8_t aflags)
  441 {
  442 
  443         bzero(marker, sizeof(*marker));
  444         marker->flags = PG_MARKER;
  445         marker->aflags = aflags;
  446         marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
  447         marker->queue = queue;
  448         marker->hold_count = 1;
  449 }
  450 
  451 static void
  452 vm_page_domain_init(int domain)
  453 {
  454         struct vm_domain *vmd;
  455         struct vm_pagequeue *pq;
  456         int i;
  457 
  458         vmd = VM_DOMAIN(domain);
  459         bzero(vmd, sizeof(*vmd));
  460         *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
  461             "vm inactive pagequeue";
  462         *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
  463             "vm active pagequeue";
  464         *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
  465             "vm laundry pagequeue";
  466         *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) =
  467             "vm unswappable pagequeue";
  468         vmd->vmd_domain = domain;
  469         vmd->vmd_page_count = 0;
  470         vmd->vmd_free_count = 0;
  471         vmd->vmd_segs = 0;
  472         vmd->vmd_oom = FALSE;
  473         for (i = 0; i < PQ_COUNT; i++) {
  474                 pq = &vmd->vmd_pagequeues[i];
  475                 TAILQ_INIT(&pq->pq_pl);
  476                 mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
  477                     MTX_DEF | MTX_DUPOK);
  478                 pq->pq_pdpages = 0;
  479                 vm_page_init_marker(&vmd->vmd_markers[i], i, 0);
  480         }
  481         mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
  482         mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF);
  483         snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
  484 
  485         /*
  486          * inacthead is used to provide FIFO ordering for LRU-bypassing
  487          * insertions.
  488          */
  489         vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED);
  490         TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
  491             &vmd->vmd_inacthead, plinks.q);
  492 
  493         /*
  494          * The clock pages are used to implement active queue scanning without
  495          * requeues.  Scans start at clock[0], which is advanced after the scan
  496          * ends.  When the two clock hands meet, they are reset and scanning
  497          * resumes from the head of the queue.
  498          */
  499         vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED);
  500         vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED);
  501         TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
  502             &vmd->vmd_clock[0], plinks.q);
  503         TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
  504             &vmd->vmd_clock[1], plinks.q);
  505 }
  506 
  507 /*
  508  * Initialize a physical page in preparation for adding it to the free
  509  * lists.
  510  */
  511 static void
  512 vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
  513 {
  514 
  515         m->object = NULL;
  516         m->wire_count = 0;
  517         m->busy_lock = VPB_UNBUSIED;
  518         m->hold_count = 0;
  519         m->flags = m->aflags = 0;
  520         m->phys_addr = pa;
  521         m->queue = PQ_NONE;
  522         m->psind = 0;
  523         m->segind = segind;
  524         m->order = VM_NFREEORDER;
  525         m->pool = VM_FREEPOOL_DEFAULT;
  526         m->valid = m->dirty = 0;
  527         pmap_page_init(m);
  528 }
  529 
  530 /*
  531  *      vm_page_startup:
  532  *
  533  *      Initializes the resident memory module.  Allocates physical memory for
  534  *      bootstrapping UMA and some data structures that are used to manage
  535  *      physical pages.  Initializes these structures, and populates the free
  536  *      page queues.
  537  */
  538 vm_offset_t
  539 vm_page_startup(vm_offset_t vaddr)
  540 {
  541         struct vm_phys_seg *seg;
  542         vm_page_t m;
  543         char *list, *listend;
  544         vm_offset_t mapped;
  545         vm_paddr_t end, high_avail, low_avail, new_end, page_range, size;
  546         vm_paddr_t biggestsize, last_pa, pa;
  547         u_long pagecount;
  548         int biggestone, i, segind;
  549 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
  550         long ii;
  551 #endif
  552 
  553         biggestsize = 0;
  554         biggestone = 0;
  555         vaddr = round_page(vaddr);
  556 
  557         for (i = 0; phys_avail[i + 1]; i += 2) {
  558                 phys_avail[i] = round_page(phys_avail[i]);
  559                 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
  560         }
  561         for (i = 0; phys_avail[i + 1]; i += 2) {
  562                 size = phys_avail[i + 1] - phys_avail[i];
  563                 if (size > biggestsize) {
  564                         biggestone = i;
  565                         biggestsize = size;
  566                 }
  567         }
  568 
  569         end = phys_avail[biggestone+1];
  570 
  571         /*
  572          * Initialize the page and queue locks.
  573          */
  574         mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF);
  575         for (i = 0; i < PA_LOCK_COUNT; i++)
  576                 mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
  577         for (i = 0; i < vm_ndomains; i++)
  578                 vm_page_domain_init(i);
  579 
  580         /*
  581          * Allocate memory for use when boot strapping the kernel memory
  582          * allocator.  Tell UMA how many zones we are going to create
  583          * before going fully functional.  UMA will add its zones.
  584          *
  585          * VM startup zones: vmem, vmem_btag, VM OBJECT, RADIX NODE, MAP,
  586          * KMAP ENTRY, MAP ENTRY, VMSPACE.
  587          */
  588         boot_pages = uma_startup_count(8);
  589 
  590 #ifndef UMA_MD_SMALL_ALLOC
  591         /* vmem_startup() calls uma_prealloc(). */
  592         boot_pages += vmem_startup_count();
  593         /* vm_map_startup() calls uma_prealloc(). */
  594         boot_pages += howmany(MAX_KMAP,
  595             UMA_SLAB_SPACE / sizeof(struct vm_map));
  596 
  597         /*
  598          * Before going fully functional kmem_init() does allocation
  599          * from "KMAP ENTRY" and vmem_create() does allocation from "vmem".
  600          */
  601         boot_pages += 2;
  602 #endif
  603         /*
  604          * CTFLAG_RDTUN doesn't work during the early boot process, so we must
  605          * manually fetch the value.
  606          */
  607         TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages);
  608         new_end = end - (boot_pages * UMA_SLAB_SIZE);
  609         new_end = trunc_page(new_end);
  610         mapped = pmap_map(&vaddr, new_end, end,
  611             VM_PROT_READ | VM_PROT_WRITE);
  612         bzero((void *)mapped, end - new_end);
  613         uma_startup((void *)mapped, boot_pages);
  614 
  615 #ifdef WITNESS
  616         end = new_end;
  617         new_end = end - round_page(witness_startup_count());
  618         mapped = pmap_map(&vaddr, new_end, end,
  619             VM_PROT_READ | VM_PROT_WRITE);
  620         bzero((void *)mapped, end - new_end);
  621         witness_startup((void *)mapped);
  622 #endif
  623 
  624 #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
  625     defined(__i386__) || defined(__mips__) || defined(__riscv)
  626         /*
  627          * Allocate a bitmap to indicate that a random physical page
  628          * needs to be included in a minidump.
  629          *
  630          * The amd64 port needs this to indicate which direct map pages
  631          * need to be dumped, via calls to dump_add_page()/dump_drop_page().
  632          *
  633          * However, i386 still needs this workspace internally within the
  634          * minidump code.  In theory, they are not needed on i386, but are
  635          * included should the sf_buf code decide to use them.
  636          */
  637         last_pa = 0;
  638         for (i = 0; dump_avail[i + 1] != 0; i += 2)
  639                 if (dump_avail[i + 1] > last_pa)
  640                         last_pa = dump_avail[i + 1];
  641         page_range = last_pa / PAGE_SIZE;
  642         vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
  643         new_end -= vm_page_dump_size;
  644         vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
  645             new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
  646         bzero((void *)vm_page_dump, vm_page_dump_size);
  647 #else
  648         (void)last_pa;
  649 #endif
  650 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
  651     defined(__riscv)
  652         /*
  653          * Include the UMA bootstrap pages and vm_page_dump in a crash dump.
  654          * When pmap_map() uses the direct map, they are not automatically 
  655          * included.
  656          */
  657         for (pa = new_end; pa < end; pa += PAGE_SIZE)
  658                 dump_add_page(pa);
  659 #endif
  660         phys_avail[biggestone + 1] = new_end;
  661 #ifdef __amd64__
  662         /*
  663          * Request that the physical pages underlying the message buffer be
  664          * included in a crash dump.  Since the message buffer is accessed
  665          * through the direct map, they are not automatically included.
  666          */
  667         pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
  668         last_pa = pa + round_page(msgbufsize);
  669         while (pa < last_pa) {
  670                 dump_add_page(pa);
  671                 pa += PAGE_SIZE;
  672         }
  673 #endif
  674         /*
  675          * Compute the number of pages of memory that will be available for
  676          * use, taking into account the overhead of a page structure per page.
  677          * In other words, solve
  678          *      "available physical memory" - round_page(page_range *
  679          *          sizeof(struct vm_page)) = page_range * PAGE_SIZE 
  680          * for page_range.  
  681          */
  682         low_avail = phys_avail[0];
  683         high_avail = phys_avail[1];
  684         for (i = 0; i < vm_phys_nsegs; i++) {
  685                 if (vm_phys_segs[i].start < low_avail)
  686                         low_avail = vm_phys_segs[i].start;
  687                 if (vm_phys_segs[i].end > high_avail)
  688                         high_avail = vm_phys_segs[i].end;
  689         }
  690         /* Skip the first chunk.  It is already accounted for. */
  691         for (i = 2; phys_avail[i + 1] != 0; i += 2) {
  692                 if (phys_avail[i] < low_avail)
  693                         low_avail = phys_avail[i];
  694                 if (phys_avail[i + 1] > high_avail)
  695                         high_avail = phys_avail[i + 1];
  696         }
  697         first_page = low_avail / PAGE_SIZE;
  698 #ifdef VM_PHYSSEG_SPARSE
  699         size = 0;
  700         for (i = 0; i < vm_phys_nsegs; i++)
  701                 size += vm_phys_segs[i].end - vm_phys_segs[i].start;
  702         for (i = 0; phys_avail[i + 1] != 0; i += 2)
  703                 size += phys_avail[i + 1] - phys_avail[i];
  704 #elif defined(VM_PHYSSEG_DENSE)
  705         size = high_avail - low_avail;
  706 #else
  707 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
  708 #endif
  709 
  710 #ifdef VM_PHYSSEG_DENSE
  711         /*
  712          * In the VM_PHYSSEG_DENSE case, the number of pages can account for
  713          * the overhead of a page structure per page only if vm_page_array is
  714          * allocated from the last physical memory chunk.  Otherwise, we must
  715          * allocate page structures representing the physical memory
  716          * underlying vm_page_array, even though they will not be used.
  717          */
  718         if (new_end != high_avail)
  719                 page_range = size / PAGE_SIZE;
  720         else
  721 #endif
  722         {
  723                 page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
  724 
  725                 /*
  726                  * If the partial bytes remaining are large enough for
  727                  * a page (PAGE_SIZE) without a corresponding
  728                  * 'struct vm_page', then new_end will contain an
  729                  * extra page after subtracting the length of the VM
  730                  * page array.  Compensate by subtracting an extra
  731                  * page from new_end.
  732                  */
  733                 if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
  734                         if (new_end == high_avail)
  735                                 high_avail -= PAGE_SIZE;
  736                         new_end -= PAGE_SIZE;
  737                 }
  738         }
  739         end = new_end;
  740 
  741         /*
  742          * Reserve an unmapped guard page to trap access to vm_page_array[-1].
  743          * However, because this page is allocated from KVM, out-of-bounds
  744          * accesses using the direct map will not be trapped.
  745          */
  746         vaddr += PAGE_SIZE;
  747 
  748         /*
  749          * Allocate physical memory for the page structures, and map it.
  750          */
  751         new_end = trunc_page(end - page_range * sizeof(struct vm_page));
  752         mapped = pmap_map(&vaddr, new_end, end,
  753             VM_PROT_READ | VM_PROT_WRITE);
  754         vm_page_array = (vm_page_t)mapped;
  755         vm_page_array_size = page_range;
  756 
  757 #if VM_NRESERVLEVEL > 0
  758         /*
  759          * Allocate physical memory for the reservation management system's
  760          * data structures, and map it.
  761          */
  762         if (high_avail == end)
  763                 high_avail = new_end;
  764         new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
  765 #endif
  766 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
  767     defined(__riscv)
  768         /*
  769          * Include vm_page_array and vm_reserv_array in a crash dump.
  770          */
  771         for (pa = new_end; pa < end; pa += PAGE_SIZE)
  772                 dump_add_page(pa);
  773 #endif
  774         phys_avail[biggestone + 1] = new_end;
  775 
  776         /*
  777          * Add physical memory segments corresponding to the available
  778          * physical pages.
  779          */
  780         for (i = 0; phys_avail[i + 1] != 0; i += 2)
  781                 vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
  782 
  783         /*
  784          * Initialize the physical memory allocator.
  785          */
  786         vm_phys_init();
  787 
  788         /*
  789          * Initialize the page structures and add every available page to the
  790          * physical memory allocator's free lists.
  791          */
  792 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
  793         for (ii = 0; ii < vm_page_array_size; ii++) {
  794                 m = &vm_page_array[ii];
  795                 vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0);
  796                 m->flags = PG_FICTITIOUS;
  797         }
  798 #endif
  799         vm_cnt.v_page_count = 0;
  800         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  801                 seg = &vm_phys_segs[segind];
  802                 for (m = seg->first_page, pa = seg->start; pa < seg->end;
  803                     m++, pa += PAGE_SIZE)
  804                         vm_page_init_page(m, pa, segind);
  805 
  806                 /*
  807                  * Add the segment to the free lists only if it is covered by
  808                  * one of the ranges in phys_avail.  Because we've added the
  809                  * ranges to the vm_phys_segs array, we can assume that each
  810                  * segment is either entirely contained in one of the ranges,
  811                  * or doesn't overlap any of them.
  812                  */
  813                 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
  814                         struct vm_domain *vmd;
  815 
  816                         if (seg->start < phys_avail[i] ||
  817                             seg->end > phys_avail[i + 1])
  818                                 continue;
  819 
  820                         m = seg->first_page;
  821                         pagecount = (u_long)atop(seg->end - seg->start);
  822 
  823                         vmd = VM_DOMAIN(seg->domain);
  824                         vm_domain_free_lock(vmd);
  825                         vm_phys_free_contig(m, pagecount);
  826                         vm_domain_free_unlock(vmd);
  827                         vm_domain_freecnt_inc(vmd, pagecount);
  828                         vm_cnt.v_page_count += (u_int)pagecount;
  829 
  830                         vmd = VM_DOMAIN(seg->domain);
  831                         vmd->vmd_page_count += (u_int)pagecount;
  832                         vmd->vmd_segs |= 1UL << m->segind;
  833                         break;
  834                 }
  835         }
  836 
  837         /*
  838          * Remove blacklisted pages from the physical memory allocator.
  839          */
  840         TAILQ_INIT(&blacklist_head);
  841         vm_page_blacklist_load(&list, &listend);
  842         vm_page_blacklist_check(list, listend);
  843 
  844         list = kern_getenv("vm.blacklist");
  845         vm_page_blacklist_check(list, NULL);
  846 
  847         freeenv(list);
  848 #if VM_NRESERVLEVEL > 0
  849         /*
  850          * Initialize the reservation management system.
  851          */
  852         vm_reserv_init();
  853 #endif
  854 
  855         return (vaddr);
  856 }
  857 
  858 void
  859 vm_page_reference(vm_page_t m)
  860 {
  861 
  862         vm_page_aflag_set(m, PGA_REFERENCED);
  863 }
  864 
  865 /*
  866  *      vm_page_busy_downgrade:
  867  *
  868  *      Downgrade an exclusive busy page into a single shared busy page.
  869  */
  870 void
  871 vm_page_busy_downgrade(vm_page_t m)
  872 {
  873         u_int x;
  874         bool locked;
  875 
  876         vm_page_assert_xbusied(m);
  877         locked = mtx_owned(vm_page_lockptr(m));
  878 
  879         for (;;) {
  880                 x = m->busy_lock;
  881                 x &= VPB_BIT_WAITERS;
  882                 if (x != 0 && !locked)
  883                         vm_page_lock(m);
  884                 if (atomic_cmpset_rel_int(&m->busy_lock,
  885                     VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1)))
  886                         break;
  887                 if (x != 0 && !locked)
  888                         vm_page_unlock(m);
  889         }
  890         if (x != 0) {
  891                 wakeup(m);
  892                 if (!locked)
  893                         vm_page_unlock(m);
  894         }
  895 }
  896 
  897 /*
  898  *      vm_page_sbusied:
  899  *
  900  *      Return a positive value if the page is shared busied, 0 otherwise.
  901  */
  902 int
  903 vm_page_sbusied(vm_page_t m)
  904 {
  905         u_int x;
  906 
  907         x = m->busy_lock;
  908         return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
  909 }
  910 
  911 /*
  912  *      vm_page_sunbusy:
  913  *
  914  *      Shared unbusy a page.
  915  */
  916 void
  917 vm_page_sunbusy(vm_page_t m)
  918 {
  919         u_int x;
  920 
  921         vm_page_lock_assert(m, MA_NOTOWNED);
  922         vm_page_assert_sbusied(m);
  923 
  924         for (;;) {
  925                 x = m->busy_lock;
  926                 if (VPB_SHARERS(x) > 1) {
  927                         if (atomic_cmpset_int(&m->busy_lock, x,
  928                             x - VPB_ONE_SHARER))
  929                                 break;
  930                         continue;
  931                 }
  932                 if ((x & VPB_BIT_WAITERS) == 0) {
  933                         KASSERT(x == VPB_SHARERS_WORD(1),
  934                             ("vm_page_sunbusy: invalid lock state"));
  935                         if (atomic_cmpset_int(&m->busy_lock,
  936                             VPB_SHARERS_WORD(1), VPB_UNBUSIED))
  937                                 break;
  938                         continue;
  939                 }
  940                 KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS),
  941                     ("vm_page_sunbusy: invalid lock state for waiters"));
  942 
  943                 vm_page_lock(m);
  944                 if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) {
  945                         vm_page_unlock(m);
  946                         continue;
  947                 }
  948                 wakeup(m);
  949                 vm_page_unlock(m);
  950                 break;
  951         }
  952 }
  953 
  954 /*
  955  *      vm_page_busy_sleep:
  956  *
  957  *      Sleep and release the page lock, using the page pointer as wchan.
  958  *      This is used to implement the hard-path of busying mechanism.
  959  *
  960  *      The given page must be locked.
  961  *
  962  *      If nonshared is true, sleep only if the page is xbusy.
  963  */
  964 void
  965 vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared)
  966 {
  967         u_int x;
  968 
  969         vm_page_assert_locked(m);
  970 
  971         x = m->busy_lock;
  972         if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) ||
  973             ((x & VPB_BIT_WAITERS) == 0 &&
  974             !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) {
  975                 vm_page_unlock(m);
  976                 return;
  977         }
  978         msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0);
  979 }
  980 
  981 /*
  982  *      vm_page_trysbusy:
  983  *
  984  *      Try to shared busy a page.
  985  *      If the operation succeeds 1 is returned otherwise 0.
  986  *      The operation never sleeps.
  987  */
  988 int
  989 vm_page_trysbusy(vm_page_t m)
  990 {
  991         u_int x;
  992 
  993         for (;;) {
  994                 x = m->busy_lock;
  995                 if ((x & VPB_BIT_SHARED) == 0)
  996                         return (0);
  997                 if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER))
  998                         return (1);
  999         }
 1000 }
 1001 
 1002 static void
 1003 vm_page_xunbusy_locked(vm_page_t m)
 1004 {
 1005 
 1006         vm_page_assert_xbusied(m);
 1007         vm_page_assert_locked(m);
 1008 
 1009         atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
 1010         /* There is a waiter, do wakeup() instead of vm_page_flash(). */
 1011         wakeup(m);
 1012 }
 1013 
 1014 void
 1015 vm_page_xunbusy_maybelocked(vm_page_t m)
 1016 {
 1017         bool lockacq;
 1018 
 1019         vm_page_assert_xbusied(m);
 1020 
 1021         /*
 1022          * Fast path for unbusy.  If it succeeds, we know that there
 1023          * are no waiters, so we do not need a wakeup.
 1024          */
 1025         if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER,
 1026             VPB_UNBUSIED))
 1027                 return;
 1028 
 1029         lockacq = !mtx_owned(vm_page_lockptr(m));
 1030         if (lockacq)
 1031                 vm_page_lock(m);
 1032         vm_page_xunbusy_locked(m);
 1033         if (lockacq)
 1034                 vm_page_unlock(m);
 1035 }
 1036 
 1037 /*
 1038  *      vm_page_xunbusy_hard:
 1039  *
 1040  *      Called after the first try the exclusive unbusy of a page failed.
 1041  *      It is assumed that the waiters bit is on.
 1042  */
 1043 void
 1044 vm_page_xunbusy_hard(vm_page_t m)
 1045 {
 1046 
 1047         vm_page_assert_xbusied(m);
 1048 
 1049         vm_page_lock(m);
 1050         vm_page_xunbusy_locked(m);
 1051         vm_page_unlock(m);
 1052 }
 1053 
 1054 /*
 1055  *      vm_page_flash:
 1056  *
 1057  *      Wakeup anyone waiting for the page.
 1058  *      The ownership bits do not change.
 1059  *
 1060  *      The given page must be locked.
 1061  */
 1062 void
 1063 vm_page_flash(vm_page_t m)
 1064 {
 1065         u_int x;
 1066 
 1067         vm_page_lock_assert(m, MA_OWNED);
 1068 
 1069         for (;;) {
 1070                 x = m->busy_lock;
 1071                 if ((x & VPB_BIT_WAITERS) == 0)
 1072                         return;
 1073                 if (atomic_cmpset_int(&m->busy_lock, x,
 1074                     x & (~VPB_BIT_WAITERS)))
 1075                         break;
 1076         }
 1077         wakeup(m);
 1078 }
 1079 
 1080 /*
 1081  * Avoid releasing and reacquiring the same page lock.
 1082  */
 1083 void
 1084 vm_page_change_lock(vm_page_t m, struct mtx **mtx)
 1085 {
 1086         struct mtx *mtx1;
 1087 
 1088         mtx1 = vm_page_lockptr(m);
 1089         if (*mtx == mtx1)
 1090                 return;
 1091         if (*mtx != NULL)
 1092                 mtx_unlock(*mtx);
 1093         *mtx = mtx1;
 1094         mtx_lock(mtx1);
 1095 }
 1096 
 1097 /*
 1098  * Keep page from being freed by the page daemon
 1099  * much of the same effect as wiring, except much lower
 1100  * overhead and should be used only for *very* temporary
 1101  * holding ("wiring").
 1102  */
 1103 void
 1104 vm_page_hold(vm_page_t mem)
 1105 {
 1106 
 1107         vm_page_lock_assert(mem, MA_OWNED);
 1108         mem->hold_count++;
 1109 }
 1110 
 1111 void
 1112 vm_page_unhold(vm_page_t mem)
 1113 {
 1114 
 1115         vm_page_lock_assert(mem, MA_OWNED);
 1116         KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!"));
 1117         --mem->hold_count;
 1118         if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
 1119                 vm_page_free_toq(mem);
 1120 }
 1121 
 1122 /*
 1123  *      vm_page_unhold_pages:
 1124  *
 1125  *      Unhold each of the pages that is referenced by the given array.
 1126  */
 1127 void
 1128 vm_page_unhold_pages(vm_page_t *ma, int count)
 1129 {
 1130         struct mtx *mtx;
 1131 
 1132         mtx = NULL;
 1133         for (; count != 0; count--) {
 1134                 vm_page_change_lock(*ma, &mtx);
 1135                 vm_page_unhold(*ma);
 1136                 ma++;
 1137         }
 1138         if (mtx != NULL)
 1139                 mtx_unlock(mtx);
 1140 }
 1141 
 1142 vm_page_t
 1143 PHYS_TO_VM_PAGE(vm_paddr_t pa)
 1144 {
 1145         vm_page_t m;
 1146 
 1147 #ifdef VM_PHYSSEG_SPARSE
 1148         m = vm_phys_paddr_to_vm_page(pa);
 1149         if (m == NULL)
 1150                 m = vm_phys_fictitious_to_vm_page(pa);
 1151         return (m);
 1152 #elif defined(VM_PHYSSEG_DENSE)
 1153         long pi;
 1154 
 1155         pi = atop(pa);
 1156         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 1157                 m = &vm_page_array[pi - first_page];
 1158                 return (m);
 1159         }
 1160         return (vm_phys_fictitious_to_vm_page(pa));
 1161 #else
 1162 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 1163 #endif
 1164 }
 1165 
 1166 /*
 1167  *      vm_page_getfake:
 1168  *
 1169  *      Create a fictitious page with the specified physical address and
 1170  *      memory attribute.  The memory attribute is the only the machine-
 1171  *      dependent aspect of a fictitious page that must be initialized.
 1172  */
 1173 vm_page_t
 1174 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
 1175 {
 1176         vm_page_t m;
 1177 
 1178         m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
 1179         vm_page_initfake(m, paddr, memattr);
 1180         return (m);
 1181 }
 1182 
 1183 void
 1184 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 1185 {
 1186 
 1187         if ((m->flags & PG_FICTITIOUS) != 0) {
 1188                 /*
 1189                  * The page's memattr might have changed since the
 1190                  * previous initialization.  Update the pmap to the
 1191                  * new memattr.
 1192                  */
 1193                 goto memattr;
 1194         }
 1195         m->phys_addr = paddr;
 1196         m->queue = PQ_NONE;
 1197         /* Fictitious pages don't use "segind". */
 1198         m->flags = PG_FICTITIOUS;
 1199         /* Fictitious pages don't use "order" or "pool". */
 1200         m->oflags = VPO_UNMANAGED;
 1201         m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 1202         m->wire_count = 1;
 1203         pmap_page_init(m);
 1204 memattr:
 1205         pmap_page_set_memattr(m, memattr);
 1206 }
 1207 
 1208 /*
 1209  *      vm_page_putfake:
 1210  *
 1211  *      Release a fictitious page.
 1212  */
 1213 void
 1214 vm_page_putfake(vm_page_t m)
 1215 {
 1216 
 1217         KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
 1218         KASSERT((m->flags & PG_FICTITIOUS) != 0,
 1219             ("vm_page_putfake: bad page %p", m));
 1220         uma_zfree(fakepg_zone, m);
 1221 }
 1222 
 1223 /*
 1224  *      vm_page_updatefake:
 1225  *
 1226  *      Update the given fictitious page to the specified physical address and
 1227  *      memory attribute.
 1228  */
 1229 void
 1230 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 1231 {
 1232 
 1233         KASSERT((m->flags & PG_FICTITIOUS) != 0,
 1234             ("vm_page_updatefake: bad page %p", m));
 1235         m->phys_addr = paddr;
 1236         pmap_page_set_memattr(m, memattr);
 1237 }
 1238 
 1239 /*
 1240  *      vm_page_free:
 1241  *
 1242  *      Free a page.
 1243  */
 1244 void
 1245 vm_page_free(vm_page_t m)
 1246 {
 1247 
 1248         m->flags &= ~PG_ZERO;
 1249         vm_page_free_toq(m);
 1250 }
 1251 
 1252 /*
 1253  *      vm_page_free_zero:
 1254  *
 1255  *      Free a page to the zerod-pages queue
 1256  */
 1257 void
 1258 vm_page_free_zero(vm_page_t m)
 1259 {
 1260 
 1261         m->flags |= PG_ZERO;
 1262         vm_page_free_toq(m);
 1263 }
 1264 
 1265 /*
 1266  * Unbusy and handle the page queueing for a page from a getpages request that
 1267  * was optionally read ahead or behind.
 1268  */
 1269 void
 1270 vm_page_readahead_finish(vm_page_t m)
 1271 {
 1272 
 1273         /* We shouldn't put invalid pages on queues. */
 1274         KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m));
 1275 
 1276         /*
 1277          * Since the page is not the actually needed one, whether it should
 1278          * be activated or deactivated is not obvious.  Empirical results
 1279          * have shown that deactivating the page is usually the best choice,
 1280          * unless the page is wanted by another thread.
 1281          */
 1282         vm_page_lock(m);
 1283         if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
 1284                 vm_page_activate(m);
 1285         else
 1286                 vm_page_deactivate(m);
 1287         vm_page_unlock(m);
 1288         vm_page_xunbusy(m);
 1289 }
 1290 
 1291 /*
 1292  *      vm_page_sleep_if_busy:
 1293  *
 1294  *      Sleep and release the page queues lock if the page is busied.
 1295  *      Returns TRUE if the thread slept.
 1296  *
 1297  *      The given page must be unlocked and object containing it must
 1298  *      be locked.
 1299  */
 1300 int
 1301 vm_page_sleep_if_busy(vm_page_t m, const char *msg)
 1302 {
 1303         vm_object_t obj;
 1304 
 1305         vm_page_lock_assert(m, MA_NOTOWNED);
 1306         VM_OBJECT_ASSERT_WLOCKED(m->object);
 1307 
 1308         if (vm_page_busied(m)) {
 1309                 /*
 1310                  * The page-specific object must be cached because page
 1311                  * identity can change during the sleep, causing the
 1312                  * re-lock of a different object.
 1313                  * It is assumed that a reference to the object is already
 1314                  * held by the callers.
 1315                  */
 1316                 obj = m->object;
 1317                 vm_page_lock(m);
 1318                 VM_OBJECT_WUNLOCK(obj);
 1319                 vm_page_busy_sleep(m, msg, false);
 1320                 VM_OBJECT_WLOCK(obj);
 1321                 return (TRUE);
 1322         }
 1323         return (FALSE);
 1324 }
 1325 
 1326 /*
 1327  *      vm_page_dirty_KBI:              [ internal use only ]
 1328  *
 1329  *      Set all bits in the page's dirty field.
 1330  *
 1331  *      The object containing the specified page must be locked if the
 1332  *      call is made from the machine-independent layer.
 1333  *
 1334  *      See vm_page_clear_dirty_mask().
 1335  *
 1336  *      This function should only be called by vm_page_dirty().
 1337  */
 1338 void
 1339 vm_page_dirty_KBI(vm_page_t m)
 1340 {
 1341 
 1342         /* Refer to this operation by its public name. */
 1343         KASSERT(m->valid == VM_PAGE_BITS_ALL,
 1344             ("vm_page_dirty: page is invalid!"));
 1345         m->dirty = VM_PAGE_BITS_ALL;
 1346 }
 1347 
 1348 /*
 1349  *      vm_page_insert:         [ internal use only ]
 1350  *
 1351  *      Inserts the given mem entry into the object and object list.
 1352  *
 1353  *      The object must be locked.
 1354  */
 1355 int
 1356 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 1357 {
 1358         vm_page_t mpred;
 1359 
 1360         VM_OBJECT_ASSERT_WLOCKED(object);
 1361         mpred = vm_radix_lookup_le(&object->rtree, pindex);
 1362         return (vm_page_insert_after(m, object, pindex, mpred));
 1363 }
 1364 
 1365 /*
 1366  *      vm_page_insert_after:
 1367  *
 1368  *      Inserts the page "m" into the specified object at offset "pindex".
 1369  *
 1370  *      The page "mpred" must immediately precede the offset "pindex" within
 1371  *      the specified object.
 1372  *
 1373  *      The object must be locked.
 1374  */
 1375 static int
 1376 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
 1377     vm_page_t mpred)
 1378 {
 1379         vm_page_t msucc;
 1380 
 1381         VM_OBJECT_ASSERT_WLOCKED(object);
 1382         KASSERT(m->object == NULL,
 1383             ("vm_page_insert_after: page already inserted"));
 1384         if (mpred != NULL) {
 1385                 KASSERT(mpred->object == object,
 1386                     ("vm_page_insert_after: object doesn't contain mpred"));
 1387                 KASSERT(mpred->pindex < pindex,
 1388                     ("vm_page_insert_after: mpred doesn't precede pindex"));
 1389                 msucc = TAILQ_NEXT(mpred, listq);
 1390         } else
 1391                 msucc = TAILQ_FIRST(&object->memq);
 1392         if (msucc != NULL)
 1393                 KASSERT(msucc->pindex > pindex,
 1394                     ("vm_page_insert_after: msucc doesn't succeed pindex"));
 1395 
 1396         /*
 1397          * Record the object/offset pair in this page
 1398          */
 1399         m->object = object;
 1400         m->pindex = pindex;
 1401 
 1402         /*
 1403          * Now link into the object's ordered list of backed pages.
 1404          */
 1405         if (vm_radix_insert(&object->rtree, m)) {
 1406                 m->object = NULL;
 1407                 m->pindex = 0;
 1408                 return (1);
 1409         }
 1410         vm_page_insert_radixdone(m, object, mpred);
 1411         return (0);
 1412 }
 1413 
 1414 /*
 1415  *      vm_page_insert_radixdone:
 1416  *
 1417  *      Complete page "m" insertion into the specified object after the
 1418  *      radix trie hooking.
 1419  *
 1420  *      The page "mpred" must precede the offset "m->pindex" within the
 1421  *      specified object.
 1422  *
 1423  *      The object must be locked.
 1424  */
 1425 static void
 1426 vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
 1427 {
 1428 
 1429         VM_OBJECT_ASSERT_WLOCKED(object);
 1430         KASSERT(object != NULL && m->object == object,
 1431             ("vm_page_insert_radixdone: page %p has inconsistent object", m));
 1432         if (mpred != NULL) {
 1433                 KASSERT(mpred->object == object,
 1434                     ("vm_page_insert_after: object doesn't contain mpred"));
 1435                 KASSERT(mpred->pindex < m->pindex,
 1436                     ("vm_page_insert_after: mpred doesn't precede pindex"));
 1437         }
 1438 
 1439         if (mpred != NULL)
 1440                 TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
 1441         else
 1442                 TAILQ_INSERT_HEAD(&object->memq, m, listq);
 1443 
 1444         /*
 1445          * Show that the object has one more resident page.
 1446          */
 1447         object->resident_page_count++;
 1448 
 1449         /*
 1450          * Hold the vnode until the last page is released.
 1451          */
 1452         if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
 1453                 vhold(object->handle);
 1454 
 1455         /*
 1456          * Since we are inserting a new and possibly dirty page,
 1457          * update the object's OBJ_MIGHTBEDIRTY flag.
 1458          */
 1459         if (pmap_page_is_write_mapped(m))
 1460                 vm_object_set_writeable_dirty(object);
 1461 }
 1462 
 1463 /*
 1464  *      vm_page_remove:
 1465  *
 1466  *      Removes the specified page from its containing object, but does not
 1467  *      invalidate any backing storage.  Return true if the page may be safely
 1468  *      freed and false otherwise.
 1469  *
 1470  *      The object must be locked.  The page must be locked if it is managed.
 1471  */
 1472 bool
 1473 vm_page_remove(vm_page_t m)
 1474 {
 1475         vm_object_t object;
 1476         vm_page_t mrem;
 1477 
 1478         object = m->object;
 1479 
 1480         if ((m->oflags & VPO_UNMANAGED) == 0)
 1481                 vm_page_assert_locked(m);
 1482         VM_OBJECT_ASSERT_WLOCKED(object);
 1483         if (vm_page_xbusied(m))
 1484                 vm_page_xunbusy_maybelocked(m);
 1485         mrem = vm_radix_remove(&object->rtree, m->pindex);
 1486         KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
 1487 
 1488         /*
 1489          * Now remove from the object's list of backed pages.
 1490          */
 1491         TAILQ_REMOVE(&object->memq, m, listq);
 1492 
 1493         /*
 1494          * And show that the object has one fewer resident page.
 1495          */
 1496         object->resident_page_count--;
 1497 
 1498         /*
 1499          * The vnode may now be recycled.
 1500          */
 1501         if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
 1502                 vdrop(object->handle);
 1503 
 1504         m->object = NULL;
 1505         return (!vm_page_wired(m));
 1506 }
 1507 
 1508 /*
 1509  *      vm_page_lookup:
 1510  *
 1511  *      Returns the page associated with the object/offset
 1512  *      pair specified; if none is found, NULL is returned.
 1513  *
 1514  *      The object must be locked.
 1515  */
 1516 vm_page_t
 1517 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 1518 {
 1519 
 1520         VM_OBJECT_ASSERT_LOCKED(object);
 1521         return (vm_radix_lookup(&object->rtree, pindex));
 1522 }
 1523 
 1524 /*
 1525  *      vm_page_find_least:
 1526  *
 1527  *      Returns the page associated with the object with least pindex
 1528  *      greater than or equal to the parameter pindex, or NULL.
 1529  *
 1530  *      The object must be locked.
 1531  */
 1532 vm_page_t
 1533 vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
 1534 {
 1535         vm_page_t m;
 1536 
 1537         VM_OBJECT_ASSERT_LOCKED(object);
 1538         if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
 1539                 m = vm_radix_lookup_ge(&object->rtree, pindex);
 1540         return (m);
 1541 }
 1542 
 1543 /*
 1544  * Returns the given page's successor (by pindex) within the object if it is
 1545  * resident; if none is found, NULL is returned.
 1546  *
 1547  * The object must be locked.
 1548  */
 1549 vm_page_t
 1550 vm_page_next(vm_page_t m)
 1551 {
 1552         vm_page_t next;
 1553 
 1554         VM_OBJECT_ASSERT_LOCKED(m->object);
 1555         if ((next = TAILQ_NEXT(m, listq)) != NULL) {
 1556                 MPASS(next->object == m->object);
 1557                 if (next->pindex != m->pindex + 1)
 1558                         next = NULL;
 1559         }
 1560         return (next);
 1561 }
 1562 
 1563 /*
 1564  * Returns the given page's predecessor (by pindex) within the object if it is
 1565  * resident; if none is found, NULL is returned.
 1566  *
 1567  * The object must be locked.
 1568  */
 1569 vm_page_t
 1570 vm_page_prev(vm_page_t m)
 1571 {
 1572         vm_page_t prev;
 1573 
 1574         VM_OBJECT_ASSERT_LOCKED(m->object);
 1575         if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
 1576                 MPASS(prev->object == m->object);
 1577                 if (prev->pindex != m->pindex - 1)
 1578                         prev = NULL;
 1579         }
 1580         return (prev);
 1581 }
 1582 
 1583 /*
 1584  * Uses the page mnew as a replacement for an existing page at index
 1585  * pindex which must be already present in the object.
 1586  *
 1587  * The existing page must not be on a paging queue.
 1588  */
 1589 vm_page_t
 1590 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
 1591 {
 1592         vm_page_t mold;
 1593 
 1594         VM_OBJECT_ASSERT_WLOCKED(object);
 1595         KASSERT(mnew->object == NULL,
 1596             ("vm_page_replace: page %p already in object", mnew));
 1597         KASSERT(mnew->queue == PQ_NONE,
 1598             ("vm_page_replace: new page %p is on a paging queue", mnew));
 1599 
 1600         /*
 1601          * This function mostly follows vm_page_insert() and
 1602          * vm_page_remove() without the radix, object count and vnode
 1603          * dance.  Double check such functions for more comments.
 1604          */
 1605 
 1606         mnew->object = object;
 1607         mnew->pindex = pindex;
 1608         mold = vm_radix_replace(&object->rtree, mnew);
 1609         KASSERT(mold->queue == PQ_NONE,
 1610             ("vm_page_replace: old page %p is on a paging queue", mold));
 1611 
 1612         /* Keep the resident page list in sorted order. */
 1613         TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
 1614         TAILQ_REMOVE(&object->memq, mold, listq);
 1615 
 1616         mold->object = NULL;
 1617         vm_page_xunbusy_maybelocked(mold);
 1618 
 1619         /*
 1620          * The object's resident_page_count does not change because we have
 1621          * swapped one page for another, but OBJ_MIGHTBEDIRTY.
 1622          */
 1623         if (pmap_page_is_write_mapped(mnew))
 1624                 vm_object_set_writeable_dirty(object);
 1625         return (mold);
 1626 }
 1627 
 1628 /*
 1629  *      vm_page_rename:
 1630  *
 1631  *      Move the given memory entry from its
 1632  *      current object to the specified target object/offset.
 1633  *
 1634  *      Note: swap associated with the page must be invalidated by the move.  We
 1635  *            have to do this for several reasons:  (1) we aren't freeing the
 1636  *            page, (2) we are dirtying the page, (3) the VM system is probably
 1637  *            moving the page from object A to B, and will then later move
 1638  *            the backing store from A to B and we can't have a conflict.
 1639  *
 1640  *      Note: we *always* dirty the page.  It is necessary both for the
 1641  *            fact that we moved it, and because we may be invalidating
 1642  *            swap.
 1643  *
 1644  *      The objects must be locked.
 1645  */
 1646 int
 1647 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 1648 {
 1649         vm_page_t mpred;
 1650         vm_pindex_t opidx;
 1651 
 1652         VM_OBJECT_ASSERT_WLOCKED(new_object);
 1653 
 1654         mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
 1655         KASSERT(mpred == NULL || mpred->pindex != new_pindex,
 1656             ("vm_page_rename: pindex already renamed"));
 1657 
 1658         /*
 1659          * Create a custom version of vm_page_insert() which does not depend
 1660          * by m_prev and can cheat on the implementation aspects of the
 1661          * function.
 1662          */
 1663         opidx = m->pindex;
 1664         m->pindex = new_pindex;
 1665         if (vm_radix_insert(&new_object->rtree, m)) {
 1666                 m->pindex = opidx;
 1667                 return (1);
 1668         }
 1669 
 1670         /*
 1671          * The operation cannot fail anymore.  The removal must happen before
 1672          * the listq iterator is tainted.
 1673          */
 1674         m->pindex = opidx;
 1675         vm_page_lock(m);
 1676         (void)vm_page_remove(m);
 1677 
 1678         /* Return back to the new pindex to complete vm_page_insert(). */
 1679         m->pindex = new_pindex;
 1680         m->object = new_object;
 1681         vm_page_unlock(m);
 1682         vm_page_insert_radixdone(m, new_object, mpred);
 1683         vm_page_dirty(m);
 1684         return (0);
 1685 }
 1686 
 1687 /*
 1688  *      vm_page_alloc:
 1689  *
 1690  *      Allocate and return a page that is associated with the specified
 1691  *      object and offset pair.  By default, this page is exclusive busied.
 1692  *
 1693  *      The caller must always specify an allocation class.
 1694  *
 1695  *      allocation classes:
 1696  *      VM_ALLOC_NORMAL         normal process request
 1697  *      VM_ALLOC_SYSTEM         system *really* needs a page
 1698  *      VM_ALLOC_INTERRUPT      interrupt time request
 1699  *
 1700  *      optional allocation flags:
 1701  *      VM_ALLOC_COUNT(number)  the number of additional pages that the caller
 1702  *                              intends to allocate
 1703  *      VM_ALLOC_NOBUSY         do not exclusive busy the page
 1704  *      VM_ALLOC_NODUMP         do not include the page in a kernel core dump
 1705  *      VM_ALLOC_NOOBJ          page is not associated with an object and
 1706  *                              should not be exclusive busy
 1707  *      VM_ALLOC_SBUSY          shared busy the allocated page
 1708  *      VM_ALLOC_WIRED          wire the allocated page
 1709  *      VM_ALLOC_ZERO           prefer a zeroed page
 1710  */
 1711 vm_page_t
 1712 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 1713 {
 1714 
 1715         return (vm_page_alloc_after(object, pindex, req, object != NULL ?
 1716             vm_radix_lookup_le(&object->rtree, pindex) : NULL));
 1717 }
 1718 
 1719 vm_page_t
 1720 vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain,
 1721     int req)
 1722 {
 1723 
 1724         return (vm_page_alloc_domain_after(object, pindex, domain, req,
 1725             object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) :
 1726             NULL));
 1727 }
 1728 
 1729 /*
 1730  * Allocate a page in the specified object with the given page index.  To
 1731  * optimize insertion of the page into the object, the caller must also specifiy
 1732  * the resident page in the object with largest index smaller than the given
 1733  * page index, or NULL if no such page exists.
 1734  */
 1735 vm_page_t
 1736 vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex,
 1737     int req, vm_page_t mpred)
 1738 {
 1739         struct vm_domainset_iter di;
 1740         vm_page_t m;
 1741         int domain;
 1742 
 1743         vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
 1744         do {
 1745                 m = vm_page_alloc_domain_after(object, pindex, domain, req,
 1746                     mpred);
 1747                 if (m != NULL)
 1748                         break;
 1749         } while (vm_domainset_iter_page(&di, object, &domain) == 0);
 1750 
 1751         return (m);
 1752 }
 1753 
 1754 /*
 1755  * Returns true if the number of free pages exceeds the minimum
 1756  * for the request class and false otherwise.
 1757  */
 1758 static int
 1759 _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages)
 1760 {
 1761         u_int limit, old, new;
 1762 
 1763         if (req_class == VM_ALLOC_INTERRUPT)
 1764                 limit = 0;
 1765         else if (req_class == VM_ALLOC_SYSTEM)
 1766                 limit = vmd->vmd_interrupt_free_min;
 1767         else
 1768                 limit = vmd->vmd_free_reserved;
 1769 
 1770         /*
 1771          * Attempt to reserve the pages.  Fail if we're below the limit.
 1772          */
 1773         limit += npages;
 1774         old = vmd->vmd_free_count;
 1775         do {
 1776                 if (old < limit)
 1777                         return (0);
 1778                 new = old - npages;
 1779         } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0);
 1780 
 1781         /* Wake the page daemon if we've crossed the threshold. */
 1782         if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old))
 1783                 pagedaemon_wakeup(vmd->vmd_domain);
 1784 
 1785         /* Only update bitsets on transitions. */
 1786         if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) ||
 1787             (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe))
 1788                 vm_domain_set(vmd);
 1789 
 1790         return (1);
 1791 }
 1792 
 1793 int
 1794 vm_domain_allocate(struct vm_domain *vmd, int req, int npages)
 1795 {
 1796         int req_class;
 1797 
 1798         /*
 1799          * The page daemon is allowed to dig deeper into the free page list.
 1800          */
 1801         req_class = req & VM_ALLOC_CLASS_MASK;
 1802         if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 1803                 req_class = VM_ALLOC_SYSTEM;
 1804         return (_vm_domain_allocate(vmd, req_class, npages));
 1805 }
 1806 
 1807 vm_page_t
 1808 vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
 1809     int req, vm_page_t mpred)
 1810 {
 1811         struct vm_domain *vmd;
 1812         vm_page_t m;
 1813         int flags, pool;
 1814 
 1815         KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 1816             (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 1817             ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 1818             (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 1819             ("inconsistent object(%p)/req(%x)", object, req));
 1820         KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
 1821             ("Can't sleep and retry object insertion."));
 1822         KASSERT(mpred == NULL || mpred->pindex < pindex,
 1823             ("mpred %p doesn't precede pindex 0x%jx", mpred,
 1824             (uintmax_t)pindex));
 1825         if (object != NULL)
 1826                 VM_OBJECT_ASSERT_WLOCKED(object);
 1827 
 1828         flags = 0;
 1829         m = NULL;
 1830         pool = object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT;
 1831 again:
 1832 #if VM_NRESERVLEVEL > 0
 1833         /*
 1834          * Can we allocate the page from a reservation?
 1835          */
 1836         if (vm_object_reserv(object) &&
 1837             ((m = vm_reserv_extend(req, object, pindex, domain, mpred)) != NULL ||
 1838             (m = vm_reserv_alloc_page(req, object, pindex, domain, mpred)) != NULL)) {
 1839                 domain = vm_phys_domain(m);
 1840                 vmd = VM_DOMAIN(domain);
 1841                 goto found;
 1842         }
 1843 #endif
 1844         vmd = VM_DOMAIN(domain);
 1845         if (vmd->vmd_pgcache[pool].zone != NULL) {
 1846                 m = uma_zalloc(vmd->vmd_pgcache[pool].zone, M_NOWAIT);
 1847                 if (m != NULL) {
 1848                         flags |= PG_PCPU_CACHE;
 1849                         goto found;
 1850                 }
 1851         }
 1852         if (vm_domain_allocate(vmd, req, 1)) {
 1853                 /*
 1854                  * If not, allocate it from the free page queues.
 1855                  */
 1856                 vm_domain_free_lock(vmd);
 1857                 m = vm_phys_alloc_pages(domain, pool, 0);
 1858                 vm_domain_free_unlock(vmd);
 1859                 if (m == NULL) {
 1860                         vm_domain_freecnt_inc(vmd, 1);
 1861 #if VM_NRESERVLEVEL > 0
 1862                         if (vm_reserv_reclaim_inactive(domain))
 1863                                 goto again;
 1864 #endif
 1865                 }
 1866         }
 1867         if (m == NULL) {
 1868                 /*
 1869                  * Not allocatable, give up.
 1870                  */
 1871                 if (vm_domain_alloc_fail(vmd, object, req))
 1872                         goto again;
 1873                 return (NULL);
 1874         }
 1875 
 1876         /*
 1877          * At this point we had better have found a good page.
 1878          */
 1879 found:
 1880         vm_page_dequeue(m);
 1881         vm_page_alloc_check(m);
 1882 
 1883         /*
 1884          * Initialize the page.  Only the PG_ZERO flag is inherited.
 1885          */
 1886         if ((req & VM_ALLOC_ZERO) != 0)
 1887                 flags |= (m->flags & PG_ZERO);
 1888         if ((req & VM_ALLOC_NODUMP) != 0)
 1889                 flags |= PG_NODUMP;
 1890         m->flags = flags;
 1891         m->aflags = 0;
 1892         m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 1893             VPO_UNMANAGED : 0;
 1894         m->busy_lock = VPB_UNBUSIED;
 1895         if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 1896                 m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 1897         if ((req & VM_ALLOC_SBUSY) != 0)
 1898                 m->busy_lock = VPB_SHARERS_WORD(1);
 1899         if (req & VM_ALLOC_WIRED) {
 1900                 /*
 1901                  * The page lock is not required for wiring a page until that
 1902                  * page is inserted into the object.
 1903                  */
 1904                 vm_wire_add(1);
 1905                 m->wire_count = 1;
 1906         }
 1907         m->act_count = 0;
 1908 
 1909         if (object != NULL) {
 1910                 if (vm_page_insert_after(m, object, pindex, mpred)) {
 1911                         if (req & VM_ALLOC_WIRED) {
 1912                                 vm_wire_sub(1);
 1913                                 m->wire_count = 0;
 1914                         }
 1915                         KASSERT(m->object == NULL, ("page %p has object", m));
 1916                         m->oflags = VPO_UNMANAGED;
 1917                         m->busy_lock = VPB_UNBUSIED;
 1918                         /* Don't change PG_ZERO. */
 1919                         vm_page_free_toq(m);
 1920                         if (req & VM_ALLOC_WAITFAIL) {
 1921                                 VM_OBJECT_WUNLOCK(object);
 1922                                 vm_radix_wait();
 1923                                 VM_OBJECT_WLOCK(object);
 1924                         }
 1925                         return (NULL);
 1926                 }
 1927 
 1928                 /* Ignore device objects; the pager sets "memattr" for them. */
 1929                 if (object->memattr != VM_MEMATTR_DEFAULT &&
 1930                     (object->flags & OBJ_FICTITIOUS) == 0)
 1931                         pmap_page_set_memattr(m, object->memattr);
 1932         } else
 1933                 m->pindex = pindex;
 1934 
 1935         return (m);
 1936 }
 1937 
 1938 /*
 1939  *      vm_page_alloc_contig:
 1940  *
 1941  *      Allocate a contiguous set of physical pages of the given size "npages"
 1942  *      from the free lists.  All of the physical pages must be at or above
 1943  *      the given physical address "low" and below the given physical address
 1944  *      "high".  The given value "alignment" determines the alignment of the
 1945  *      first physical page in the set.  If the given value "boundary" is
 1946  *      non-zero, then the set of physical pages cannot cross any physical
 1947  *      address boundary that is a multiple of that value.  Both "alignment"
 1948  *      and "boundary" must be a power of two.
 1949  *
 1950  *      If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
 1951  *      then the memory attribute setting for the physical pages is configured
 1952  *      to the object's memory attribute setting.  Otherwise, the memory
 1953  *      attribute setting for the physical pages is configured to "memattr",
 1954  *      overriding the object's memory attribute setting.  However, if the
 1955  *      object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
 1956  *      memory attribute setting for the physical pages cannot be configured
 1957  *      to VM_MEMATTR_DEFAULT.
 1958  *
 1959  *      The specified object may not contain fictitious pages.
 1960  *
 1961  *      The caller must always specify an allocation class.
 1962  *
 1963  *      allocation classes:
 1964  *      VM_ALLOC_NORMAL         normal process request
 1965  *      VM_ALLOC_SYSTEM         system *really* needs a page
 1966  *      VM_ALLOC_INTERRUPT      interrupt time request
 1967  *
 1968  *      optional allocation flags:
 1969  *      VM_ALLOC_NOBUSY         do not exclusive busy the page
 1970  *      VM_ALLOC_NODUMP         do not include the page in a kernel core dump
 1971  *      VM_ALLOC_NOOBJ          page is not associated with an object and
 1972  *                              should not be exclusive busy
 1973  *      VM_ALLOC_SBUSY          shared busy the allocated page
 1974  *      VM_ALLOC_WIRED          wire the allocated page
 1975  *      VM_ALLOC_ZERO           prefer a zeroed page
 1976  */
 1977 vm_page_t
 1978 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
 1979     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
 1980     vm_paddr_t boundary, vm_memattr_t memattr)
 1981 {
 1982         struct vm_domainset_iter di;
 1983         vm_page_t m;
 1984         int domain;
 1985 
 1986         vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
 1987         do {
 1988                 m = vm_page_alloc_contig_domain(object, pindex, domain, req,
 1989                     npages, low, high, alignment, boundary, memattr);
 1990                 if (m != NULL)
 1991                         break;
 1992         } while (vm_domainset_iter_page(&di, object, &domain) == 0);
 1993 
 1994         return (m);
 1995 }
 1996 
 1997 vm_page_t
 1998 vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
 1999     int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
 2000     vm_paddr_t boundary, vm_memattr_t memattr)
 2001 {
 2002         struct vm_domain *vmd;
 2003         vm_page_t m, m_ret, mpred;
 2004         u_int busy_lock, flags, oflags;
 2005 
 2006         mpred = NULL;   /* XXX: pacify gcc */
 2007         KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 2008             (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 2009             ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 2010             (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 2011             ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
 2012             req));
 2013         KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
 2014             ("Can't sleep and retry object insertion."));
 2015         if (object != NULL) {
 2016                 VM_OBJECT_ASSERT_WLOCKED(object);
 2017                 KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
 2018                     ("vm_page_alloc_contig: object %p has fictitious pages",
 2019                     object));
 2020         }
 2021         KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
 2022 
 2023         if (object != NULL) {
 2024                 mpred = vm_radix_lookup_le(&object->rtree, pindex);
 2025                 KASSERT(mpred == NULL || mpred->pindex != pindex,
 2026                     ("vm_page_alloc_contig: pindex already allocated"));
 2027         }
 2028 
 2029         /*
 2030          * Can we allocate the pages without the number of free pages falling
 2031          * below the lower bound for the allocation class?
 2032          */
 2033         m_ret = NULL;
 2034 again:
 2035 #if VM_NRESERVLEVEL > 0
 2036         /*
 2037          * Can we allocate the pages from a reservation?
 2038          */
 2039         if (vm_object_reserv(object) &&
 2040             ((m_ret = vm_reserv_extend_contig(req, object, pindex, domain,
 2041             npages, low, high, alignment, boundary, mpred)) != NULL ||
 2042             (m_ret = vm_reserv_alloc_contig(req, object, pindex, domain,
 2043             npages, low, high, alignment, boundary, mpred)) != NULL)) {
 2044                 domain = vm_phys_domain(m_ret);
 2045                 vmd = VM_DOMAIN(domain);
 2046                 goto found;
 2047         }
 2048 #endif
 2049         vmd = VM_DOMAIN(domain);
 2050         if (vm_domain_allocate(vmd, req, npages)) {
 2051                 /*
 2052                  * allocate them from the free page queues.
 2053                  */
 2054                 vm_domain_free_lock(vmd);
 2055                 m_ret = vm_phys_alloc_contig(domain, npages, low, high,
 2056                     alignment, boundary);
 2057                 vm_domain_free_unlock(vmd);
 2058                 if (m_ret == NULL) {
 2059                         vm_domain_freecnt_inc(vmd, npages);
 2060 #if VM_NRESERVLEVEL > 0
 2061                         if (vm_reserv_reclaim_contig(domain, npages, low,
 2062                             high, alignment, boundary))
 2063                                 goto again;
 2064 #endif
 2065                 }
 2066         }
 2067         if (m_ret == NULL) {
 2068                 if (vm_domain_alloc_fail(vmd, object, req))
 2069                         goto again;
 2070                 return (NULL);
 2071         }
 2072 #if VM_NRESERVLEVEL > 0
 2073 found:
 2074 #endif
 2075         for (m = m_ret; m < &m_ret[npages]; m++) {
 2076                 vm_page_dequeue(m);
 2077                 vm_page_alloc_check(m);
 2078         }
 2079 
 2080         /*
 2081          * Initialize the pages.  Only the PG_ZERO flag is inherited.
 2082          */
 2083         flags = 0;
 2084         if ((req & VM_ALLOC_ZERO) != 0)
 2085                 flags = PG_ZERO;
 2086         if ((req & VM_ALLOC_NODUMP) != 0)
 2087                 flags |= PG_NODUMP;
 2088         oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 2089             VPO_UNMANAGED : 0;
 2090         busy_lock = VPB_UNBUSIED;
 2091         if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 2092                 busy_lock = VPB_SINGLE_EXCLUSIVER;
 2093         if ((req & VM_ALLOC_SBUSY) != 0)
 2094                 busy_lock = VPB_SHARERS_WORD(1);
 2095         if ((req & VM_ALLOC_WIRED) != 0)
 2096                 vm_wire_add(npages);
 2097         if (object != NULL) {
 2098                 if (object->memattr != VM_MEMATTR_DEFAULT &&
 2099                     memattr == VM_MEMATTR_DEFAULT)
 2100                         memattr = object->memattr;
 2101         }
 2102         for (m = m_ret; m < &m_ret[npages]; m++) {
 2103                 m->aflags = 0;
 2104                 m->flags = (m->flags | PG_NODUMP) & flags;
 2105                 m->busy_lock = busy_lock;
 2106                 if ((req & VM_ALLOC_WIRED) != 0)
 2107                         m->wire_count = 1;
 2108                 m->act_count = 0;
 2109                 m->oflags = oflags;
 2110                 if (object != NULL) {
 2111                         if (vm_page_insert_after(m, object, pindex, mpred)) {
 2112                                 if ((req & VM_ALLOC_WIRED) != 0)
 2113                                         vm_wire_sub(npages);
 2114                                 KASSERT(m->object == NULL,
 2115                                     ("page %p has object", m));
 2116                                 mpred = m;
 2117                                 for (m = m_ret; m < &m_ret[npages]; m++) {
 2118                                         if (m <= mpred &&
 2119                                             (req & VM_ALLOC_WIRED) != 0)
 2120                                                 m->wire_count = 0;
 2121                                         m->oflags = VPO_UNMANAGED;
 2122                                         m->busy_lock = VPB_UNBUSIED;
 2123                                         /* Don't change PG_ZERO. */
 2124                                         vm_page_free_toq(m);
 2125                                 }
 2126                                 if (req & VM_ALLOC_WAITFAIL) {
 2127                                         VM_OBJECT_WUNLOCK(object);
 2128                                         vm_radix_wait();
 2129                                         VM_OBJECT_WLOCK(object);
 2130                                 }
 2131                                 return (NULL);
 2132                         }
 2133                         mpred = m;
 2134                 } else
 2135                         m->pindex = pindex;
 2136                 if (memattr != VM_MEMATTR_DEFAULT)
 2137                         pmap_page_set_memattr(m, memattr);
 2138                 pindex++;
 2139         }
 2140         return (m_ret);
 2141 }
 2142 
 2143 /*
 2144  * Check a page that has been freshly dequeued from a freelist.
 2145  */
 2146 static void
 2147 vm_page_alloc_check(vm_page_t m)
 2148 {
 2149 
 2150         KASSERT(m->object == NULL, ("page %p has object", m));
 2151         KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0,
 2152             ("page %p has unexpected queue %d, flags %#x",
 2153             m, m->queue, (m->aflags & PGA_QUEUE_STATE_MASK)));
 2154         KASSERT(!vm_page_held(m), ("page %p is held", m));
 2155         KASSERT(!vm_page_busied(m), ("page %p is busy", m));
 2156         KASSERT(m->dirty == 0, ("page %p is dirty", m));
 2157         KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 2158             ("page %p has unexpected memattr %d",
 2159             m, pmap_page_get_memattr(m)));
 2160         KASSERT(m->valid == 0, ("free page %p is valid", m));
 2161 }
 2162 
 2163 /*
 2164  *      vm_page_alloc_freelist:
 2165  *
 2166  *      Allocate a physical page from the specified free page list.
 2167  *
 2168  *      The caller must always specify an allocation class.
 2169  *
 2170  *      allocation classes:
 2171  *      VM_ALLOC_NORMAL         normal process request
 2172  *      VM_ALLOC_SYSTEM         system *really* needs a page
 2173  *      VM_ALLOC_INTERRUPT      interrupt time request
 2174  *
 2175  *      optional allocation flags:
 2176  *      VM_ALLOC_COUNT(number)  the number of additional pages that the caller
 2177  *                              intends to allocate
 2178  *      VM_ALLOC_WIRED          wire the allocated page
 2179  *      VM_ALLOC_ZERO           prefer a zeroed page
 2180  */
 2181 vm_page_t
 2182 vm_page_alloc_freelist(int freelist, int req)
 2183 {
 2184         struct vm_domainset_iter di;
 2185         vm_page_t m;
 2186         int domain;
 2187 
 2188         vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 2189         do {
 2190                 m = vm_page_alloc_freelist_domain(domain, freelist, req);
 2191                 if (m != NULL)
 2192                         break;
 2193         } while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
 2194 
 2195         return (m);
 2196 }
 2197 
 2198 vm_page_t
 2199 vm_page_alloc_freelist_domain(int domain, int freelist, int req)
 2200 {
 2201         struct vm_domain *vmd;
 2202         vm_page_t m;
 2203         u_int flags;
 2204 
 2205         m = NULL;
 2206         vmd = VM_DOMAIN(domain);
 2207 again:
 2208         if (vm_domain_allocate(vmd, req, 1)) {
 2209                 vm_domain_free_lock(vmd);
 2210                 m = vm_phys_alloc_freelist_pages(domain, freelist,
 2211                     VM_FREEPOOL_DIRECT, 0);
 2212                 vm_domain_free_unlock(vmd);
 2213                 if (m == NULL)
 2214                         vm_domain_freecnt_inc(vmd, 1);
 2215         }
 2216         if (m == NULL) {
 2217                 if (vm_domain_alloc_fail(vmd, NULL, req))
 2218                         goto again;
 2219                 return (NULL);
 2220         }
 2221         vm_page_dequeue(m);
 2222         vm_page_alloc_check(m);
 2223 
 2224         /*
 2225          * Initialize the page.  Only the PG_ZERO flag is inherited.
 2226          */
 2227         m->aflags = 0;
 2228         flags = 0;
 2229         if ((req & VM_ALLOC_ZERO) != 0)
 2230                 flags = PG_ZERO;
 2231         m->flags &= flags;
 2232         if ((req & VM_ALLOC_WIRED) != 0) {
 2233                 /*
 2234                  * The page lock is not required for wiring a page that does
 2235                  * not belong to an object.
 2236                  */
 2237                 vm_wire_add(1);
 2238                 m->wire_count = 1;
 2239         }
 2240         /* Unmanaged pages don't use "act_count". */
 2241         m->oflags = VPO_UNMANAGED;
 2242         return (m);
 2243 }
 2244 
 2245 static int
 2246 vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags)
 2247 {
 2248         struct vm_domain *vmd;
 2249         struct vm_pgcache *pgcache;
 2250         int i;
 2251 
 2252         pgcache = arg;
 2253         vmd = VM_DOMAIN(pgcache->domain);
 2254 
 2255         /*
 2256          * The page daemon should avoid creating extra memory pressure since its
 2257          * main purpose is to replenish the store of free pages.
 2258          */
 2259         if (vmd->vmd_severeset || curproc == pageproc ||
 2260             !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt))
 2261                 return (0);
 2262         domain = vmd->vmd_domain;
 2263         vm_domain_free_lock(vmd);
 2264         i = vm_phys_alloc_npages(domain, pgcache->pool, cnt,
 2265             (vm_page_t *)store);
 2266         vm_domain_free_unlock(vmd);
 2267         if (cnt != i)
 2268                 vm_domain_freecnt_inc(vmd, cnt - i);
 2269 
 2270         return (i);
 2271 }
 2272 
 2273 static void
 2274 vm_page_zone_release(void *arg, void **store, int cnt)
 2275 {
 2276         struct vm_domain *vmd;
 2277         struct vm_pgcache *pgcache;
 2278         vm_page_t m;
 2279         int i;
 2280 
 2281         pgcache = arg;
 2282         vmd = VM_DOMAIN(pgcache->domain);
 2283         vm_domain_free_lock(vmd);
 2284         for (i = 0; i < cnt; i++) {
 2285                 m = (vm_page_t)store[i];
 2286                 vm_phys_free_pages(m, 0);
 2287         }
 2288         vm_domain_free_unlock(vmd);
 2289         vm_domain_freecnt_inc(vmd, cnt);
 2290 }
 2291 
 2292 #define VPSC_ANY        0       /* No restrictions. */
 2293 #define VPSC_NORESERV   1       /* Skip reservations; implies VPSC_NOSUPER. */
 2294 #define VPSC_NOSUPER    2       /* Skip superpages. */
 2295 
 2296 /*
 2297  *      vm_page_scan_contig:
 2298  *
 2299  *      Scan vm_page_array[] between the specified entries "m_start" and
 2300  *      "m_end" for a run of contiguous physical pages that satisfy the
 2301  *      specified conditions, and return the lowest page in the run.  The
 2302  *      specified "alignment" determines the alignment of the lowest physical
 2303  *      page in the run.  If the specified "boundary" is non-zero, then the
 2304  *      run of physical pages cannot span a physical address that is a
 2305  *      multiple of "boundary".
 2306  *
 2307  *      "m_end" is never dereferenced, so it need not point to a vm_page
 2308  *      structure within vm_page_array[].
 2309  *
 2310  *      "npages" must be greater than zero.  "m_start" and "m_end" must not
 2311  *      span a hole (or discontiguity) in the physical address space.  Both
 2312  *      "alignment" and "boundary" must be a power of two.
 2313  */
 2314 vm_page_t
 2315 vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
 2316     u_long alignment, vm_paddr_t boundary, int options)
 2317 {
 2318         struct mtx *m_mtx;
 2319         vm_object_t object;
 2320         vm_paddr_t pa;
 2321         vm_page_t m, m_run;
 2322 #if VM_NRESERVLEVEL > 0
 2323         int level;
 2324 #endif
 2325         int m_inc, order, run_ext, run_len;
 2326 
 2327         KASSERT(npages > 0, ("npages is 0"));
 2328         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 2329         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 2330         m_run = NULL;
 2331         run_len = 0;
 2332         m_mtx = NULL;
 2333         for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
 2334                 KASSERT((m->flags & PG_MARKER) == 0,
 2335                     ("page %p is PG_MARKER", m));
 2336                 KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1,
 2337                     ("fictitious page %p has invalid wire count", m));
 2338 
 2339                 /*
 2340                  * If the current page would be the start of a run, check its
 2341                  * physical address against the end, alignment, and boundary
 2342                  * conditions.  If it doesn't satisfy these conditions, either
 2343                  * terminate the scan or advance to the next page that
 2344                  * satisfies the failed condition.
 2345                  */
 2346                 if (run_len == 0) {
 2347                         KASSERT(m_run == NULL, ("m_run != NULL"));
 2348                         if (m + npages > m_end)
 2349                                 break;
 2350                         pa = VM_PAGE_TO_PHYS(m);
 2351                         if ((pa & (alignment - 1)) != 0) {
 2352                                 m_inc = atop(roundup2(pa, alignment) - pa);
 2353                                 continue;
 2354                         }
 2355                         if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
 2356                             boundary) != 0) {
 2357                                 m_inc = atop(roundup2(pa, boundary) - pa);
 2358                                 continue;
 2359                         }
 2360                 } else
 2361                         KASSERT(m_run != NULL, ("m_run == NULL"));
 2362 
 2363                 vm_page_change_lock(m, &m_mtx);
 2364                 m_inc = 1;
 2365 retry:
 2366                 if (vm_page_held(m))
 2367                         run_ext = 0;
 2368 #if VM_NRESERVLEVEL > 0
 2369                 else if ((level = vm_reserv_level(m)) >= 0 &&
 2370                     (options & VPSC_NORESERV) != 0) {
 2371                         run_ext = 0;
 2372                         /* Advance to the end of the reservation. */
 2373                         pa = VM_PAGE_TO_PHYS(m);
 2374                         m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
 2375                             pa);
 2376                 }
 2377 #endif
 2378                 else if ((object = m->object) != NULL) {
 2379                         /*
 2380                          * The page is considered eligible for relocation if
 2381                          * and only if it could be laundered or reclaimed by
 2382                          * the page daemon.
 2383                          */
 2384                         if (!VM_OBJECT_TRYRLOCK(object)) {
 2385                                 mtx_unlock(m_mtx);
 2386                                 VM_OBJECT_RLOCK(object);
 2387                                 mtx_lock(m_mtx);
 2388                                 if (m->object != object) {
 2389                                         /*
 2390                                          * The page may have been freed.
 2391                                          */
 2392                                         VM_OBJECT_RUNLOCK(object);
 2393                                         goto retry;
 2394                                 } else if (vm_page_held(m)) {
 2395                                         run_ext = 0;
 2396                                         goto unlock;
 2397                                 }
 2398                         }
 2399                         KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 2400                             ("page %p is PG_UNHOLDFREE", m));
 2401                         /* Don't care: PG_NODUMP, PG_ZERO. */
 2402                         if (object->type != OBJT_DEFAULT &&
 2403                             object->type != OBJT_SWAP &&
 2404                             object->type != OBJT_VNODE) {
 2405                                 run_ext = 0;
 2406 #if VM_NRESERVLEVEL > 0
 2407                         } else if ((options & VPSC_NOSUPER) != 0 &&
 2408                             (level = vm_reserv_level_iffullpop(m)) >= 0) {
 2409                                 run_ext = 0;
 2410                                 /* Advance to the end of the superpage. */
 2411                                 pa = VM_PAGE_TO_PHYS(m);
 2412                                 m_inc = atop(roundup2(pa + 1,
 2413                                     vm_reserv_size(level)) - pa);
 2414 #endif
 2415                         } else if (object->memattr == VM_MEMATTR_DEFAULT &&
 2416                             vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) {
 2417                                 /*
 2418                                  * The page is allocated but eligible for
 2419                                  * relocation.  Extend the current run by one
 2420                                  * page.
 2421                                  */
 2422                                 KASSERT(pmap_page_get_memattr(m) ==
 2423                                     VM_MEMATTR_DEFAULT,
 2424                                     ("page %p has an unexpected memattr", m));
 2425                                 KASSERT((m->oflags & (VPO_SWAPINPROG |
 2426                                     VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
 2427                                     ("page %p has unexpected oflags", m));
 2428                                 /* Don't care: VPO_NOSYNC. */
 2429                                 run_ext = 1;
 2430                         } else
 2431                                 run_ext = 0;
 2432 unlock:
 2433                         VM_OBJECT_RUNLOCK(object);
 2434 #if VM_NRESERVLEVEL > 0
 2435                 } else if (level >= 0) {
 2436                         /*
 2437                          * The page is reserved but not yet allocated.  In
 2438                          * other words, it is still free.  Extend the current
 2439                          * run by one page.
 2440                          */
 2441                         run_ext = 1;
 2442 #endif
 2443                 } else if ((order = m->order) < VM_NFREEORDER) {
 2444                         /*
 2445                          * The page is enqueued in the physical memory
 2446                          * allocator's free page queues.  Moreover, it is the
 2447                          * first page in a power-of-two-sized run of
 2448                          * contiguous free pages.  Add these pages to the end
 2449                          * of the current run, and jump ahead.
 2450                          */
 2451                         run_ext = 1 << order;
 2452                         m_inc = 1 << order;
 2453                 } else {
 2454                         /*
 2455                          * Skip the page for one of the following reasons: (1)
 2456                          * It is enqueued in the physical memory allocator's
 2457                          * free page queues.  However, it is not the first
 2458                          * page in a run of contiguous free pages.  (This case
 2459                          * rarely occurs because the scan is performed in
 2460                          * ascending order.) (2) It is not reserved, and it is
 2461                          * transitioning from free to allocated.  (Conversely,
 2462                          * the transition from allocated to free for managed
 2463                          * pages is blocked by the page lock.) (3) It is
 2464                          * allocated but not contained by an object and not
 2465                          * wired, e.g., allocated by Xen's balloon driver.
 2466                          */
 2467                         run_ext = 0;
 2468                 }
 2469 
 2470                 /*
 2471                  * Extend or reset the current run of pages.
 2472                  */
 2473                 if (run_ext > 0) {
 2474                         if (run_len == 0)
 2475                                 m_run = m;
 2476                         run_len += run_ext;
 2477                 } else {
 2478                         if (run_len > 0) {
 2479                                 m_run = NULL;
 2480                                 run_len = 0;
 2481                         }
 2482                 }
 2483         }
 2484         if (m_mtx != NULL)
 2485                 mtx_unlock(m_mtx);
 2486         if (run_len >= npages)
 2487                 return (m_run);
 2488         return (NULL);
 2489 }
 2490 
 2491 /*
 2492  *      vm_page_reclaim_run:
 2493  *
 2494  *      Try to relocate each of the allocated virtual pages within the
 2495  *      specified run of physical pages to a new physical address.  Free the
 2496  *      physical pages underlying the relocated virtual pages.  A virtual page
 2497  *      is relocatable if and only if it could be laundered or reclaimed by
 2498  *      the page daemon.  Whenever possible, a virtual page is relocated to a
 2499  *      physical address above "high".
 2500  *
 2501  *      Returns 0 if every physical page within the run was already free or
 2502  *      just freed by a successful relocation.  Otherwise, returns a non-zero
 2503  *      value indicating why the last attempt to relocate a virtual page was
 2504  *      unsuccessful.
 2505  *
 2506  *      "req_class" must be an allocation class.
 2507  */
 2508 static int
 2509 vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
 2510     vm_paddr_t high)
 2511 {
 2512         struct vm_domain *vmd;
 2513         struct mtx *m_mtx;
 2514         struct spglist free;
 2515         vm_object_t object;
 2516         vm_paddr_t pa;
 2517         vm_page_t m, m_end, m_new;
 2518         int error, order, req;
 2519 
 2520         KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
 2521             ("req_class is not an allocation class"));
 2522         SLIST_INIT(&free);
 2523         error = 0;
 2524         m = m_run;
 2525         m_end = m_run + npages;
 2526         m_mtx = NULL;
 2527         for (; error == 0 && m < m_end; m++) {
 2528                 KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
 2529                     ("page %p is PG_FICTITIOUS or PG_MARKER", m));
 2530 
 2531                 /*
 2532                  * Avoid releasing and reacquiring the same page lock.
 2533                  */
 2534                 vm_page_change_lock(m, &m_mtx);
 2535 retry:
 2536                 if (vm_page_held(m))
 2537                         error = EBUSY;
 2538                 else if ((object = m->object) != NULL) {
 2539                         /*
 2540                          * The page is relocated if and only if it could be
 2541                          * laundered or reclaimed by the page daemon.
 2542                          */
 2543                         if (!VM_OBJECT_TRYWLOCK(object)) {
 2544                                 mtx_unlock(m_mtx);
 2545                                 VM_OBJECT_WLOCK(object);
 2546                                 mtx_lock(m_mtx);
 2547                                 if (m->object != object) {
 2548                                         /*
 2549                                          * The page may have been freed.
 2550                                          */
 2551                                         VM_OBJECT_WUNLOCK(object);
 2552                                         goto retry;
 2553                                 } else if (vm_page_held(m)) {
 2554                                         error = EBUSY;
 2555                                         goto unlock;
 2556                                 }
 2557                         }
 2558                         KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 2559                             ("page %p is PG_UNHOLDFREE", m));
 2560                         /* Don't care: PG_NODUMP, PG_ZERO. */
 2561                         if (object->type != OBJT_DEFAULT &&
 2562                             object->type != OBJT_SWAP &&
 2563                             object->type != OBJT_VNODE)
 2564                                 error = EINVAL;
 2565                         else if (object->memattr != VM_MEMATTR_DEFAULT)
 2566                                 error = EINVAL;
 2567                         else if (vm_page_queue(m) != PQ_NONE &&
 2568                             !vm_page_busied(m)) {
 2569                                 KASSERT(pmap_page_get_memattr(m) ==
 2570                                     VM_MEMATTR_DEFAULT,
 2571                                     ("page %p has an unexpected memattr", m));
 2572                                 KASSERT((m->oflags & (VPO_SWAPINPROG |
 2573                                     VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
 2574                                     ("page %p has unexpected oflags", m));
 2575                                 /* Don't care: VPO_NOSYNC. */
 2576                                 if (m->valid != 0) {
 2577                                         /*
 2578                                          * First, try to allocate a new page
 2579                                          * that is above "high".  Failing
 2580                                          * that, try to allocate a new page
 2581                                          * that is below "m_run".  Allocate
 2582                                          * the new page between the end of
 2583                                          * "m_run" and "high" only as a last
 2584                                          * resort.
 2585                                          */
 2586                                         req = req_class | VM_ALLOC_NOOBJ;
 2587                                         if ((m->flags & PG_NODUMP) != 0)
 2588                                                 req |= VM_ALLOC_NODUMP;
 2589                                         if (trunc_page(high) !=
 2590                                             ~(vm_paddr_t)PAGE_MASK) {
 2591                                                 m_new = vm_page_alloc_contig(
 2592                                                     NULL, 0, req, 1,
 2593                                                     round_page(high),
 2594                                                     ~(vm_paddr_t)0,
 2595                                                     PAGE_SIZE, 0,
 2596                                                     VM_MEMATTR_DEFAULT);
 2597                                         } else
 2598                                                 m_new = NULL;
 2599                                         if (m_new == NULL) {
 2600                                                 pa = VM_PAGE_TO_PHYS(m_run);
 2601                                                 m_new = vm_page_alloc_contig(
 2602                                                     NULL, 0, req, 1,
 2603                                                     0, pa - 1, PAGE_SIZE, 0,
 2604                                                     VM_MEMATTR_DEFAULT);
 2605                                         }
 2606                                         if (m_new == NULL) {
 2607                                                 pa += ptoa(npages);
 2608                                                 m_new = vm_page_alloc_contig(
 2609                                                     NULL, 0, req, 1,
 2610                                                     pa, high, PAGE_SIZE, 0,
 2611                                                     VM_MEMATTR_DEFAULT);
 2612                                         }
 2613                                         if (m_new == NULL) {
 2614                                                 error = ENOMEM;
 2615                                                 goto unlock;
 2616                                         }
 2617                                         KASSERT(!vm_page_wired(m_new),
 2618                                             ("page %p is wired", m_new));
 2619 
 2620                                         /*
 2621                                          * Replace "m" with the new page.  For
 2622                                          * vm_page_replace(), "m" must be busy
 2623                                          * and dequeued.  Finally, change "m"
 2624                                          * as if vm_page_free() was called.
 2625                                          */
 2626                                         if (object->ref_count != 0)
 2627                                                 pmap_remove_all(m);
 2628                                         m_new->aflags = m->aflags &
 2629                                             ~PGA_QUEUE_STATE_MASK;
 2630                                         KASSERT(m_new->oflags == VPO_UNMANAGED,
 2631                                             ("page %p is managed", m_new));
 2632                                         m_new->oflags = m->oflags & VPO_NOSYNC;
 2633                                         pmap_copy_page(m, m_new);
 2634                                         m_new->valid = m->valid;
 2635                                         m_new->dirty = m->dirty;
 2636                                         m->flags &= ~PG_ZERO;
 2637                                         vm_page_xbusy(m);
 2638                                         vm_page_dequeue(m);
 2639                                         vm_page_replace_checked(m_new, object,
 2640                                             m->pindex, m);
 2641                                         if (vm_page_free_prep(m))
 2642                                                 SLIST_INSERT_HEAD(&free, m,
 2643                                                     plinks.s.ss);
 2644 
 2645                                         /*
 2646                                          * The new page must be deactivated
 2647                                          * before the object is unlocked.
 2648                                          */
 2649                                         vm_page_change_lock(m_new, &m_mtx);
 2650                                         vm_page_deactivate(m_new);
 2651                                 } else {
 2652                                         m->flags &= ~PG_ZERO;
 2653                                         vm_page_dequeue(m);
 2654                                         if (vm_page_free_prep(m))
 2655                                                 SLIST_INSERT_HEAD(&free, m,
 2656                                                     plinks.s.ss);
 2657                                         KASSERT(m->dirty == 0,
 2658                                             ("page %p is dirty", m));
 2659                                 }
 2660                         } else
 2661                                 error = EBUSY;
 2662 unlock:
 2663                         VM_OBJECT_WUNLOCK(object);
 2664                 } else {
 2665                         MPASS(vm_phys_domain(m) == domain);
 2666                         vmd = VM_DOMAIN(domain);
 2667                         vm_domain_free_lock(vmd);
 2668                         order = m->order;
 2669                         if (order < VM_NFREEORDER) {
 2670                                 /*
 2671                                  * The page is enqueued in the physical memory
 2672                                  * allocator's free page queues.  Moreover, it
 2673                                  * is the first page in a power-of-two-sized
 2674                                  * run of contiguous free pages.  Jump ahead
 2675                                  * to the last page within that run, and
 2676                                  * continue from there.
 2677                                  */
 2678                                 m += (1 << order) - 1;
 2679                         }
 2680 #if VM_NRESERVLEVEL > 0
 2681                         else if (vm_reserv_is_page_free(m))
 2682                                 order = 0;
 2683 #endif
 2684                         vm_domain_free_unlock(vmd);
 2685                         if (order == VM_NFREEORDER)
 2686                                 error = EINVAL;
 2687                 }
 2688         }
 2689         if (m_mtx != NULL)
 2690                 mtx_unlock(m_mtx);
 2691         if ((m = SLIST_FIRST(&free)) != NULL) {
 2692                 int cnt;
 2693 
 2694                 vmd = VM_DOMAIN(domain);
 2695                 cnt = 0;
 2696                 vm_domain_free_lock(vmd);
 2697                 do {
 2698                         MPASS(vm_phys_domain(m) == domain);
 2699                         SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 2700                         vm_phys_free_pages(m, 0);
 2701                         cnt++;
 2702                 } while ((m = SLIST_FIRST(&free)) != NULL);
 2703                 vm_domain_free_unlock(vmd);
 2704                 vm_domain_freecnt_inc(vmd, cnt);
 2705         }
 2706         return (error);
 2707 }
 2708 
 2709 #define NRUNS   16
 2710 
 2711 CTASSERT(powerof2(NRUNS));
 2712 
 2713 #define RUN_INDEX(count)        ((count) & (NRUNS - 1))
 2714 
 2715 #define MIN_RECLAIM     8
 2716 
 2717 /*
 2718  *      vm_page_reclaim_contig:
 2719  *
 2720  *      Reclaim allocated, contiguous physical memory satisfying the specified
 2721  *      conditions by relocating the virtual pages using that physical memory.
 2722  *      Returns true if reclamation is successful and false otherwise.  Since
 2723  *      relocation requires the allocation of physical pages, reclamation may
 2724  *      fail due to a shortage of free pages.  When reclamation fails, callers
 2725  *      are expected to perform vm_wait() before retrying a failed allocation
 2726  *      operation, e.g., vm_page_alloc_contig().
 2727  *
 2728  *      The caller must always specify an allocation class through "req".
 2729  *
 2730  *      allocation classes:
 2731  *      VM_ALLOC_NORMAL         normal process request
 2732  *      VM_ALLOC_SYSTEM         system *really* needs a page
 2733  *      VM_ALLOC_INTERRUPT      interrupt time request
 2734  *
 2735  *      The optional allocation flags are ignored.
 2736  *
 2737  *      "npages" must be greater than zero.  Both "alignment" and "boundary"
 2738  *      must be a power of two.
 2739  */
 2740 bool
 2741 vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
 2742     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 2743 {
 2744         struct vm_domain *vmd;
 2745         vm_paddr_t curr_low;
 2746         vm_page_t m_run, m_runs[NRUNS];
 2747         u_long count, reclaimed;
 2748         int error, i, options, req_class;
 2749 
 2750         KASSERT(npages > 0, ("npages is 0"));
 2751         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 2752         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 2753         req_class = req & VM_ALLOC_CLASS_MASK;
 2754 
 2755         /*
 2756          * The page daemon is allowed to dig deeper into the free page list.
 2757          */
 2758         if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 2759                 req_class = VM_ALLOC_SYSTEM;
 2760 
 2761         /*
 2762          * Return if the number of free pages cannot satisfy the requested
 2763          * allocation.
 2764          */
 2765         vmd = VM_DOMAIN(domain);
 2766         count = vmd->vmd_free_count;
 2767         if (count < npages + vmd->vmd_free_reserved || (count < npages +
 2768             vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
 2769             (count < npages && req_class == VM_ALLOC_INTERRUPT))
 2770                 return (false);
 2771 
 2772         /*
 2773          * Scan up to three times, relaxing the restrictions ("options") on
 2774          * the reclamation of reservations and superpages each time.
 2775          */
 2776         for (options = VPSC_NORESERV;;) {
 2777                 /*
 2778                  * Find the highest runs that satisfy the given constraints
 2779                  * and restrictions, and record them in "m_runs".
 2780                  */
 2781                 curr_low = low;
 2782                 count = 0;
 2783                 for (;;) {
 2784                         m_run = vm_phys_scan_contig(domain, npages, curr_low,
 2785                             high, alignment, boundary, options);
 2786                         if (m_run == NULL)
 2787                                 break;
 2788                         curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
 2789                         m_runs[RUN_INDEX(count)] = m_run;
 2790                         count++;
 2791                 }
 2792 
 2793                 /*
 2794                  * Reclaim the highest runs in LIFO (descending) order until
 2795                  * the number of reclaimed pages, "reclaimed", is at least
 2796                  * MIN_RECLAIM.  Reset "reclaimed" each time because each
 2797                  * reclamation is idempotent, and runs will (likely) recur
 2798                  * from one scan to the next as restrictions are relaxed.
 2799                  */
 2800                 reclaimed = 0;
 2801                 for (i = 0; count > 0 && i < NRUNS; i++) {
 2802                         count--;
 2803                         m_run = m_runs[RUN_INDEX(count)];
 2804                         error = vm_page_reclaim_run(req_class, domain, npages,
 2805                             m_run, high);
 2806                         if (error == 0) {
 2807                                 reclaimed += npages;
 2808                                 if (reclaimed >= MIN_RECLAIM)
 2809                                         return (true);
 2810                         }
 2811                 }
 2812 
 2813                 /*
 2814                  * Either relax the restrictions on the next scan or return if
 2815                  * the last scan had no restrictions.
 2816                  */
 2817                 if (options == VPSC_NORESERV)
 2818                         options = VPSC_NOSUPER;
 2819                 else if (options == VPSC_NOSUPER)
 2820                         options = VPSC_ANY;
 2821                 else if (options == VPSC_ANY)
 2822                         return (reclaimed != 0);
 2823         }
 2824 }
 2825 
 2826 bool
 2827 vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
 2828     u_long alignment, vm_paddr_t boundary)
 2829 {
 2830         struct vm_domainset_iter di;
 2831         int domain;
 2832         bool ret;
 2833 
 2834         vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 2835         do {
 2836                 ret = vm_page_reclaim_contig_domain(domain, req, npages, low,
 2837                     high, alignment, boundary);
 2838                 if (ret)
 2839                         break;
 2840         } while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
 2841 
 2842         return (ret);
 2843 }
 2844 
 2845 /*
 2846  * Set the domain in the appropriate page level domainset.
 2847  */
 2848 void
 2849 vm_domain_set(struct vm_domain *vmd)
 2850 {
 2851 
 2852         mtx_lock(&vm_domainset_lock);
 2853         if (!vmd->vmd_minset && vm_paging_min(vmd)) {
 2854                 vmd->vmd_minset = 1;
 2855                 DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains);
 2856         }
 2857         if (!vmd->vmd_severeset && vm_paging_severe(vmd)) {
 2858                 vmd->vmd_severeset = 1;
 2859                 DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains);
 2860         }
 2861         mtx_unlock(&vm_domainset_lock);
 2862 }
 2863 
 2864 /*
 2865  * Clear the domain from the appropriate page level domainset.
 2866  */
 2867 void
 2868 vm_domain_clear(struct vm_domain *vmd)
 2869 {
 2870 
 2871         mtx_lock(&vm_domainset_lock);
 2872         if (vmd->vmd_minset && !vm_paging_min(vmd)) {
 2873                 vmd->vmd_minset = 0;
 2874                 DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains);
 2875                 if (vm_min_waiters != 0) {
 2876                         vm_min_waiters = 0;
 2877                         wakeup(&vm_min_domains);
 2878                 }
 2879         }
 2880         if (vmd->vmd_severeset && !vm_paging_severe(vmd)) {
 2881                 vmd->vmd_severeset = 0;
 2882                 DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains);
 2883                 if (vm_severe_waiters != 0) {
 2884                         vm_severe_waiters = 0;
 2885                         wakeup(&vm_severe_domains);
 2886                 }
 2887         }
 2888 
 2889         /*
 2890          * If pageout daemon needs pages, then tell it that there are
 2891          * some free.
 2892          */
 2893         if (vmd->vmd_pageout_pages_needed &&
 2894             vmd->vmd_free_count >= vmd->vmd_pageout_free_min) {
 2895                 wakeup(&vmd->vmd_pageout_pages_needed);
 2896                 vmd->vmd_pageout_pages_needed = 0;
 2897         }
 2898 
 2899         /* See comments in vm_wait_doms(). */
 2900         if (vm_pageproc_waiters) {
 2901                 vm_pageproc_waiters = 0;
 2902                 wakeup(&vm_pageproc_waiters);
 2903         }
 2904         mtx_unlock(&vm_domainset_lock);
 2905 }
 2906 
 2907 /*
 2908  * Wait for free pages to exceed the min threshold globally.
 2909  */
 2910 void
 2911 vm_wait_min(void)
 2912 {
 2913 
 2914         mtx_lock(&vm_domainset_lock);
 2915         while (vm_page_count_min()) {
 2916                 vm_min_waiters++;
 2917                 msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0);
 2918         }
 2919         mtx_unlock(&vm_domainset_lock);
 2920 }
 2921 
 2922 /*
 2923  * Wait for free pages to exceed the severe threshold globally.
 2924  */
 2925 void
 2926 vm_wait_severe(void)
 2927 {
 2928 
 2929         mtx_lock(&vm_domainset_lock);
 2930         while (vm_page_count_severe()) {
 2931                 vm_severe_waiters++;
 2932                 msleep(&vm_severe_domains, &vm_domainset_lock, PVM,
 2933                     "vmwait", 0);
 2934         }
 2935         mtx_unlock(&vm_domainset_lock);
 2936 }
 2937 
 2938 u_int
 2939 vm_wait_count(void)
 2940 {
 2941 
 2942         return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters);
 2943 }
 2944 
 2945 int
 2946 vm_wait_doms(const domainset_t *wdoms, int mflags)
 2947 {
 2948         int error;
 2949 
 2950         error = 0;
 2951 
 2952         /*
 2953          * We use racey wakeup synchronization to avoid expensive global
 2954          * locking for the pageproc when sleeping with a non-specific vm_wait.
 2955          * To handle this, we only sleep for one tick in this instance.  It
 2956          * is expected that most allocations for the pageproc will come from
 2957          * kmem or vm_page_grab* which will use the more specific and
 2958          * race-free vm_wait_domain().
 2959          */
 2960         if (curproc == pageproc) {
 2961                 mtx_lock(&vm_domainset_lock);
 2962                 vm_pageproc_waiters++;
 2963                 error = msleep(&vm_pageproc_waiters, &vm_domainset_lock,
 2964                     PVM | PDROP | mflags, "pageprocwait", 1);
 2965         } else {
 2966                 /*
 2967                  * XXX Ideally we would wait only until the allocation could
 2968                  * be satisfied.  This condition can cause new allocators to
 2969                  * consume all freed pages while old allocators wait.
 2970                  */
 2971                 mtx_lock(&vm_domainset_lock);
 2972                 if (vm_page_count_min_set(wdoms)) {
 2973                         vm_min_waiters++;
 2974                         error = msleep(&vm_min_domains, &vm_domainset_lock,
 2975                             PVM | PDROP | mflags, "vmwait", 0);
 2976                 } else
 2977                         mtx_unlock(&vm_domainset_lock);
 2978         }
 2979         return (error);
 2980 }
 2981 
 2982 /*
 2983  *      vm_wait_domain:
 2984  *
 2985  *      Sleep until free pages are available for allocation.
 2986  *      - Called in various places after failed memory allocations.
 2987  */
 2988 void
 2989 vm_wait_domain(int domain)
 2990 {
 2991         struct vm_domain *vmd;
 2992         domainset_t wdom;
 2993 
 2994         vmd = VM_DOMAIN(domain);
 2995         vm_domain_free_assert_unlocked(vmd);
 2996 
 2997         if (curproc == pageproc) {
 2998                 mtx_lock(&vm_domainset_lock);
 2999                 if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) {
 3000                         vmd->vmd_pageout_pages_needed = 1;
 3001                         msleep(&vmd->vmd_pageout_pages_needed,
 3002                             &vm_domainset_lock, PDROP | PSWP, "VMWait", 0);
 3003                 } else
 3004                         mtx_unlock(&vm_domainset_lock);
 3005         } else {
 3006                 if (pageproc == NULL)
 3007                         panic("vm_wait in early boot");
 3008                 DOMAINSET_ZERO(&wdom);
 3009                 DOMAINSET_SET(vmd->vmd_domain, &wdom);
 3010                 vm_wait_doms(&wdom, 0);
 3011         }
 3012 }
 3013 
 3014 static int
 3015 vm_wait_flags(vm_object_t obj, int mflags)
 3016 {
 3017         struct domainset *d;
 3018 
 3019         d = NULL;
 3020 
 3021         /*
 3022          * Carefully fetch pointers only once: the struct domainset
 3023          * itself is ummutable but the pointer might change.
 3024          */
 3025         if (obj != NULL)
 3026                 d = obj->domain.dr_policy;
 3027         if (d == NULL)
 3028                 d = curthread->td_domain.dr_policy;
 3029 
 3030         return (vm_wait_doms(&d->ds_mask, mflags));
 3031 }
 3032 
 3033 /*
 3034  *      vm_wait:
 3035  *
 3036  *      Sleep until free pages are available for allocation in the
 3037  *      affinity domains of the obj.  If obj is NULL, the domain set
 3038  *      for the calling thread is used.
 3039  *      Called in various places after failed memory allocations.
 3040  */
 3041 void
 3042 vm_wait(vm_object_t obj)
 3043 {
 3044         (void)vm_wait_flags(obj, 0);
 3045 }
 3046 
 3047 int
 3048 vm_wait_intr(vm_object_t obj)
 3049 {
 3050         return (vm_wait_flags(obj, PCATCH));
 3051 }
 3052 
 3053 /*
 3054  *      vm_domain_alloc_fail:
 3055  *
 3056  *      Called when a page allocation function fails.  Informs the
 3057  *      pagedaemon and performs the requested wait.  Requires the
 3058  *      domain_free and object lock on entry.  Returns with the
 3059  *      object lock held and free lock released.  Returns an error when
 3060  *      retry is necessary.
 3061  *
 3062  */
 3063 static int
 3064 vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req)
 3065 {
 3066 
 3067         vm_domain_free_assert_unlocked(vmd);
 3068 
 3069         atomic_add_int(&vmd->vmd_pageout_deficit,
 3070             max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 3071         if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
 3072                 if (object != NULL) 
 3073                         VM_OBJECT_WUNLOCK(object);
 3074                 vm_wait_domain(vmd->vmd_domain);
 3075                 if (object != NULL) 
 3076                         VM_OBJECT_WLOCK(object);
 3077                 if (req & VM_ALLOC_WAITOK)
 3078                         return (EAGAIN);
 3079         }
 3080 
 3081         return (0);
 3082 }
 3083 
 3084 /*
 3085  *      vm_waitpfault:
 3086  *
 3087  *      Sleep until free pages are available for allocation.
 3088  *      - Called only in vm_fault so that processes page faulting
 3089  *        can be easily tracked.
 3090  *      - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
 3091  *        processes will be able to grab memory first.  Do not change
 3092  *        this balance without careful testing first.
 3093  */
 3094 void
 3095 vm_waitpfault(struct domainset *dset, int timo)
 3096 {
 3097 
 3098         /*
 3099          * XXX Ideally we would wait only until the allocation could
 3100          * be satisfied.  This condition can cause new allocators to
 3101          * consume all freed pages while old allocators wait.
 3102          */
 3103         mtx_lock(&vm_domainset_lock);
 3104         if (vm_page_count_min_set(&dset->ds_mask)) {
 3105                 vm_min_waiters++;
 3106                 msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP,
 3107                     "pfault", timo);
 3108         } else
 3109                 mtx_unlock(&vm_domainset_lock);
 3110 }
 3111 
 3112 static struct vm_pagequeue *
 3113 vm_page_pagequeue(vm_page_t m)
 3114 {
 3115 
 3116         uint8_t queue;
 3117 
 3118         if ((queue = atomic_load_8(&m->queue)) == PQ_NONE)
 3119                 return (NULL);
 3120         return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]);
 3121 }
 3122 
 3123 static inline void
 3124 vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m)
 3125 {
 3126         struct vm_domain *vmd;
 3127         uint8_t qflags;
 3128 
 3129         CRITICAL_ASSERT(curthread);
 3130         vm_pagequeue_assert_locked(pq);
 3131 
 3132         /*
 3133          * The page daemon is allowed to set m->queue = PQ_NONE without
 3134          * the page queue lock held.  In this case it is about to free the page,
 3135          * which must not have any queue state.
 3136          */
 3137         qflags = atomic_load_8(&m->aflags);
 3138         KASSERT(pq == vm_page_pagequeue(m) ||
 3139             (qflags & PGA_QUEUE_STATE_MASK) == 0,
 3140             ("page %p doesn't belong to queue %p but has aflags %#x",
 3141             m, pq, qflags));
 3142 
 3143         if ((qflags & PGA_DEQUEUE) != 0) {
 3144                 if (__predict_true((qflags & PGA_ENQUEUED) != 0))
 3145                         vm_pagequeue_remove(pq, m);
 3146                 vm_page_dequeue_complete(m);
 3147         } else if ((qflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) {
 3148                 if ((qflags & PGA_ENQUEUED) != 0)
 3149                         TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 3150                 else {
 3151                         vm_pagequeue_cnt_inc(pq);
 3152                         vm_page_aflag_set(m, PGA_ENQUEUED);
 3153                 }
 3154 
 3155                 /*
 3156                  * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE.
 3157                  * In particular, if both flags are set in close succession,
 3158                  * only PGA_REQUEUE_HEAD will be applied, even if it was set
 3159                  * first.
 3160                  */
 3161                 if ((qflags & PGA_REQUEUE_HEAD) != 0) {
 3162                         KASSERT(m->queue == PQ_INACTIVE,
 3163                             ("head enqueue not supported for page %p", m));
 3164                         vmd = vm_pagequeue_domain(m);
 3165                         TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
 3166                 } else
 3167                         TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 3168 
 3169                 vm_page_aflag_clear(m, qflags & (PGA_REQUEUE |
 3170                     PGA_REQUEUE_HEAD));
 3171         }
 3172 }
 3173 
 3174 static void
 3175 vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq,
 3176     uint8_t queue)
 3177 {
 3178         vm_page_t m;
 3179         int i;
 3180 
 3181         for (i = 0; i < bq->bq_cnt; i++) {
 3182                 m = bq->bq_pa[i];
 3183                 if (__predict_false(m->queue != queue))
 3184                         continue;
 3185                 vm_pqbatch_process_page(pq, m);
 3186         }
 3187         vm_batchqueue_init(bq);
 3188 }
 3189 
 3190 static void
 3191 vm_pqbatch_submit_page(vm_page_t m, uint8_t queue)
 3192 {
 3193         struct vm_batchqueue *bq;
 3194         struct vm_pagequeue *pq;
 3195         int domain;
 3196 
 3197         vm_page_assert_locked(m);
 3198         KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
 3199 
 3200         domain = vm_phys_domain(m);
 3201         pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue];
 3202 
 3203         critical_enter();
 3204         bq = DPCPU_PTR(pqbatch[domain][queue]);
 3205         if (vm_batchqueue_insert(bq, m)) {
 3206                 critical_exit();
 3207                 return;
 3208         }
 3209         if (!vm_pagequeue_trylock(pq)) {
 3210                 critical_exit();
 3211                 vm_pagequeue_lock(pq);
 3212                 critical_enter();
 3213                 bq = DPCPU_PTR(pqbatch[domain][queue]);
 3214         }
 3215         vm_pqbatch_process(pq, bq, queue);
 3216 
 3217         /*
 3218          * The page may have been logically dequeued before we acquired the
 3219          * page queue lock.  In this case, the page lock prevents the page
 3220          * from being logically enqueued elsewhere.
 3221          */
 3222         if (__predict_true(m->queue == queue))
 3223                 vm_pqbatch_process_page(pq, m);
 3224         else {
 3225                 KASSERT(m->queue == PQ_NONE,
 3226                     ("invalid queue transition for page %p", m));
 3227                 KASSERT((m->aflags & PGA_ENQUEUED) == 0,
 3228                     ("page %p is enqueued with invalid queue index", m));
 3229                 vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
 3230         }
 3231         vm_pagequeue_unlock(pq);
 3232         critical_exit();
 3233 }
 3234 
 3235 /*
 3236  *      vm_page_drain_pqbatch:          [ internal use only ]
 3237  *
 3238  *      Force all per-CPU page queue batch queues to be drained.  This is
 3239  *      intended for use in severe memory shortages, to ensure that pages
 3240  *      do not remain stuck in the batch queues.
 3241  */
 3242 void
 3243 vm_page_drain_pqbatch(void)
 3244 {
 3245         struct thread *td;
 3246         struct vm_domain *vmd;
 3247         struct vm_pagequeue *pq;
 3248         int cpu, domain, queue;
 3249 
 3250         td = curthread;
 3251         CPU_FOREACH(cpu) {
 3252                 thread_lock(td);
 3253                 sched_bind(td, cpu);
 3254                 thread_unlock(td);
 3255 
 3256                 for (domain = 0; domain < vm_ndomains; domain++) {
 3257                         vmd = VM_DOMAIN(domain);
 3258                         for (queue = 0; queue < PQ_COUNT; queue++) {
 3259                                 pq = &vmd->vmd_pagequeues[queue];
 3260                                 vm_pagequeue_lock(pq);
 3261                                 critical_enter();
 3262                                 vm_pqbatch_process(pq,
 3263                                     DPCPU_PTR(pqbatch[domain][queue]), queue);
 3264                                 critical_exit();
 3265                                 vm_pagequeue_unlock(pq);
 3266                         }
 3267                 }
 3268         }
 3269         thread_lock(td);
 3270         sched_unbind(td);
 3271         thread_unlock(td);
 3272 }
 3273 
 3274 /*
 3275  * Complete the logical removal of a page from a page queue.  We must be
 3276  * careful to synchronize with the page daemon, which may be concurrently
 3277  * examining the page with only the page lock held.  The page must not be
 3278  * in a state where it appears to be logically enqueued.
 3279  */
 3280 static void
 3281 vm_page_dequeue_complete(vm_page_t m)
 3282 {
 3283 
 3284         m->queue = PQ_NONE;
 3285         atomic_thread_fence_rel();
 3286         vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
 3287 }
 3288 
 3289 /*
 3290  *      vm_page_dequeue_deferred:       [ internal use only ]
 3291  *
 3292  *      Request removal of the given page from its current page
 3293  *      queue.  Physical removal from the queue may be deferred
 3294  *      indefinitely.
 3295  *
 3296  *      The page must be locked.
 3297  */
 3298 void
 3299 vm_page_dequeue_deferred(vm_page_t m)
 3300 {
 3301         uint8_t queue;
 3302 
 3303         vm_page_assert_locked(m);
 3304 
 3305         if ((queue = vm_page_queue(m)) == PQ_NONE)
 3306                 return;
 3307         vm_page_aflag_set(m, PGA_DEQUEUE);
 3308         vm_pqbatch_submit_page(m, queue);
 3309 }
 3310 
 3311 /*
 3312  *      vm_page_dequeue:
 3313  *
 3314  *      Remove the page from whichever page queue it's in, if any.
 3315  *      The page must either be locked or unallocated.  This constraint
 3316  *      ensures that the queue state of the page will remain consistent
 3317  *      after this function returns.
 3318  */
 3319 void
 3320 vm_page_dequeue(vm_page_t m)
 3321 {
 3322         struct vm_pagequeue *pq, *pq1;
 3323         uint8_t aflags;
 3324 
 3325         KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL,
 3326             ("page %p is allocated and unlocked", m));
 3327 
 3328         for (pq = vm_page_pagequeue(m);; pq = pq1) {
 3329                 if (pq == NULL) {
 3330                         /*
 3331                          * A thread may be concurrently executing
 3332                          * vm_page_dequeue_complete().  Ensure that all queue
 3333                          * state is cleared before we return.
 3334                          */
 3335                         aflags = atomic_load_8(&m->aflags);
 3336                         if ((aflags & PGA_QUEUE_STATE_MASK) == 0)
 3337                                 return;
 3338                         KASSERT((aflags & PGA_DEQUEUE) != 0,
 3339                             ("page %p has unexpected queue state flags %#x",
 3340                             m, aflags));
 3341 
 3342                         /*
 3343                          * Busy wait until the thread updating queue state is
 3344                          * finished.  Such a thread must be executing in a
 3345                          * critical section.
 3346                          */
 3347                         cpu_spinwait();
 3348                         pq1 = vm_page_pagequeue(m);
 3349                         continue;
 3350                 }
 3351                 vm_pagequeue_lock(pq);
 3352                 if ((pq1 = vm_page_pagequeue(m)) == pq)
 3353                         break;
 3354                 vm_pagequeue_unlock(pq);
 3355         }
 3356         KASSERT(pq == vm_page_pagequeue(m),
 3357             ("%s: page %p migrated directly between queues", __func__, m));
 3358         KASSERT((m->aflags & PGA_DEQUEUE) != 0 ||
 3359             mtx_owned(vm_page_lockptr(m)),
 3360             ("%s: queued unlocked page %p", __func__, m));
 3361 
 3362         if ((m->aflags & PGA_ENQUEUED) != 0)
 3363                 vm_pagequeue_remove(pq, m);
 3364         vm_page_dequeue_complete(m);
 3365         vm_pagequeue_unlock(pq);
 3366 }
 3367 
 3368 /*
 3369  * Schedule the given page for insertion into the specified page queue.
 3370  * Physical insertion of the page may be deferred indefinitely.
 3371  */
 3372 static void
 3373 vm_page_enqueue(vm_page_t m, uint8_t queue)
 3374 {
 3375 
 3376         vm_page_assert_locked(m);
 3377         KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0,
 3378             ("%s: page %p is already enqueued", __func__, m));
 3379 
 3380         m->queue = queue;
 3381         if ((m->aflags & PGA_REQUEUE) == 0)
 3382                 vm_page_aflag_set(m, PGA_REQUEUE);
 3383         vm_pqbatch_submit_page(m, queue);
 3384 }
 3385 
 3386 /*
 3387  *      vm_page_requeue:                [ internal use only ]
 3388  *
 3389  *      Schedule a requeue of the given page.
 3390  *
 3391  *      The page must be locked.
 3392  */
 3393 void
 3394 vm_page_requeue(vm_page_t m)
 3395 {
 3396 
 3397         vm_page_assert_locked(m);
 3398         KASSERT(vm_page_queue(m) != PQ_NONE,
 3399             ("%s: page %p is not logically enqueued", __func__, m));
 3400 
 3401         if ((m->aflags & PGA_REQUEUE) == 0)
 3402                 vm_page_aflag_set(m, PGA_REQUEUE);
 3403         vm_pqbatch_submit_page(m, atomic_load_8(&m->queue));
 3404 }
 3405 
 3406 /*
 3407  *      vm_page_free_prep:
 3408  *
 3409  *      Prepares the given page to be put on the free list,
 3410  *      disassociating it from any VM object. The caller may return
 3411  *      the page to the free list only if this function returns true.
 3412  *
 3413  *      The object must be locked.  The page must be locked if it is
 3414  *      managed.
 3415  */
 3416 bool
 3417 vm_page_free_prep(vm_page_t m)
 3418 {
 3419 
 3420 #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
 3421         if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) {
 3422                 uint64_t *p;
 3423                 int i;
 3424                 p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 3425                 for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
 3426                         KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
 3427                             m, i, (uintmax_t)*p));
 3428         }
 3429 #endif
 3430         if ((m->oflags & VPO_UNMANAGED) == 0) {
 3431                 vm_page_lock_assert(m, MA_OWNED);
 3432                 KASSERT(!pmap_page_is_mapped(m),
 3433                     ("vm_page_free_prep: freeing mapped page %p", m));
 3434                 KASSERT((m->aflags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0,
 3435                     ("vm_page_free_prep: mapping flags set in page %p", m));
 3436         } else {
 3437                 KASSERT(m->queue == PQ_NONE,
 3438                     ("vm_page_free_prep: unmanaged page %p is queued", m));
 3439         }
 3440         VM_CNT_INC(v_tfree);
 3441 
 3442         if (vm_page_sbusied(m))
 3443                 panic("vm_page_free_prep: freeing busy page %p", m);
 3444 
 3445         if (m->object != NULL)
 3446                 (void)vm_page_remove(m);
 3447 
 3448         /*
 3449          * If fictitious remove object association and
 3450          * return.
 3451          */
 3452         if ((m->flags & PG_FICTITIOUS) != 0) {
 3453                 KASSERT(m->wire_count == 1,
 3454                     ("fictitious page %p is not wired", m));
 3455                 KASSERT(m->queue == PQ_NONE,
 3456                     ("fictitious page %p is queued", m));
 3457                 return (false);
 3458         }
 3459 
 3460         /*
 3461          * Pages need not be dequeued before they are returned to the physical
 3462          * memory allocator, but they must at least be marked for a deferred
 3463          * dequeue.
 3464          */
 3465         if ((m->oflags & VPO_UNMANAGED) == 0)
 3466                 vm_page_dequeue_deferred(m);
 3467 
 3468         m->valid = 0;
 3469         vm_page_undirty(m);
 3470 
 3471         if (vm_page_wired(m) != 0)
 3472                 panic("vm_page_free_prep: freeing wired page %p", m);
 3473         if (m->hold_count != 0) {
 3474                 m->flags &= ~PG_ZERO;
 3475                 KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 3476                     ("vm_page_free_prep: freeing PG_UNHOLDFREE page %p", m));
 3477                 m->flags |= PG_UNHOLDFREE;
 3478                 return (false);
 3479         }
 3480 
 3481         /*
 3482          * Restore the default memory attribute to the page.
 3483          */
 3484         if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 3485                 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 3486 
 3487 #if VM_NRESERVLEVEL > 0
 3488         /*
 3489          * Determine whether the page belongs to a reservation.  If the page was
 3490          * allocated from a per-CPU cache, it cannot belong to a reservation, so
 3491          * as an optimization, we avoid the check in that case.
 3492          */
 3493         if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m))
 3494                 return (false);
 3495 #endif
 3496 
 3497         return (true);
 3498 }
 3499 
 3500 /*
 3501  *      vm_page_free_toq:
 3502  *
 3503  *      Returns the given page to the free list, disassociating it
 3504  *      from any VM object.
 3505  *
 3506  *      The object must be locked.  The page must be locked if it is
 3507  *      managed.
 3508  */
 3509 void
 3510 vm_page_free_toq(vm_page_t m)
 3511 {
 3512         struct vm_domain *vmd;
 3513         uma_zone_t zone;
 3514 
 3515         if (!vm_page_free_prep(m))
 3516                 return;
 3517 
 3518         vmd = vm_pagequeue_domain(m);
 3519         zone = vmd->vmd_pgcache[m->pool].zone;
 3520         if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) {
 3521                 uma_zfree(zone, m);
 3522                 return;
 3523         }
 3524         vm_domain_free_lock(vmd);
 3525         vm_phys_free_pages(m, 0);
 3526         vm_domain_free_unlock(vmd);
 3527         vm_domain_freecnt_inc(vmd, 1);
 3528 }
 3529 
 3530 /*
 3531  *      vm_page_free_pages_toq:
 3532  *
 3533  *      Returns a list of pages to the free list, disassociating it
 3534  *      from any VM object.  In other words, this is equivalent to
 3535  *      calling vm_page_free_toq() for each page of a list of VM objects.
 3536  *
 3537  *      The objects must be locked.  The pages must be locked if it is
 3538  *      managed.
 3539  */
 3540 void
 3541 vm_page_free_pages_toq(struct spglist *free, bool update_wire_count)
 3542 {
 3543         vm_page_t m;
 3544         int count;
 3545 
 3546         if (SLIST_EMPTY(free))
 3547                 return;
 3548 
 3549         count = 0;
 3550         while ((m = SLIST_FIRST(free)) != NULL) {
 3551                 count++;
 3552                 SLIST_REMOVE_HEAD(free, plinks.s.ss);
 3553                 vm_page_free_toq(m);
 3554         }
 3555 
 3556         if (update_wire_count)
 3557                 vm_wire_sub(count);
 3558 }
 3559 
 3560 /*
 3561  *      vm_page_wire:
 3562  *
 3563  * Mark this page as wired down.  If the page is fictitious, then
 3564  * its wire count must remain one.
 3565  *
 3566  * The page must be locked.
 3567  */
 3568 void
 3569 vm_page_wire(vm_page_t m)
 3570 {
 3571 
 3572         vm_page_assert_locked(m);
 3573         if ((m->flags & PG_FICTITIOUS) != 0) {
 3574                 KASSERT(m->wire_count == 1,
 3575                     ("vm_page_wire: fictitious page %p's wire count isn't one",
 3576                     m));
 3577                 return;
 3578         }
 3579         if (!vm_page_wired(m)) {
 3580                 KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
 3581                     m->queue == PQ_NONE,
 3582                     ("vm_page_wire: unmanaged page %p is queued", m));
 3583                 vm_wire_add(1);
 3584         }
 3585         m->wire_count++;
 3586         KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
 3587 }
 3588 
 3589 /*
 3590  * vm_page_unwire:
 3591  *
 3592  * Release one wiring of the specified page, potentially allowing it to be
 3593  * paged out.  Returns TRUE if the number of wirings transitions to zero and
 3594  * FALSE otherwise.
 3595  *
 3596  * Only managed pages belonging to an object can be paged out.  If the number
 3597  * of wirings transitions to zero and the page is eligible for page out, then
 3598  * the page is added to the specified paging queue (unless PQ_NONE is
 3599  * specified, in which case the page is dequeued if it belongs to a paging
 3600  * queue).
 3601  *
 3602  * If a page is fictitious, then its wire count must always be one.
 3603  *
 3604  * A managed page must be locked.
 3605  */
 3606 bool
 3607 vm_page_unwire(vm_page_t m, uint8_t queue)
 3608 {
 3609         bool unwired;
 3610 
 3611         KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
 3612             ("vm_page_unwire: invalid queue %u request for page %p",
 3613             queue, m));
 3614         if ((m->oflags & VPO_UNMANAGED) == 0)
 3615                 vm_page_assert_locked(m);
 3616 
 3617         unwired = vm_page_unwire_noq(m);
 3618         if (!unwired || (m->oflags & VPO_UNMANAGED) != 0 || m->object == NULL)
 3619                 return (unwired);
 3620 
 3621         if (vm_page_queue(m) == queue) {
 3622                 if (queue == PQ_ACTIVE)
 3623                         vm_page_reference(m);
 3624                 else if (queue != PQ_NONE)
 3625                         vm_page_requeue(m);
 3626         } else {
 3627                 vm_page_dequeue(m);
 3628                 if (queue != PQ_NONE) {
 3629                         vm_page_enqueue(m, queue);
 3630                         if (queue == PQ_ACTIVE)
 3631                                 /* Initialize act_count. */
 3632                                 vm_page_activate(m);
 3633                 }
 3634         }
 3635         return (unwired);
 3636 }
 3637 
 3638 /*
 3639  *
 3640  * vm_page_unwire_noq:
 3641  *
 3642  * Unwire a page without (re-)inserting it into a page queue.  It is up
 3643  * to the caller to enqueue, requeue, or free the page as appropriate.
 3644  * In most cases, vm_page_unwire() should be used instead.
 3645  */
 3646 bool
 3647 vm_page_unwire_noq(vm_page_t m)
 3648 {
 3649 
 3650         if ((m->oflags & VPO_UNMANAGED) == 0)
 3651                 vm_page_assert_locked(m);
 3652         if ((m->flags & PG_FICTITIOUS) != 0) {
 3653                 KASSERT(m->wire_count == 1,
 3654             ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
 3655                 return (false);
 3656         }
 3657         if (!vm_page_wired(m))
 3658                 panic("vm_page_unwire: page %p's wire count is zero", m);
 3659         m->wire_count--;
 3660         if (m->wire_count == 0) {
 3661                 vm_wire_sub(1);
 3662                 return (true);
 3663         } else
 3664                 return (false);
 3665 }
 3666 
 3667 /*
 3668  *      vm_page_activate:
 3669  *
 3670  *      Put the specified page on the active list (if appropriate).
 3671  *      Ensure that act_count is at least ACT_INIT but do not otherwise
 3672  *      mess with it.
 3673  *
 3674  *      The page must be locked.
 3675  */
 3676 void
 3677 vm_page_activate(vm_page_t m)
 3678 {
 3679 
 3680         vm_page_assert_locked(m);
 3681 
 3682         if (vm_page_wired(m) || (m->oflags & VPO_UNMANAGED) != 0)
 3683                 return;
 3684         if (vm_page_queue(m) == PQ_ACTIVE) {
 3685                 if (m->act_count < ACT_INIT)
 3686                         m->act_count = ACT_INIT;
 3687                 return;
 3688         }
 3689 
 3690         vm_page_dequeue(m);
 3691         if (m->act_count < ACT_INIT)
 3692                 m->act_count = ACT_INIT;
 3693         vm_page_enqueue(m, PQ_ACTIVE);
 3694 }
 3695 
 3696 /*
 3697  * Move the specified page to the tail of the inactive queue, or requeue
 3698  * the page if it is already in the inactive queue.
 3699  *
 3700  * The page must be locked.
 3701  */
 3702 void
 3703 vm_page_deactivate(vm_page_t m)
 3704 {
 3705 
 3706         vm_page_assert_locked(m);
 3707 
 3708         if (vm_page_wired(m) || (m->oflags & VPO_UNMANAGED) != 0)
 3709                 return;
 3710 
 3711         if (!vm_page_inactive(m)) {
 3712                 vm_page_dequeue(m);
 3713                 vm_page_enqueue(m, PQ_INACTIVE);
 3714         } else
 3715                 vm_page_requeue(m);
 3716 }
 3717 
 3718 /*
 3719  * Move the specified page close to the head of the inactive queue,
 3720  * bypassing LRU.  A marker page is used to maintain FIFO ordering.
 3721  * As with regular enqueues, we use a per-CPU batch queue to reduce
 3722  * contention on the page queue lock.
 3723  *
 3724  * The page must be locked.
 3725  */
 3726 void
 3727 vm_page_deactivate_noreuse(vm_page_t m)
 3728 {
 3729 
 3730         vm_page_assert_locked(m);
 3731 
 3732         if (vm_page_wired(m) || (m->oflags & VPO_UNMANAGED) != 0)
 3733                 return;
 3734 
 3735         if (!vm_page_inactive(m)) {
 3736                 vm_page_dequeue(m);
 3737                 m->queue = PQ_INACTIVE;
 3738         }
 3739         if ((m->aflags & PGA_REQUEUE_HEAD) == 0)
 3740                 vm_page_aflag_set(m, PGA_REQUEUE_HEAD);
 3741         vm_pqbatch_submit_page(m, PQ_INACTIVE);
 3742 }
 3743 
 3744 /*
 3745  * vm_page_launder
 3746  *
 3747  *      Put a page in the laundry, or requeue it if it is already there.
 3748  */
 3749 void
 3750 vm_page_launder(vm_page_t m)
 3751 {
 3752 
 3753         vm_page_assert_locked(m);
 3754         if (vm_page_wired(m) || (m->oflags & VPO_UNMANAGED) != 0)
 3755                 return;
 3756 
 3757         if (vm_page_in_laundry(m))
 3758                 vm_page_requeue(m);
 3759         else {
 3760                 vm_page_dequeue(m);
 3761                 vm_page_enqueue(m, PQ_LAUNDRY);
 3762         }
 3763 }
 3764 
 3765 /*
 3766  * vm_page_unswappable
 3767  *
 3768  *      Put a page in the PQ_UNSWAPPABLE holding queue.
 3769  */
 3770 void
 3771 vm_page_unswappable(vm_page_t m)
 3772 {
 3773 
 3774         vm_page_assert_locked(m);
 3775         KASSERT(!vm_page_wired(m) && (m->oflags & VPO_UNMANAGED) == 0,
 3776             ("page %p already unswappable", m));
 3777 
 3778         vm_page_dequeue(m);
 3779         vm_page_enqueue(m, PQ_UNSWAPPABLE);
 3780 }
 3781 
 3782 static void
 3783 vm_page_release_toq(vm_page_t m, int flags)
 3784 {
 3785 
 3786         /*
 3787          * Use a check of the valid bits to determine whether we should
 3788          * accelerate reclamation of the page.  The object lock might not be
 3789          * held here, in which case the check is racy.  At worst we will either
 3790          * accelerate reclamation of a valid page and violate LRU, or
 3791          * unnecessarily defer reclamation of an invalid page.
 3792          *
 3793          * If we were asked to not cache the page, place it near the head of the
 3794          * inactive queue so that is reclaimed sooner.
 3795          */
 3796         if ((flags & (VPR_TRYFREE | VPR_NOREUSE)) != 0 || m->valid == 0)
 3797                 vm_page_deactivate_noreuse(m);
 3798         else if (vm_page_active(m))
 3799                 vm_page_reference(m);
 3800         else
 3801                 vm_page_deactivate(m);
 3802 }
 3803 
 3804 /*
 3805  * Unwire a page and either attempt to free it or re-add it to the page queues.
 3806  */
 3807 void
 3808 vm_page_release(vm_page_t m, int flags)
 3809 {
 3810         vm_object_t object;
 3811         bool freed;
 3812 
 3813         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3814             ("vm_page_release: page %p is unmanaged", m));
 3815 
 3816         vm_page_lock(m);
 3817         if (m->object != NULL)
 3818                 VM_OBJECT_ASSERT_UNLOCKED(m->object);
 3819         if (vm_page_unwire_noq(m)) {
 3820                 if ((object = m->object) == NULL) {
 3821                         vm_page_free(m);
 3822                 } else {
 3823                         freed = false;
 3824                         if ((flags & VPR_TRYFREE) != 0 && !vm_page_busied(m) &&
 3825                             /* Depends on type stability. */
 3826                             VM_OBJECT_TRYWLOCK(object)) {
 3827                                 /*
 3828                                  * Only free unmapped pages.  The busy test from
 3829                                  * before the object was locked cannot be relied
 3830                                  * upon.
 3831                                  */
 3832                                 if ((object->ref_count == 0 ||
 3833                                     !pmap_page_is_mapped(m)) && m->dirty == 0 &&
 3834                                     !vm_page_busied(m)) {
 3835                                         vm_page_free(m);
 3836                                         freed = true;
 3837                                 }
 3838                                 VM_OBJECT_WUNLOCK(object);
 3839                         }
 3840 
 3841                         if (!freed)
 3842                                 vm_page_release_toq(m, flags);
 3843                 }
 3844         }
 3845         vm_page_unlock(m);
 3846 }
 3847 
 3848 /* See vm_page_release(). */
 3849 void
 3850 vm_page_release_locked(vm_page_t m, int flags)
 3851 {
 3852 
 3853         VM_OBJECT_ASSERT_WLOCKED(m->object);
 3854         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3855             ("vm_page_release_locked: page %p is unmanaged", m));
 3856 
 3857         vm_page_lock(m);
 3858         if (vm_page_unwire_noq(m)) {
 3859                 if ((flags & VPR_TRYFREE) != 0 &&
 3860                     (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) &&
 3861                     m->dirty == 0 && !vm_page_busied(m)) {
 3862                         vm_page_free(m);
 3863                 } else {
 3864                         vm_page_release_toq(m, flags);
 3865                 }
 3866         }
 3867         vm_page_unlock(m);
 3868 }
 3869 
 3870 /*
 3871  * vm_page_advise
 3872  *
 3873  *      Apply the specified advice to the given page.
 3874  *
 3875  *      The object and page must be locked.
 3876  */
 3877 void
 3878 vm_page_advise(vm_page_t m, int advice)
 3879 {
 3880 
 3881         vm_page_assert_locked(m);
 3882         VM_OBJECT_ASSERT_WLOCKED(m->object);
 3883         if (advice == MADV_FREE)
 3884                 /*
 3885                  * Mark the page clean.  This will allow the page to be freed
 3886                  * without first paging it out.  MADV_FREE pages are often
 3887                  * quickly reused by malloc(3), so we do not do anything that
 3888                  * would result in a page fault on a later access.
 3889                  */
 3890                 vm_page_undirty(m);
 3891         else if (advice != MADV_DONTNEED) {
 3892                 if (advice == MADV_WILLNEED)
 3893                         vm_page_activate(m);
 3894                 return;
 3895         }
 3896 
 3897         /*
 3898          * Clear any references to the page.  Otherwise, the page daemon will
 3899          * immediately reactivate the page.
 3900          */
 3901         vm_page_aflag_clear(m, PGA_REFERENCED);
 3902 
 3903         if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
 3904                 vm_page_dirty(m);
 3905 
 3906         /*
 3907          * Place clean pages near the head of the inactive queue rather than
 3908          * the tail, thus defeating the queue's LRU operation and ensuring that
 3909          * the page will be reused quickly.  Dirty pages not already in the
 3910          * laundry are moved there.
 3911          */
 3912         if (m->dirty == 0)
 3913                 vm_page_deactivate_noreuse(m);
 3914         else if (!vm_page_in_laundry(m))
 3915                 vm_page_launder(m);
 3916 }
 3917 
 3918 /*
 3919  * Grab a page, waiting until we are waken up due to the page
 3920  * changing state.  We keep on waiting, if the page continues
 3921  * to be in the object.  If the page doesn't exist, first allocate it
 3922  * and then conditionally zero it.
 3923  *
 3924  * This routine may sleep.
 3925  *
 3926  * The object must be locked on entry.  The lock will, however, be released
 3927  * and reacquired if the routine sleeps.
 3928  */
 3929 vm_page_t
 3930 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 3931 {
 3932         vm_page_t m;
 3933         int sleep;
 3934         int pflags;
 3935 
 3936         VM_OBJECT_ASSERT_WLOCKED(object);
 3937         KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 3938             (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 3939             ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
 3940         pflags = allocflags &
 3941             ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
 3942         if ((allocflags & VM_ALLOC_NOWAIT) == 0)
 3943                 pflags |= VM_ALLOC_WAITFAIL;
 3944 retrylookup:
 3945         if ((m = vm_page_lookup(object, pindex)) != NULL) {
 3946                 sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
 3947                     vm_page_xbusied(m) : vm_page_busied(m);
 3948                 if (sleep) {
 3949                         if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 3950                                 return (NULL);
 3951                         /*
 3952                          * Reference the page before unlocking and
 3953                          * sleeping so that the page daemon is less
 3954                          * likely to reclaim it.
 3955                          */
 3956                         vm_page_aflag_set(m, PGA_REFERENCED);
 3957                         vm_page_lock(m);
 3958                         VM_OBJECT_WUNLOCK(object);
 3959                         vm_page_busy_sleep(m, "pgrbwt", (allocflags &
 3960                             VM_ALLOC_IGN_SBUSY) != 0);
 3961                         VM_OBJECT_WLOCK(object);
 3962                         goto retrylookup;
 3963                 } else {
 3964                         if ((allocflags & VM_ALLOC_WIRED) != 0) {
 3965                                 vm_page_lock(m);
 3966                                 vm_page_wire(m);
 3967                                 vm_page_unlock(m);
 3968                         }
 3969                         if ((allocflags &
 3970                             (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
 3971                                 vm_page_xbusy(m);
 3972                         if ((allocflags & VM_ALLOC_SBUSY) != 0)
 3973                                 vm_page_sbusy(m);
 3974                         return (m);
 3975                 }
 3976         }
 3977         m = vm_page_alloc(object, pindex, pflags);
 3978         if (m == NULL) {
 3979                 if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 3980                         return (NULL);
 3981                 goto retrylookup;
 3982         }
 3983         if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 3984                 pmap_zero_page(m);
 3985         return (m);
 3986 }
 3987 
 3988 /*
 3989  * Return the specified range of pages from the given object.  For each
 3990  * page offset within the range, if a page already exists within the object
 3991  * at that offset and it is busy, then wait for it to change state.  If,
 3992  * instead, the page doesn't exist, then allocate it.
 3993  *
 3994  * The caller must always specify an allocation class.
 3995  *
 3996  * allocation classes:
 3997  *      VM_ALLOC_NORMAL         normal process request
 3998  *      VM_ALLOC_SYSTEM         system *really* needs the pages
 3999  *
 4000  * The caller must always specify that the pages are to be busied and/or
 4001  * wired.
 4002  *
 4003  * optional allocation flags:
 4004  *      VM_ALLOC_IGN_SBUSY      do not sleep on soft busy pages
 4005  *      VM_ALLOC_NOBUSY         do not exclusive busy the page
 4006  *      VM_ALLOC_NOWAIT         do not sleep
 4007  *      VM_ALLOC_SBUSY          set page to sbusy state
 4008  *      VM_ALLOC_WIRED          wire the pages
 4009  *      VM_ALLOC_ZERO           zero and validate any invalid pages
 4010  *
 4011  * If VM_ALLOC_NOWAIT is not specified, this routine may sleep.  Otherwise, it
 4012  * may return a partial prefix of the requested range.
 4013  */
 4014 int
 4015 vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
 4016     vm_page_t *ma, int count)
 4017 {
 4018         vm_page_t m, mpred;
 4019         int pflags;
 4020         int i;
 4021         bool sleep;
 4022 
 4023         VM_OBJECT_ASSERT_WLOCKED(object);
 4024         KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
 4025             ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
 4026         KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
 4027             (allocflags & VM_ALLOC_WIRED) != 0,
 4028             ("vm_page_grab_pages: the pages must be busied or wired"));
 4029         KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 4030             (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 4031             ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch"));
 4032         if (count == 0)
 4033                 return (0);
 4034         pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK |
 4035             VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY);
 4036         if ((allocflags & VM_ALLOC_NOWAIT) == 0)
 4037                 pflags |= VM_ALLOC_WAITFAIL;
 4038         i = 0;
 4039 retrylookup:
 4040         m = vm_radix_lookup_le(&object->rtree, pindex + i);
 4041         if (m == NULL || m->pindex != pindex + i) {
 4042                 mpred = m;
 4043                 m = NULL;
 4044         } else
 4045                 mpred = TAILQ_PREV(m, pglist, listq);
 4046         for (; i < count; i++) {
 4047                 if (m != NULL) {
 4048                         sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
 4049                             vm_page_xbusied(m) : vm_page_busied(m);
 4050                         if (sleep) {
 4051                                 if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 4052                                         break;
 4053                                 /*
 4054                                  * Reference the page before unlocking and
 4055                                  * sleeping so that the page daemon is less
 4056                                  * likely to reclaim it.
 4057                                  */
 4058                                 vm_page_aflag_set(m, PGA_REFERENCED);
 4059                                 vm_page_lock(m);
 4060                                 VM_OBJECT_WUNLOCK(object);
 4061                                 vm_page_busy_sleep(m, "grbmaw", (allocflags &
 4062                                     VM_ALLOC_IGN_SBUSY) != 0);
 4063                                 VM_OBJECT_WLOCK(object);
 4064                                 goto retrylookup;
 4065                         }
 4066                         if ((allocflags & VM_ALLOC_WIRED) != 0) {
 4067                                 vm_page_lock(m);
 4068                                 vm_page_wire(m);
 4069                                 vm_page_unlock(m);
 4070                         }
 4071                         if ((allocflags & (VM_ALLOC_NOBUSY |
 4072                             VM_ALLOC_SBUSY)) == 0)
 4073                                 vm_page_xbusy(m);
 4074                         if ((allocflags & VM_ALLOC_SBUSY) != 0)
 4075                                 vm_page_sbusy(m);
 4076                 } else {
 4077                         m = vm_page_alloc_after(object, pindex + i,
 4078                             pflags | VM_ALLOC_COUNT(count - i), mpred);
 4079                         if (m == NULL) {
 4080                                 if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 4081                                         break;
 4082                                 goto retrylookup;
 4083                         }
 4084                 }
 4085                 if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) {
 4086                         if ((m->flags & PG_ZERO) == 0)
 4087                                 pmap_zero_page(m);
 4088                         m->valid = VM_PAGE_BITS_ALL;
 4089                 }
 4090                 ma[i] = mpred = m;
 4091                 m = vm_page_next(m);
 4092         }
 4093         return (i);
 4094 }
 4095 
 4096 /*
 4097  * Mapping function for valid or dirty bits in a page.
 4098  *
 4099  * Inputs are required to range within a page.
 4100  */
 4101 vm_page_bits_t
 4102 vm_page_bits(int base, int size)
 4103 {
 4104         int first_bit;
 4105         int last_bit;
 4106 
 4107         KASSERT(
 4108             base + size <= PAGE_SIZE,
 4109             ("vm_page_bits: illegal base/size %d/%d", base, size)
 4110         );
 4111 
 4112         if (size == 0)          /* handle degenerate case */
 4113                 return (0);
 4114 
 4115         first_bit = base >> DEV_BSHIFT;
 4116         last_bit = (base + size - 1) >> DEV_BSHIFT;
 4117 
 4118         return (((vm_page_bits_t)2 << last_bit) -
 4119             ((vm_page_bits_t)1 << first_bit));
 4120 }
 4121 
 4122 /*
 4123  *      vm_page_set_valid_range:
 4124  *
 4125  *      Sets portions of a page valid.  The arguments are expected
 4126  *      to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
 4127  *      of any partial chunks touched by the range.  The invalid portion of
 4128  *      such chunks will be zeroed.
 4129  *
 4130  *      (base + size) must be less then or equal to PAGE_SIZE.
 4131  */
 4132 void
 4133 vm_page_set_valid_range(vm_page_t m, int base, int size)
 4134 {
 4135         int endoff, frag;
 4136 
 4137         VM_OBJECT_ASSERT_WLOCKED(m->object);
 4138         if (size == 0)  /* handle degenerate case */
 4139                 return;
 4140 
 4141         /*
 4142          * If the base is not DEV_BSIZE aligned and the valid
 4143          * bit is clear, we have to zero out a portion of the
 4144          * first block.
 4145          */
 4146         if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 4147             (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 4148                 pmap_zero_page_area(m, frag, base - frag);
 4149 
 4150         /*
 4151          * If the ending offset is not DEV_BSIZE aligned and the
 4152          * valid bit is clear, we have to zero out a portion of
 4153          * the last block.
 4154          */
 4155         endoff = base + size;
 4156         if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 4157             (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 4158                 pmap_zero_page_area(m, endoff,
 4159                     DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 4160 
 4161         /*
 4162          * Assert that no previously invalid block that is now being validated
 4163          * is already dirty.
 4164          */
 4165         KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
 4166             ("vm_page_set_valid_range: page %p is dirty", m));
 4167 
 4168         /*
 4169          * Set valid bits inclusive of any overlap.
 4170          */
 4171         m->valid |= vm_page_bits(base, size);
 4172 }
 4173 
 4174 /*
 4175  * Clear the given bits from the specified page's dirty field.
 4176  */
 4177 static __inline void
 4178 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
 4179 {
 4180         uintptr_t addr;
 4181 #if PAGE_SIZE < 16384
 4182         int shift;
 4183 #endif
 4184 
 4185         /*
 4186          * If the object is locked and the page is neither exclusive busy nor
 4187          * write mapped, then the page's dirty field cannot possibly be
 4188          * set by a concurrent pmap operation.
 4189          */
 4190         VM_OBJECT_ASSERT_WLOCKED(m->object);
 4191         if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
 4192                 m->dirty &= ~pagebits;
 4193         else {
 4194                 /*
 4195                  * The pmap layer can call vm_page_dirty() without
 4196                  * holding a distinguished lock.  The combination of
 4197                  * the object's lock and an atomic operation suffice
 4198                  * to guarantee consistency of the page dirty field.
 4199                  *
 4200                  * For PAGE_SIZE == 32768 case, compiler already
 4201                  * properly aligns the dirty field, so no forcible
 4202                  * alignment is needed. Only require existence of
 4203                  * atomic_clear_64 when page size is 32768.
 4204                  */
 4205                 addr = (uintptr_t)&m->dirty;
 4206 #if PAGE_SIZE == 32768
 4207                 atomic_clear_64((uint64_t *)addr, pagebits);
 4208 #elif PAGE_SIZE == 16384
 4209                 atomic_clear_32((uint32_t *)addr, pagebits);
 4210 #else           /* PAGE_SIZE <= 8192 */
 4211                 /*
 4212                  * Use a trick to perform a 32-bit atomic on the
 4213                  * containing aligned word, to not depend on the existence
 4214                  * of atomic_clear_{8, 16}.
 4215                  */
 4216                 shift = addr & (sizeof(uint32_t) - 1);
 4217 #if BYTE_ORDER == BIG_ENDIAN
 4218                 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
 4219 #else
 4220                 shift *= NBBY;
 4221 #endif
 4222                 addr &= ~(sizeof(uint32_t) - 1);
 4223                 atomic_clear_32((uint32_t *)addr, pagebits << shift);
 4224 #endif          /* PAGE_SIZE */
 4225         }
 4226 }
 4227 
 4228 /*
 4229  *      vm_page_set_validclean:
 4230  *
 4231  *      Sets portions of a page valid and clean.  The arguments are expected
 4232  *      to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
 4233  *      of any partial chunks touched by the range.  The invalid portion of
 4234  *      such chunks will be zero'd.
 4235  *
 4236  *      (base + size) must be less then or equal to PAGE_SIZE.
 4237  */
 4238 void
 4239 vm_page_set_validclean(vm_page_t m, int base, int size)
 4240 {
 4241         vm_page_bits_t oldvalid, pagebits;
 4242         int endoff, frag;
 4243 
 4244         VM_OBJECT_ASSERT_WLOCKED(m->object);
 4245         if (size == 0)  /* handle degenerate case */
 4246                 return;
 4247 
 4248         /*
 4249          * If the base is not DEV_BSIZE aligned and the valid
 4250          * bit is clear, we have to zero out a portion of the
 4251          * first block.
 4252          */
 4253         if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 4254             (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
 4255                 pmap_zero_page_area(m, frag, base - frag);
 4256 
 4257         /*
 4258          * If the ending offset is not DEV_BSIZE aligned and the
 4259          * valid bit is clear, we have to zero out a portion of
 4260          * the last block.
 4261          */
 4262         endoff = base + size;
 4263         if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 4264             (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
 4265                 pmap_zero_page_area(m, endoff,
 4266                     DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 4267 
 4268         /*
 4269          * Set valid, clear dirty bits.  If validating the entire
 4270          * page we can safely clear the pmap modify bit.  We also
 4271          * use this opportunity to clear the VPO_NOSYNC flag.  If a process
 4272          * takes a write fault on a MAP_NOSYNC memory area the flag will
 4273          * be set again.
 4274          *
 4275          * We set valid bits inclusive of any overlap, but we can only
 4276          * clear dirty bits for DEV_BSIZE chunks that are fully within
 4277          * the range.
 4278          */
 4279         oldvalid = m->valid;
 4280         pagebits = vm_page_bits(base, size);
 4281         m->valid |= pagebits;
 4282 #if 0   /* NOT YET */
 4283         if ((frag = base & (DEV_BSIZE - 1)) != 0) {
 4284                 frag = DEV_BSIZE - frag;
 4285                 base += frag;
 4286                 size -= frag;
 4287                 if (size < 0)
 4288                         size = 0;
 4289         }
 4290         pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
 4291 #endif
 4292         if (base == 0 && size == PAGE_SIZE) {
 4293                 /*
 4294                  * The page can only be modified within the pmap if it is
 4295                  * mapped, and it can only be mapped if it was previously
 4296                  * fully valid.
 4297                  */
 4298                 if (oldvalid == VM_PAGE_BITS_ALL)
 4299                         /*
 4300                          * Perform the pmap_clear_modify() first.  Otherwise,
 4301                          * a concurrent pmap operation, such as
 4302                          * pmap_protect(), could clear a modification in the
 4303                          * pmap and set the dirty field on the page before
 4304                          * pmap_clear_modify() had begun and after the dirty
 4305                          * field was cleared here.
 4306                          */
 4307                         pmap_clear_modify(m);
 4308                 m->dirty = 0;
 4309                 m->oflags &= ~VPO_NOSYNC;
 4310         } else if (oldvalid != VM_PAGE_BITS_ALL)
 4311                 m->dirty &= ~pagebits;
 4312         else
 4313                 vm_page_clear_dirty_mask(m, pagebits);
 4314 }
 4315 
 4316 void
 4317 vm_page_clear_dirty(vm_page_t m, int base, int size)
 4318 {
 4319 
 4320         vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
 4321 }
 4322 
 4323 /*
 4324  *      vm_page_set_invalid:
 4325  *
 4326  *      Invalidates DEV_BSIZE'd chunks within a page.  Both the
 4327  *      valid and dirty bits for the effected areas are cleared.
 4328  */
 4329 void
 4330 vm_page_set_invalid(vm_page_t m, int base, int size)
 4331 {
 4332         vm_page_bits_t bits;
 4333         vm_object_t object;
 4334 
 4335         object = m->object;
 4336         VM_OBJECT_ASSERT_WLOCKED(object);
 4337         if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
 4338             size >= object->un_pager.vnp.vnp_size)
 4339                 bits = VM_PAGE_BITS_ALL;
 4340         else
 4341                 bits = vm_page_bits(base, size);
 4342         if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL &&
 4343             bits != 0)
 4344                 pmap_remove_all(m);
 4345         KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) ||
 4346             !pmap_page_is_mapped(m),
 4347             ("vm_page_set_invalid: page %p is mapped", m));
 4348         m->valid &= ~bits;
 4349         m->dirty &= ~bits;
 4350 }
 4351 
 4352 /*
 4353  * vm_page_zero_invalid()
 4354  *
 4355  *      The kernel assumes that the invalid portions of a page contain
 4356  *      garbage, but such pages can be mapped into memory by user code.
 4357  *      When this occurs, we must zero out the non-valid portions of the
 4358  *      page so user code sees what it expects.
 4359  *
 4360  *      Pages are most often semi-valid when the end of a file is mapped
 4361  *      into memory and the file's size is not page aligned.
 4362  */
 4363 void
 4364 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
 4365 {
 4366         int b;
 4367         int i;
 4368 
 4369         VM_OBJECT_ASSERT_WLOCKED(m->object);
 4370         /*
 4371          * Scan the valid bits looking for invalid sections that
 4372          * must be zeroed.  Invalid sub-DEV_BSIZE'd areas ( where the
 4373          * valid bit may be set ) have already been zeroed by
 4374          * vm_page_set_validclean().
 4375          */
 4376         for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
 4377                 if (i == (PAGE_SIZE / DEV_BSIZE) ||
 4378                     (m->valid & ((vm_page_bits_t)1 << i))) {
 4379                         if (i > b) {
 4380                                 pmap_zero_page_area(m,
 4381                                     b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 4382                         }
 4383                         b = i + 1;
 4384                 }
 4385         }
 4386 
 4387         /*
 4388          * setvalid is TRUE when we can safely set the zero'd areas
 4389          * as being valid.  We can do this if there are no cache consistency
 4390          * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
 4391          */
 4392         if (setvalid)
 4393                 m->valid = VM_PAGE_BITS_ALL;
 4394 }
 4395 
 4396 /*
 4397  *      vm_page_is_valid:
 4398  *
 4399  *      Is (partial) page valid?  Note that the case where size == 0
 4400  *      will return FALSE in the degenerate case where the page is
 4401  *      entirely invalid, and TRUE otherwise.
 4402  */
 4403 int
 4404 vm_page_is_valid(vm_page_t m, int base, int size)
 4405 {
 4406         vm_page_bits_t bits;
 4407 
 4408         VM_OBJECT_ASSERT_LOCKED(m->object);
 4409         bits = vm_page_bits(base, size);
 4410         return (m->valid != 0 && (m->valid & bits) == bits);
 4411 }
 4412 
 4413 /*
 4414  * Returns true if all of the specified predicates are true for the entire
 4415  * (super)page and false otherwise.
 4416  */
 4417 bool
 4418 vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
 4419 {
 4420         vm_object_t object;
 4421         int i, npages;
 4422 
 4423         object = m->object;
 4424         if (skip_m != NULL && skip_m->object != object)
 4425                 return (false);
 4426         VM_OBJECT_ASSERT_LOCKED(object);
 4427         npages = atop(pagesizes[m->psind]);
 4428 
 4429         /*
 4430          * The physically contiguous pages that make up a superpage, i.e., a
 4431          * page with a page size index ("psind") greater than zero, will
 4432          * occupy adjacent entries in vm_page_array[].
 4433          */
 4434         for (i = 0; i < npages; i++) {
 4435                 /* Always test object consistency, including "skip_m". */
 4436                 if (m[i].object != object)
 4437                         return (false);
 4438                 if (&m[i] == skip_m)
 4439                         continue;
 4440                 if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
 4441                         return (false);
 4442                 if ((flags & PS_ALL_DIRTY) != 0) {
 4443                         /*
 4444                          * Calling vm_page_test_dirty() or pmap_is_modified()
 4445                          * might stop this case from spuriously returning
 4446                          * "false".  However, that would require a write lock
 4447                          * on the object containing "m[i]".
 4448                          */
 4449                         if (m[i].dirty != VM_PAGE_BITS_ALL)
 4450                                 return (false);
 4451                 }
 4452                 if ((flags & PS_ALL_VALID) != 0 &&
 4453                     m[i].valid != VM_PAGE_BITS_ALL)
 4454                         return (false);
 4455         }
 4456         return (true);
 4457 }
 4458 
 4459 /*
 4460  * Set the page's dirty bits if the page is modified.
 4461  */
 4462 void
 4463 vm_page_test_dirty(vm_page_t m)
 4464 {
 4465 
 4466         VM_OBJECT_ASSERT_WLOCKED(m->object);
 4467         if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
 4468                 vm_page_dirty(m);
 4469 }
 4470 
 4471 void
 4472 vm_page_lock_KBI(vm_page_t m, const char *file, int line)
 4473 {
 4474 
 4475         mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
 4476 }
 4477 
 4478 void
 4479 vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
 4480 {
 4481 
 4482         mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
 4483 }
 4484 
 4485 int
 4486 vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
 4487 {
 4488 
 4489         return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
 4490 }
 4491 
 4492 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 4493 void
 4494 vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
 4495 {
 4496 
 4497         vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
 4498 }
 4499 
 4500 void
 4501 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
 4502 {
 4503 
 4504         mtx_assert_(vm_page_lockptr(m), a, file, line);
 4505 }
 4506 #endif
 4507 
 4508 #ifdef INVARIANTS
 4509 void
 4510 vm_page_object_lock_assert(vm_page_t m)
 4511 {
 4512 
 4513         /*
 4514          * Certain of the page's fields may only be modified by the
 4515          * holder of the containing object's lock or the exclusive busy.
 4516          * holder.  Unfortunately, the holder of the write busy is
 4517          * not recorded, and thus cannot be checked here.
 4518          */
 4519         if (m->object != NULL && !vm_page_xbusied(m))
 4520                 VM_OBJECT_ASSERT_WLOCKED(m->object);
 4521 }
 4522 
 4523 void
 4524 vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits)
 4525 {
 4526 
 4527         if ((bits & PGA_WRITEABLE) == 0)
 4528                 return;
 4529 
 4530         /*
 4531          * The PGA_WRITEABLE flag can only be set if the page is
 4532          * managed, is exclusively busied or the object is locked.
 4533          * Currently, this flag is only set by pmap_enter().
 4534          */
 4535         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 4536             ("PGA_WRITEABLE on unmanaged page"));
 4537         if (!vm_page_xbusied(m))
 4538                 VM_OBJECT_ASSERT_LOCKED(m->object);
 4539 }
 4540 #endif
 4541 
 4542 #include "opt_ddb.h"
 4543 #ifdef DDB
 4544 #include <sys/kernel.h>
 4545 
 4546 #include <ddb/ddb.h>
 4547 
 4548 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 4549 {
 4550 
 4551         db_printf("vm_cnt.v_free_count: %d\n", vm_free_count());
 4552         db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count());
 4553         db_printf("vm_cnt.v_active_count: %d\n", vm_active_count());
 4554         db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count());
 4555         db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count());
 4556         db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
 4557         db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
 4558         db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
 4559         db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
 4560 }
 4561 
 4562 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 4563 {
 4564         int dom;
 4565 
 4566         db_printf("pq_free %d\n", vm_free_count());
 4567         for (dom = 0; dom < vm_ndomains; dom++) {
 4568                 db_printf(
 4569     "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n",
 4570                     dom,
 4571                     vm_dom[dom].vmd_page_count,
 4572                     vm_dom[dom].vmd_free_count,
 4573                     vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
 4574                     vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
 4575                     vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt,
 4576                     vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt);
 4577         }
 4578 }
 4579 
 4580 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
 4581 {
 4582         vm_page_t m;
 4583         boolean_t phys, virt;
 4584 
 4585         if (!have_addr) {
 4586                 db_printf("show pginfo addr\n");
 4587                 return;
 4588         }
 4589 
 4590         phys = strchr(modif, 'p') != NULL;
 4591         virt = strchr(modif, 'v') != NULL;
 4592         if (virt)
 4593                 m = PHYS_TO_VM_PAGE(pmap_kextract(addr));
 4594         else if (phys)
 4595                 m = PHYS_TO_VM_PAGE(addr);
 4596         else
 4597                 m = (vm_page_t)addr;
 4598         db_printf(
 4599     "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
 4600     "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
 4601             m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
 4602             m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
 4603             m->flags, m->act_count, m->busy_lock, m->valid, m->dirty);
 4604 }
 4605 #endif /* DDB */

Cache object: 55ad9211e3a70adfe0cc0cb0257568a1


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.