The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_page.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uvm_page.c,v 1.251 2022/10/26 23:38:09 riastradh Exp $ */
    2 
    3 /*-
    4  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  * Copyright (c) 1997 Charles D. Cranor and Washington University.
   34  * Copyright (c) 1991, 1993, The Regents of the University of California.
   35  *
   36  * All rights reserved.
   37  *
   38  * This code is derived from software contributed to Berkeley by
   39  * The Mach Operating System project at Carnegie-Mellon University.
   40  *
   41  * Redistribution and use in source and binary forms, with or without
   42  * modification, are permitted provided that the following conditions
   43  * are met:
   44  * 1. Redistributions of source code must retain the above copyright
   45  *    notice, this list of conditions and the following disclaimer.
   46  * 2. Redistributions in binary form must reproduce the above copyright
   47  *    notice, this list of conditions and the following disclaimer in the
   48  *    documentation and/or other materials provided with the distribution.
   49  * 3. Neither the name of the University nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   63  * SUCH DAMAGE.
   64  *
   65  *      @(#)vm_page.c   8.3 (Berkeley) 3/21/94
   66  * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
   67  *
   68  *
   69  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   70  * All rights reserved.
   71  *
   72  * Permission to use, copy, modify and distribute this software and
   73  * its documentation is hereby granted, provided that both the copyright
   74  * notice and this permission notice appear in all copies of the
   75  * software, derivative works or modified versions, and any portions
   76  * thereof, and that both notices appear in supporting documentation.
   77  *
   78  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   79  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   80  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   81  *
   82  * Carnegie Mellon requests users of this software to return to
   83  *
   84  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   85  *  School of Computer Science
   86  *  Carnegie Mellon University
   87  *  Pittsburgh PA 15213-3890
   88  *
   89  * any improvements or extensions that they make and grant Carnegie the
   90  * rights to redistribute these changes.
   91  */
   92 
   93 /*
   94  * uvm_page.c: page ops.
   95  */
   96 
   97 #include <sys/cdefs.h>
   98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.251 2022/10/26 23:38:09 riastradh Exp $");
   99 
  100 #include "opt_ddb.h"
  101 #include "opt_uvm.h"
  102 #include "opt_uvmhist.h"
  103 #include "opt_readahead.h"
  104 
  105 #include <sys/param.h>
  106 #include <sys/systm.h>
  107 #include <sys/sched.h>
  108 #include <sys/kernel.h>
  109 #include <sys/vnode.h>
  110 #include <sys/proc.h>
  111 #include <sys/radixtree.h>
  112 #include <sys/atomic.h>
  113 #include <sys/cpu.h>
  114 
  115 #include <ddb/db_active.h>
  116 
  117 #include <uvm/uvm.h>
  118 #include <uvm/uvm_ddb.h>
  119 #include <uvm/uvm_pdpolicy.h>
  120 #include <uvm/uvm_pgflcache.h>
  121 
  122 /*
  123  * number of pages per-CPU to reserve for the kernel.
  124  */
  125 #ifndef UVM_RESERVED_PAGES_PER_CPU
  126 #define UVM_RESERVED_PAGES_PER_CPU      5
  127 #endif
  128 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
  129 
  130 /*
  131  * physical memory size;
  132  */
  133 psize_t physmem;
  134 
  135 /*
  136  * local variables
  137  */
  138 
  139 /*
  140  * these variables record the values returned by vm_page_bootstrap,
  141  * for debugging purposes.  The implementation of uvm_pageboot_alloc
  142  * and pmap_startup here also uses them internally.
  143  */
  144 
  145 static vaddr_t      virtual_space_start;
  146 static vaddr_t      virtual_space_end;
  147 
  148 /*
  149  * we allocate an initial number of page colors in uvm_page_init(),
  150  * and remember them.  We may re-color pages as cache sizes are
  151  * discovered during the autoconfiguration phase.  But we can never
  152  * free the initial set of buckets, since they are allocated using
  153  * uvm_pageboot_alloc().
  154  */
  155 
  156 static size_t recolored_pages_memsize /* = 0 */;
  157 static char *recolored_pages_mem;
  158 
  159 /*
  160  * freelist locks - one per bucket.
  161  */
  162 
  163 union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]
  164     __cacheline_aligned;
  165 
  166 /*
  167  * basic NUMA information.
  168  */
  169 
  170 static struct uvm_page_numa_region {
  171         struct uvm_page_numa_region     *next;
  172         paddr_t                         start;
  173         paddr_t                         size;
  174         u_int                           numa_id;
  175 } *uvm_page_numa_region;
  176 
  177 #ifdef DEBUG
  178 kmutex_t uvm_zerochecklock __cacheline_aligned;
  179 vaddr_t uvm_zerocheckkva;
  180 #endif /* DEBUG */
  181 
  182 /*
  183  * These functions are reserved for uvm(9) internal use and are not
  184  * exported in the header file uvm_physseg.h
  185  *
  186  * Thus they are redefined here.
  187  */
  188 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
  189 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
  190 
  191 /* returns a pgs array */
  192 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
  193 
  194 /*
  195  * inline functions
  196  */
  197 
  198 /*
  199  * uvm_pageinsert: insert a page in the object.
  200  *
  201  * => caller must lock object
  202  * => call should have already set pg's object and offset pointers
  203  *    and bumped the version counter
  204  */
  205 
  206 static inline void
  207 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
  208 {
  209 
  210         KASSERT(uobj == pg->uobject);
  211         KASSERT(rw_write_held(uobj->vmobjlock));
  212         KASSERT((pg->flags & PG_TABLED) == 0);
  213 
  214         if ((pg->flags & PG_STAT) != 0) {
  215                 /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
  216                 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
  217 
  218                 if ((pg->flags & PG_FILE) != 0) {
  219                         if (uobj->uo_npages == 0) {
  220                                 struct vnode *vp = (struct vnode *)uobj;
  221                                 mutex_enter(vp->v_interlock);
  222                                 KASSERT((vp->v_iflag & VI_PAGES) == 0);
  223                                 vp->v_iflag |= VI_PAGES;
  224                                 vholdl(vp);
  225                                 mutex_exit(vp->v_interlock);
  226                         }
  227                         if (UVM_OBJ_IS_VTEXT(uobj)) {
  228                                 cpu_count(CPU_COUNT_EXECPAGES, 1);
  229                         }
  230                         cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
  231                 } else {
  232                         cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
  233                 }
  234         }
  235         pg->flags |= PG_TABLED;
  236         uobj->uo_npages++;
  237 }
  238 
  239 static inline int
  240 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
  241 {
  242         const uint64_t idx = pg->offset >> PAGE_SHIFT;
  243         int error;
  244 
  245         KASSERT(rw_write_held(uobj->vmobjlock));
  246 
  247         error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
  248         if (error != 0) {
  249                 return error;
  250         }
  251         if ((pg->flags & PG_CLEAN) == 0) {
  252                 uvm_obj_page_set_dirty(pg);
  253         }
  254         KASSERT(((pg->flags & PG_CLEAN) == 0) ==
  255                 uvm_obj_page_dirty_p(pg));
  256         return 0;
  257 }
  258 
  259 /*
  260  * uvm_page_remove: remove page from object.
  261  *
  262  * => caller must lock object
  263  */
  264 
  265 static inline void
  266 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
  267 {
  268 
  269         KASSERT(uobj == pg->uobject);
  270         KASSERT(rw_write_held(uobj->vmobjlock));
  271         KASSERT(pg->flags & PG_TABLED);
  272 
  273         if ((pg->flags & PG_STAT) != 0) {
  274                 /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
  275                 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
  276 
  277                 if ((pg->flags & PG_FILE) != 0) {
  278                         if (uobj->uo_npages == 1) {
  279                                 struct vnode *vp = (struct vnode *)uobj;
  280                                 mutex_enter(vp->v_interlock);
  281                                 KASSERT((vp->v_iflag & VI_PAGES) != 0);
  282                                 vp->v_iflag &= ~VI_PAGES;
  283                                 holdrelel(vp);
  284                                 mutex_exit(vp->v_interlock);
  285                         }
  286                         if (UVM_OBJ_IS_VTEXT(uobj)) {
  287                                 cpu_count(CPU_COUNT_EXECPAGES, -1);
  288                         }
  289                         cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
  290                 } else {
  291                         cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
  292                 }
  293         }
  294         uobj->uo_npages--;
  295         pg->flags &= ~PG_TABLED;
  296         pg->uobject = NULL;
  297 }
  298 
  299 static inline void
  300 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
  301 {
  302         struct vm_page *opg __unused;
  303 
  304         KASSERT(rw_write_held(uobj->vmobjlock));
  305 
  306         opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
  307         KASSERT(pg == opg);
  308 }
  309 
  310 static void
  311 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
  312 {
  313         int i;
  314 
  315         pgb->pgb_nfree = 0;
  316         for (i = 0; i < uvmexp.ncolors; i++) {
  317                 LIST_INIT(&pgb->pgb_colors[i]);
  318         }
  319         pgfl->pgfl_buckets[num] = pgb;
  320 }
  321 
  322 /*
  323  * uvm_page_init: init the page system.   called from uvm_init().
  324  *
  325  * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
  326  */
  327 
  328 void
  329 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
  330 {
  331         static struct uvm_cpu boot_cpu __cacheline_aligned;
  332         psize_t freepages, pagecount, bucketsize, n;
  333         struct pgflbucket *pgb;
  334         struct vm_page *pagearray;
  335         char *bucketarray;
  336         uvm_physseg_t bank;
  337         int fl, b;
  338 
  339         KASSERT(ncpu <= 1);
  340 
  341         /*
  342          * init the page queues and free page queue locks, except the
  343          * free list; we allocate that later (with the initial vm_page
  344          * structures).
  345          */
  346 
  347         curcpu()->ci_data.cpu_uvm = &boot_cpu;
  348         uvmpdpol_init();
  349         for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
  350                 mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
  351         }
  352 
  353         /*
  354          * allocate vm_page structures.
  355          */
  356 
  357         /*
  358          * sanity check:
  359          * before calling this function the MD code is expected to register
  360          * some free RAM with the uvm_page_physload() function.   our job
  361          * now is to allocate vm_page structures for this memory.
  362          */
  363 
  364         if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
  365                 panic("uvm_page_bootstrap: no memory pre-allocated");
  366 
  367         /*
  368          * first calculate the number of free pages...
  369          *
  370          * note that we use start/end rather than avail_start/avail_end.
  371          * this allows us to allocate extra vm_page structures in case we
  372          * want to return some memory to the pool after booting.
  373          */
  374 
  375         freepages = 0;
  376 
  377         for (bank = uvm_physseg_get_first();
  378              uvm_physseg_valid_p(bank) ;
  379              bank = uvm_physseg_get_next(bank)) {
  380                 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
  381         }
  382 
  383         /*
  384          * Let MD code initialize the number of colors, or default
  385          * to 1 color if MD code doesn't care.
  386          */
  387         if (uvmexp.ncolors == 0)
  388                 uvmexp.ncolors = 1;
  389         uvmexp.colormask = uvmexp.ncolors - 1;
  390         KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
  391 
  392         /* We always start with only 1 bucket. */
  393         uvm.bucketcount = 1;
  394 
  395         /*
  396          * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
  397          * use.   for each page of memory we use we need a vm_page structure.
  398          * thus, the total number of pages we can use is the total size of
  399          * the memory divided by the PAGE_SIZE plus the size of the vm_page
  400          * structure.   we add one to freepages as a fudge factor to avoid
  401          * truncation errors (since we can only allocate in terms of whole
  402          * pages).
  403          */
  404         pagecount = ((freepages + 1) << PAGE_SHIFT) /
  405             (PAGE_SIZE + sizeof(struct vm_page));
  406         bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
  407         bucketsize = roundup2(bucketsize, coherency_unit);
  408         bucketarray = (void *)uvm_pageboot_alloc(
  409             bucketsize * VM_NFREELIST +
  410             pagecount * sizeof(struct vm_page));
  411         pagearray = (struct vm_page *)
  412             (bucketarray + bucketsize * VM_NFREELIST);
  413 
  414         for (fl = 0; fl < VM_NFREELIST; fl++) {
  415                 pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
  416                 uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
  417         }
  418         memset(pagearray, 0, pagecount * sizeof(struct vm_page));
  419 
  420         /*
  421          * init the freelist cache in the disabled state.
  422          */
  423         uvm_pgflcache_init();
  424 
  425         /*
  426          * init the vm_page structures and put them in the correct place.
  427          */
  428         /* First init the extent */
  429 
  430         for (bank = uvm_physseg_get_first(),
  431                  uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
  432              uvm_physseg_valid_p(bank);
  433              bank = uvm_physseg_get_next(bank)) {
  434 
  435                 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
  436                 uvm_physseg_seg_alloc_from_slab(bank, n);
  437                 uvm_physseg_init_seg(bank, pagearray);
  438 
  439                 /* set up page array pointers */
  440                 pagearray += n;
  441                 pagecount -= n;
  442         }
  443 
  444         /*
  445          * pass up the values of virtual_space_start and
  446          * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
  447          * layers of the VM.
  448          */
  449 
  450         *kvm_startp = round_page(virtual_space_start);
  451         *kvm_endp = trunc_page(virtual_space_end);
  452 
  453         /*
  454          * init various thresholds.
  455          */
  456 
  457         uvmexp.reserve_pagedaemon = 1;
  458         uvmexp.reserve_kernel = vm_page_reserve_kernel;
  459 
  460         /*
  461          * done!
  462          */
  463 
  464         uvm.page_init_done = true;
  465 }
  466 
  467 /*
  468  * uvm_pgfl_lock: lock all freelist buckets
  469  */
  470 
  471 void
  472 uvm_pgfl_lock(void)
  473 {
  474         int i;
  475 
  476         for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
  477                 mutex_spin_enter(&uvm_freelist_locks[i].lock);
  478         }
  479 }
  480 
  481 /*
  482  * uvm_pgfl_unlock: unlock all freelist buckets
  483  */
  484 
  485 void
  486 uvm_pgfl_unlock(void)
  487 {
  488         int i;
  489 
  490         for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
  491                 mutex_spin_exit(&uvm_freelist_locks[i].lock);
  492         }
  493 }
  494 
  495 /*
  496  * uvm_setpagesize: set the page size
  497  *
  498  * => sets page_shift and page_mask from uvmexp.pagesize.
  499  */
  500 
  501 void
  502 uvm_setpagesize(void)
  503 {
  504 
  505         /*
  506          * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
  507          * to be a constant (indicated by being a non-zero value).
  508          */
  509         if (uvmexp.pagesize == 0) {
  510                 if (PAGE_SIZE == 0)
  511                         panic("uvm_setpagesize: uvmexp.pagesize not set");
  512                 uvmexp.pagesize = PAGE_SIZE;
  513         }
  514         uvmexp.pagemask = uvmexp.pagesize - 1;
  515         if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
  516                 panic("uvm_setpagesize: page size %u (%#x) not a power of two",
  517                     uvmexp.pagesize, uvmexp.pagesize);
  518         for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
  519                 if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
  520                         break;
  521 }
  522 
  523 /*
  524  * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
  525  */
  526 
  527 vaddr_t
  528 uvm_pageboot_alloc(vsize_t size)
  529 {
  530         static bool initialized = false;
  531         vaddr_t addr;
  532 #if !defined(PMAP_STEAL_MEMORY)
  533         vaddr_t vaddr;
  534         paddr_t paddr;
  535 #endif
  536 
  537         /*
  538          * on first call to this function, initialize ourselves.
  539          */
  540         if (initialized == false) {
  541                 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
  542 
  543                 /* round it the way we like it */
  544                 virtual_space_start = round_page(virtual_space_start);
  545                 virtual_space_end = trunc_page(virtual_space_end);
  546 
  547                 initialized = true;
  548         }
  549 
  550         /* round to page size */
  551         size = round_page(size);
  552         uvmexp.bootpages += atop(size);
  553 
  554 #if defined(PMAP_STEAL_MEMORY)
  555 
  556         /*
  557          * defer bootstrap allocation to MD code (it may want to allocate
  558          * from a direct-mapped segment).  pmap_steal_memory should adjust
  559          * virtual_space_start/virtual_space_end if necessary.
  560          */
  561 
  562         addr = pmap_steal_memory(size, &virtual_space_start,
  563             &virtual_space_end);
  564 
  565         return addr;
  566 
  567 #else /* !PMAP_STEAL_MEMORY */
  568 
  569         /*
  570          * allocate virtual memory for this request
  571          */
  572         if (virtual_space_start == virtual_space_end ||
  573             (virtual_space_end - virtual_space_start) < size)
  574                 panic("uvm_pageboot_alloc: out of virtual space");
  575 
  576         addr = virtual_space_start;
  577 
  578 #ifdef PMAP_GROWKERNEL
  579         /*
  580          * If the kernel pmap can't map the requested space,
  581          * then allocate more resources for it.
  582          */
  583         if (uvm_maxkaddr < (addr + size)) {
  584                 uvm_maxkaddr = pmap_growkernel(addr + size);
  585                 if (uvm_maxkaddr < (addr + size))
  586                         panic("uvm_pageboot_alloc: pmap_growkernel() failed");
  587         }
  588 #endif
  589 
  590         virtual_space_start += size;
  591 
  592         /*
  593          * allocate and mapin physical pages to back new virtual pages
  594          */
  595 
  596         for (vaddr = round_page(addr) ; vaddr < addr + size ;
  597             vaddr += PAGE_SIZE) {
  598 
  599                 if (!uvm_page_physget(&paddr))
  600                         panic("uvm_pageboot_alloc: out of memory");
  601 
  602                 /*
  603                  * Note this memory is no longer managed, so using
  604                  * pmap_kenter is safe.
  605                  */
  606                 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
  607         }
  608         pmap_update(pmap_kernel());
  609         return addr;
  610 #endif  /* PMAP_STEAL_MEMORY */
  611 }
  612 
  613 #if !defined(PMAP_STEAL_MEMORY)
  614 /*
  615  * uvm_page_physget: "steal" one page from the vm_physmem structure.
  616  *
  617  * => attempt to allocate it off the end of a segment in which the "avail"
  618  *    values match the start/end values.   if we can't do that, then we
  619  *    will advance both values (making them equal, and removing some
  620  *    vm_page structures from the non-avail area).
  621  * => return false if out of memory.
  622  */
  623 
  624 /* subroutine: try to allocate from memory chunks on the specified freelist */
  625 static bool uvm_page_physget_freelist(paddr_t *, int);
  626 
  627 static bool
  628 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
  629 {
  630         uvm_physseg_t lcv;
  631 
  632         /* pass 1: try allocating from a matching end */
  633 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
  634         for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
  635 #else
  636         for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
  637 #endif
  638         {
  639                 if (uvm.page_init_done == true)
  640                         panic("uvm_page_physget: called _after_ bootstrap");
  641 
  642                 /* Try to match at front or back on unused segment */
  643                 if (uvm_page_physunload(lcv, freelist, paddrp))
  644                         return true;
  645         }
  646 
  647         /* pass2: forget about matching ends, just allocate something */
  648 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
  649         for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
  650 #else
  651         for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
  652 #endif
  653         {
  654                 /* Try the front regardless. */
  655                 if (uvm_page_physunload_force(lcv, freelist, paddrp))
  656                         return true;
  657         }
  658         return false;
  659 }
  660 
  661 bool
  662 uvm_page_physget(paddr_t *paddrp)
  663 {
  664         int i;
  665 
  666         /* try in the order of freelist preference */
  667         for (i = 0; i < VM_NFREELIST; i++)
  668                 if (uvm_page_physget_freelist(paddrp, i) == true)
  669                         return (true);
  670         return (false);
  671 }
  672 #endif /* PMAP_STEAL_MEMORY */
  673 
  674 /*
  675  * PHYS_TO_VM_PAGE: find vm_page for a PA.   used by MI code to get vm_pages
  676  * back from an I/O mapping (ugh!).   used in some MD code as well.
  677  */
  678 struct vm_page *
  679 uvm_phys_to_vm_page(paddr_t pa)
  680 {
  681         paddr_t pf = atop(pa);
  682         paddr_t off;
  683         uvm_physseg_t   upm;
  684 
  685         upm = uvm_physseg_find(pf, &off);
  686         if (upm != UVM_PHYSSEG_TYPE_INVALID)
  687                 return uvm_physseg_get_pg(upm, off);
  688         return(NULL);
  689 }
  690 
  691 paddr_t
  692 uvm_vm_page_to_phys(const struct vm_page *pg)
  693 {
  694 
  695         return pg->phys_addr & ~(PAGE_SIZE - 1);
  696 }
  697 
  698 /*
  699  * uvm_page_numa_load: load NUMA range description.
  700  */
  701 void
  702 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
  703 {
  704         struct uvm_page_numa_region *d;
  705 
  706         KASSERT(numa_id < PGFL_MAX_BUCKETS);
  707 
  708         d = kmem_alloc(sizeof(*d), KM_SLEEP);
  709         d->start = start;
  710         d->size = size;
  711         d->numa_id = numa_id;
  712         d->next = uvm_page_numa_region;
  713         uvm_page_numa_region = d;
  714 }
  715 
  716 /*
  717  * uvm_page_numa_lookup: lookup NUMA node for the given page.
  718  */
  719 static u_int
  720 uvm_page_numa_lookup(struct vm_page *pg)
  721 {
  722         struct uvm_page_numa_region *d;
  723         static bool warned;
  724         paddr_t pa;
  725 
  726         KASSERT(uvm_page_numa_region != NULL);
  727 
  728         pa = VM_PAGE_TO_PHYS(pg);
  729         for (d = uvm_page_numa_region; d != NULL; d = d->next) {
  730                 if (pa >= d->start && pa < d->start + d->size) {
  731                         return d->numa_id;
  732                 }
  733         }
  734 
  735         if (!warned) {
  736                 printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
  737                     PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
  738                 warned = true;
  739         }
  740 
  741         return 0;
  742 }
  743 
  744 /*
  745  * uvm_page_redim: adjust freelist dimensions if they have changed.
  746  */
  747 
  748 static void
  749 uvm_page_redim(int newncolors, int newnbuckets)
  750 {
  751         struct pgfreelist npgfl;
  752         struct pgflbucket *opgb, *npgb;
  753         struct pgflist *ohead, *nhead;
  754         struct vm_page *pg;
  755         size_t bucketsize, bucketmemsize, oldbucketmemsize;
  756         int fl, ob, oc, nb, nc, obuckets, ocolors;
  757         char *bucketarray, *oldbucketmem, *bucketmem;
  758 
  759         KASSERT(((newncolors - 1) & newncolors) == 0);
  760 
  761         /* Anything to do? */
  762         if (newncolors <= uvmexp.ncolors &&
  763             newnbuckets == uvm.bucketcount) {
  764                 return;
  765         }
  766         if (uvm.page_init_done == false) {
  767                 uvmexp.ncolors = newncolors;
  768                 return;
  769         }
  770 
  771         bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
  772         bucketsize = roundup2(bucketsize, coherency_unit);
  773         bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
  774             coherency_unit - 1;
  775         bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
  776         bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
  777 
  778         ocolors = uvmexp.ncolors;
  779         obuckets = uvm.bucketcount;
  780 
  781         /* Freelist cache musn't be enabled. */
  782         uvm_pgflcache_pause();
  783 
  784         /* Make sure we should still do this. */
  785         uvm_pgfl_lock();
  786         if (newncolors <= uvmexp.ncolors &&
  787             newnbuckets == uvm.bucketcount) {
  788                 uvm_pgfl_unlock();
  789                 uvm_pgflcache_resume();
  790                 kmem_free(bucketmem, bucketmemsize);
  791                 return;
  792         }
  793 
  794         uvmexp.ncolors = newncolors;
  795         uvmexp.colormask = uvmexp.ncolors - 1;
  796         uvm.bucketcount = newnbuckets;
  797 
  798         for (fl = 0; fl < VM_NFREELIST; fl++) {
  799                 /* Init new buckets in new freelist. */
  800                 memset(&npgfl, 0, sizeof(npgfl));
  801                 for (nb = 0; nb < newnbuckets; nb++) {
  802                         npgb = (struct pgflbucket *)bucketarray;
  803                         uvm_page_init_bucket(&npgfl, npgb, nb);
  804                         bucketarray += bucketsize;
  805                 }
  806                 /* Now transfer pages from the old freelist. */
  807                 for (nb = ob = 0; ob < obuckets; ob++) {
  808                         opgb = uvm.page_free[fl].pgfl_buckets[ob];
  809                         for (oc = 0; oc < ocolors; oc++) {
  810                                 ohead = &opgb->pgb_colors[oc];
  811                                 while ((pg = LIST_FIRST(ohead)) != NULL) {
  812                                         LIST_REMOVE(pg, pageq.list);
  813                                         /*
  814                                          * Here we decide on the NEW color &
  815                                          * bucket for the page.  For NUMA
  816                                          * we'll use the info that the
  817                                          * hardware gave us.  For non-NUMA
  818                                          * assign take physical page frame
  819                                          * number and cache color into
  820                                          * account.  We do this to try and
  821                                          * avoid defeating any memory
  822                                          * interleaving in the hardware.
  823                                          */
  824                                         KASSERT(
  825                                             uvm_page_get_bucket(pg) == ob);
  826                                         KASSERT(fl ==
  827                                             uvm_page_get_freelist(pg));
  828                                         if (uvm_page_numa_region != NULL) {
  829                                                 nb = uvm_page_numa_lookup(pg);
  830                                         } else {
  831                                                 nb = atop(VM_PAGE_TO_PHYS(pg))
  832                                                     / uvmexp.ncolors / 8
  833                                                     % newnbuckets;
  834                                         }
  835                                         uvm_page_set_bucket(pg, nb);
  836                                         npgb = npgfl.pgfl_buckets[nb];
  837                                         npgb->pgb_nfree++;
  838                                         nc = VM_PGCOLOR(pg);
  839                                         nhead = &npgb->pgb_colors[nc];
  840                                         LIST_INSERT_HEAD(nhead, pg, pageq.list);
  841                                 }
  842                         }
  843                 }
  844                 /* Install the new freelist. */
  845                 memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
  846         }
  847 
  848         /* Unlock and free the old memory. */
  849         oldbucketmemsize = recolored_pages_memsize;
  850         oldbucketmem = recolored_pages_mem;
  851         recolored_pages_memsize = bucketmemsize;
  852         recolored_pages_mem = bucketmem;
  853 
  854         uvm_pgfl_unlock();
  855         uvm_pgflcache_resume();
  856 
  857         if (oldbucketmemsize) {
  858                 kmem_free(oldbucketmem, oldbucketmemsize);
  859         }
  860 
  861         /*
  862          * this calls uvm_km_alloc() which may want to hold
  863          * uvm_freelist_lock.
  864          */
  865         uvm_pager_realloc_emerg();
  866 }
  867 
  868 /*
  869  * uvm_page_recolor: Recolor the pages if the new color count is
  870  * larger than the old one.
  871  */
  872 
  873 void
  874 uvm_page_recolor(int newncolors)
  875 {
  876 
  877         uvm_page_redim(newncolors, uvm.bucketcount);
  878 }
  879 
  880 /*
  881  * uvm_page_rebucket: Determine a bucket structure and redim the free
  882  * lists to match.
  883  */
  884 
  885 void
  886 uvm_page_rebucket(void)
  887 {
  888         u_int min_numa, max_numa, npackage, shift;
  889         struct cpu_info *ci, *ci2, *ci3;
  890         CPU_INFO_ITERATOR cii;
  891 
  892         /*
  893          * If we have more than one NUMA node, and the maximum NUMA node ID
  894          * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
  895          * for free pages.
  896          */
  897         min_numa = (u_int)-1;
  898         max_numa = 0;
  899         for (CPU_INFO_FOREACH(cii, ci)) {
  900                 if (ci->ci_numa_id < min_numa) {
  901                         min_numa = ci->ci_numa_id;
  902                 }
  903                 if (ci->ci_numa_id > max_numa) {
  904                         max_numa = ci->ci_numa_id;
  905                 }
  906         }
  907         if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
  908                 aprint_debug("UVM: using NUMA allocation scheme\n");
  909                 for (CPU_INFO_FOREACH(cii, ci)) {
  910                         ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
  911                 }
  912                 uvm_page_redim(uvmexp.ncolors, max_numa + 1);
  913                 return;
  914         }
  915 
  916         /*
  917          * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
  918          * and minimise lock contention.  Count the total number of CPU
  919          * packages, and then try to distribute the buckets among CPU
  920          * packages evenly.
  921          */
  922         npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
  923 
  924         /*
  925          * Figure out how to arrange the packages & buckets, and the total
  926          * number of buckets we need.  XXX 2 may not be the best factor.
  927          */
  928         for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
  929                 npackage >>= 1;
  930         }
  931         uvm_page_redim(uvmexp.ncolors, npackage);
  932 
  933         /*
  934          * Now tell each CPU which bucket to use.  In the outer loop, scroll
  935          * through all CPU packages.
  936          */
  937         npackage = 0;
  938         ci = curcpu();
  939         ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
  940         do {
  941                 /*
  942                  * In the inner loop, scroll through all CPUs in the package
  943                  * and assign the same bucket ID.
  944                  */
  945                 ci3 = ci2;
  946                 do {
  947                         ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
  948                         ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
  949                 } while (ci3 != ci2);
  950                 npackage++;
  951                 ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
  952         } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
  953 
  954         aprint_debug("UVM: using package allocation scheme, "
  955             "%d package(s) per bucket\n", 1 << shift);
  956 }
  957 
  958 /*
  959  * uvm_cpu_attach: initialize per-CPU data structures.
  960  */
  961 
  962 void
  963 uvm_cpu_attach(struct cpu_info *ci)
  964 {
  965         struct uvm_cpu *ucpu;
  966 
  967         /* Already done in uvm_page_init(). */
  968         if (!CPU_IS_PRIMARY(ci)) {
  969                 /* Add more reserve pages for this CPU. */
  970                 uvmexp.reserve_kernel += vm_page_reserve_kernel;
  971 
  972                 /* Allocate per-CPU data structures. */
  973                 ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
  974                     KM_SLEEP);
  975                 ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
  976                     coherency_unit);
  977                 ci->ci_data.cpu_uvm = ucpu;
  978         } else {
  979                 ucpu = ci->ci_data.cpu_uvm;
  980         }
  981 
  982         uvmpdpol_init_cpu(ucpu);
  983 
  984         /*
  985          * Attach RNG source for this CPU's VM events
  986          */
  987         rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM,
  988             RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
  989             RND_FLAG_ESTIMATE_VALUE);
  990 }
  991 
  992 /*
  993  * uvm_availmem: fetch the total amount of free memory in pages.  this can
  994  * have a detrimental effect on performance due to false sharing; don't call
  995  * unless needed.
  996  *
  997  * some users can request the amount of free memory so often that it begins
  998  * to impact upon performance.  if calling frequently and an inexact value
  999  * is okay, call with cached = true.
 1000  */
 1001 
 1002 int
 1003 uvm_availmem(bool cached)
 1004 {
 1005         int64_t fp;
 1006 
 1007         cpu_count_sync(cached);
 1008         if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
 1009                 /*
 1010                  * XXXAD could briefly go negative because it's impossible
 1011                  * to get a clean snapshot.  address this for other counters
 1012                  * used as running totals before NetBSD 10 although less
 1013                  * important for those.
 1014                  */
 1015                 fp = 0;
 1016         }
 1017         return (int)fp;
 1018 }
 1019 
 1020 /*
 1021  * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
 1022  * specific freelist and specific bucket only.
 1023  *
 1024  * => must be at IPL_VM or higher to protect per-CPU data structures.
 1025  */
 1026 
 1027 static struct vm_page *
 1028 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
 1029 {
 1030         int c, trycolor, colormask;
 1031         struct pgflbucket *pgb;
 1032         struct vm_page *pg;
 1033         kmutex_t *lock;
 1034         bool fill;
 1035 
 1036         /*
 1037          * Skip the bucket if empty, no lock needed.  There could be many
 1038          * empty freelists/buckets.
 1039          */
 1040         pgb = uvm.page_free[f].pgfl_buckets[b];
 1041         if (pgb->pgb_nfree == 0) {
 1042                 return NULL;
 1043         }
 1044 
 1045         /* Skip bucket if low on memory. */
 1046         lock = &uvm_freelist_locks[b].lock;
 1047         mutex_spin_enter(lock);
 1048         if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
 1049                 if ((flags & UVM_PGA_USERESERVE) == 0 ||
 1050                     (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
 1051                      curlwp != uvm.pagedaemon_lwp)) {
 1052                         mutex_spin_exit(lock);
 1053                         return NULL;
 1054                 }
 1055                 fill = false;
 1056         } else {
 1057                 fill = true;
 1058         }
 1059 
 1060         /* Try all page colors as needed. */
 1061         c = trycolor = *trycolorp;
 1062         colormask = uvmexp.colormask;
 1063         do {
 1064                 pg = LIST_FIRST(&pgb->pgb_colors[c]);
 1065                 if (__predict_true(pg != NULL)) {
 1066                         /*
 1067                          * Got a free page!  PG_FREE must be cleared under
 1068                          * lock because of uvm_pglistalloc().
 1069                          */
 1070                         LIST_REMOVE(pg, pageq.list);
 1071                         KASSERT(pg->flags == PG_FREE);
 1072                         pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
 1073                         pgb->pgb_nfree--;
 1074                         CPU_COUNT(CPU_COUNT_FREEPAGES, -1);
 1075 
 1076                         /*
 1077                          * While we have the bucket locked and our data
 1078                          * structures fresh in L1 cache, we have an ideal
 1079                          * opportunity to grab some pages for the freelist
 1080                          * cache without causing extra contention.  Only do
 1081                          * so if we found pages in this CPU's preferred
 1082                          * bucket.
 1083                          */
 1084                         if (__predict_true(b == ucpu->pgflbucket && fill)) {
 1085                                 uvm_pgflcache_fill(ucpu, f, b, c);
 1086                         }
 1087                         mutex_spin_exit(lock);
 1088                         KASSERT(uvm_page_get_bucket(pg) == b);
 1089                         CPU_COUNT(c == trycolor ?
 1090                             CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
 1091                         CPU_COUNT(CPU_COUNT_CPUMISS, 1);
 1092                         *trycolorp = c;
 1093                         return pg;
 1094                 }
 1095                 c = (c + 1) & colormask;
 1096         } while (c != trycolor);
 1097         mutex_spin_exit(lock);
 1098 
 1099         return NULL;
 1100 }
 1101 
 1102 /*
 1103  * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
 1104  * any color from any bucket, in a specific freelist.
 1105  *
 1106  * => must be at IPL_VM or higher to protect per-CPU data structures.
 1107  */
 1108 
 1109 static struct vm_page *
 1110 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
 1111 {
 1112         int b, trybucket, bucketcount;
 1113         struct vm_page *pg;
 1114 
 1115         /* Try for the exact thing in the per-CPU cache. */
 1116         if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
 1117                 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
 1118                 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
 1119                 return pg;
 1120         }
 1121 
 1122         /* Walk through all buckets, trying our preferred bucket first. */
 1123         trybucket = ucpu->pgflbucket;
 1124         b = trybucket;
 1125         bucketcount = uvm.bucketcount;
 1126         do {
 1127                 pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
 1128                 if (pg != NULL) {
 1129                         return pg;
 1130                 }
 1131                 b = (b + 1 == bucketcount ? 0 : b + 1);
 1132         } while (b != trybucket);
 1133 
 1134         return NULL;
 1135 }
 1136 
 1137 /*
 1138  * uvm_pagealloc_strat: allocate vm_page from a particular free list.
 1139  *
 1140  * => return null if no pages free
 1141  * => wake up pagedaemon if number of free pages drops below low water mark
 1142  * => if obj != NULL, obj must be locked (to put in obj's tree)
 1143  * => if anon != NULL, anon must be locked (to put in anon)
 1144  * => only one of obj or anon can be non-null
 1145  * => caller must activate/deactivate page if it is not wired.
 1146  * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
 1147  * => policy decision: it is more important to pull a page off of the
 1148  *      appropriate priority free list than it is to get a page from the
 1149  *      correct bucket or color bin.  This is because we live with the
 1150  *      consequences of a bad free list decision for the entire
 1151  *      lifetime of the page, e.g. if the page comes from memory that
 1152  *      is slower to access.
 1153  */
 1154 
 1155 struct vm_page *
 1156 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
 1157     int flags, int strat, int free_list)
 1158 {
 1159         int color, lcv, error, s;
 1160         struct uvm_cpu *ucpu;
 1161         struct vm_page *pg;
 1162         lwp_t *l;
 1163 
 1164         KASSERT(obj == NULL || anon == NULL);
 1165         KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
 1166         KASSERT(off == trunc_page(off));
 1167         KASSERT(obj == NULL || rw_write_held(obj->vmobjlock));
 1168         KASSERT(anon == NULL || anon->an_lock == NULL ||
 1169             rw_write_held(anon->an_lock));
 1170 
 1171         /*
 1172          * This implements a global round-robin page coloring
 1173          * algorithm.
 1174          */
 1175 
 1176         s = splvm();
 1177         ucpu = curcpu()->ci_data.cpu_uvm;
 1178         if (flags & UVM_FLAG_COLORMATCH) {
 1179                 color = atop(off) & uvmexp.colormask;
 1180         } else {
 1181                 color = ucpu->pgflcolor;
 1182         }
 1183 
 1184         /*
 1185          * fail if any of these conditions is true:
 1186          * [1]  there really are no free pages, or
 1187          * [2]  only kernel "reserved" pages remain and
 1188          *        reserved pages have not been requested.
 1189          * [3]  only pagedaemon "reserved" pages remain and
 1190          *        the requestor isn't the pagedaemon.
 1191          * we make kernel reserve pages available if called by a
 1192          * kernel thread.
 1193          */
 1194         l = curlwp;
 1195         if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
 1196                 flags |= UVM_PGA_USERESERVE;
 1197         }
 1198 
 1199  again:
 1200         switch (strat) {
 1201         case UVM_PGA_STRAT_NORMAL:
 1202                 /* Check freelists: descending priority (ascending id) order. */
 1203                 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
 1204                         pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
 1205                         if (pg != NULL) {
 1206                                 goto gotit;
 1207                         }
 1208                 }
 1209 
 1210                 /* No pages free!  Have pagedaemon free some memory. */
 1211                 splx(s);
 1212                 uvm_kick_pdaemon();
 1213                 return NULL;
 1214 
 1215         case UVM_PGA_STRAT_ONLY:
 1216         case UVM_PGA_STRAT_FALLBACK:
 1217                 /* Attempt to allocate from the specified free list. */
 1218                 KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
 1219                 pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
 1220                 if (pg != NULL) {
 1221                         goto gotit;
 1222                 }
 1223 
 1224                 /* Fall back, if possible. */
 1225                 if (strat == UVM_PGA_STRAT_FALLBACK) {
 1226                         strat = UVM_PGA_STRAT_NORMAL;
 1227                         goto again;
 1228                 }
 1229 
 1230                 /* No pages free!  Have pagedaemon free some memory. */
 1231                 splx(s);
 1232                 uvm_kick_pdaemon();
 1233                 return NULL;
 1234 
 1235         case UVM_PGA_STRAT_NUMA:
 1236                 /*
 1237                  * NUMA strategy (experimental): allocating from the correct
 1238                  * bucket is more important than observing freelist
 1239                  * priority.  Look only to the current NUMA node; if that
 1240                  * fails, we need to look to other NUMA nodes, so retry with
 1241                  * the normal strategy.
 1242                  */
 1243                 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
 1244                         pg = uvm_pgflcache_alloc(ucpu, lcv, color);
 1245                         if (pg != NULL) {
 1246                                 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
 1247                                 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
 1248                                 goto gotit;
 1249                         }
 1250                         pg = uvm_pagealloc_pgb(ucpu, lcv,
 1251                             ucpu->pgflbucket, &color, flags);
 1252                         if (pg != NULL) {
 1253                                 goto gotit;
 1254                         }
 1255                 }
 1256                 strat = UVM_PGA_STRAT_NORMAL;
 1257                 goto again;
 1258 
 1259         default:
 1260                 panic("uvm_pagealloc_strat: bad strat %d", strat);
 1261                 /* NOTREACHED */
 1262         }
 1263 
 1264  gotit:
 1265         /*
 1266          * We now know which color we actually allocated from; set
 1267          * the next color accordingly.
 1268          */
 1269 
 1270         ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
 1271 
 1272         /*
 1273          * while still at IPL_VM, update allocation statistics.
 1274          */
 1275 
 1276         if (anon) {
 1277                 CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
 1278         }
 1279         splx(s);
 1280         KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE));
 1281 
 1282         /*
 1283          * assign the page to the object.  as the page was free, we know
 1284          * that pg->uobject and pg->uanon are NULL.  we only need to take
 1285          * the page's interlock if we are changing the values.
 1286          */
 1287         if (anon != NULL || obj != NULL) {
 1288                 mutex_enter(&pg->interlock);
 1289         }
 1290         pg->offset = off;
 1291         pg->uobject = obj;
 1292         pg->uanon = anon;
 1293         KASSERT(uvm_page_owner_locked_p(pg, true));
 1294         if (anon) {
 1295                 anon->an_page = pg;
 1296                 pg->flags |= PG_ANON;
 1297                 mutex_exit(&pg->interlock);
 1298         } else if (obj) {
 1299                 /*
 1300                  * set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
 1301                  */
 1302                 if (UVM_OBJ_IS_VNODE(obj)) {
 1303                         pg->flags |= PG_FILE;
 1304                 } else if (UVM_OBJ_IS_AOBJ(obj)) {
 1305                         pg->flags |= PG_AOBJ;
 1306                 }
 1307                 uvm_pageinsert_object(obj, pg);
 1308                 mutex_exit(&pg->interlock);
 1309                 error = uvm_pageinsert_tree(obj, pg);
 1310                 if (error != 0) {
 1311                         mutex_enter(&pg->interlock);
 1312                         uvm_pageremove_object(obj, pg);
 1313                         mutex_exit(&pg->interlock);
 1314                         uvm_pagefree(pg);
 1315                         return NULL;
 1316                 }
 1317         }
 1318 
 1319 #if defined(UVM_PAGE_TRKOWN)
 1320         pg->owner_tag = NULL;
 1321 #endif
 1322         UVM_PAGE_OWN(pg, "new alloc");
 1323 
 1324         if (flags & UVM_PGA_ZERO) {
 1325                 /* A zero'd page is not clean. */
 1326                 if (obj != NULL || anon != NULL) {
 1327                         uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
 1328                 }
 1329                 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
 1330         }
 1331 
 1332         return(pg);
 1333 }
 1334 
 1335 /*
 1336  * uvm_pagereplace: replace a page with another
 1337  *
 1338  * => object must be locked
 1339  * => page interlocks must be held
 1340  */
 1341 
 1342 void
 1343 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
 1344 {
 1345         struct uvm_object *uobj = oldpg->uobject;
 1346         struct vm_page *pg __diagused;
 1347         uint64_t idx;
 1348 
 1349         KASSERT((oldpg->flags & PG_TABLED) != 0);
 1350         KASSERT(uobj != NULL);
 1351         KASSERT((newpg->flags & PG_TABLED) == 0);
 1352         KASSERT(newpg->uobject == NULL);
 1353         KASSERT(rw_write_held(uobj->vmobjlock));
 1354         KASSERT(mutex_owned(&oldpg->interlock));
 1355         KASSERT(mutex_owned(&newpg->interlock));
 1356 
 1357         newpg->uobject = uobj;
 1358         newpg->offset = oldpg->offset;
 1359         idx = newpg->offset >> PAGE_SHIFT;
 1360         pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
 1361         KASSERT(pg == oldpg);
 1362         if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
 1363                 if ((newpg->flags & PG_CLEAN) != 0) {
 1364                         uvm_obj_page_clear_dirty(newpg);
 1365                 } else {
 1366                         uvm_obj_page_set_dirty(newpg);
 1367                 }
 1368         }
 1369         /*
 1370          * oldpg's PG_STAT is stable.  newpg is not reachable by others yet.
 1371          */
 1372         newpg->flags |=
 1373             (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
 1374         uvm_pageinsert_object(uobj, newpg);
 1375         uvm_pageremove_object(uobj, oldpg);
 1376 }
 1377 
 1378 /*
 1379  * uvm_pagerealloc: reallocate a page from one object to another
 1380  *
 1381  * => both objects must be locked
 1382  */
 1383 
 1384 int
 1385 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
 1386 {
 1387         int error = 0;
 1388 
 1389         /*
 1390          * remove it from the old object
 1391          */
 1392 
 1393         if (pg->uobject) {
 1394                 uvm_pageremove_tree(pg->uobject, pg);
 1395                 uvm_pageremove_object(pg->uobject, pg);
 1396         }
 1397 
 1398         /*
 1399          * put it in the new object
 1400          */
 1401 
 1402         if (newobj) {
 1403                 mutex_enter(&pg->interlock);
 1404                 pg->uobject = newobj;
 1405                 pg->offset = newoff;
 1406                 if (UVM_OBJ_IS_VNODE(newobj)) {
 1407                         pg->flags |= PG_FILE;
 1408                 } else if (UVM_OBJ_IS_AOBJ(newobj)) {
 1409                         pg->flags |= PG_AOBJ;
 1410                 }
 1411                 uvm_pageinsert_object(newobj, pg);
 1412                 mutex_exit(&pg->interlock);
 1413                 error = uvm_pageinsert_tree(newobj, pg);
 1414                 if (error != 0) {
 1415                         mutex_enter(&pg->interlock);
 1416                         uvm_pageremove_object(newobj, pg);
 1417                         mutex_exit(&pg->interlock);
 1418                 }
 1419         }
 1420 
 1421         return error;
 1422 }
 1423 
 1424 /*
 1425  * uvm_pagefree: free page
 1426  *
 1427  * => erase page's identity (i.e. remove from object)
 1428  * => put page on free list
 1429  * => caller must lock owning object (either anon or uvm_object)
 1430  * => assumes all valid mappings of pg are gone
 1431  */
 1432 
 1433 void
 1434 uvm_pagefree(struct vm_page *pg)
 1435 {
 1436         struct pgfreelist *pgfl;
 1437         struct pgflbucket *pgb;
 1438         struct uvm_cpu *ucpu;
 1439         kmutex_t *lock;
 1440         int bucket, s;
 1441         bool locked;
 1442 
 1443 #ifdef DEBUG
 1444         if (pg->uobject == (void *)0xdeadbeef &&
 1445             pg->uanon == (void *)0xdeadbeef) {
 1446                 panic("uvm_pagefree: freeing free page %p", pg);
 1447         }
 1448 #endif /* DEBUG */
 1449 
 1450         KASSERT((pg->flags & PG_PAGEOUT) == 0);
 1451         KASSERT(!(pg->flags & PG_FREE));
 1452         KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock));
 1453         KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
 1454                 rw_write_held(pg->uanon->an_lock));
 1455 
 1456         /*
 1457          * remove the page from the object's tree before acquiring any page
 1458          * interlocks: this can acquire locks to free radixtree nodes.
 1459          */
 1460         if (pg->uobject != NULL) {
 1461                 uvm_pageremove_tree(pg->uobject, pg);
 1462         }
 1463 
 1464         /*
 1465          * if the page is loaned, resolve the loan instead of freeing.
 1466          */
 1467 
 1468         if (pg->loan_count) {
 1469                 KASSERT(pg->wire_count == 0);
 1470 
 1471                 /*
 1472                  * if the page is owned by an anon then we just want to
 1473                  * drop anon ownership.  the kernel will free the page when
 1474                  * it is done with it.  if the page is owned by an object,
 1475                  * remove it from the object and mark it dirty for the benefit
 1476                  * of possible anon owners.
 1477                  *
 1478                  * regardless of previous ownership, wakeup any waiters,
 1479                  * unbusy the page, and we're done.
 1480                  */
 1481 
 1482                 uvm_pagelock(pg);
 1483                 locked = true;
 1484                 if (pg->uobject != NULL) {
 1485                         uvm_pageremove_object(pg->uobject, pg);
 1486                         pg->flags &= ~(PG_FILE|PG_AOBJ);
 1487                 } else if (pg->uanon != NULL) {
 1488                         if ((pg->flags & PG_ANON) == 0) {
 1489                                 pg->loan_count--;
 1490                         } else {
 1491                                 const unsigned status = uvm_pagegetdirty(pg);
 1492                                 pg->flags &= ~PG_ANON;
 1493                                 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
 1494                         }
 1495                         pg->uanon->an_page = NULL;
 1496                         pg->uanon = NULL;
 1497                 }
 1498                 if (pg->pqflags & PQ_WANTED) {
 1499                         wakeup(pg);
 1500                 }
 1501                 pg->pqflags &= ~PQ_WANTED;
 1502                 pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
 1503 #ifdef UVM_PAGE_TRKOWN
 1504                 pg->owner_tag = NULL;
 1505 #endif
 1506                 KASSERT((pg->flags & PG_STAT) == 0);
 1507                 if (pg->loan_count) {
 1508                         KASSERT(pg->uobject == NULL);
 1509                         if (pg->uanon == NULL) {
 1510                                 uvm_pagedequeue(pg);
 1511                         }
 1512                         uvm_pageunlock(pg);
 1513                         return;
 1514                 }
 1515         } else if (pg->uobject != NULL || pg->uanon != NULL ||
 1516                    pg->wire_count != 0) {
 1517                 uvm_pagelock(pg);
 1518                 locked = true;
 1519         } else {
 1520                 locked = false;
 1521         }
 1522 
 1523         /*
 1524          * remove page from its object or anon.
 1525          */
 1526         if (pg->uobject != NULL) {
 1527                 uvm_pageremove_object(pg->uobject, pg);
 1528         } else if (pg->uanon != NULL) {
 1529                 const unsigned int status = uvm_pagegetdirty(pg);
 1530                 pg->uanon->an_page = NULL;
 1531                 pg->uanon = NULL;
 1532                 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
 1533         }
 1534 
 1535         /*
 1536          * if the page was wired, unwire it now.
 1537          */
 1538 
 1539         if (pg->wire_count) {
 1540                 pg->wire_count = 0;
 1541                 atomic_dec_uint(&uvmexp.wired);
 1542         }
 1543         if (locked) {
 1544                 /*
 1545                  * wake anyone waiting on the page.
 1546                  */
 1547                 if ((pg->pqflags & PQ_WANTED) != 0) {
 1548                         pg->pqflags &= ~PQ_WANTED;
 1549                         wakeup(pg);
 1550                 }
 1551 
 1552                 /*
 1553                  * now remove the page from the queues.
 1554                  */
 1555                 uvm_pagedequeue(pg);
 1556                 uvm_pageunlock(pg);
 1557         } else {
 1558                 KASSERT(!uvmpdpol_pageisqueued_p(pg));
 1559         }
 1560 
 1561         /*
 1562          * and put on free queue
 1563          */
 1564 
 1565 #ifdef DEBUG
 1566         pg->uobject = (void *)0xdeadbeef;
 1567         pg->uanon = (void *)0xdeadbeef;
 1568 #endif /* DEBUG */
 1569 
 1570         /* Try to send the page to the per-CPU cache. */
 1571         s = splvm();
 1572         ucpu = curcpu()->ci_data.cpu_uvm;
 1573         bucket = uvm_page_get_bucket(pg);
 1574         if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
 1575                 splx(s);
 1576                 return;
 1577         }
 1578 
 1579         /* Didn't work.  Never mind, send it to a global bucket. */
 1580         pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
 1581         pgb = pgfl->pgfl_buckets[bucket];
 1582         lock = &uvm_freelist_locks[bucket].lock;
 1583 
 1584         mutex_spin_enter(lock);
 1585         /* PG_FREE must be set under lock because of uvm_pglistalloc(). */
 1586         pg->flags = PG_FREE;
 1587         LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
 1588         pgb->pgb_nfree++;
 1589         CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
 1590         mutex_spin_exit(lock);
 1591         splx(s);
 1592 }
 1593 
 1594 /*
 1595  * uvm_page_unbusy: unbusy an array of pages.
 1596  *
 1597  * => pages must either all belong to the same object, or all belong to anons.
 1598  * => if pages are object-owned, object must be locked.
 1599  * => if pages are anon-owned, anons must be locked.
 1600  * => caller must make sure that anon-owned pages are not PG_RELEASED.
 1601  */
 1602 
 1603 void
 1604 uvm_page_unbusy(struct vm_page **pgs, int npgs)
 1605 {
 1606         struct vm_page *pg;
 1607         int i, pageout_done;
 1608         UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
 1609 
 1610         pageout_done = 0;
 1611         for (i = 0; i < npgs; i++) {
 1612                 pg = pgs[i];
 1613                 if (pg == NULL || pg == PGO_DONTCARE) {
 1614                         continue;
 1615                 }
 1616 
 1617                 KASSERT(uvm_page_owner_locked_p(pg, true));
 1618                 KASSERT(pg->flags & PG_BUSY);
 1619 
 1620                 if (pg->flags & PG_PAGEOUT) {
 1621                         pg->flags &= ~PG_PAGEOUT;
 1622                         pg->flags |= PG_RELEASED;
 1623                         pageout_done++;
 1624                         atomic_inc_uint(&uvmexp.pdfreed);
 1625                 }
 1626                 if (pg->flags & PG_RELEASED) {
 1627                         UVMHIST_LOG(ubchist, "releasing pg %#jx",
 1628                             (uintptr_t)pg, 0, 0, 0);
 1629                         KASSERT(pg->uobject != NULL ||
 1630                             (pg->uanon != NULL && pg->uanon->an_ref > 0));
 1631                         pg->flags &= ~PG_RELEASED;
 1632                         uvm_pagefree(pg);
 1633                 } else {
 1634                         UVMHIST_LOG(ubchist, "unbusying pg %#jx",
 1635                             (uintptr_t)pg, 0, 0, 0);
 1636                         KASSERT((pg->flags & PG_FAKE) == 0);
 1637                         pg->flags &= ~PG_BUSY;
 1638                         uvm_pagelock(pg);
 1639                         uvm_pagewakeup(pg);
 1640                         uvm_pageunlock(pg);
 1641                         UVM_PAGE_OWN(pg, NULL);
 1642                 }
 1643         }
 1644         if (pageout_done != 0) {
 1645                 uvm_pageout_done(pageout_done);
 1646         }
 1647 }
 1648 
 1649 /*
 1650  * uvm_pagewait: wait for a busy page
 1651  *
 1652  * => page must be known PG_BUSY
 1653  * => object must be read or write locked
 1654  * => object will be unlocked on return
 1655  */
 1656 
 1657 void
 1658 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
 1659 {
 1660 
 1661         KASSERT(rw_lock_held(lock));
 1662         KASSERT((pg->flags & PG_BUSY) != 0);
 1663         KASSERT(uvm_page_owner_locked_p(pg, false));
 1664 
 1665         mutex_enter(&pg->interlock);
 1666         pg->pqflags |= PQ_WANTED;
 1667         rw_exit(lock);
 1668         UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
 1669 }
 1670 
 1671 /*
 1672  * uvm_pagewakeup: wake anyone waiting on a page
 1673  *
 1674  * => page interlock must be held
 1675  */
 1676 
 1677 void
 1678 uvm_pagewakeup(struct vm_page *pg)
 1679 {
 1680         UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
 1681 
 1682         KASSERT(mutex_owned(&pg->interlock));
 1683 
 1684         UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
 1685 
 1686         if ((pg->pqflags & PQ_WANTED) != 0) {
 1687                 wakeup(pg);
 1688                 pg->pqflags &= ~PQ_WANTED;
 1689         }
 1690 }
 1691 
 1692 /*
 1693  * uvm_pagewanted_p: return true if someone is waiting on the page
 1694  *
 1695  * => object must be write locked (lock out all concurrent access)
 1696  */
 1697 
 1698 bool
 1699 uvm_pagewanted_p(struct vm_page *pg)
 1700 {
 1701 
 1702         KASSERT(uvm_page_owner_locked_p(pg, true));
 1703 
 1704         return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
 1705 }
 1706 
 1707 #if defined(UVM_PAGE_TRKOWN)
 1708 /*
 1709  * uvm_page_own: set or release page ownership
 1710  *
 1711  * => this is a debugging function that keeps track of who sets PG_BUSY
 1712  *      and where they do it.   it can be used to track down problems
 1713  *      such a process setting "PG_BUSY" and never releasing it.
 1714  * => page's object [if any] must be locked
 1715  * => if "tag" is NULL then we are releasing page ownership
 1716  */
 1717 void
 1718 uvm_page_own(struct vm_page *pg, const char *tag)
 1719 {
 1720 
 1721         KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
 1722         KASSERT(uvm_page_owner_locked_p(pg, true));
 1723 
 1724         /* gain ownership? */
 1725         if (tag) {
 1726                 KASSERT((pg->flags & PG_BUSY) != 0);
 1727                 if (pg->owner_tag) {
 1728                         printf("uvm_page_own: page %p already owned "
 1729                             "by proc %d.%d [%s]\n", pg,
 1730                             pg->owner, pg->lowner, pg->owner_tag);
 1731                         panic("uvm_page_own");
 1732                 }
 1733                 pg->owner = curproc->p_pid;
 1734                 pg->lowner = curlwp->l_lid;
 1735                 pg->owner_tag = tag;
 1736                 return;
 1737         }
 1738 
 1739         /* drop ownership */
 1740         KASSERT((pg->flags & PG_BUSY) == 0);
 1741         if (pg->owner_tag == NULL) {
 1742                 printf("uvm_page_own: dropping ownership of an non-owned "
 1743                     "page (%p)\n", pg);
 1744                 panic("uvm_page_own");
 1745         }
 1746         pg->owner_tag = NULL;
 1747 }
 1748 #endif
 1749 
 1750 /*
 1751  * uvm_pagelookup: look up a page
 1752  *
 1753  * => caller should lock object to keep someone from pulling the page
 1754  *      out from under it
 1755  */
 1756 
 1757 struct vm_page *
 1758 uvm_pagelookup(struct uvm_object *obj, voff_t off)
 1759 {
 1760         struct vm_page *pg;
 1761 
 1762         KASSERT(db_active || rw_lock_held(obj->vmobjlock));
 1763 
 1764         pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
 1765 
 1766         KASSERT(pg == NULL || obj->uo_npages != 0);
 1767         KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
 1768                 (pg->flags & PG_BUSY) != 0);
 1769         return pg;
 1770 }
 1771 
 1772 /*
 1773  * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
 1774  *
 1775  * => caller must lock objects
 1776  * => caller must hold pg->interlock
 1777  */
 1778 
 1779 void
 1780 uvm_pagewire(struct vm_page *pg)
 1781 {
 1782 
 1783         KASSERT(uvm_page_owner_locked_p(pg, true));
 1784         KASSERT(mutex_owned(&pg->interlock));
 1785 #if defined(READAHEAD_STATS)
 1786         if ((pg->flags & PG_READAHEAD) != 0) {
 1787                 uvm_ra_hit.ev_count++;
 1788                 pg->flags &= ~PG_READAHEAD;
 1789         }
 1790 #endif /* defined(READAHEAD_STATS) */
 1791         if (pg->wire_count == 0) {
 1792                 uvm_pagedequeue(pg);
 1793                 atomic_inc_uint(&uvmexp.wired);
 1794         }
 1795         pg->wire_count++;
 1796         KASSERT(pg->wire_count > 0);    /* detect wraparound */
 1797 }
 1798 
 1799 /*
 1800  * uvm_pageunwire: unwire the page.
 1801  *
 1802  * => activate if wire count goes to zero.
 1803  * => caller must lock objects
 1804  * => caller must hold pg->interlock
 1805  */
 1806 
 1807 void
 1808 uvm_pageunwire(struct vm_page *pg)
 1809 {
 1810 
 1811         KASSERT(uvm_page_owner_locked_p(pg, true));
 1812         KASSERT(pg->wire_count != 0);
 1813         KASSERT(!uvmpdpol_pageisqueued_p(pg));
 1814         KASSERT(mutex_owned(&pg->interlock));
 1815         pg->wire_count--;
 1816         if (pg->wire_count == 0) {
 1817                 uvm_pageactivate(pg);
 1818                 KASSERT(uvmexp.wired != 0);
 1819                 atomic_dec_uint(&uvmexp.wired);
 1820         }
 1821 }
 1822 
 1823 /*
 1824  * uvm_pagedeactivate: deactivate page
 1825  *
 1826  * => caller must lock objects
 1827  * => caller must check to make sure page is not wired
 1828  * => object that page belongs to must be locked (so we can adjust pg->flags)
 1829  * => caller must clear the reference on the page before calling
 1830  * => caller must hold pg->interlock
 1831  */
 1832 
 1833 void
 1834 uvm_pagedeactivate(struct vm_page *pg)
 1835 {
 1836 
 1837         KASSERT(uvm_page_owner_locked_p(pg, false));
 1838         KASSERT(mutex_owned(&pg->interlock));
 1839         if (pg->wire_count == 0) {
 1840                 KASSERT(uvmpdpol_pageisqueued_p(pg));
 1841                 uvmpdpol_pagedeactivate(pg);
 1842         }
 1843 }
 1844 
 1845 /*
 1846  * uvm_pageactivate: activate page
 1847  *
 1848  * => caller must lock objects
 1849  * => caller must hold pg->interlock
 1850  */
 1851 
 1852 void
 1853 uvm_pageactivate(struct vm_page *pg)
 1854 {
 1855 
 1856         KASSERT(uvm_page_owner_locked_p(pg, false));
 1857         KASSERT(mutex_owned(&pg->interlock));
 1858 #if defined(READAHEAD_STATS)
 1859         if ((pg->flags & PG_READAHEAD) != 0) {
 1860                 uvm_ra_hit.ev_count++;
 1861                 pg->flags &= ~PG_READAHEAD;
 1862         }
 1863 #endif /* defined(READAHEAD_STATS) */
 1864         if (pg->wire_count == 0) {
 1865                 uvmpdpol_pageactivate(pg);
 1866         }
 1867 }
 1868 
 1869 /*
 1870  * uvm_pagedequeue: remove a page from any paging queue
 1871  *
 1872  * => caller must lock objects
 1873  * => caller must hold pg->interlock
 1874  */
 1875 void
 1876 uvm_pagedequeue(struct vm_page *pg)
 1877 {
 1878 
 1879         KASSERT(uvm_page_owner_locked_p(pg, true));
 1880         KASSERT(mutex_owned(&pg->interlock));
 1881         if (uvmpdpol_pageisqueued_p(pg)) {
 1882                 uvmpdpol_pagedequeue(pg);
 1883         }
 1884 }
 1885 
 1886 /*
 1887  * uvm_pageenqueue: add a page to a paging queue without activating.
 1888  * used where a page is not really demanded (yet).  eg. read-ahead
 1889  *
 1890  * => caller must lock objects
 1891  * => caller must hold pg->interlock
 1892  */
 1893 void
 1894 uvm_pageenqueue(struct vm_page *pg)
 1895 {
 1896 
 1897         KASSERT(uvm_page_owner_locked_p(pg, false));
 1898         KASSERT(mutex_owned(&pg->interlock));
 1899         if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
 1900                 uvmpdpol_pageenqueue(pg);
 1901         }
 1902 }
 1903 
 1904 /*
 1905  * uvm_pagelock: acquire page interlock
 1906  */
 1907 void
 1908 uvm_pagelock(struct vm_page *pg)
 1909 {
 1910 
 1911         mutex_enter(&pg->interlock);
 1912 }
 1913 
 1914 /*
 1915  * uvm_pagelock2: acquire two page interlocks
 1916  */
 1917 void
 1918 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
 1919 {
 1920 
 1921         if (pg1 < pg2) {
 1922                 mutex_enter(&pg1->interlock);
 1923                 mutex_enter(&pg2->interlock);
 1924         } else {
 1925                 mutex_enter(&pg2->interlock);
 1926                 mutex_enter(&pg1->interlock);
 1927         }
 1928 }
 1929 
 1930 /*
 1931  * uvm_pageunlock: release page interlock, and if a page replacement intent
 1932  * is set on the page, pass it to uvmpdpol to make real.
 1933  *
 1934  * => caller must hold pg->interlock
 1935  */
 1936 void
 1937 uvm_pageunlock(struct vm_page *pg)
 1938 {
 1939 
 1940         if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
 1941             (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
 1942                 mutex_exit(&pg->interlock);
 1943                 return;
 1944         }
 1945         pg->pqflags |= PQ_INTENT_QUEUED;
 1946         mutex_exit(&pg->interlock);
 1947         uvmpdpol_pagerealize(pg);
 1948 }
 1949 
 1950 /*
 1951  * uvm_pageunlock2: release two page interlocks, and for both pages if a
 1952  * page replacement intent is set on the page, pass it to uvmpdpol to make
 1953  * real.
 1954  *
 1955  * => caller must hold pg->interlock
 1956  */
 1957 void
 1958 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
 1959 {
 1960 
 1961         if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
 1962             (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
 1963                 mutex_exit(&pg1->interlock);
 1964                 pg1 = NULL;
 1965         } else {
 1966                 pg1->pqflags |= PQ_INTENT_QUEUED;
 1967                 mutex_exit(&pg1->interlock);
 1968         }
 1969 
 1970         if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
 1971             (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
 1972                 mutex_exit(&pg2->interlock);
 1973                 pg2 = NULL;
 1974         } else {
 1975                 pg2->pqflags |= PQ_INTENT_QUEUED;
 1976                 mutex_exit(&pg2->interlock);
 1977         }
 1978 
 1979         if (pg1 != NULL) {
 1980                 uvmpdpol_pagerealize(pg1);
 1981         }
 1982         if (pg2 != NULL) {
 1983                 uvmpdpol_pagerealize(pg2);
 1984         }
 1985 }
 1986 
 1987 /*
 1988  * uvm_pagezero: zero fill a page
 1989  *
 1990  * => if page is part of an object then the object should be locked
 1991  *      to protect pg->flags.
 1992  */
 1993 
 1994 void
 1995 uvm_pagezero(struct vm_page *pg)
 1996 {
 1997 
 1998         uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
 1999         pmap_zero_page(VM_PAGE_TO_PHYS(pg));
 2000 }
 2001 
 2002 /*
 2003  * uvm_pagecopy: copy a page
 2004  *
 2005  * => if page is part of an object then the object should be locked
 2006  *      to protect pg->flags.
 2007  */
 2008 
 2009 void
 2010 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
 2011 {
 2012 
 2013         uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
 2014         pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
 2015 }
 2016 
 2017 /*
 2018  * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
 2019  */
 2020 
 2021 bool
 2022 uvm_pageismanaged(paddr_t pa)
 2023 {
 2024 
 2025         return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
 2026 }
 2027 
 2028 /*
 2029  * uvm_page_lookup_freelist: look up the free list for the specified page
 2030  */
 2031 
 2032 int
 2033 uvm_page_lookup_freelist(struct vm_page *pg)
 2034 {
 2035         uvm_physseg_t upm;
 2036 
 2037         upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
 2038         KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
 2039         return uvm_physseg_get_free_list(upm);
 2040 }
 2041 
 2042 /*
 2043  * uvm_page_owner_locked_p: return true if object associated with page is
 2044  * locked.  this is a weak check for runtime assertions only.
 2045  */
 2046 
 2047 bool
 2048 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
 2049 {
 2050 
 2051         if (pg->uobject != NULL) {
 2052                 return exclusive
 2053                     ? rw_write_held(pg->uobject->vmobjlock)
 2054                     : rw_lock_held(pg->uobject->vmobjlock);
 2055         }
 2056         if (pg->uanon != NULL) {
 2057                 return exclusive
 2058                     ? rw_write_held(pg->uanon->an_lock)
 2059                     : rw_lock_held(pg->uanon->an_lock);
 2060         }
 2061         return true;
 2062 }
 2063 
 2064 /*
 2065  * uvm_pagereadonly_p: return if the page should be mapped read-only
 2066  */
 2067 
 2068 bool
 2069 uvm_pagereadonly_p(struct vm_page *pg)
 2070 {
 2071         struct uvm_object * const uobj = pg->uobject;
 2072 
 2073         KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
 2074         KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock));
 2075         if ((pg->flags & PG_RDONLY) != 0) {
 2076                 return true;
 2077         }
 2078         if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
 2079                 return true;
 2080         }
 2081         if (uobj == NULL) {
 2082                 return false;
 2083         }
 2084         return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
 2085 }
 2086 
 2087 #ifdef PMAP_DIRECT
 2088 /*
 2089  * Call pmap to translate physical address into a virtual and to run a callback
 2090  * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
 2091  * or equivalent.
 2092  */
 2093 int
 2094 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
 2095             int (*process)(void *, size_t, void *), void *arg)
 2096 {
 2097         int error = 0;
 2098         paddr_t pa;
 2099         size_t todo;
 2100         voff_t pgoff = (off & PAGE_MASK);
 2101         struct vm_page *pg;
 2102 
 2103         KASSERT(npages > 0 && len > 0);
 2104 
 2105         for (int i = 0; i < npages; i++) {
 2106                 pg = pgs[i];
 2107 
 2108                 KASSERT(len > 0);
 2109 
 2110                 /*
 2111                  * Caller is responsible for ensuring all the pages are
 2112                  * available.
 2113                  */
 2114                 KASSERT(pg != NULL && pg != PGO_DONTCARE);
 2115 
 2116                 pa = VM_PAGE_TO_PHYS(pg);
 2117                 todo = MIN(len, PAGE_SIZE - pgoff);
 2118 
 2119                 error = pmap_direct_process(pa, pgoff, todo, process, arg);
 2120                 if (error)
 2121                         break;
 2122 
 2123                 pgoff = 0;
 2124                 len -= todo;
 2125         }
 2126 
 2127         KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
 2128         return error;
 2129 }
 2130 #endif /* PMAP_DIRECT */
 2131 
 2132 #if defined(DDB) || defined(DEBUGPRINT)
 2133 
 2134 /*
 2135  * uvm_page_printit: actually print the page
 2136  */
 2137 
 2138 static const char page_flagbits[] = UVM_PGFLAGBITS;
 2139 static const char page_pqflagbits[] = UVM_PQFLAGBITS;
 2140 
 2141 void
 2142 uvm_page_printit(struct vm_page *pg, bool full,
 2143     void (*pr)(const char *, ...))
 2144 {
 2145         struct vm_page *tpg;
 2146         struct uvm_object *uobj;
 2147         struct pgflbucket *pgb;
 2148         struct pgflist *pgl;
 2149         char pgbuf[128];
 2150 
 2151         (*pr)("PAGE %p:\n", pg);
 2152         snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
 2153         (*pr)("  flags=%s\n", pgbuf);
 2154         snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
 2155         (*pr)("  pqflags=%s\n", pgbuf);
 2156         (*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
 2157             pg->uobject, pg->uanon, (long long)pg->offset);
 2158         (*pr)("  loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
 2159             pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
 2160             uvm_page_get_freelist(pg));
 2161         (*pr)("  pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
 2162 #if defined(UVM_PAGE_TRKOWN)
 2163         if (pg->flags & PG_BUSY)
 2164                 (*pr)("  owning process = %d.%d, tag=%s\n",
 2165                     pg->owner, pg->lowner, pg->owner_tag);
 2166         else
 2167                 (*pr)("  page not busy, no owner\n");
 2168 #else
 2169         (*pr)("  [page ownership tracking disabled]\n");
 2170 #endif
 2171 
 2172         if (!full)
 2173                 return;
 2174 
 2175         /* cross-verify object/anon */
 2176         if ((pg->flags & PG_FREE) == 0) {
 2177                 if (pg->flags & PG_ANON) {
 2178                         if (pg->uanon == NULL || pg->uanon->an_page != pg)
 2179                             (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
 2180                                 (pg->uanon) ? pg->uanon->an_page : NULL);
 2181                         else
 2182                                 (*pr)("  anon backpointer is OK\n");
 2183                 } else {
 2184                         uobj = pg->uobject;
 2185                         if (uobj) {
 2186                                 (*pr)("  checking object list\n");
 2187                                 tpg = uvm_pagelookup(uobj, pg->offset);
 2188                                 if (tpg)
 2189                                         (*pr)("  page found on object list\n");
 2190                                 else
 2191                         (*pr)("  >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
 2192                         }
 2193                 }
 2194         }
 2195 
 2196         /* cross-verify page queue */
 2197         if (pg->flags & PG_FREE) {
 2198                 int fl = uvm_page_get_freelist(pg);
 2199                 int b = uvm_page_get_bucket(pg);
 2200                 pgb = uvm.page_free[fl].pgfl_buckets[b];
 2201                 pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
 2202                 (*pr)("  checking pageq list\n");
 2203                 LIST_FOREACH(tpg, pgl, pageq.list) {
 2204                         if (tpg == pg) {
 2205                                 break;
 2206                         }
 2207                 }
 2208                 if (tpg)
 2209                         (*pr)("  page found on pageq list\n");
 2210                 else
 2211                         (*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
 2212         }
 2213 }
 2214 
 2215 /*
 2216  * uvm_page_printall - print a summary of all managed pages
 2217  */
 2218 
 2219 void
 2220 uvm_page_printall(void (*pr)(const char *, ...))
 2221 {
 2222         uvm_physseg_t i;
 2223         paddr_t pfn;
 2224         struct vm_page *pg;
 2225 
 2226         (*pr)("%18s %4s %4s %18s %18s"
 2227 #ifdef UVM_PAGE_TRKOWN
 2228             " OWNER"
 2229 #endif
 2230             "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
 2231         for (i = uvm_physseg_get_first();
 2232              uvm_physseg_valid_p(i);
 2233              i = uvm_physseg_get_next(i)) {
 2234                 for (pfn = uvm_physseg_get_start(i);
 2235                      pfn < uvm_physseg_get_end(i);
 2236                      pfn++) {
 2237                         pg = PHYS_TO_VM_PAGE(ptoa(pfn));
 2238 
 2239                         (*pr)("%18p %04x %08x %18p %18p",
 2240                             pg, pg->flags, pg->pqflags, pg->uobject,
 2241                             pg->uanon);
 2242 #ifdef UVM_PAGE_TRKOWN
 2243                         if (pg->flags & PG_BUSY)
 2244                                 (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
 2245 #endif
 2246                         (*pr)("\n");
 2247                 }
 2248         }
 2249 }
 2250 
 2251 /*
 2252  * uvm_page_print_freelists - print a summary freelists
 2253  */
 2254 
 2255 void
 2256 uvm_page_print_freelists(void (*pr)(const char *, ...))
 2257 {
 2258         struct pgfreelist *pgfl;
 2259         struct pgflbucket *pgb;
 2260         int fl, b, c;
 2261 
 2262         (*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
 2263             VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
 2264 
 2265         for (fl = 0; fl < VM_NFREELIST; fl++) {
 2266                 pgfl = &uvm.page_free[fl];
 2267                 (*pr)("freelist(%d) @ %p\n", fl, pgfl);
 2268                 for (b = 0; b < uvm.bucketcount; b++) {
 2269                         pgb = uvm.page_free[fl].pgfl_buckets[b];
 2270                         (*pr)("    bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
 2271                             b, pgb, pgb->pgb_nfree,
 2272                             &uvm_freelist_locks[b].lock);
 2273                         for (c = 0; c < uvmexp.ncolors; c++) {
 2274                                 (*pr)("        color(%d) @ %p, ", c,
 2275                                     &pgb->pgb_colors[c]);
 2276                                 (*pr)("first page = %p\n",
 2277                                     LIST_FIRST(&pgb->pgb_colors[c]));
 2278                         }
 2279                 }
 2280         }
 2281 }
 2282 
 2283 #endif /* DDB || DEBUGPRINT */

Cache object: 310c0e1341de7395ff1fcf03db1ea9f6


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.