vm_phys.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2002-2006 Rice University
    5  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
    6  * All rights reserved.
    7  *
    8  * This software was developed for the FreeBSD Project by Alan L. Cox,
    9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
   24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
   27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
   30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   31  * POSSIBILITY OF SUCH DAMAGE.
   32  */
   33 
   34 /*
   35  *      Physical memory system implementation
   36  *
   37  * Any external functions defined by this module are only to be used by the
   38  * virtual memory system.
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD$");
   43 
   44 #include "opt_ddb.h"
   45 #include "opt_vm.h"
   46 
   47 #include <sys/param.h>
   48 #include <sys/systm.h>
   49 #include <sys/domainset.h>
   50 #include <sys/lock.h>
   51 #include <sys/kernel.h>
   52 #include <sys/malloc.h>
   53 #include <sys/mutex.h>
   54 #include <sys/proc.h>
   55 #include <sys/queue.h>
   56 #include <sys/rwlock.h>
   57 #include <sys/sbuf.h>
   58 #include <sys/sysctl.h>
   59 #include <sys/tree.h>
   60 #include <sys/vmmeter.h>
   61 
   62 #include <ddb/ddb.h>
   63 
   64 #include <vm/vm.h>
   65 #include <vm/vm_extern.h>
   66 #include <vm/vm_param.h>
   67 #include <vm/vm_kern.h>
   68 #include <vm/vm_object.h>
   69 #include <vm/vm_page.h>
   70 #include <vm/vm_phys.h>
   71 #include <vm/vm_pagequeue.h>
   72 
   73 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
   74     "Too many physsegs.");
   75 
   76 #ifdef NUMA
   77 struct mem_affinity __read_mostly *mem_affinity;
   78 int __read_mostly *mem_locality;
   79 #endif
   80 
   81 int __read_mostly vm_ndomains = 1;
   82 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
   83 
   84 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
   85 int __read_mostly vm_phys_nsegs;
   86 static struct vm_phys_seg vm_phys_early_segs[8];
   87 static int vm_phys_early_nsegs;
   88 
   89 struct vm_phys_fictitious_seg;
   90 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
   91     struct vm_phys_fictitious_seg *);
   92 
   93 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
   94     RB_INITIALIZER(&vm_phys_fictitious_tree);
   95 
   96 struct vm_phys_fictitious_seg {
   97         RB_ENTRY(vm_phys_fictitious_seg) node;
   98         /* Memory region data */
   99         vm_paddr_t      start;
  100         vm_paddr_t      end;
  101         vm_page_t       first_page;
  102 };
  103 
  104 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
  105     vm_phys_fictitious_cmp);
  106 
  107 static struct rwlock_padalign vm_phys_fictitious_reg_lock;
  108 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
  109 
  110 static struct vm_freelist __aligned(CACHE_LINE_SIZE)
  111     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
  112     [VM_NFREEORDER_MAX];
  113 
  114 static int __read_mostly vm_nfreelists;
  115 
  116 /*
  117  * These "avail lists" are globals used to communicate boot-time physical
  118  * memory layout to other parts of the kernel.  Each physically contiguous
  119  * region of memory is defined by a start address at an even index and an
  120  * end address at the following odd index.  Each list is terminated by a
  121  * pair of zero entries.
  122  *
  123  * dump_avail tells the dump code what regions to include in a crash dump, and
  124  * phys_avail is all of the remaining physical memory that is available for
  125  * the vm system.
  126  *
  127  * Initially dump_avail and phys_avail are identical.  Boot time memory
  128  * allocations remove extents from phys_avail that may still be included
  129  * in dumps.
  130  */
  131 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
  132 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
  133 
  134 /*
  135  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
  136  */
  137 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
  138 
  139 CTASSERT(VM_FREELIST_DEFAULT == 0);
  140 
  141 #ifdef VM_FREELIST_DMA32
  142 #define VM_DMA32_BOUNDARY       ((vm_paddr_t)1 << 32)
  143 #endif
  144 
  145 /*
  146  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
  147  * the ordering of the free list boundaries.
  148  */
  149 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
  150 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
  151 #endif
  152 
  153 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
  154 SYSCTL_OID(_vm, OID_AUTO, phys_free,
  155     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  156     sysctl_vm_phys_free, "A",
  157     "Phys Free Info");
  158 
  159 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
  160 SYSCTL_OID(_vm, OID_AUTO, phys_segs,
  161     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  162     sysctl_vm_phys_segs, "A",
  163     "Phys Seg Info");
  164 
  165 #ifdef NUMA
  166 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
  167 SYSCTL_OID(_vm, OID_AUTO, phys_locality,
  168     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  169     sysctl_vm_phys_locality, "A",
  170     "Phys Locality Info");
  171 #endif
  172 
  173 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
  174     &vm_ndomains, 0, "Number of physical memory domains available.");
  175 
  176 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
  177 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
  178 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
  179     int order, int tail);
  180 
  181 /*
  182  * Red-black tree helpers for vm fictitious range management.
  183  */
  184 static inline int
  185 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
  186     struct vm_phys_fictitious_seg *range)
  187 {
  188 
  189         KASSERT(range->start != 0 && range->end != 0,
  190             ("Invalid range passed on search for vm_fictitious page"));
  191         if (p->start >= range->end)
  192                 return (1);
  193         if (p->start < range->start)
  194                 return (-1);
  195 
  196         return (0);
  197 }
  198 
  199 static int
  200 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
  201     struct vm_phys_fictitious_seg *p2)
  202 {
  203 
  204         /* Check if this is a search for a page */
  205         if (p1->end == 0)
  206                 return (vm_phys_fictitious_in_range(p1, p2));
  207 
  208         KASSERT(p2->end != 0,
  209     ("Invalid range passed as second parameter to vm fictitious comparison"));
  210 
  211         /* Searching to add a new range */
  212         if (p1->end <= p2->start)
  213                 return (-1);
  214         if (p1->start >= p2->end)
  215                 return (1);
  216 
  217         panic("Trying to add overlapping vm fictitious ranges:\n"
  218             "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
  219             (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
  220 }
  221 
  222 int
  223 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high)
  224 {
  225 #ifdef NUMA
  226         domainset_t mask;
  227         int i;
  228 
  229         if (vm_ndomains == 1 || mem_affinity == NULL)
  230                 return (0);
  231 
  232         DOMAINSET_ZERO(&mask);
  233         /*
  234          * Check for any memory that overlaps low, high.
  235          */
  236         for (i = 0; mem_affinity[i].end != 0; i++)
  237                 if (mem_affinity[i].start <= high &&
  238                     mem_affinity[i].end >= low)
  239                         DOMAINSET_SET(mem_affinity[i].domain, &mask);
  240         if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
  241                 return (prefer);
  242         if (DOMAINSET_EMPTY(&mask))
  243                 panic("vm_phys_domain_match:  Impossible constraint");
  244         return (DOMAINSET_FFS(&mask) - 1);
  245 #else
  246         return (0);
  247 #endif
  248 }
  249 
  250 /*
  251  * Outputs the state of the physical memory allocator, specifically,
  252  * the amount of physical memory in each free list.
  253  */
  254 static int
  255 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
  256 {
  257         struct sbuf sbuf;
  258         struct vm_freelist *fl;
  259         int dom, error, flind, oind, pind;
  260 
  261         error = sysctl_wire_old_buffer(req, 0);
  262         if (error != 0)
  263                 return (error);
  264         sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
  265         for (dom = 0; dom < vm_ndomains; dom++) {
  266                 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
  267                 for (flind = 0; flind < vm_nfreelists; flind++) {
  268                         sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
  269                             "\n  ORDER (SIZE)  |  NUMBER"
  270                             "\n              ", flind);
  271                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
  272                                 sbuf_printf(&sbuf, "  |  POOL %d", pind);
  273                         sbuf_printf(&sbuf, "\n--            ");
  274                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
  275                                 sbuf_printf(&sbuf, "-- --      ");
  276                         sbuf_printf(&sbuf, "--\n");
  277                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
  278                                 sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
  279                                     1 << (PAGE_SHIFT - 10 + oind));
  280                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  281                                 fl = vm_phys_free_queues[dom][flind][pind];
  282                                         sbuf_printf(&sbuf, "  |  %6d",
  283                                             fl[oind].lcnt);
  284                                 }
  285                                 sbuf_printf(&sbuf, "\n");
  286                         }
  287                 }
  288         }
  289         error = sbuf_finish(&sbuf);
  290         sbuf_delete(&sbuf);
  291         return (error);
  292 }
  293 
  294 /*
  295  * Outputs the set of physical memory segments.
  296  */
  297 static int
  298 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
  299 {
  300         struct sbuf sbuf;
  301         struct vm_phys_seg *seg;
  302         int error, segind;
  303 
  304         error = sysctl_wire_old_buffer(req, 0);
  305         if (error != 0)
  306                 return (error);
  307         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  308         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  309                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
  310                 seg = &vm_phys_segs[segind];
  311                 sbuf_printf(&sbuf, "start:     %#jx\n",
  312                     (uintmax_t)seg->start);
  313                 sbuf_printf(&sbuf, "end:       %#jx\n",
  314                     (uintmax_t)seg->end);
  315                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
  316                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
  317         }
  318         error = sbuf_finish(&sbuf);
  319         sbuf_delete(&sbuf);
  320         return (error);
  321 }
  322 
  323 /*
  324  * Return affinity, or -1 if there's no affinity information.
  325  */
  326 int
  327 vm_phys_mem_affinity(int f, int t)
  328 {
  329 
  330 #ifdef NUMA
  331         if (mem_locality == NULL)
  332                 return (-1);
  333         if (f >= vm_ndomains || t >= vm_ndomains)
  334                 return (-1);
  335         return (mem_locality[f * vm_ndomains + t]);
  336 #else
  337         return (-1);
  338 #endif
  339 }
  340 
  341 #ifdef NUMA
  342 /*
  343  * Outputs the VM locality table.
  344  */
  345 static int
  346 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
  347 {
  348         struct sbuf sbuf;
  349         int error, i, j;
  350 
  351         error = sysctl_wire_old_buffer(req, 0);
  352         if (error != 0)
  353                 return (error);
  354         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  355 
  356         sbuf_printf(&sbuf, "\n");
  357 
  358         for (i = 0; i < vm_ndomains; i++) {
  359                 sbuf_printf(&sbuf, "%d: ", i);
  360                 for (j = 0; j < vm_ndomains; j++) {
  361                         sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
  362                 }
  363                 sbuf_printf(&sbuf, "\n");
  364         }
  365         error = sbuf_finish(&sbuf);
  366         sbuf_delete(&sbuf);
  367         return (error);
  368 }
  369 #endif
  370 
  371 static void
  372 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
  373 {
  374 
  375         m->order = order;
  376         if (tail)
  377                 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
  378         else
  379                 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
  380         fl[order].lcnt++;
  381 }
  382 
  383 static void
  384 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
  385 {
  386 
  387         TAILQ_REMOVE(&fl[order].pl, m, listq);
  388         fl[order].lcnt--;
  389         m->order = VM_NFREEORDER;
  390 }
  391 
  392 /*
  393  * Create a physical memory segment.
  394  */
  395 static void
  396 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
  397 {
  398         struct vm_phys_seg *seg;
  399 
  400         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
  401             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
  402         KASSERT(domain >= 0 && domain < vm_ndomains,
  403             ("vm_phys_create_seg: invalid domain provided"));
  404         seg = &vm_phys_segs[vm_phys_nsegs++];
  405         while (seg > vm_phys_segs && (seg - 1)->start >= end) {
  406                 *seg = *(seg - 1);
  407                 seg--;
  408         }
  409         seg->start = start;
  410         seg->end = end;
  411         seg->domain = domain;
  412 }
  413 
  414 static void
  415 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
  416 {
  417 #ifdef NUMA
  418         int i;
  419 
  420         if (mem_affinity == NULL) {
  421                 _vm_phys_create_seg(start, end, 0);
  422                 return;
  423         }
  424 
  425         for (i = 0;; i++) {
  426                 if (mem_affinity[i].end == 0)
  427                         panic("Reached end of affinity info");
  428                 if (mem_affinity[i].end <= start)
  429                         continue;
  430                 if (mem_affinity[i].start > start)
  431                         panic("No affinity info for start %jx",
  432                             (uintmax_t)start);
  433                 if (mem_affinity[i].end >= end) {
  434                         _vm_phys_create_seg(start, end,
  435                             mem_affinity[i].domain);
  436                         break;
  437                 }
  438                 _vm_phys_create_seg(start, mem_affinity[i].end,
  439                     mem_affinity[i].domain);
  440                 start = mem_affinity[i].end;
  441         }
  442 #else
  443         _vm_phys_create_seg(start, end, 0);
  444 #endif
  445 }
  446 
  447 /*
  448  * Add a physical memory segment.
  449  */
  450 void
  451 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
  452 {
  453         vm_paddr_t paddr;
  454 
  455         KASSERT((start & PAGE_MASK) == 0,
  456             ("vm_phys_define_seg: start is not page aligned"));
  457         KASSERT((end & PAGE_MASK) == 0,
  458             ("vm_phys_define_seg: end is not page aligned"));
  459 
  460         /*
  461          * Split the physical memory segment if it spans two or more free
  462          * list boundaries.
  463          */
  464         paddr = start;
  465 #ifdef  VM_FREELIST_LOWMEM
  466         if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
  467                 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
  468                 paddr = VM_LOWMEM_BOUNDARY;
  469         }
  470 #endif
  471 #ifdef  VM_FREELIST_DMA32
  472         if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
  473                 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
  474                 paddr = VM_DMA32_BOUNDARY;
  475         }
  476 #endif
  477         vm_phys_create_seg(paddr, end);
  478 }
  479 
  480 /*
  481  * Initialize the physical memory allocator.
  482  *
  483  * Requires that vm_page_array is initialized!
  484  */
  485 void
  486 vm_phys_init(void)
  487 {
  488         struct vm_freelist *fl;
  489         struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
  490 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
  491         u_long npages;
  492 #endif
  493         int dom, flind, freelist, oind, pind, segind;
  494 
  495         /*
  496          * Compute the number of free lists, and generate the mapping from the
  497          * manifest constants VM_FREELIST_* to the free list indices.
  498          *
  499          * Initially, the entries of vm_freelist_to_flind[] are set to either
  500          * 0 or 1 to indicate which free lists should be created.
  501          */
  502 #ifdef  VM_DMA32_NPAGES_THRESHOLD
  503         npages = 0;
  504 #endif
  505         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
  506                 seg = &vm_phys_segs[segind];
  507 #ifdef  VM_FREELIST_LOWMEM
  508                 if (seg->end <= VM_LOWMEM_BOUNDARY)
  509                         vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
  510                 else
  511 #endif
  512 #ifdef  VM_FREELIST_DMA32
  513                 if (
  514 #ifdef  VM_DMA32_NPAGES_THRESHOLD
  515                     /*
  516                      * Create the DMA32 free list only if the amount of
  517                      * physical memory above physical address 4G exceeds the
  518                      * given threshold.
  519                      */
  520                     npages > VM_DMA32_NPAGES_THRESHOLD &&
  521 #endif
  522                     seg->end <= VM_DMA32_BOUNDARY)
  523                         vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
  524                 else
  525 #endif
  526                 {
  527 #ifdef  VM_DMA32_NPAGES_THRESHOLD
  528                         npages += atop(seg->end - seg->start);
  529 #endif
  530                         vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
  531                 }
  532         }
  533         /* Change each entry into a running total of the free lists. */
  534         for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
  535                 vm_freelist_to_flind[freelist] +=
  536                     vm_freelist_to_flind[freelist - 1];
  537         }
  538         vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
  539         KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
  540         /* Change each entry into a free list index. */
  541         for (freelist = 0; freelist < VM_NFREELIST; freelist++)
  542                 vm_freelist_to_flind[freelist]--;
  543 
  544         /*
  545          * Initialize the first_page and free_queues fields of each physical
  546          * memory segment.
  547          */
  548 #ifdef VM_PHYSSEG_SPARSE
  549         npages = 0;
  550 #endif
  551         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  552                 seg = &vm_phys_segs[segind];
  553 #ifdef VM_PHYSSEG_SPARSE
  554                 seg->first_page = &vm_page_array[npages];
  555                 npages += atop(seg->end - seg->start);
  556 #else
  557                 seg->first_page = PHYS_TO_VM_PAGE(seg->start);
  558 #endif
  559 #ifdef  VM_FREELIST_LOWMEM
  560                 if (seg->end <= VM_LOWMEM_BOUNDARY) {
  561                         flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
  562                         KASSERT(flind >= 0,
  563                             ("vm_phys_init: LOWMEM flind < 0"));
  564                 } else
  565 #endif
  566 #ifdef  VM_FREELIST_DMA32
  567                 if (seg->end <= VM_DMA32_BOUNDARY) {
  568                         flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
  569                         KASSERT(flind >= 0,
  570                             ("vm_phys_init: DMA32 flind < 0"));
  571                 } else
  572 #endif
  573                 {
  574                         flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
  575                         KASSERT(flind >= 0,
  576                             ("vm_phys_init: DEFAULT flind < 0"));
  577                 }
  578                 seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
  579         }
  580 
  581         /*
  582          * Coalesce physical memory segments that are contiguous and share the
  583          * same per-domain free queues.
  584          */
  585         prev_seg = vm_phys_segs;
  586         seg = &vm_phys_segs[1];
  587         end_seg = &vm_phys_segs[vm_phys_nsegs];
  588         while (seg < end_seg) {
  589                 if (prev_seg->end == seg->start &&
  590                     prev_seg->free_queues == seg->free_queues) {
  591                         prev_seg->end = seg->end;
  592                         KASSERT(prev_seg->domain == seg->domain,
  593                             ("vm_phys_init: free queues cannot span domains"));
  594                         vm_phys_nsegs--;
  595                         end_seg--;
  596                         for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
  597                                 *tmp_seg = *(tmp_seg + 1);
  598                 } else {
  599                         prev_seg = seg;
  600                         seg++;
  601                 }
  602         }
  603 
  604         /*
  605          * Initialize the free queues.
  606          */
  607         for (dom = 0; dom < vm_ndomains; dom++) {
  608                 for (flind = 0; flind < vm_nfreelists; flind++) {
  609                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  610                                 fl = vm_phys_free_queues[dom][flind][pind];
  611                                 for (oind = 0; oind < VM_NFREEORDER; oind++)
  612                                         TAILQ_INIT(&fl[oind].pl);
  613                         }
  614                 }
  615         }
  616 
  617         rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
  618 }
  619 
  620 /*
  621  * Register info about the NUMA topology of the system.
  622  *
  623  * Invoked by platform-dependent code prior to vm_phys_init().
  624  */
  625 void
  626 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
  627     int *locality)
  628 {
  629 #ifdef NUMA
  630         int d, i;
  631 
  632         /*
  633          * For now the only override value that we support is 1, which
  634          * effectively disables NUMA-awareness in the allocators.
  635          */
  636         d = 0;
  637         TUNABLE_INT_FETCH("vm.numa.disabled", &d);
  638         if (d)
  639                 ndomains = 1;
  640 
  641         if (ndomains > 1) {
  642                 vm_ndomains = ndomains;
  643                 mem_affinity = affinity;
  644                 mem_locality = locality;
  645         }
  646 
  647         for (i = 0; i < vm_ndomains; i++)
  648                 DOMAINSET_SET(i, &all_domains);
  649 #else
  650         (void)ndomains;
  651         (void)affinity;
  652         (void)locality;
  653 #endif
  654 }
  655 
  656 /*
  657  * Split a contiguous, power of two-sized set of physical pages.
  658  *
  659  * When this function is called by a page allocation function, the caller
  660  * should request insertion at the head unless the order [order, oind) queues
  661  * are known to be empty.  The objective being to reduce the likelihood of
  662  * long-term fragmentation by promoting contemporaneous allocation and
  663  * (hopefully) deallocation.
  664  */
  665 static __inline void
  666 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
  667     int tail)
  668 {
  669         vm_page_t m_buddy;
  670 
  671         while (oind > order) {
  672                 oind--;
  673                 m_buddy = &m[1 << oind];
  674                 KASSERT(m_buddy->order == VM_NFREEORDER,
  675                     ("vm_phys_split_pages: page %p has unexpected order %d",
  676                     m_buddy, m_buddy->order));
  677                 vm_freelist_add(fl, m_buddy, oind, tail);
  678         }
  679 }
  680 
  681 /*
  682  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
  683  * and sized set to the specified free list.
  684  *
  685  * When this function is called by a page allocation function, the caller
  686  * should request insertion at the head unless the lower-order queues are
  687  * known to be empty.  The objective being to reduce the likelihood of long-
  688  * term fragmentation by promoting contemporaneous allocation and (hopefully)
  689  * deallocation.
  690  *
  691  * The physical page m's buddy must not be free.
  692  */
  693 static void
  694 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
  695 {
  696         u_int n;
  697         int order;
  698 
  699         KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0"));
  700         KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
  701             ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0,
  702             ("vm_phys_enq_range: page %p and npages %u are misaligned",
  703             m, npages));
  704         do {
  705                 KASSERT(m->order == VM_NFREEORDER,
  706                     ("vm_phys_enq_range: page %p has unexpected order %d",
  707                     m, m->order));
  708                 order = ffs(npages) - 1;
  709                 KASSERT(order < VM_NFREEORDER,
  710                     ("vm_phys_enq_range: order %d is out of range", order));
  711                 vm_freelist_add(fl, m, order, tail);
  712                 n = 1 << order;
  713                 m += n;
  714                 npages -= n;
  715         } while (npages > 0);
  716 }
  717 
  718 /*
  719  * Set the pool for a contiguous, power of two-sized set of physical pages. 
  720  */
  721 static void
  722 vm_phys_set_pool(int pool, vm_page_t m, int order)
  723 {
  724         vm_page_t m_tmp;
  725 
  726         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
  727                 m_tmp->pool = pool;
  728 }
  729 
  730 /*
  731  * Tries to allocate the specified number of pages from the specified pool
  732  * within the specified domain.  Returns the actual number of allocated pages
  733  * and a pointer to each page through the array ma[].
  734  *
  735  * The returned pages may not be physically contiguous.  However, in contrast
  736  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
  737  * calling this function once to allocate the desired number of pages will
  738  * avoid wasted time in vm_phys_split_pages().
  739  *
  740  * The free page queues for the specified domain must be locked.
  741  */
  742 int
  743 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
  744 {
  745         struct vm_freelist *alt, *fl;
  746         vm_page_t m;
  747         int avail, end, flind, freelist, i, need, oind, pind;
  748 
  749         KASSERT(domain >= 0 && domain < vm_ndomains,
  750             ("vm_phys_alloc_npages: domain %d is out of range", domain));
  751         KASSERT(pool < VM_NFREEPOOL,
  752             ("vm_phys_alloc_npages: pool %d is out of range", pool));
  753         KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
  754             ("vm_phys_alloc_npages: npages %d is out of range", npages));
  755         vm_domain_free_assert_locked(VM_DOMAIN(domain));
  756         i = 0;
  757         for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
  758                 flind = vm_freelist_to_flind[freelist];
  759                 if (flind < 0)
  760                         continue;
  761                 fl = vm_phys_free_queues[domain][flind][pool];
  762                 for (oind = 0; oind < VM_NFREEORDER; oind++) {
  763                         while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
  764                                 vm_freelist_rem(fl, m, oind);
  765                                 avail = 1 << oind;
  766                                 need = imin(npages - i, avail);
  767                                 for (end = i + need; i < end;)
  768                                         ma[i++] = m++;
  769                                 if (need < avail) {
  770                                         /*
  771                                          * Return excess pages to fl.  Its
  772                                          * order [0, oind) queues are empty.
  773                                          */
  774                                         vm_phys_enq_range(m, avail - need, fl,
  775                                             1);
  776                                         return (npages);
  777                                 } else if (i == npages)
  778                                         return (npages);
  779                         }
  780                 }
  781                 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
  782                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  783                                 alt = vm_phys_free_queues[domain][flind][pind];
  784                                 while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
  785                                     NULL) {
  786                                         vm_freelist_rem(alt, m, oind);
  787                                         vm_phys_set_pool(pool, m, oind);
  788                                         avail = 1 << oind;
  789                                         need = imin(npages - i, avail);
  790                                         for (end = i + need; i < end;)
  791                                                 ma[i++] = m++;
  792                                         if (need < avail) {
  793                                                 /*
  794                                                  * Return excess pages to fl.
  795                                                  * Its order [0, oind) queues
  796                                                  * are empty.
  797                                                  */
  798                                                 vm_phys_enq_range(m, avail -
  799                                                     need, fl, 1);
  800                                                 return (npages);
  801                                         } else if (i == npages)
  802                                                 return (npages);
  803                                 }
  804                         }
  805                 }
  806         }
  807         return (i);
  808 }
  809 
  810 /*
  811  * Allocate a contiguous, power of two-sized set of physical pages
  812  * from the free lists.
  813  *
  814  * The free page queues must be locked.
  815  */
  816 vm_page_t
  817 vm_phys_alloc_pages(int domain, int pool, int order)
  818 {
  819         vm_page_t m;
  820         int freelist;
  821 
  822         for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
  823                 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
  824                 if (m != NULL)
  825                         return (m);
  826         }
  827         return (NULL);
  828 }
  829 
  830 /*
  831  * Allocate a contiguous, power of two-sized set of physical pages from the
  832  * specified free list.  The free list must be specified using one of the
  833  * manifest constants VM_FREELIST_*.
  834  *
  835  * The free page queues must be locked.
  836  */
  837 vm_page_t
  838 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
  839 {
  840         struct vm_freelist *alt, *fl;
  841         vm_page_t m;
  842         int oind, pind, flind;
  843 
  844         KASSERT(domain >= 0 && domain < vm_ndomains,
  845             ("vm_phys_alloc_freelist_pages: domain %d is out of range",
  846             domain));
  847         KASSERT(freelist < VM_NFREELIST,
  848             ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
  849             freelist));
  850         KASSERT(pool < VM_NFREEPOOL,
  851             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
  852         KASSERT(order < VM_NFREEORDER,
  853             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
  854 
  855         flind = vm_freelist_to_flind[freelist];
  856         /* Check if freelist is present */
  857         if (flind < 0)
  858                 return (NULL);
  859 
  860         vm_domain_free_assert_locked(VM_DOMAIN(domain));
  861         fl = &vm_phys_free_queues[domain][flind][pool][0];
  862         for (oind = order; oind < VM_NFREEORDER; oind++) {
  863                 m = TAILQ_FIRST(&fl[oind].pl);
  864                 if (m != NULL) {
  865                         vm_freelist_rem(fl, m, oind);
  866                         /* The order [order, oind) queues are empty. */
  867                         vm_phys_split_pages(m, oind, fl, order, 1);
  868                         return (m);
  869                 }
  870         }
  871 
  872         /*
  873          * The given pool was empty.  Find the largest
  874          * contiguous, power-of-two-sized set of pages in any
  875          * pool.  Transfer these pages to the given pool, and
  876          * use them to satisfy the allocation.
  877          */
  878         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
  879                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  880                         alt = &vm_phys_free_queues[domain][flind][pind][0];
  881                         m = TAILQ_FIRST(&alt[oind].pl);
  882                         if (m != NULL) {
  883                                 vm_freelist_rem(alt, m, oind);
  884                                 vm_phys_set_pool(pool, m, oind);
  885                                 /* The order [order, oind) queues are empty. */
  886                                 vm_phys_split_pages(m, oind, fl, order, 1);
  887                                 return (m);
  888                         }
  889                 }
  890         }
  891         return (NULL);
  892 }
  893 
  894 /*
  895  * Find the vm_page corresponding to the given physical address.
  896  */
  897 vm_page_t
  898 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
  899 {
  900         struct vm_phys_seg *seg;
  901         int segind;
  902 
  903         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  904                 seg = &vm_phys_segs[segind];
  905                 if (pa >= seg->start && pa < seg->end)
  906                         return (&seg->first_page[atop(pa - seg->start)]);
  907         }
  908         return (NULL);
  909 }
  910 
  911 vm_page_t
  912 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
  913 {
  914         struct vm_phys_fictitious_seg tmp, *seg;
  915         vm_page_t m;
  916 
  917         m = NULL;
  918         tmp.start = pa;
  919         tmp.end = 0;
  920 
  921         rw_rlock(&vm_phys_fictitious_reg_lock);
  922         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
  923         rw_runlock(&vm_phys_fictitious_reg_lock);
  924         if (seg == NULL)
  925                 return (NULL);
  926 
  927         m = &seg->first_page[atop(pa - seg->start)];
  928         KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
  929 
  930         return (m);
  931 }
  932 
  933 static inline void
  934 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
  935     long page_count, vm_memattr_t memattr)
  936 {
  937         long i;
  938 
  939         bzero(range, page_count * sizeof(*range));
  940         for (i = 0; i < page_count; i++) {
  941                 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
  942                 range[i].oflags &= ~VPO_UNMANAGED;
  943                 range[i].busy_lock = VPB_UNBUSIED;
  944         }
  945 }
  946 
  947 int
  948 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
  949     vm_memattr_t memattr)
  950 {
  951         struct vm_phys_fictitious_seg *seg;
  952         vm_page_t fp;
  953         long page_count;
  954 #ifdef VM_PHYSSEG_DENSE
  955         long pi, pe;
  956         long dpage_count;
  957 #endif
  958 
  959         KASSERT(start < end,
  960             ("Start of segment isn't less than end (start: %jx end: %jx)",
  961             (uintmax_t)start, (uintmax_t)end));
  962 
  963         page_count = (end - start) / PAGE_SIZE;
  964 
  965 #ifdef VM_PHYSSEG_DENSE
  966         pi = atop(start);
  967         pe = atop(end);
  968         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
  969                 fp = &vm_page_array[pi - first_page];
  970                 if ((pe - first_page) > vm_page_array_size) {
  971                         /*
  972                          * We have a segment that starts inside
  973                          * of vm_page_array, but ends outside of it.
  974                          *
  975                          * Use vm_page_array pages for those that are
  976                          * inside of the vm_page_array range, and
  977                          * allocate the remaining ones.
  978                          */
  979                         dpage_count = vm_page_array_size - (pi - first_page);
  980                         vm_phys_fictitious_init_range(fp, start, dpage_count,
  981                             memattr);
  982                         page_count -= dpage_count;
  983                         start += ptoa(dpage_count);
  984                         goto alloc;
  985                 }
  986                 /*
  987                  * We can allocate the full range from vm_page_array,
  988                  * so there's no need to register the range in the tree.
  989                  */
  990                 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
  991                 return (0);
  992         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
  993                 /*
  994                  * We have a segment that ends inside of vm_page_array,
  995                  * but starts outside of it.
  996                  */
  997                 fp = &vm_page_array[0];
  998                 dpage_count = pe - first_page;
  999                 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
 1000                     memattr);
 1001                 end -= ptoa(dpage_count);
 1002                 page_count -= dpage_count;
 1003                 goto alloc;
 1004         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 1005                 /*
 1006                  * Trying to register a fictitious range that expands before
 1007                  * and after vm_page_array.
 1008                  */
 1009                 return (EINVAL);
 1010         } else {
 1011 alloc:
 1012 #endif
 1013                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
 1014                     M_WAITOK);
 1015 #ifdef VM_PHYSSEG_DENSE
 1016         }
 1017 #endif
 1018         vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 1019 
 1020         seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
 1021         seg->start = start;
 1022         seg->end = end;
 1023         seg->first_page = fp;
 1024 
 1025         rw_wlock(&vm_phys_fictitious_reg_lock);
 1026         RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
 1027         rw_wunlock(&vm_phys_fictitious_reg_lock);
 1028 
 1029         return (0);
 1030 }
 1031 
 1032 void
 1033 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
 1034 {
 1035         struct vm_phys_fictitious_seg *seg, tmp;
 1036 #ifdef VM_PHYSSEG_DENSE
 1037         long pi, pe;
 1038 #endif
 1039 
 1040         KASSERT(start < end,
 1041             ("Start of segment isn't less than end (start: %jx end: %jx)",
 1042             (uintmax_t)start, (uintmax_t)end));
 1043 
 1044 #ifdef VM_PHYSSEG_DENSE
 1045         pi = atop(start);
 1046         pe = atop(end);
 1047         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 1048                 if ((pe - first_page) <= vm_page_array_size) {
 1049                         /*
 1050                          * This segment was allocated using vm_page_array
 1051                          * only, there's nothing to do since those pages
 1052                          * were never added to the tree.
 1053                          */
 1054                         return;
 1055                 }
 1056                 /*
 1057                  * We have a segment that starts inside
 1058                  * of vm_page_array, but ends outside of it.
 1059                  *
 1060                  * Calculate how many pages were added to the
 1061                  * tree and free them.
 1062                  */
 1063                 start = ptoa(first_page + vm_page_array_size);
 1064         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 1065                 /*
 1066                  * We have a segment that ends inside of vm_page_array,
 1067                  * but starts outside of it.
 1068                  */
 1069                 end = ptoa(first_page);
 1070         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 1071                 /* Since it's not possible to register such a range, panic. */
 1072                 panic(
 1073                     "Unregistering not registered fictitious range [%#jx:%#jx]",
 1074                     (uintmax_t)start, (uintmax_t)end);
 1075         }
 1076 #endif
 1077         tmp.start = start;
 1078         tmp.end = 0;
 1079 
 1080         rw_wlock(&vm_phys_fictitious_reg_lock);
 1081         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 1082         if (seg->start != start || seg->end != end) {
 1083                 rw_wunlock(&vm_phys_fictitious_reg_lock);
 1084                 panic(
 1085                     "Unregistering not registered fictitious range [%#jx:%#jx]",
 1086                     (uintmax_t)start, (uintmax_t)end);
 1087         }
 1088         RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
 1089         rw_wunlock(&vm_phys_fictitious_reg_lock);
 1090         free(seg->first_page, M_FICT_PAGES);
 1091         free(seg, M_FICT_PAGES);
 1092 }
 1093 
 1094 /*
 1095  * Free a contiguous, power of two-sized set of physical pages.
 1096  *
 1097  * The free page queues must be locked.
 1098  */
 1099 void
 1100 vm_phys_free_pages(vm_page_t m, int order)
 1101 {
 1102         struct vm_freelist *fl;
 1103         struct vm_phys_seg *seg;
 1104         vm_paddr_t pa;
 1105         vm_page_t m_buddy;
 1106 
 1107         KASSERT(m->order == VM_NFREEORDER,
 1108             ("vm_phys_free_pages: page %p has unexpected order %d",
 1109             m, m->order));
 1110         KASSERT(m->pool < VM_NFREEPOOL,
 1111             ("vm_phys_free_pages: page %p has unexpected pool %d",
 1112             m, m->pool));
 1113         KASSERT(order < VM_NFREEORDER,
 1114             ("vm_phys_free_pages: order %d is out of range", order));
 1115         seg = &vm_phys_segs[m->segind];
 1116         vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
 1117         if (order < VM_NFREEORDER - 1) {
 1118                 pa = VM_PAGE_TO_PHYS(m);
 1119                 do {
 1120                         pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
 1121                         if (pa < seg->start || pa >= seg->end)
 1122                                 break;
 1123                         m_buddy = &seg->first_page[atop(pa - seg->start)];
 1124                         if (m_buddy->order != order)
 1125                                 break;
 1126                         fl = (*seg->free_queues)[m_buddy->pool];
 1127                         vm_freelist_rem(fl, m_buddy, order);
 1128                         if (m_buddy->pool != m->pool)
 1129                                 vm_phys_set_pool(m->pool, m_buddy, order);
 1130                         order++;
 1131                         pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
 1132                         m = &seg->first_page[atop(pa - seg->start)];
 1133                 } while (order < VM_NFREEORDER - 1);
 1134         }
 1135         fl = (*seg->free_queues)[m->pool];
 1136         vm_freelist_add(fl, m, order, 1);
 1137 }
 1138 
 1139 /*
 1140  * Return the largest possible order of a set of pages starting at m.
 1141  */
 1142 static int
 1143 max_order(vm_page_t m)
 1144 {
 1145 
 1146         /*
 1147          * Unsigned "min" is used here so that "order" is assigned
 1148          * "VM_NFREEORDER - 1" when "m"'s physical address is zero
 1149          * or the low-order bits of its physical address are zero
 1150          * because the size of a physical address exceeds the size of
 1151          * a long.
 1152          */
 1153         return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
 1154             VM_NFREEORDER - 1));
 1155 }
 1156 
 1157 /*
 1158  * Free a contiguous, arbitrarily sized set of physical pages, without
 1159  * merging across set boundaries.
 1160  *
 1161  * The free page queues must be locked.
 1162  */
 1163 void
 1164 vm_phys_enqueue_contig(vm_page_t m, u_long npages)
 1165 {
 1166         struct vm_freelist *fl;
 1167         struct vm_phys_seg *seg;
 1168         vm_page_t m_end;
 1169         int order;
 1170 
 1171         /*
 1172          * Avoid unnecessary coalescing by freeing the pages in the largest
 1173          * possible power-of-two-sized subsets.
 1174          */
 1175         vm_domain_free_assert_locked(vm_pagequeue_domain(m));
 1176         seg = &vm_phys_segs[m->segind];
 1177         fl = (*seg->free_queues)[m->pool];
 1178         m_end = m + npages;
 1179         /* Free blocks of increasing size. */
 1180         while ((order = max_order(m)) < VM_NFREEORDER - 1 &&
 1181             m + (1 << order) <= m_end) {
 1182                 KASSERT(seg == &vm_phys_segs[m->segind],
 1183                     ("%s: page range [%p,%p) spans multiple segments",
 1184                     __func__, m_end - npages, m));
 1185                 vm_freelist_add(fl, m, order, 1);
 1186                 m += 1 << order;
 1187         }
 1188         /* Free blocks of maximum size. */
 1189         while (m + (1 << order) <= m_end) {
 1190                 KASSERT(seg == &vm_phys_segs[m->segind],
 1191                     ("%s: page range [%p,%p) spans multiple segments",
 1192                     __func__, m_end - npages, m));
 1193                 vm_freelist_add(fl, m, order, 1);
 1194                 m += 1 << order;
 1195         }
 1196         /* Free blocks of diminishing size. */
 1197         while (m < m_end) {
 1198                 KASSERT(seg == &vm_phys_segs[m->segind],
 1199                     ("%s: page range [%p,%p) spans multiple segments",
 1200                     __func__, m_end - npages, m));
 1201                 order = flsl(m_end - m) - 1;
 1202                 vm_freelist_add(fl, m, order, 1);
 1203                 m += 1 << order;
 1204         }
 1205 }
 1206 
 1207 /*
 1208  * Free a contiguous, arbitrarily sized set of physical pages.
 1209  *
 1210  * The free page queues must be locked.
 1211  */
 1212 void
 1213 vm_phys_free_contig(vm_page_t m, u_long npages)
 1214 {
 1215         int order_start, order_end;
 1216         vm_page_t m_start, m_end;
 1217 
 1218         vm_domain_free_assert_locked(vm_pagequeue_domain(m));
 1219 
 1220         m_start = m;
 1221         order_start = max_order(m_start);
 1222         if (order_start < VM_NFREEORDER - 1)
 1223                 m_start += 1 << order_start;
 1224         m_end = m + npages;
 1225         order_end = max_order(m_end);
 1226         if (order_end < VM_NFREEORDER - 1)
 1227                 m_end -= 1 << order_end;
 1228         /*
 1229          * Avoid unnecessary coalescing by freeing the pages at the start and
 1230          * end of the range last.
 1231          */
 1232         if (m_start < m_end)
 1233                 vm_phys_enqueue_contig(m_start, m_end - m_start);
 1234         if (order_start < VM_NFREEORDER - 1)
 1235                 vm_phys_free_pages(m, order_start);
 1236         if (order_end < VM_NFREEORDER - 1)
 1237                 vm_phys_free_pages(m_end, order_end);
 1238 }
 1239 
 1240 /*
 1241  * Scan physical memory between the specified addresses "low" and "high" for a
 1242  * run of contiguous physical pages that satisfy the specified conditions, and
 1243  * return the lowest page in the run.  The specified "alignment" determines
 1244  * the alignment of the lowest physical page in the run.  If the specified
 1245  * "boundary" is non-zero, then the run of physical pages cannot span a
 1246  * physical address that is a multiple of "boundary".
 1247  *
 1248  * "npages" must be greater than zero.  Both "alignment" and "boundary" must
 1249  * be a power of two.
 1250  */
 1251 vm_page_t
 1252 vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
 1253     u_long alignment, vm_paddr_t boundary, int options)
 1254 {
 1255         vm_paddr_t pa_end;
 1256         vm_page_t m_end, m_run, m_start;
 1257         struct vm_phys_seg *seg;
 1258         int segind;
 1259 
 1260         KASSERT(npages > 0, ("npages is 0"));
 1261         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1262         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1263         if (low >= high)
 1264                 return (NULL);
 1265         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 1266                 seg = &vm_phys_segs[segind];
 1267                 if (seg->domain != domain)
 1268                         continue;
 1269                 if (seg->start >= high)
 1270                         break;
 1271                 if (low >= seg->end)
 1272                         continue;
 1273                 if (low <= seg->start)
 1274                         m_start = seg->first_page;
 1275                 else
 1276                         m_start = &seg->first_page[atop(low - seg->start)];
 1277                 if (high < seg->end)
 1278                         pa_end = high;
 1279                 else
 1280                         pa_end = seg->end;
 1281                 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
 1282                         continue;
 1283                 m_end = &seg->first_page[atop(pa_end - seg->start)];
 1284                 m_run = vm_page_scan_contig(npages, m_start, m_end,
 1285                     alignment, boundary, options);
 1286                 if (m_run != NULL)
 1287                         return (m_run);
 1288         }
 1289         return (NULL);
 1290 }
 1291 
 1292 /*
 1293  * Search for the given physical page "m" in the free lists.  If the search
 1294  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
 1295  * FALSE, indicating that "m" is not in the free lists.
 1296  *
 1297  * The free page queues must be locked.
 1298  */
 1299 boolean_t
 1300 vm_phys_unfree_page(vm_page_t m)
 1301 {
 1302         struct vm_freelist *fl;
 1303         struct vm_phys_seg *seg;
 1304         vm_paddr_t pa, pa_half;
 1305         vm_page_t m_set, m_tmp;
 1306         int order;
 1307 
 1308         /*
 1309          * First, find the contiguous, power of two-sized set of free
 1310          * physical pages containing the given physical page "m" and
 1311          * assign it to "m_set".
 1312          */
 1313         seg = &vm_phys_segs[m->segind];
 1314         vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
 1315         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
 1316             order < VM_NFREEORDER - 1; ) {
 1317                 order++;
 1318                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
 1319                 if (pa >= seg->start)
 1320                         m_set = &seg->first_page[atop(pa - seg->start)];
 1321                 else
 1322                         return (FALSE);
 1323         }
 1324         if (m_set->order < order)
 1325                 return (FALSE);
 1326         if (m_set->order == VM_NFREEORDER)
 1327                 return (FALSE);
 1328         KASSERT(m_set->order < VM_NFREEORDER,
 1329             ("vm_phys_unfree_page: page %p has unexpected order %d",
 1330             m_set, m_set->order));
 1331 
 1332         /*
 1333          * Next, remove "m_set" from the free lists.  Finally, extract
 1334          * "m" from "m_set" using an iterative algorithm: While "m_set"
 1335          * is larger than a page, shrink "m_set" by returning the half
 1336          * of "m_set" that does not contain "m" to the free lists.
 1337          */
 1338         fl = (*seg->free_queues)[m_set->pool];
 1339         order = m_set->order;
 1340         vm_freelist_rem(fl, m_set, order);
 1341         while (order > 0) {
 1342                 order--;
 1343                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
 1344                 if (m->phys_addr < pa_half)
 1345                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
 1346                 else {
 1347                         m_tmp = m_set;
 1348                         m_set = &seg->first_page[atop(pa_half - seg->start)];
 1349                 }
 1350                 vm_freelist_add(fl, m_tmp, order, 0);
 1351         }
 1352         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
 1353         return (TRUE);
 1354 }
 1355 
 1356 /*
 1357  * Find a run of contiguous physical pages from the specified page list.
 1358  */
 1359 static vm_page_t
 1360 vm_phys_find_freelist_contig(struct vm_freelist *fl, int oind, u_long npages,
 1361     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 1362 {
 1363         struct vm_phys_seg *seg;
 1364         vm_paddr_t frag, lbound, pa, page_size, pa_end, pa_pre, size;
 1365         vm_page_t m, m_listed, m_ret;
 1366         int order;
 1367 
 1368         KASSERT(npages > 0, ("npages is 0"));
 1369         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1370         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1371         /* Search for a run satisfying the specified conditions. */
 1372         page_size = PAGE_SIZE;
 1373         size = npages << PAGE_SHIFT;
 1374         frag = (npages & ~(~0UL << oind)) << PAGE_SHIFT;
 1375         TAILQ_FOREACH(m_listed, &fl[oind].pl, listq) {
 1376                 /*
 1377                  * Determine if the address range starting at pa is
 1378                  * too low.
 1379                  */
 1380                 pa = VM_PAGE_TO_PHYS(m_listed);
 1381                 if (pa < low)
 1382                         continue;
 1383 
 1384                 /*
 1385                  * If this is not the first free oind-block in this range, bail
 1386                  * out. We have seen the first free block already, or will see
 1387                  * it before failing to find an appropriate range.
 1388                  */
 1389                 seg = &vm_phys_segs[m_listed->segind];
 1390                 lbound = low > seg->start ? low : seg->start;
 1391                 pa_pre = pa - (page_size << oind);
 1392                 m = &seg->first_page[atop(pa_pre - seg->start)];
 1393                 if (pa != 0 && pa_pre >= lbound && m->order == oind)
 1394                         continue;
 1395 
 1396                 if (!vm_addr_align_ok(pa, alignment))
 1397                         /* Advance to satisfy alignment condition. */
 1398                         pa = roundup2(pa, alignment);
 1399                 else if (frag != 0 && lbound + frag <= pa) {
 1400                         /*
 1401                          * Back up to the first aligned free block in this
 1402                          * range, without moving below lbound.
 1403                          */
 1404                         pa_end = pa;
 1405                         for (order = oind - 1; order >= 0; order--) {
 1406                                 pa_pre = pa_end - (page_size << order);
 1407                                 if (!vm_addr_align_ok(pa_pre, alignment))
 1408                                         break;
 1409                                 m = &seg->first_page[atop(pa_pre - seg->start)];
 1410                                 if (pa_pre >= lbound && m->order == order)
 1411                                         pa_end = pa_pre;
 1412                         }
 1413                         /*
 1414                          * If the extra small blocks are enough to complete the
 1415                          * fragment, use them.  Otherwise, look to allocate the
 1416                          * fragment at the other end.
 1417                          */
 1418                         if (pa_end + frag <= pa)
 1419                                 pa = pa_end;
 1420                 }
 1421 
 1422                 /* Advance as necessary to satisfy boundary conditions. */
 1423                 if (!vm_addr_bound_ok(pa, size, boundary))
 1424                         pa = roundup2(pa + 1, boundary);
 1425                 pa_end = pa + size;
 1426 
 1427                 /*
 1428                  * Determine if the address range is valid (without overflow in
 1429                  * pa_end calculation), and fits within the segment.
 1430                  */
 1431                 if (pa_end < pa || seg->end < pa_end)
 1432                         continue;
 1433 
 1434                 m_ret = &seg->first_page[atop(pa - seg->start)];
 1435 
 1436                 /*
 1437                  * Determine whether there are enough free oind-blocks here to
 1438                  * satisfy the allocation request.
 1439                  */
 1440                 pa = VM_PAGE_TO_PHYS(m_listed);
 1441                 do {
 1442                         pa += page_size << oind;
 1443                         if (pa >= pa_end)
 1444                                 return (m_ret);
 1445                         m = &seg->first_page[atop(pa - seg->start)];
 1446                 } while (oind == m->order);
 1447 
 1448                 /*
 1449                  * Determine if an additional series of free blocks of
 1450                  * diminishing size can help to satisfy the allocation request.
 1451                  */
 1452                 while (m->order < oind &&
 1453                     pa + 2 * (page_size << m->order) > pa_end) {
 1454                         pa += page_size << m->order;
 1455                         if (pa >= pa_end)
 1456                                 return (m_ret);
 1457                         m = &seg->first_page[atop(pa - seg->start)];
 1458                 }
 1459         }
 1460         return (NULL);
 1461 }
 1462 
 1463 /*
 1464  * Find a run of contiguous physical pages from the specified free list
 1465  * table.
 1466  */
 1467 static vm_page_t
 1468 vm_phys_find_queues_contig(
 1469     struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
 1470     u_long npages, vm_paddr_t low, vm_paddr_t high,
 1471     u_long alignment, vm_paddr_t boundary)
 1472 {
 1473         struct vm_freelist *fl;
 1474         vm_page_t m_ret;
 1475         vm_paddr_t pa, pa_end, size;
 1476         int oind, order, pind;
 1477 
 1478         KASSERT(npages > 0, ("npages is 0"));
 1479         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1480         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1481         /* Compute the queue that is the best fit for npages. */
 1482         order = flsl(npages - 1);
 1483         /* Search for a large enough free block. */
 1484         size = npages << PAGE_SHIFT;
 1485         for (oind = order; oind < VM_NFREEORDER; oind++) {
 1486                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 1487                         fl = (*queues)[pind];
 1488                         TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
 1489                                 /*
 1490                                  * Determine if the address range starting at pa
 1491                                  * is within the given range, satisfies the
 1492                                  * given alignment, and does not cross the given
 1493                                  * boundary.
 1494                                  */
 1495                                 pa = VM_PAGE_TO_PHYS(m_ret);
 1496                                 pa_end = pa + size;
 1497                                 if (low <= pa && pa_end <= high &&
 1498                                     vm_addr_ok(pa, size, alignment, boundary))
 1499                                         return (m_ret);
 1500                         }
 1501                 }
 1502         }
 1503         if (order < VM_NFREEORDER)
 1504                 return (NULL);
 1505         /* Search for a long-enough sequence of small blocks. */
 1506         oind = VM_NFREEORDER - 1;
 1507         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 1508                 fl = (*queues)[pind];
 1509                 m_ret = vm_phys_find_freelist_contig(fl, oind, npages,
 1510                     low, high, alignment, boundary);
 1511                 if (m_ret != NULL)
 1512                         return (m_ret);
 1513         }
 1514         return (NULL);
 1515 }
 1516 
 1517 /*
 1518  * Allocate a contiguous set of physical pages of the given size
 1519  * "npages" from the free lists.  All of the physical pages must be at
 1520  * or above the given physical address "low" and below the given
 1521  * physical address "high".  The given value "alignment" determines the
 1522  * alignment of the first physical page in the set.  If the given value
 1523  * "boundary" is non-zero, then the set of physical pages cannot cross
 1524  * any physical address boundary that is a multiple of that value.  Both
 1525  * "alignment" and "boundary" must be a power of two.
 1526  */
 1527 vm_page_t
 1528 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
 1529     u_long alignment, vm_paddr_t boundary)
 1530 {
 1531         vm_paddr_t pa_end, pa_start;
 1532         struct vm_freelist *fl;
 1533         vm_page_t m, m_run;
 1534         struct vm_phys_seg *seg;
 1535         struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
 1536         int oind, segind;
 1537 
 1538         KASSERT(npages > 0, ("npages is 0"));
 1539         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1540         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1541         vm_domain_free_assert_locked(VM_DOMAIN(domain));
 1542         if (low >= high)
 1543                 return (NULL);
 1544         queues = NULL;
 1545         m_run = NULL;
 1546         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 1547                 seg = &vm_phys_segs[segind];
 1548                 if (seg->start >= high || seg->domain != domain)
 1549                         continue;
 1550                 if (low >= seg->end)
 1551                         break;
 1552                 if (low <= seg->start)
 1553                         pa_start = seg->start;
 1554                 else
 1555                         pa_start = low;
 1556                 if (high < seg->end)
 1557                         pa_end = high;
 1558                 else
 1559                         pa_end = seg->end;
 1560                 if (pa_end - pa_start < ptoa(npages))
 1561                         continue;
 1562                 /*
 1563                  * If a previous segment led to a search using
 1564                  * the same free lists as would this segment, then
 1565                  * we've actually already searched within this
 1566                  * too.  So skip it.
 1567                  */
 1568                 if (seg->free_queues == queues)
 1569                         continue;
 1570                 queues = seg->free_queues;
 1571                 m_run = vm_phys_find_queues_contig(queues, npages,
 1572                     low, high, alignment, boundary);
 1573                 if (m_run != NULL)
 1574                         break;
 1575         }
 1576         if (m_run == NULL)
 1577                 return (NULL);
 1578 
 1579         /* Allocate pages from the page-range found. */
 1580         for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
 1581                 fl = (*queues)[m->pool];
 1582                 oind = m->order;
 1583                 vm_freelist_rem(fl, m, oind);
 1584                 if (m->pool != VM_FREEPOOL_DEFAULT)
 1585                         vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
 1586         }
 1587         /* Return excess pages to the free lists. */
 1588         if (&m_run[npages] < m) {
 1589                 fl = (*queues)[VM_FREEPOOL_DEFAULT];
 1590                 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0);
 1591         }
 1592         return (m_run);
 1593 }
 1594 
 1595 /*
 1596  * Return the index of the first unused slot which may be the terminating
 1597  * entry.
 1598  */
 1599 static int
 1600 vm_phys_avail_count(void)
 1601 {
 1602         int i;
 1603 
 1604         for (i = 0; phys_avail[i + 1]; i += 2)
 1605                 continue;
 1606         if (i > PHYS_AVAIL_ENTRIES)
 1607                 panic("Improperly terminated phys_avail %d entries", i);
 1608 
 1609         return (i);
 1610 }
 1611 
 1612 /*
 1613  * Assert that a phys_avail entry is valid.
 1614  */
 1615 static void
 1616 vm_phys_avail_check(int i)
 1617 {
 1618         if (phys_avail[i] & PAGE_MASK)
 1619                 panic("Unaligned phys_avail[%d]: %#jx", i,
 1620                     (intmax_t)phys_avail[i]);
 1621         if (phys_avail[i+1] & PAGE_MASK)
 1622                 panic("Unaligned phys_avail[%d + 1]: %#jx", i,
 1623                     (intmax_t)phys_avail[i]);
 1624         if (phys_avail[i + 1] < phys_avail[i])
 1625                 panic("phys_avail[%d] start %#jx < end %#jx", i,
 1626                     (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]);
 1627 }
 1628 
 1629 /*
 1630  * Return the index of an overlapping phys_avail entry or -1.
 1631  */
 1632 #ifdef NUMA
 1633 static int
 1634 vm_phys_avail_find(vm_paddr_t pa)
 1635 {
 1636         int i;
 1637 
 1638         for (i = 0; phys_avail[i + 1]; i += 2)
 1639                 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
 1640                         return (i);
 1641         return (-1);
 1642 }
 1643 #endif
 1644 
 1645 /*
 1646  * Return the index of the largest entry.
 1647  */
 1648 int
 1649 vm_phys_avail_largest(void)
 1650 {
 1651         vm_paddr_t sz, largesz;
 1652         int largest;
 1653         int i;
 1654 
 1655         largest = 0;
 1656         largesz = 0;
 1657         for (i = 0; phys_avail[i + 1]; i += 2) {
 1658                 sz = vm_phys_avail_size(i);
 1659                 if (sz > largesz) {
 1660                         largesz = sz;
 1661                         largest = i;
 1662                 }
 1663         }
 1664 
 1665         return (largest);
 1666 }
 1667 
 1668 vm_paddr_t
 1669 vm_phys_avail_size(int i)
 1670 {
 1671 
 1672         return (phys_avail[i + 1] - phys_avail[i]);
 1673 }
 1674 
 1675 /*
 1676  * Split an entry at the address 'pa'.  Return zero on success or errno.
 1677  */
 1678 static int
 1679 vm_phys_avail_split(vm_paddr_t pa, int i)
 1680 {
 1681         int cnt;
 1682 
 1683         vm_phys_avail_check(i);
 1684         if (pa <= phys_avail[i] || pa >= phys_avail[i + 1])
 1685                 panic("vm_phys_avail_split: invalid address");
 1686         cnt = vm_phys_avail_count();
 1687         if (cnt >= PHYS_AVAIL_ENTRIES)
 1688                 return (ENOSPC);
 1689         memmove(&phys_avail[i + 2], &phys_avail[i],
 1690             (cnt - i) * sizeof(phys_avail[0]));
 1691         phys_avail[i + 1] = pa;
 1692         phys_avail[i + 2] = pa;
 1693         vm_phys_avail_check(i);
 1694         vm_phys_avail_check(i+2);
 1695 
 1696         return (0);
 1697 }
 1698 
 1699 /*
 1700  * Check if a given physical address can be included as part of a crash dump.
 1701  */
 1702 bool
 1703 vm_phys_is_dumpable(vm_paddr_t pa)
 1704 {
 1705         vm_page_t m;
 1706         int i;
 1707 
 1708         if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
 1709                 return ((m->flags & PG_NODUMP) == 0);
 1710 
 1711         for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
 1712                 if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
 1713                         return (true);
 1714         }
 1715         return (false);
 1716 }
 1717 
 1718 void
 1719 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
 1720 {
 1721         struct vm_phys_seg *seg;
 1722 
 1723         if (vm_phys_early_nsegs == -1)
 1724                 panic("%s: called after initialization", __func__);
 1725         if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
 1726                 panic("%s: ran out of early segments", __func__);
 1727 
 1728         seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
 1729         seg->start = start;
 1730         seg->end = end;
 1731 }
 1732 
 1733 /*
 1734  * This routine allocates NUMA node specific memory before the page
 1735  * allocator is bootstrapped.
 1736  */
 1737 vm_paddr_t
 1738 vm_phys_early_alloc(int domain, size_t alloc_size)
 1739 {
 1740 #ifdef NUMA
 1741         int mem_index;
 1742 #endif
 1743         int i, biggestone;
 1744         vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
 1745 
 1746         KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
 1747             ("%s: invalid domain index %d", __func__, domain));
 1748 
 1749         /*
 1750          * Search the mem_affinity array for the biggest address
 1751          * range in the desired domain.  This is used to constrain
 1752          * the phys_avail selection below.
 1753          */
 1754         biggestsize = 0;
 1755         mem_start = 0;
 1756         mem_end = -1;
 1757 #ifdef NUMA
 1758         mem_index = 0;
 1759         if (mem_affinity != NULL) {
 1760                 for (i = 0;; i++) {
 1761                         size = mem_affinity[i].end - mem_affinity[i].start;
 1762                         if (size == 0)
 1763                                 break;
 1764                         if (domain != -1 && mem_affinity[i].domain != domain)
 1765                                 continue;
 1766                         if (size > biggestsize) {
 1767                                 mem_index = i;
 1768                                 biggestsize = size;
 1769                         }
 1770                 }
 1771                 mem_start = mem_affinity[mem_index].start;
 1772                 mem_end = mem_affinity[mem_index].end;
 1773         }
 1774 #endif
 1775 
 1776         /*
 1777          * Now find biggest physical segment in within the desired
 1778          * numa domain.
 1779          */
 1780         biggestsize = 0;
 1781         biggestone = 0;
 1782         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 1783                 /* skip regions that are out of range */
 1784                 if (phys_avail[i+1] - alloc_size < mem_start ||
 1785                     phys_avail[i+1] > mem_end)
 1786                         continue;
 1787                 size = vm_phys_avail_size(i);
 1788                 if (size > biggestsize) {
 1789                         biggestone = i;
 1790                         biggestsize = size;
 1791                 }
 1792         }
 1793         alloc_size = round_page(alloc_size);
 1794 
 1795         /*
 1796          * Grab single pages from the front to reduce fragmentation.
 1797          */
 1798         if (alloc_size == PAGE_SIZE) {
 1799                 pa = phys_avail[biggestone];
 1800                 phys_avail[biggestone] += PAGE_SIZE;
 1801                 vm_phys_avail_check(biggestone);
 1802                 return (pa);
 1803         }
 1804 
 1805         /*
 1806          * Naturally align large allocations.
 1807          */
 1808         align = phys_avail[biggestone + 1] & (alloc_size - 1);
 1809         if (alloc_size + align > biggestsize)
 1810                 panic("cannot find a large enough size\n");
 1811         if (align != 0 &&
 1812             vm_phys_avail_split(phys_avail[biggestone + 1] - align,
 1813             biggestone) != 0)
 1814                 /* Wasting memory. */
 1815                 phys_avail[biggestone + 1] -= align;
 1816 
 1817         phys_avail[biggestone + 1] -= alloc_size;
 1818         vm_phys_avail_check(biggestone);
 1819         pa = phys_avail[biggestone + 1];
 1820         return (pa);
 1821 }
 1822 
 1823 void
 1824 vm_phys_early_startup(void)
 1825 {
 1826         struct vm_phys_seg *seg;
 1827         int i;
 1828 
 1829         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 1830                 phys_avail[i] = round_page(phys_avail[i]);
 1831                 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 1832         }
 1833 
 1834         for (i = 0; i < vm_phys_early_nsegs; i++) {
 1835                 seg = &vm_phys_early_segs[i];
 1836                 vm_phys_add_seg(seg->start, seg->end);
 1837         }
 1838         vm_phys_early_nsegs = -1;
 1839 
 1840 #ifdef NUMA
 1841         /* Force phys_avail to be split by domain. */
 1842         if (mem_affinity != NULL) {
 1843                 int idx;
 1844 
 1845                 for (i = 0; mem_affinity[i].end != 0; i++) {
 1846                         idx = vm_phys_avail_find(mem_affinity[i].start);
 1847                         if (idx != -1 &&
 1848                             phys_avail[idx] != mem_affinity[i].start)
 1849                                 vm_phys_avail_split(mem_affinity[i].start, idx);
 1850                         idx = vm_phys_avail_find(mem_affinity[i].end);
 1851                         if (idx != -1 &&
 1852                             phys_avail[idx] != mem_affinity[i].end)
 1853                                 vm_phys_avail_split(mem_affinity[i].end, idx);
 1854                 }
 1855         }
 1856 #endif
 1857 }
 1858 
 1859 #ifdef DDB
 1860 /*
 1861  * Show the number of physical pages in each of the free lists.
 1862  */
 1863 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
 1864 {
 1865         struct vm_freelist *fl;
 1866         int flind, oind, pind, dom;
 1867 
 1868         for (dom = 0; dom < vm_ndomains; dom++) {
 1869                 db_printf("DOMAIN: %d\n", dom);
 1870                 for (flind = 0; flind < vm_nfreelists; flind++) {
 1871                         db_printf("FREE LIST %d:\n"
 1872                             "\n  ORDER (SIZE)  |  NUMBER"
 1873                             "\n              ", flind);
 1874                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 1875                                 db_printf("  |  POOL %d", pind);
 1876                         db_printf("\n--            ");
 1877                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 1878                                 db_printf("-- --      ");
 1879                         db_printf("--\n");
 1880                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 1881                                 db_printf("  %2.2d (%6.6dK)", oind,
 1882                                     1 << (PAGE_SHIFT - 10 + oind));
 1883                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 1884                                 fl = vm_phys_free_queues[dom][flind][pind];
 1885                                         db_printf("  |  %6.6d", fl[oind].lcnt);
 1886                                 }
 1887                                 db_printf("\n");
 1888                         }
 1889                         db_printf("\n");
 1890                 }
 1891                 db_printf("\n");
 1892         }
 1893 }
 1894 #endif
Cache object: 4009230b5ae970c220caefde1852fe33
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vm/vm_phys.c

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_phys.c