vm_phys.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 2002-2006 Rice University
    3  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
    4  * All rights reserved.
    5  *
    6  * This software was developed for the FreeBSD Project by Alan L. Cox,
    7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
    8  *
    9  * Redistribution and use in source and binary forms, with or without
   10  * modification, are permitted provided that the following conditions
   11  * are met:
   12  * 1. Redistributions of source code must retain the above copyright
   13  *    notice, this list of conditions and the following disclaimer.
   14  * 2. Redistributions in binary form must reproduce the above copyright
   15  *    notice, this list of conditions and the following disclaimer in the
   16  *    documentation and/or other materials provided with the distribution.
   17  *
   18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
   22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
   25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
   28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  *      Physical memory system implementation
   34  *
   35  * Any external functions defined by this module are only to be used by the
   36  * virtual memory system.
   37  */
   38 
   39 #include <sys/cdefs.h>
   40 __FBSDID("$FreeBSD$");
   41 
   42 #include "opt_ddb.h"
   43 #include "opt_vm.h"
   44 
   45 #include <sys/param.h>
   46 #include <sys/systm.h>
   47 #include <sys/lock.h>
   48 #include <sys/kernel.h>
   49 #include <sys/malloc.h>
   50 #include <sys/mutex.h>
   51 #include <sys/proc.h>
   52 #include <sys/queue.h>
   53 #include <sys/rwlock.h>
   54 #include <sys/sbuf.h>
   55 #include <sys/sysctl.h>
   56 #include <sys/tree.h>
   57 #include <sys/vmmeter.h>
   58 #include <sys/seq.h>
   59 
   60 #include <ddb/ddb.h>
   61 
   62 #include <vm/vm.h>
   63 #include <vm/vm_param.h>
   64 #include <vm/vm_kern.h>
   65 #include <vm/vm_object.h>
   66 #include <vm/vm_page.h>
   67 #include <vm/vm_phys.h>
   68 
   69 #include <vm/vm_domain.h>
   70 
   71 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
   72     "Too many physsegs.");
   73 
   74 #ifdef VM_NUMA_ALLOC
   75 struct mem_affinity *mem_affinity;
   76 int *mem_locality;
   77 #endif
   78 
   79 int vm_ndomains = 1;
   80 
   81 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
   82 int vm_phys_nsegs;
   83 
   84 struct vm_phys_fictitious_seg;
   85 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
   86     struct vm_phys_fictitious_seg *);
   87 
   88 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
   89     RB_INITIALIZER(_vm_phys_fictitious_tree);
   90 
   91 struct vm_phys_fictitious_seg {
   92         RB_ENTRY(vm_phys_fictitious_seg) node;
   93         /* Memory region data */
   94         vm_paddr_t      start;
   95         vm_paddr_t      end;
   96         vm_page_t       first_page;
   97 };
   98 
   99 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
  100     vm_phys_fictitious_cmp);
  101 
  102 static struct rwlock vm_phys_fictitious_reg_lock;
  103 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
  104 
  105 static struct vm_freelist
  106     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
  107 
  108 static int vm_nfreelists;
  109 
  110 /*
  111  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
  112  */
  113 static int vm_freelist_to_flind[VM_NFREELIST];
  114 
  115 CTASSERT(VM_FREELIST_DEFAULT == 0);
  116 
  117 #ifdef VM_FREELIST_ISADMA
  118 #define VM_ISADMA_BOUNDARY      16777216
  119 #endif
  120 #ifdef VM_FREELIST_DMA32
  121 #define VM_DMA32_BOUNDARY       ((vm_paddr_t)1 << 32)
  122 #endif
  123 
  124 /*
  125  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
  126  * the ordering of the free list boundaries.
  127  */
  128 #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY)
  129 CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY);
  130 #endif
  131 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
  132 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
  133 #endif
  134 
  135 static int cnt_prezero;
  136 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
  137     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
  138 
  139 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
  140 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
  141     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
  142 
  143 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
  144 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
  145     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
  146 
  147 #ifdef VM_NUMA_ALLOC
  148 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
  149 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
  150     NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
  151 #endif
  152 
  153 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
  154     &vm_ndomains, 0, "Number of physical memory domains available.");
  155 
  156 /*
  157  * Default to first-touch + round-robin.
  158  */
  159 static struct mtx vm_default_policy_mtx;
  160 MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
  161     MTX_DEF);
  162 #ifdef VM_NUMA_ALLOC
  163 static struct vm_domain_policy vm_default_policy =
  164     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
  165 #else
  166 /* Use round-robin so the domain policy code will only try once per allocation */
  167 static struct vm_domain_policy vm_default_policy =
  168     VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
  169 #endif
  170 
  171 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
  172     int order);
  173 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
  174     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
  175     vm_paddr_t boundary);
  176 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
  177 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
  178 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
  179     int order);
  180 
  181 static int
  182 sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
  183 {
  184         char policy_name[32];
  185         int error;
  186 
  187         mtx_lock(&vm_default_policy_mtx);
  188 
  189         /* Map policy to output string */
  190         switch (vm_default_policy.p.policy) {
  191         case VM_POLICY_FIRST_TOUCH:
  192                 strcpy(policy_name, "first-touch");
  193                 break;
  194         case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
  195                 strcpy(policy_name, "first-touch-rr");
  196                 break;
  197         case VM_POLICY_ROUND_ROBIN:
  198         default:
  199                 strcpy(policy_name, "rr");
  200                 break;
  201         }
  202         mtx_unlock(&vm_default_policy_mtx);
  203 
  204         error = sysctl_handle_string(oidp, &policy_name[0],
  205             sizeof(policy_name), req);
  206         if (error != 0 || req->newptr == NULL)
  207                 return (error);
  208 
  209         mtx_lock(&vm_default_policy_mtx);
  210         /* Set: match on the subset of policies that make sense as a default */
  211         if (strcmp("first-touch-rr", policy_name) == 0) {
  212                 vm_domain_policy_set(&vm_default_policy,
  213                     VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
  214         } else if (strcmp("first-touch", policy_name) == 0) {
  215                 vm_domain_policy_set(&vm_default_policy,
  216                     VM_POLICY_FIRST_TOUCH, 0);
  217         } else if (strcmp("rr", policy_name) == 0) {
  218                 vm_domain_policy_set(&vm_default_policy,
  219                     VM_POLICY_ROUND_ROBIN, 0);
  220         } else {
  221                 error = EINVAL;
  222                 goto finish;
  223         }
  224 
  225         error = 0;
  226 finish:
  227         mtx_unlock(&vm_default_policy_mtx);
  228         return (error);
  229 }
  230 
  231 SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
  232     0, 0, sysctl_vm_default_policy, "A",
  233     "Default policy (rr, first-touch, first-touch-rr");
  234 
  235 /*
  236  * Red-black tree helpers for vm fictitious range management.
  237  */
  238 static inline int
  239 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
  240     struct vm_phys_fictitious_seg *range)
  241 {
  242 
  243         KASSERT(range->start != 0 && range->end != 0,
  244             ("Invalid range passed on search for vm_fictitious page"));
  245         if (p->start >= range->end)
  246                 return (1);
  247         if (p->start < range->start)
  248                 return (-1);
  249 
  250         return (0);
  251 }
  252 
  253 static int
  254 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
  255     struct vm_phys_fictitious_seg *p2)
  256 {
  257 
  258         /* Check if this is a search for a page */
  259         if (p1->end == 0)
  260                 return (vm_phys_fictitious_in_range(p1, p2));
  261 
  262         KASSERT(p2->end != 0,
  263     ("Invalid range passed as second parameter to vm fictitious comparison"));
  264 
  265         /* Searching to add a new range */
  266         if (p1->end <= p2->start)
  267                 return (-1);
  268         if (p1->start >= p2->end)
  269                 return (1);
  270 
  271         panic("Trying to add overlapping vm fictitious ranges:\n"
  272             "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
  273             (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
  274 }
  275 
  276 static __inline int
  277 vm_rr_selectdomain(void)
  278 {
  279 #ifdef VM_NUMA_ALLOC
  280         struct thread *td;
  281 
  282         td = curthread;
  283 
  284         td->td_dom_rr_idx++;
  285         td->td_dom_rr_idx %= vm_ndomains;
  286         return (td->td_dom_rr_idx);
  287 #else
  288         return (0);
  289 #endif
  290 }
  291 
  292 /*
  293  * Initialise a VM domain iterator.
  294  *
  295  * Check the thread policy, then the proc policy,
  296  * then default to the system policy.
  297  *
  298  * Later on the various layers will have this logic
  299  * plumbed into them and the phys code will be explicitly
  300  * handed a VM domain policy to use.
  301  */
  302 static void
  303 vm_policy_iterator_init(struct vm_domain_iterator *vi)
  304 {
  305 #ifdef VM_NUMA_ALLOC
  306         struct vm_domain_policy lcl;
  307 #endif
  308 
  309         vm_domain_iterator_init(vi);
  310 
  311 #ifdef VM_NUMA_ALLOC
  312         /* Copy out the thread policy */
  313         vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
  314         if (lcl.p.policy != VM_POLICY_NONE) {
  315                 /* Thread policy is present; use it */
  316                 vm_domain_iterator_set_policy(vi, &lcl);
  317                 return;
  318         }
  319 
  320         vm_domain_policy_localcopy(&lcl,
  321             &curthread->td_proc->p_vm_dom_policy);
  322         if (lcl.p.policy != VM_POLICY_NONE) {
  323                 /* Process policy is present; use it */
  324                 vm_domain_iterator_set_policy(vi, &lcl);
  325                 return;
  326         }
  327 #endif
  328         /* Use system default policy */
  329         vm_domain_iterator_set_policy(vi, &vm_default_policy);
  330 }
  331 
  332 static void
  333 vm_policy_iterator_finish(struct vm_domain_iterator *vi)
  334 {
  335 
  336         vm_domain_iterator_cleanup(vi);
  337 }
  338 
  339 boolean_t
  340 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
  341 {
  342         struct vm_phys_seg *s;
  343         int idx;
  344 
  345         while ((idx = ffsl(mask)) != 0) {
  346                 idx--;  /* ffsl counts from 1 */
  347                 mask &= ~(1UL << idx);
  348                 s = &vm_phys_segs[idx];
  349                 if (low < s->end && high > s->start)
  350                         return (TRUE);
  351         }
  352         return (FALSE);
  353 }
  354 
  355 /*
  356  * Outputs the state of the physical memory allocator, specifically,
  357  * the amount of physical memory in each free list.
  358  */
  359 static int
  360 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
  361 {
  362         struct sbuf sbuf;
  363         struct vm_freelist *fl;
  364         int dom, error, flind, oind, pind;
  365 
  366         error = sysctl_wire_old_buffer(req, 0);
  367         if (error != 0)
  368                 return (error);
  369         sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
  370         for (dom = 0; dom < vm_ndomains; dom++) {
  371                 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
  372                 for (flind = 0; flind < vm_nfreelists; flind++) {
  373                         sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
  374                             "\n  ORDER (SIZE)  |  NUMBER"
  375                             "\n              ", flind);
  376                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
  377                                 sbuf_printf(&sbuf, "  |  POOL %d", pind);
  378                         sbuf_printf(&sbuf, "\n--            ");
  379                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
  380                                 sbuf_printf(&sbuf, "-- --      ");
  381                         sbuf_printf(&sbuf, "--\n");
  382                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
  383                                 sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
  384                                     1 << (PAGE_SHIFT - 10 + oind));
  385                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  386                                 fl = vm_phys_free_queues[dom][flind][pind];
  387                                         sbuf_printf(&sbuf, "  |  %6d",
  388                                             fl[oind].lcnt);
  389                                 }
  390                                 sbuf_printf(&sbuf, "\n");
  391                         }
  392                 }
  393         }
  394         error = sbuf_finish(&sbuf);
  395         sbuf_delete(&sbuf);
  396         return (error);
  397 }
  398 
  399 /*
  400  * Outputs the set of physical memory segments.
  401  */
  402 static int
  403 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
  404 {
  405         struct sbuf sbuf;
  406         struct vm_phys_seg *seg;
  407         int error, segind;
  408 
  409         error = sysctl_wire_old_buffer(req, 0);
  410         if (error != 0)
  411                 return (error);
  412         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  413         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  414                 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
  415                 seg = &vm_phys_segs[segind];
  416                 sbuf_printf(&sbuf, "start:     %#jx\n",
  417                     (uintmax_t)seg->start);
  418                 sbuf_printf(&sbuf, "end:       %#jx\n",
  419                     (uintmax_t)seg->end);
  420                 sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
  421                 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
  422         }
  423         error = sbuf_finish(&sbuf);
  424         sbuf_delete(&sbuf);
  425         return (error);
  426 }
  427 
  428 /*
  429  * Return affinity, or -1 if there's no affinity information.
  430  */
  431 int
  432 vm_phys_mem_affinity(int f, int t)
  433 {
  434 
  435 #ifdef VM_NUMA_ALLOC
  436         if (mem_locality == NULL)
  437                 return (-1);
  438         if (f >= vm_ndomains || t >= vm_ndomains)
  439                 return (-1);
  440         return (mem_locality[f * vm_ndomains + t]);
  441 #else
  442         return (-1);
  443 #endif
  444 }
  445 
  446 #ifdef VM_NUMA_ALLOC
  447 /*
  448  * Outputs the VM locality table.
  449  */
  450 static int
  451 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
  452 {
  453         struct sbuf sbuf;
  454         int error, i, j;
  455 
  456         error = sysctl_wire_old_buffer(req, 0);
  457         if (error != 0)
  458                 return (error);
  459         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
  460 
  461         sbuf_printf(&sbuf, "\n");
  462 
  463         for (i = 0; i < vm_ndomains; i++) {
  464                 sbuf_printf(&sbuf, "%d: ", i);
  465                 for (j = 0; j < vm_ndomains; j++) {
  466                         sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
  467                 }
  468                 sbuf_printf(&sbuf, "\n");
  469         }
  470         error = sbuf_finish(&sbuf);
  471         sbuf_delete(&sbuf);
  472         return (error);
  473 }
  474 #endif
  475 
  476 static void
  477 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
  478 {
  479 
  480         m->order = order;
  481         if (tail)
  482                 TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
  483         else
  484                 TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
  485         fl[order].lcnt++;
  486 }
  487 
  488 static void
  489 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
  490 {
  491 
  492         TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
  493         fl[order].lcnt--;
  494         m->order = VM_NFREEORDER;
  495 }
  496 
  497 /*
  498  * Create a physical memory segment.
  499  */
  500 static void
  501 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
  502 {
  503         struct vm_phys_seg *seg;
  504 
  505         KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
  506             ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
  507         KASSERT(domain < vm_ndomains,
  508             ("vm_phys_create_seg: invalid domain provided"));
  509         seg = &vm_phys_segs[vm_phys_nsegs++];
  510         while (seg > vm_phys_segs && (seg - 1)->start >= end) {
  511                 *seg = *(seg - 1);
  512                 seg--;
  513         }
  514         seg->start = start;
  515         seg->end = end;
  516         seg->domain = domain;
  517 }
  518 
  519 static void
  520 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
  521 {
  522 #ifdef VM_NUMA_ALLOC
  523         int i;
  524 
  525         if (mem_affinity == NULL) {
  526                 _vm_phys_create_seg(start, end, 0);
  527                 return;
  528         }
  529 
  530         for (i = 0;; i++) {
  531                 if (mem_affinity[i].end == 0)
  532                         panic("Reached end of affinity info");
  533                 if (mem_affinity[i].end <= start)
  534                         continue;
  535                 if (mem_affinity[i].start > start)
  536                         panic("No affinity info for start %jx",
  537                             (uintmax_t)start);
  538                 if (mem_affinity[i].end >= end) {
  539                         _vm_phys_create_seg(start, end,
  540                             mem_affinity[i].domain);
  541                         break;
  542                 }
  543                 _vm_phys_create_seg(start, mem_affinity[i].end,
  544                     mem_affinity[i].domain);
  545                 start = mem_affinity[i].end;
  546         }
  547 #else
  548         _vm_phys_create_seg(start, end, 0);
  549 #endif
  550 }
  551 
  552 /*
  553  * Add a physical memory segment.
  554  */
  555 void
  556 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
  557 {
  558         vm_paddr_t paddr;
  559 
  560         KASSERT((start & PAGE_MASK) == 0,
  561             ("vm_phys_define_seg: start is not page aligned"));
  562         KASSERT((end & PAGE_MASK) == 0,
  563             ("vm_phys_define_seg: end is not page aligned"));
  564 
  565         /*
  566          * Split the physical memory segment if it spans two or more free
  567          * list boundaries.
  568          */
  569         paddr = start;
  570 #ifdef  VM_FREELIST_ISADMA
  571         if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) {
  572                 vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY);
  573                 paddr = VM_ISADMA_BOUNDARY;
  574         }
  575 #endif
  576 #ifdef  VM_FREELIST_LOWMEM
  577         if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
  578                 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
  579                 paddr = VM_LOWMEM_BOUNDARY;
  580         }
  581 #endif
  582 #ifdef  VM_FREELIST_DMA32
  583         if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
  584                 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
  585                 paddr = VM_DMA32_BOUNDARY;
  586         }
  587 #endif
  588         vm_phys_create_seg(paddr, end);
  589 }
  590 
  591 /*
  592  * Initialize the physical memory allocator.
  593  *
  594  * Requires that vm_page_array is initialized!
  595  */
  596 void
  597 vm_phys_init(void)
  598 {
  599         struct vm_freelist *fl;
  600         struct vm_phys_seg *seg;
  601         u_long npages;
  602         int dom, flind, freelist, oind, pind, segind;
  603 
  604         /*
  605          * Compute the number of free lists, and generate the mapping from the
  606          * manifest constants VM_FREELIST_* to the free list indices.
  607          *
  608          * Initially, the entries of vm_freelist_to_flind[] are set to either
  609          * 0 or 1 to indicate which free lists should be created.
  610          */
  611         npages = 0;
  612         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
  613                 seg = &vm_phys_segs[segind];
  614 #ifdef  VM_FREELIST_ISADMA
  615                 if (seg->end <= VM_ISADMA_BOUNDARY)
  616                         vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1;
  617                 else
  618 #endif
  619 #ifdef  VM_FREELIST_LOWMEM
  620                 if (seg->end <= VM_LOWMEM_BOUNDARY)
  621                         vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
  622                 else
  623 #endif
  624 #ifdef  VM_FREELIST_DMA32
  625                 if (
  626 #ifdef  VM_DMA32_NPAGES_THRESHOLD
  627                     /*
  628                      * Create the DMA32 free list only if the amount of
  629                      * physical memory above physical address 4G exceeds the
  630                      * given threshold.
  631                      */
  632                     npages > VM_DMA32_NPAGES_THRESHOLD &&
  633 #endif
  634                     seg->end <= VM_DMA32_BOUNDARY)
  635                         vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
  636                 else
  637 #endif
  638                 {
  639                         npages += atop(seg->end - seg->start);
  640                         vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
  641                 }
  642         }
  643         /* Change each entry into a running total of the free lists. */
  644         for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
  645                 vm_freelist_to_flind[freelist] +=
  646                     vm_freelist_to_flind[freelist - 1];
  647         }
  648         vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
  649         KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
  650         /* Change each entry into a free list index. */
  651         for (freelist = 0; freelist < VM_NFREELIST; freelist++)
  652                 vm_freelist_to_flind[freelist]--;
  653 
  654         /*
  655          * Initialize the first_page and free_queues fields of each physical
  656          * memory segment.
  657          */
  658 #ifdef VM_PHYSSEG_SPARSE
  659         npages = 0;
  660 #endif
  661         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  662                 seg = &vm_phys_segs[segind];
  663 #ifdef VM_PHYSSEG_SPARSE
  664                 seg->first_page = &vm_page_array[npages];
  665                 npages += atop(seg->end - seg->start);
  666 #else
  667                 seg->first_page = PHYS_TO_VM_PAGE(seg->start);
  668 #endif
  669 #ifdef  VM_FREELIST_ISADMA
  670                 if (seg->end <= VM_ISADMA_BOUNDARY) {
  671                         flind = vm_freelist_to_flind[VM_FREELIST_ISADMA];
  672                         KASSERT(flind >= 0,
  673                             ("vm_phys_init: ISADMA flind < 0"));
  674                 } else
  675 #endif
  676 #ifdef  VM_FREELIST_LOWMEM
  677                 if (seg->end <= VM_LOWMEM_BOUNDARY) {
  678                         flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
  679                         KASSERT(flind >= 0,
  680                             ("vm_phys_init: LOWMEM flind < 0"));
  681                 } else
  682 #endif
  683 #ifdef  VM_FREELIST_DMA32
  684                 if (seg->end <= VM_DMA32_BOUNDARY) {
  685                         flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
  686                         KASSERT(flind >= 0,
  687                             ("vm_phys_init: DMA32 flind < 0"));
  688                 } else
  689 #endif
  690                 {
  691                         flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
  692                         KASSERT(flind >= 0,
  693                             ("vm_phys_init: DEFAULT flind < 0"));
  694                 }
  695                 seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
  696         }
  697 
  698         /*
  699          * Initialize the free queues.
  700          */
  701         for (dom = 0; dom < vm_ndomains; dom++) {
  702                 for (flind = 0; flind < vm_nfreelists; flind++) {
  703                         for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  704                                 fl = vm_phys_free_queues[dom][flind][pind];
  705                                 for (oind = 0; oind < VM_NFREEORDER; oind++)
  706                                         TAILQ_INIT(&fl[oind].pl);
  707                         }
  708                 }
  709         }
  710 
  711         rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
  712 }
  713 
  714 /*
  715  * Split a contiguous, power of two-sized set of physical pages.
  716  */
  717 static __inline void
  718 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
  719 {
  720         vm_page_t m_buddy;
  721 
  722         while (oind > order) {
  723                 oind--;
  724                 m_buddy = &m[1 << oind];
  725                 KASSERT(m_buddy->order == VM_NFREEORDER,
  726                     ("vm_phys_split_pages: page %p has unexpected order %d",
  727                     m_buddy, m_buddy->order));
  728                 vm_freelist_add(fl, m_buddy, oind, 0);
  729         }
  730 }
  731 
  732 /*
  733  * Allocate a contiguous, power of two-sized set of physical pages
  734  * from the free lists.
  735  *
  736  * The free page queues must be locked.
  737  */
  738 vm_page_t
  739 vm_phys_alloc_pages(int pool, int order)
  740 {
  741         vm_page_t m;
  742         int domain, flind;
  743         struct vm_domain_iterator vi;
  744 
  745         KASSERT(pool < VM_NFREEPOOL,
  746             ("vm_phys_alloc_pages: pool %d is out of range", pool));
  747         KASSERT(order < VM_NFREEORDER,
  748             ("vm_phys_alloc_pages: order %d is out of range", order));
  749 
  750         vm_policy_iterator_init(&vi);
  751 
  752         while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
  753                 for (flind = 0; flind < vm_nfreelists; flind++) {
  754                         m = vm_phys_alloc_domain_pages(domain, flind, pool,
  755                             order);
  756                         if (m != NULL)
  757                                 return (m);
  758                 }
  759         }
  760 
  761         vm_policy_iterator_finish(&vi);
  762         return (NULL);
  763 }
  764 
  765 /*
  766  * Allocate a contiguous, power of two-sized set of physical pages from the
  767  * specified free list.  The free list must be specified using one of the
  768  * manifest constants VM_FREELIST_*.
  769  *
  770  * The free page queues must be locked.
  771  */
  772 vm_page_t
  773 vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
  774 {
  775         vm_page_t m;
  776         struct vm_domain_iterator vi;
  777         int domain;
  778 
  779         KASSERT(freelist < VM_NFREELIST,
  780             ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
  781             freelist));
  782         KASSERT(pool < VM_NFREEPOOL,
  783             ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
  784         KASSERT(order < VM_NFREEORDER,
  785             ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
  786 
  787         vm_policy_iterator_init(&vi);
  788 
  789         while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
  790                 m = vm_phys_alloc_domain_pages(domain,
  791                     vm_freelist_to_flind[freelist], pool, order);
  792                 if (m != NULL)
  793                         return (m);
  794         }
  795 
  796         vm_policy_iterator_finish(&vi);
  797         return (NULL);
  798 }
  799 
  800 static vm_page_t
  801 vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order)
  802 {       
  803         struct vm_freelist *fl;
  804         struct vm_freelist *alt;
  805         int oind, pind;
  806         vm_page_t m;
  807 
  808         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
  809         fl = &vm_phys_free_queues[domain][flind][pool][0];
  810         for (oind = order; oind < VM_NFREEORDER; oind++) {
  811                 m = TAILQ_FIRST(&fl[oind].pl);
  812                 if (m != NULL) {
  813                         vm_freelist_rem(fl, m, oind);
  814                         vm_phys_split_pages(m, oind, fl, order);
  815                         return (m);
  816                 }
  817         }
  818 
  819         /*
  820          * The given pool was empty.  Find the largest
  821          * contiguous, power-of-two-sized set of pages in any
  822          * pool.  Transfer these pages to the given pool, and
  823          * use them to satisfy the allocation.
  824          */
  825         for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
  826                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
  827                         alt = &vm_phys_free_queues[domain][flind][pind][0];
  828                         m = TAILQ_FIRST(&alt[oind].pl);
  829                         if (m != NULL) {
  830                                 vm_freelist_rem(alt, m, oind);
  831                                 vm_phys_set_pool(pool, m, oind);
  832                                 vm_phys_split_pages(m, oind, fl, order);
  833                                 return (m);
  834                         }
  835                 }
  836         }
  837         return (NULL);
  838 }
  839 
  840 /*
  841  * Find the vm_page corresponding to the given physical address.
  842  */
  843 vm_page_t
  844 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
  845 {
  846         struct vm_phys_seg *seg;
  847         int segind;
  848 
  849         for (segind = 0; segind < vm_phys_nsegs; segind++) {
  850                 seg = &vm_phys_segs[segind];
  851                 if (pa >= seg->start && pa < seg->end)
  852                         return (&seg->first_page[atop(pa - seg->start)]);
  853         }
  854         return (NULL);
  855 }
  856 
  857 vm_page_t
  858 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
  859 {
  860         struct vm_phys_fictitious_seg tmp, *seg;
  861         vm_page_t m;
  862 
  863         m = NULL;
  864         tmp.start = pa;
  865         tmp.end = 0;
  866 
  867         rw_rlock(&vm_phys_fictitious_reg_lock);
  868         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
  869         rw_runlock(&vm_phys_fictitious_reg_lock);
  870         if (seg == NULL)
  871                 return (NULL);
  872 
  873         m = &seg->first_page[atop(pa - seg->start)];
  874         KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
  875 
  876         return (m);
  877 }
  878 
  879 static inline void
  880 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
  881     long page_count, vm_memattr_t memattr)
  882 {
  883         long i;
  884 
  885         bzero(range, page_count * sizeof(*range));
  886         for (i = 0; i < page_count; i++) {
  887                 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
  888                 range[i].oflags &= ~VPO_UNMANAGED;
  889                 range[i].busy_lock = VPB_UNBUSIED;
  890         }
  891 }
  892 
  893 int
  894 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
  895     vm_memattr_t memattr)
  896 {
  897         struct vm_phys_fictitious_seg *seg;
  898         vm_page_t fp;
  899         long page_count;
  900 #ifdef VM_PHYSSEG_DENSE
  901         long pi, pe;
  902         long dpage_count;
  903 #endif
  904 
  905         KASSERT(start < end,
  906             ("Start of segment isn't less than end (start: %jx end: %jx)",
  907             (uintmax_t)start, (uintmax_t)end));
  908 
  909         page_count = (end - start) / PAGE_SIZE;
  910 
  911 #ifdef VM_PHYSSEG_DENSE
  912         pi = atop(start);
  913         pe = atop(end);
  914         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
  915                 fp = &vm_page_array[pi - first_page];
  916                 if ((pe - first_page) > vm_page_array_size) {
  917                         /*
  918                          * We have a segment that starts inside
  919                          * of vm_page_array, but ends outside of it.
  920                          *
  921                          * Use vm_page_array pages for those that are
  922                          * inside of the vm_page_array range, and
  923                          * allocate the remaining ones.
  924                          */
  925                         dpage_count = vm_page_array_size - (pi - first_page);
  926                         vm_phys_fictitious_init_range(fp, start, dpage_count,
  927                             memattr);
  928                         page_count -= dpage_count;
  929                         start += ptoa(dpage_count);
  930                         goto alloc;
  931                 }
  932                 /*
  933                  * We can allocate the full range from vm_page_array,
  934                  * so there's no need to register the range in the tree.
  935                  */
  936                 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
  937                 return (0);
  938         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
  939                 /*
  940                  * We have a segment that ends inside of vm_page_array,
  941                  * but starts outside of it.
  942                  */
  943                 fp = &vm_page_array[0];
  944                 dpage_count = pe - first_page;
  945                 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
  946                     memattr);
  947                 end -= ptoa(dpage_count);
  948                 page_count -= dpage_count;
  949                 goto alloc;
  950         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
  951                 /*
  952                  * Trying to register a fictitious range that expands before
  953                  * and after vm_page_array.
  954                  */
  955                 return (EINVAL);
  956         } else {
  957 alloc:
  958 #endif
  959                 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
  960                     M_WAITOK);
  961 #ifdef VM_PHYSSEG_DENSE
  962         }
  963 #endif
  964         vm_phys_fictitious_init_range(fp, start, page_count, memattr);
  965 
  966         seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
  967         seg->start = start;
  968         seg->end = end;
  969         seg->first_page = fp;
  970 
  971         rw_wlock(&vm_phys_fictitious_reg_lock);
  972         RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
  973         rw_wunlock(&vm_phys_fictitious_reg_lock);
  974 
  975         return (0);
  976 }
  977 
  978 void
  979 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
  980 {
  981         struct vm_phys_fictitious_seg *seg, tmp;
  982 #ifdef VM_PHYSSEG_DENSE
  983         long pi, pe;
  984 #endif
  985 
  986         KASSERT(start < end,
  987             ("Start of segment isn't less than end (start: %jx end: %jx)",
  988             (uintmax_t)start, (uintmax_t)end));
  989 
  990 #ifdef VM_PHYSSEG_DENSE
  991         pi = atop(start);
  992         pe = atop(end);
  993         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
  994                 if ((pe - first_page) <= vm_page_array_size) {
  995                         /*
  996                          * This segment was allocated using vm_page_array
  997                          * only, there's nothing to do since those pages
  998                          * were never added to the tree.
  999                          */
 1000                         return;
 1001                 }
 1002                 /*
 1003                  * We have a segment that starts inside
 1004                  * of vm_page_array, but ends outside of it.
 1005                  *
 1006                  * Calculate how many pages were added to the
 1007                  * tree and free them.
 1008                  */
 1009                 start = ptoa(first_page + vm_page_array_size);
 1010         } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 1011                 /*
 1012                  * We have a segment that ends inside of vm_page_array,
 1013                  * but starts outside of it.
 1014                  */
 1015                 end = ptoa(first_page);
 1016         } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 1017                 /* Since it's not possible to register such a range, panic. */
 1018                 panic(
 1019                     "Unregistering not registered fictitious range [%#jx:%#jx]",
 1020                     (uintmax_t)start, (uintmax_t)end);
 1021         }
 1022 #endif
 1023         tmp.start = start;
 1024         tmp.end = 0;
 1025 
 1026         rw_wlock(&vm_phys_fictitious_reg_lock);
 1027         seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 1028         if (seg->start != start || seg->end != end) {
 1029                 rw_wunlock(&vm_phys_fictitious_reg_lock);
 1030                 panic(
 1031                     "Unregistering not registered fictitious range [%#jx:%#jx]",
 1032                     (uintmax_t)start, (uintmax_t)end);
 1033         }
 1034         RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
 1035         rw_wunlock(&vm_phys_fictitious_reg_lock);
 1036         free(seg->first_page, M_FICT_PAGES);
 1037         free(seg, M_FICT_PAGES);
 1038 }
 1039 
 1040 /*
 1041  * Free a contiguous, power of two-sized set of physical pages.
 1042  *
 1043  * The free page queues must be locked.
 1044  */
 1045 void
 1046 vm_phys_free_pages(vm_page_t m, int order)
 1047 {
 1048         struct vm_freelist *fl;
 1049         struct vm_phys_seg *seg;
 1050         vm_paddr_t pa;
 1051         vm_page_t m_buddy;
 1052 
 1053         KASSERT(m->order == VM_NFREEORDER,
 1054             ("vm_phys_free_pages: page %p has unexpected order %d",
 1055             m, m->order));
 1056         KASSERT(m->pool < VM_NFREEPOOL,
 1057             ("vm_phys_free_pages: page %p has unexpected pool %d",
 1058             m, m->pool));
 1059         KASSERT(order < VM_NFREEORDER,
 1060             ("vm_phys_free_pages: order %d is out of range", order));
 1061         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1062         seg = &vm_phys_segs[m->segind];
 1063         if (order < VM_NFREEORDER - 1) {
 1064                 pa = VM_PAGE_TO_PHYS(m);
 1065                 do {
 1066                         pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
 1067                         if (pa < seg->start || pa >= seg->end)
 1068                                 break;
 1069                         m_buddy = &seg->first_page[atop(pa - seg->start)];
 1070                         if (m_buddy->order != order)
 1071                                 break;
 1072                         fl = (*seg->free_queues)[m_buddy->pool];
 1073                         vm_freelist_rem(fl, m_buddy, order);
 1074                         if (m_buddy->pool != m->pool)
 1075                                 vm_phys_set_pool(m->pool, m_buddy, order);
 1076                         order++;
 1077                         pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
 1078                         m = &seg->first_page[atop(pa - seg->start)];
 1079                 } while (order < VM_NFREEORDER - 1);
 1080         }
 1081         fl = (*seg->free_queues)[m->pool];
 1082         vm_freelist_add(fl, m, order, 1);
 1083 }
 1084 
 1085 /*
 1086  * Free a contiguous, arbitrarily sized set of physical pages.
 1087  *
 1088  * The free page queues must be locked.
 1089  */
 1090 void
 1091 vm_phys_free_contig(vm_page_t m, u_long npages)
 1092 {
 1093         u_int n;
 1094         int order;
 1095 
 1096         /*
 1097          * Avoid unnecessary coalescing by freeing the pages in the largest
 1098          * possible power-of-two-sized subsets.
 1099          */
 1100         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1101         for (;; npages -= n) {
 1102                 /*
 1103                  * Unsigned "min" is used here so that "order" is assigned
 1104                  * "VM_NFREEORDER - 1" when "m"'s physical address is zero
 1105                  * or the low-order bits of its physical address are zero
 1106                  * because the size of a physical address exceeds the size of
 1107                  * a long.
 1108                  */
 1109                 order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
 1110                     VM_NFREEORDER - 1);
 1111                 n = 1 << order;
 1112                 if (npages < n)
 1113                         break;
 1114                 vm_phys_free_pages(m, order);
 1115                 m += n;
 1116         }
 1117         /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
 1118         for (; npages > 0; npages -= n) {
 1119                 order = flsl(npages) - 1;
 1120                 n = 1 << order;
 1121                 vm_phys_free_pages(m, order);
 1122                 m += n;
 1123         }
 1124 }
 1125 
 1126 /*
 1127  * Scan physical memory between the specified addresses "low" and "high" for a
 1128  * run of contiguous physical pages that satisfy the specified conditions, and
 1129  * return the lowest page in the run.  The specified "alignment" determines
 1130  * the alignment of the lowest physical page in the run.  If the specified
 1131  * "boundary" is non-zero, then the run of physical pages cannot span a
 1132  * physical address that is a multiple of "boundary".
 1133  *
 1134  * "npages" must be greater than zero.  Both "alignment" and "boundary" must
 1135  * be a power of two.
 1136  */
 1137 vm_page_t
 1138 vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
 1139     u_long alignment, vm_paddr_t boundary, int options)
 1140 {
 1141         vm_paddr_t pa_end;
 1142         vm_page_t m_end, m_run, m_start;
 1143         struct vm_phys_seg *seg;
 1144         int segind;
 1145 
 1146         KASSERT(npages > 0, ("npages is 0"));
 1147         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1148         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1149         if (low >= high)
 1150                 return (NULL);
 1151         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 1152                 seg = &vm_phys_segs[segind];
 1153                 if (seg->start >= high)
 1154                         break;
 1155                 if (low >= seg->end)
 1156                         continue;
 1157                 if (low <= seg->start)
 1158                         m_start = seg->first_page;
 1159                 else
 1160                         m_start = &seg->first_page[atop(low - seg->start)];
 1161                 if (high < seg->end)
 1162                         pa_end = high;
 1163                 else
 1164                         pa_end = seg->end;
 1165                 if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
 1166                         continue;
 1167                 m_end = &seg->first_page[atop(pa_end - seg->start)];
 1168                 m_run = vm_page_scan_contig(npages, m_start, m_end,
 1169                     alignment, boundary, options);
 1170                 if (m_run != NULL)
 1171                         return (m_run);
 1172         }
 1173         return (NULL);
 1174 }
 1175 
 1176 /*
 1177  * Set the pool for a contiguous, power of two-sized set of physical pages. 
 1178  */
 1179 void
 1180 vm_phys_set_pool(int pool, vm_page_t m, int order)
 1181 {
 1182         vm_page_t m_tmp;
 1183 
 1184         for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
 1185                 m_tmp->pool = pool;
 1186 }
 1187 
 1188 /*
 1189  * Search for the given physical page "m" in the free lists.  If the search
 1190  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
 1191  * FALSE, indicating that "m" is not in the free lists.
 1192  *
 1193  * The free page queues must be locked.
 1194  */
 1195 boolean_t
 1196 vm_phys_unfree_page(vm_page_t m)
 1197 {
 1198         struct vm_freelist *fl;
 1199         struct vm_phys_seg *seg;
 1200         vm_paddr_t pa, pa_half;
 1201         vm_page_t m_set, m_tmp;
 1202         int order;
 1203 
 1204         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1205 
 1206         /*
 1207          * First, find the contiguous, power of two-sized set of free
 1208          * physical pages containing the given physical page "m" and
 1209          * assign it to "m_set".
 1210          */
 1211         seg = &vm_phys_segs[m->segind];
 1212         for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
 1213             order < VM_NFREEORDER - 1; ) {
 1214                 order++;
 1215                 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
 1216                 if (pa >= seg->start)
 1217                         m_set = &seg->first_page[atop(pa - seg->start)];
 1218                 else
 1219                         return (FALSE);
 1220         }
 1221         if (m_set->order < order)
 1222                 return (FALSE);
 1223         if (m_set->order == VM_NFREEORDER)
 1224                 return (FALSE);
 1225         KASSERT(m_set->order < VM_NFREEORDER,
 1226             ("vm_phys_unfree_page: page %p has unexpected order %d",
 1227             m_set, m_set->order));
 1228 
 1229         /*
 1230          * Next, remove "m_set" from the free lists.  Finally, extract
 1231          * "m" from "m_set" using an iterative algorithm: While "m_set"
 1232          * is larger than a page, shrink "m_set" by returning the half
 1233          * of "m_set" that does not contain "m" to the free lists.
 1234          */
 1235         fl = (*seg->free_queues)[m_set->pool];
 1236         order = m_set->order;
 1237         vm_freelist_rem(fl, m_set, order);
 1238         while (order > 0) {
 1239                 order--;
 1240                 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
 1241                 if (m->phys_addr < pa_half)
 1242                         m_tmp = &seg->first_page[atop(pa_half - seg->start)];
 1243                 else {
 1244                         m_tmp = m_set;
 1245                         m_set = &seg->first_page[atop(pa_half - seg->start)];
 1246                 }
 1247                 vm_freelist_add(fl, m_tmp, order, 0);
 1248         }
 1249         KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
 1250         return (TRUE);
 1251 }
 1252 
 1253 /*
 1254  * Try to zero one physical page.  Used by an idle priority thread.
 1255  */
 1256 boolean_t
 1257 vm_phys_zero_pages_idle(void)
 1258 {
 1259         static struct vm_freelist *fl;
 1260         static int flind, oind, pind;
 1261         vm_page_t m, m_tmp;
 1262         int domain;
 1263 
 1264         domain = vm_rr_selectdomain();
 1265         fl = vm_phys_free_queues[domain][0][0];
 1266         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1267         for (;;) {
 1268                 TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
 1269                         for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
 1270                                 if ((m_tmp->flags & PG_ZERO) == 0) {
 1271                                         vm_phys_unfree_page(m_tmp);
 1272                                         vm_phys_freecnt_adj(m, -1);
 1273                                         mtx_unlock(&vm_page_queue_free_mtx);
 1274                                         pmap_zero_page_idle(m_tmp);
 1275                                         m_tmp->flags |= PG_ZERO;
 1276                                         mtx_lock(&vm_page_queue_free_mtx);
 1277                                         vm_phys_freecnt_adj(m, 1);
 1278                                         vm_phys_free_pages(m_tmp, 0);
 1279                                         vm_page_zero_count++;
 1280                                         cnt_prezero++;
 1281                                         return (TRUE);
 1282                                 }
 1283                         }
 1284                 }
 1285                 oind++;
 1286                 if (oind == VM_NFREEORDER) {
 1287                         oind = 0;
 1288                         pind++;
 1289                         if (pind == VM_NFREEPOOL) {
 1290                                 pind = 0;
 1291                                 flind++;
 1292                                 if (flind == vm_nfreelists)
 1293                                         flind = 0;
 1294                         }
 1295                         fl = vm_phys_free_queues[domain][flind][pind];
 1296                 }
 1297         }
 1298 }
 1299 
 1300 /*
 1301  * Allocate a contiguous set of physical pages of the given size
 1302  * "npages" from the free lists.  All of the physical pages must be at
 1303  * or above the given physical address "low" and below the given
 1304  * physical address "high".  The given value "alignment" determines the
 1305  * alignment of the first physical page in the set.  If the given value
 1306  * "boundary" is non-zero, then the set of physical pages cannot cross
 1307  * any physical address boundary that is a multiple of that value.  Both
 1308  * "alignment" and "boundary" must be a power of two.
 1309  */
 1310 vm_page_t
 1311 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
 1312     u_long alignment, vm_paddr_t boundary)
 1313 {
 1314         vm_paddr_t pa_end, pa_start;
 1315         vm_page_t m_run;
 1316         struct vm_domain_iterator vi;
 1317         struct vm_phys_seg *seg;
 1318         int domain, segind;
 1319 
 1320         KASSERT(npages > 0, ("npages is 0"));
 1321         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1322         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1323         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1324         if (low >= high)
 1325                 return (NULL);
 1326         vm_policy_iterator_init(&vi);
 1327 restartdom:
 1328         if (vm_domain_iterator_run(&vi, &domain) != 0) {
 1329                 vm_policy_iterator_finish(&vi);
 1330                 return (NULL);
 1331         }
 1332         m_run = NULL;
 1333         for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 1334                 seg = &vm_phys_segs[segind];
 1335                 if (seg->start >= high || seg->domain != domain)
 1336                         continue;
 1337                 if (low >= seg->end)
 1338                         break;
 1339                 if (low <= seg->start)
 1340                         pa_start = seg->start;
 1341                 else
 1342                         pa_start = low;
 1343                 if (high < seg->end)
 1344                         pa_end = high;
 1345                 else
 1346                         pa_end = seg->end;
 1347                 if (pa_end - pa_start < ptoa(npages))
 1348                         continue;
 1349                 m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
 1350                     alignment, boundary);
 1351                 if (m_run != NULL)
 1352                         break;
 1353         }
 1354         if (m_run == NULL && !vm_domain_iterator_isdone(&vi))
 1355                 goto restartdom;
 1356         vm_policy_iterator_finish(&vi);
 1357         return (m_run);
 1358 }
 1359 
 1360 /*
 1361  * Allocate a run of contiguous physical pages from the free list for the
 1362  * specified segment.
 1363  */
 1364 static vm_page_t
 1365 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
 1366     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 1367 {
 1368         struct vm_freelist *fl;
 1369         vm_paddr_t pa, pa_end, size;
 1370         vm_page_t m, m_ret;
 1371         u_long npages_end;
 1372         int oind, order, pind;
 1373 
 1374         KASSERT(npages > 0, ("npages is 0"));
 1375         KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 1376         KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 1377         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1378         /* Compute the queue that is the best fit for npages. */
 1379         for (order = 0; (1 << order) < npages; order++);
 1380         /* Search for a run satisfying the specified conditions. */
 1381         size = npages << PAGE_SHIFT;
 1382         for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
 1383             oind++) {
 1384                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 1385                         fl = (*seg->free_queues)[pind];
 1386                         TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
 1387                                 /*
 1388                                  * Is the size of this allocation request
 1389                                  * larger than the largest block size?
 1390                                  */
 1391                                 if (order >= VM_NFREEORDER) {
 1392                                         /*
 1393                                          * Determine if a sufficient number of
 1394                                          * subsequent blocks to satisfy the
 1395                                          * allocation request are free.
 1396                                          */
 1397                                         pa = VM_PAGE_TO_PHYS(m_ret);
 1398                                         pa_end = pa + size;
 1399                                         if (pa_end < pa)
 1400                                                 continue;
 1401                                         for (;;) {
 1402                                                 pa += 1 << (PAGE_SHIFT +
 1403                                                     VM_NFREEORDER - 1);
 1404                                                 if (pa >= pa_end ||
 1405                                                     pa < seg->start ||
 1406                                                     pa >= seg->end)
 1407                                                         break;
 1408                                                 m = &seg->first_page[atop(pa -
 1409                                                     seg->start)];
 1410                                                 if (m->order != VM_NFREEORDER -
 1411                                                     1)
 1412                                                         break;
 1413                                         }
 1414                                         /* If not, go to the next block. */
 1415                                         if (pa < pa_end)
 1416                                                 continue;
 1417                                 }
 1418 
 1419                                 /*
 1420                                  * Determine if the blocks are within the
 1421                                  * given range, satisfy the given alignment,
 1422                                  * and do not cross the given boundary.
 1423                                  */
 1424                                 pa = VM_PAGE_TO_PHYS(m_ret);
 1425                                 pa_end = pa + size;
 1426                                 if (pa >= low && pa_end <= high &&
 1427                                     (pa & (alignment - 1)) == 0 &&
 1428                                     rounddown2(pa ^ (pa_end - 1), boundary) == 0)
 1429                                         goto done;
 1430                         }
 1431                 }
 1432         }
 1433         return (NULL);
 1434 done:
 1435         for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
 1436                 fl = (*seg->free_queues)[m->pool];
 1437                 vm_freelist_rem(fl, m, m->order);
 1438         }
 1439         if (m_ret->pool != VM_FREEPOOL_DEFAULT)
 1440                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
 1441         fl = (*seg->free_queues)[m_ret->pool];
 1442         vm_phys_split_pages(m_ret, oind, fl, order);
 1443         /* Return excess pages to the free lists. */
 1444         npages_end = roundup2(npages, 1 << imin(oind, order));
 1445         if (npages < npages_end)
 1446                 vm_phys_free_contig(&m_ret[npages], npages_end - npages);
 1447         return (m_ret);
 1448 }
 1449 
 1450 #ifdef DDB
 1451 /*
 1452  * Show the number of physical pages in each of the free lists.
 1453  */
 1454 DB_SHOW_COMMAND(freepages, db_show_freepages)
 1455 {
 1456         struct vm_freelist *fl;
 1457         int flind, oind, pind, dom;
 1458 
 1459         for (dom = 0; dom < vm_ndomains; dom++) {
 1460                 db_printf("DOMAIN: %d\n", dom);
 1461                 for (flind = 0; flind < vm_nfreelists; flind++) {
 1462                         db_printf("FREE LIST %d:\n"
 1463                             "\n  ORDER (SIZE)  |  NUMBER"
 1464                             "\n              ", flind);
 1465                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 1466                                 db_printf("  |  POOL %d", pind);
 1467                         db_printf("\n--            ");
 1468                         for (pind = 0; pind < VM_NFREEPOOL; pind++)
 1469                                 db_printf("-- --      ");
 1470                         db_printf("--\n");
 1471                         for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 1472                                 db_printf("  %2.2d (%6.6dK)", oind,
 1473                                     1 << (PAGE_SHIFT - 10 + oind));
 1474                                 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 1475                                 fl = vm_phys_free_queues[dom][flind][pind];
 1476                                         db_printf("  |  %6.6d", fl[oind].lcnt);
 1477                                 }
 1478                                 db_printf("\n");
 1479                         }
 1480                         db_printf("\n");
 1481                 }
 1482                 db_printf("\n");
 1483         }
 1484 }
 1485 #endif
Cache object: edfa0f08b55c3a23142d3089223cbe3e
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vm/vm_phys.c

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_phys.c