The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1991 Regents of the University of California.
    3  * All rights reserved.
    4  * Copyright (c) 1994 John S. Dyson
    5  * All rights reserved.
    6  * Copyright (c) 1994 David Greenman
    7  * All rights reserved.
    8  * Copyright (c) 2003 Peter Wemm
    9  * All rights reserved.
   10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
   11  * All rights reserved.
   12  *
   13  * This code is derived from software contributed to Berkeley by
   14  * the Systems Programming Group of the University of Utah Computer
   15  * Science Department and William Jolitz of UUNET Technologies Inc.
   16  *
   17  * Redistribution and use in source and binary forms, with or without
   18  * modification, are permitted provided that the following conditions
   19  * are met:
   20  * 1. Redistributions of source code must retain the above copyright
   21  *    notice, this list of conditions and the following disclaimer.
   22  * 2. Redistributions in binary form must reproduce the above copyright
   23  *    notice, this list of conditions and the following disclaimer in the
   24  *    documentation and/or other materials provided with the distribution.
   25  * 3. All advertising materials mentioning features or use of this software
   26  *    must display the following acknowledgement:
   27  *      This product includes software developed by the University of
   28  *      California, Berkeley and its contributors.
   29  * 4. Neither the name of the University nor the names of its contributors
   30  *    may be used to endorse or promote products derived from this software
   31  *    without specific prior written permission.
   32  *
   33  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   34  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   35  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   36  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   37  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   38  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   39  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   40  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   41  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   42  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   43  * SUCH DAMAGE.
   44  *
   45  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
   46  */
   47 /*-
   48  * Copyright (c) 2003 Networks Associates Technology, Inc.
   49  * All rights reserved.
   50  *
   51  * This software was developed for the FreeBSD Project by Jake Burkholder,
   52  * Safeport Network Services, and Network Associates Laboratories, the
   53  * Security Research Division of Network Associates, Inc. under
   54  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
   55  * CHATS research program.
   56  *
   57  * Redistribution and use in source and binary forms, with or without
   58  * modification, are permitted provided that the following conditions
   59  * are met:
   60  * 1. Redistributions of source code must retain the above copyright
   61  *    notice, this list of conditions and the following disclaimer.
   62  * 2. Redistributions in binary form must reproduce the above copyright
   63  *    notice, this list of conditions and the following disclaimer in the
   64  *    documentation and/or other materials provided with the distribution.
   65  *
   66  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   67  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   68  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   69  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   70  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   71  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   72  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   73  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   74  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   75  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   76  * SUCH DAMAGE.
   77  */
   78 
   79 #include <sys/cdefs.h>
   80 __FBSDID("$FreeBSD: releng/8.4/sys/amd64/amd64/pmap.c 247548 2013-03-01 14:54:37Z jhb $");
   81 
   82 /*
   83  *      Manages physical address maps.
   84  *
   85  *      In addition to hardware address maps, this
   86  *      module is called upon to provide software-use-only
   87  *      maps which may or may not be stored in the same
   88  *      form as hardware maps.  These pseudo-maps are
   89  *      used to store intermediate results from copy
   90  *      operations to and from address spaces.
   91  *
   92  *      Since the information managed by this module is
   93  *      also stored by the logical address mapping module,
   94  *      this module may throw away valid virtual-to-physical
   95  *      mappings at almost any time.  However, invalidations
   96  *      of virtual-to-physical mappings must be done as
   97  *      requested.
   98  *
   99  *      In order to cope with hardware architectures which
  100  *      make virtual-to-physical map invalidates expensive,
  101  *      this module may delay invalidate or reduced protection
  102  *      operations until such time as they are actually
  103  *      necessary.  This module is given full information as
  104  *      to which processors are currently using which maps,
  105  *      and to when physical maps must be made correct.
  106  */
  107 
  108 #include "opt_pmap.h"
  109 #include "opt_vm.h"
  110 
  111 #include <sys/param.h>
  112 #include <sys/bus.h>
  113 #include <sys/systm.h>
  114 #include <sys/kernel.h>
  115 #include <sys/ktr.h>
  116 #include <sys/lock.h>
  117 #include <sys/malloc.h>
  118 #include <sys/mman.h>
  119 #include <sys/mutex.h>
  120 #include <sys/proc.h>
  121 #include <sys/sx.h>
  122 #include <sys/vmmeter.h>
  123 #include <sys/sched.h>
  124 #include <sys/sysctl.h>
  125 #ifdef SMP
  126 #include <sys/smp.h>
  127 #endif
  128 
  129 #include <vm/vm.h>
  130 #include <vm/vm_param.h>
  131 #include <vm/vm_kern.h>
  132 #include <vm/vm_page.h>
  133 #include <vm/vm_map.h>
  134 #include <vm/vm_object.h>
  135 #include <vm/vm_extern.h>
  136 #include <vm/vm_pageout.h>
  137 #include <vm/vm_pager.h>
  138 #include <vm/vm_reserv.h>
  139 #include <vm/uma.h>
  140 
  141 #include <machine/intr_machdep.h>
  142 #include <machine/apicvar.h>
  143 #include <machine/cpu.h>
  144 #include <machine/cputypes.h>
  145 #include <machine/md_var.h>
  146 #include <machine/pcb.h>
  147 #include <machine/specialreg.h>
  148 #ifdef SMP
  149 #include <machine/smp.h>
  150 #endif
  151 
  152 #if !defined(DIAGNOSTIC)
  153 #define PMAP_INLINE     __gnu89_inline
  154 #else
  155 #define PMAP_INLINE
  156 #endif
  157 
  158 #ifdef PV_STATS
  159 #define PV_STAT(x)      do { x ; } while (0)
  160 #else
  161 #define PV_STAT(x)      do { } while (0)
  162 #endif
  163 
  164 #define pa_index(pa)    ((pa) >> PDRSHIFT)
  165 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
  166 
  167 struct pmap kernel_pmap_store;
  168 
  169 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
  170 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
  171 
  172 static int ndmpdp;
  173 static vm_paddr_t dmaplimit;
  174 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
  175 pt_entry_t pg_nx;
  176 
  177 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  178 
  179 static int pat_works = 1;
  180 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
  181     "Is page attribute table fully functional?");
  182 
  183 static int pg_ps_enabled = 1;
  184 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
  185     "Are large page mappings enabled?");
  186 
  187 #define PAT_INDEX_SIZE  8
  188 static int pat_index[PAT_INDEX_SIZE];   /* cache mode to PAT index conversion */
  189 
  190 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
  191 static u_int64_t        KPDphys;        /* phys addr of kernel level 2 */
  192 u_int64_t               KPDPphys;       /* phys addr of kernel level 3 */
  193 u_int64_t               KPML4phys;      /* phys addr of kernel level 4 */
  194 
  195 static u_int64_t        DMPDphys;       /* phys addr of direct mapped level 2 */
  196 static u_int64_t        DMPDPphys;      /* phys addr of direct mapped level 3 */
  197 
  198 /*
  199  * Data for the pv entry allocation mechanism
  200  */
  201 static int pv_entry_count;
  202 static struct md_page *pv_table;
  203 
  204 /*
  205  * All those kernel PT submaps that BSD is so fond of
  206  */
  207 pt_entry_t *CMAP1 = 0;
  208 caddr_t CADDR1 = 0;
  209 
  210 /*
  211  * Crashdump maps.
  212  */
  213 static caddr_t crashdumpmap;
  214 
  215 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
  216 static pv_entry_t get_pv_entry(pmap_t locked_pmap, boolean_t try);
  217 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  218 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  219 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  220 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  221 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
  222                     vm_offset_t va);
  223 static int      pmap_pvh_wired_mappings(struct md_page *pvh, int count);
  224 
  225 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
  226 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  227 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
  228     vm_offset_t va);
  229 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
  230     vm_prot_t prot);
  231 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
  232     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
  233 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
  234 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
  235 static void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
  236 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
  237 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
  238 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
  239 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
  240 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  241 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
  242     vm_prot_t prot);
  243 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
  244 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  245                 vm_page_t *free);
  246 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
  247                 vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
  248 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
  249 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  250     vm_page_t *free);
  251 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
  252                 vm_offset_t va);
  253 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
  254 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
  255     vm_page_t m);
  256 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  257     pd_entry_t newpde);
  258 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
  259 
  260 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
  261 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
  262 
  263 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
  264 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
  265                 vm_page_t* free);
  266 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
  267 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
  268 
  269 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
  270 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
  271 
  272 /*
  273  * Move the kernel virtual free pointer to the next
  274  * 2MB.  This is used to help improve performance
  275  * by using a large (2MB) page for much of the kernel
  276  * (.text, .data, .bss)
  277  */
  278 static vm_offset_t
  279 pmap_kmem_choose(vm_offset_t addr)
  280 {
  281         vm_offset_t newaddr = addr;
  282 
  283         newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
  284         return newaddr;
  285 }
  286 
  287 /********************/
  288 /* Inline functions */
  289 /********************/
  290 
  291 /* Return a non-clipped PD index for a given VA */
  292 static __inline vm_pindex_t
  293 pmap_pde_pindex(vm_offset_t va)
  294 {
  295         return va >> PDRSHIFT;
  296 }
  297 
  298 
  299 /* Return various clipped indexes for a given VA */
  300 static __inline vm_pindex_t
  301 pmap_pte_index(vm_offset_t va)
  302 {
  303 
  304         return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
  305 }
  306 
  307 static __inline vm_pindex_t
  308 pmap_pde_index(vm_offset_t va)
  309 {
  310 
  311         return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
  312 }
  313 
  314 static __inline vm_pindex_t
  315 pmap_pdpe_index(vm_offset_t va)
  316 {
  317 
  318         return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
  319 }
  320 
  321 static __inline vm_pindex_t
  322 pmap_pml4e_index(vm_offset_t va)
  323 {
  324 
  325         return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
  326 }
  327 
  328 /* Return a pointer to the PML4 slot that corresponds to a VA */
  329 static __inline pml4_entry_t *
  330 pmap_pml4e(pmap_t pmap, vm_offset_t va)
  331 {
  332 
  333         return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
  334 }
  335 
  336 /* Return a pointer to the PDP slot that corresponds to a VA */
  337 static __inline pdp_entry_t *
  338 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
  339 {
  340         pdp_entry_t *pdpe;
  341 
  342         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
  343         return (&pdpe[pmap_pdpe_index(va)]);
  344 }
  345 
  346 /* Return a pointer to the PDP slot that corresponds to a VA */
  347 static __inline pdp_entry_t *
  348 pmap_pdpe(pmap_t pmap, vm_offset_t va)
  349 {
  350         pml4_entry_t *pml4e;
  351 
  352         pml4e = pmap_pml4e(pmap, va);
  353         if ((*pml4e & PG_V) == 0)
  354                 return NULL;
  355         return (pmap_pml4e_to_pdpe(pml4e, va));
  356 }
  357 
  358 /* Return a pointer to the PD slot that corresponds to a VA */
  359 static __inline pd_entry_t *
  360 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
  361 {
  362         pd_entry_t *pde;
  363 
  364         pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
  365         return (&pde[pmap_pde_index(va)]);
  366 }
  367 
  368 /* Return a pointer to the PD slot that corresponds to a VA */
  369 static __inline pd_entry_t *
  370 pmap_pde(pmap_t pmap, vm_offset_t va)
  371 {
  372         pdp_entry_t *pdpe;
  373 
  374         pdpe = pmap_pdpe(pmap, va);
  375         if (pdpe == NULL || (*pdpe & PG_V) == 0)
  376                  return NULL;
  377         return (pmap_pdpe_to_pde(pdpe, va));
  378 }
  379 
  380 /* Return a pointer to the PT slot that corresponds to a VA */
  381 static __inline pt_entry_t *
  382 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
  383 {
  384         pt_entry_t *pte;
  385 
  386         pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
  387         return (&pte[pmap_pte_index(va)]);
  388 }
  389 
  390 /* Return a pointer to the PT slot that corresponds to a VA */
  391 static __inline pt_entry_t *
  392 pmap_pte(pmap_t pmap, vm_offset_t va)
  393 {
  394         pd_entry_t *pde;
  395 
  396         pde = pmap_pde(pmap, va);
  397         if (pde == NULL || (*pde & PG_V) == 0)
  398                 return NULL;
  399         if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
  400                 return ((pt_entry_t *)pde);
  401         return (pmap_pde_to_pte(pde, va));
  402 }
  403 
  404 
  405 PMAP_INLINE pt_entry_t *
  406 vtopte(vm_offset_t va)
  407 {
  408         u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  409 
  410         return (PTmap + ((va >> PAGE_SHIFT) & mask));
  411 }
  412 
  413 static __inline pd_entry_t *
  414 vtopde(vm_offset_t va)
  415 {
  416         u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  417 
  418         return (PDmap + ((va >> PDRSHIFT) & mask));
  419 }
  420 
  421 static u_int64_t
  422 allocpages(vm_paddr_t *firstaddr, int n)
  423 {
  424         u_int64_t ret;
  425 
  426         ret = *firstaddr;
  427         bzero((void *)ret, n * PAGE_SIZE);
  428         *firstaddr += n * PAGE_SIZE;
  429         return (ret);
  430 }
  431 
  432 static void
  433 create_pagetables(vm_paddr_t *firstaddr)
  434 {
  435         int i, j, ndm1g;
  436 
  437         /* Allocate pages */
  438         KPTphys = allocpages(firstaddr, NKPT);
  439         KPML4phys = allocpages(firstaddr, 1);
  440         KPDPphys = allocpages(firstaddr, NKPML4E);
  441         KPDphys = allocpages(firstaddr, NKPDPE);
  442 
  443         ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
  444         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
  445                 ndmpdp = 4;
  446         DMPDPphys = allocpages(firstaddr, NDMPML4E);
  447         ndm1g = 0;
  448         if ((amd_feature & AMDID_PAGE1GB) != 0)
  449                 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
  450         if (ndm1g < ndmpdp)
  451                 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
  452         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
  453 
  454         /* Fill in the underlying page table pages */
  455         /* Read-only from zero to physfree */
  456         /* XXX not fully used, underneath 2M pages */
  457         for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
  458                 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
  459                 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
  460         }
  461 
  462         /* Now map the page tables at their location within PTmap */
  463         for (i = 0; i < NKPT; i++) {
  464                 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
  465                 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
  466         }
  467 
  468         /* Map from zero to end of allocations under 2M pages */
  469         /* This replaces some of the KPTphys entries above */
  470         for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
  471                 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
  472                 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
  473         }
  474 
  475         /* And connect up the PD to the PDP */
  476         for (i = 0; i < NKPDPE; i++) {
  477                 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
  478                     (i << PAGE_SHIFT);
  479                 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
  480         }
  481 
  482         /*
  483          * Now, set up the direct map region using 2MB and/or 1GB pages.  If
  484          * the end of physical memory is not aligned to a 1GB page boundary,
  485          * then the residual physical memory is mapped with 2MB pages.  Later,
  486          * if pmap_mapdev{_attr}() uses the direct map for non-write-back
  487          * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
  488          * that are partially used. 
  489          */
  490         for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
  491                 ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
  492                 /* Preset PG_M and PG_A because demotion expects it. */
  493                 ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
  494                     PG_M | PG_A;
  495         }
  496         for (i = 0; i < ndm1g; i++) {
  497                 ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
  498                 /* Preset PG_M and PG_A because demotion expects it. */
  499                 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
  500                     PG_M | PG_A;
  501         }
  502         for (j = 0; i < ndmpdp; i++, j++) {
  503                 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
  504                 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
  505         }
  506 
  507         /* And recursively map PML4 to itself in order to get PTmap */
  508         ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
  509         ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
  510 
  511         /* Connect the Direct Map slot up to the PML4 */
  512         ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
  513         ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
  514 
  515         /* Connect the KVA slot up to the PML4 */
  516         ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
  517         ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
  518 }
  519 
  520 /*
  521  *      Bootstrap the system enough to run with virtual memory.
  522  *
  523  *      On amd64 this is called after mapping has already been enabled
  524  *      and just syncs the pmap module with what has already been done.
  525  *      [We can't call it easily with mapping off since the kernel is not
  526  *      mapped with PA == VA, hence we would have to relocate every address
  527  *      from the linked base (virtual) address "KERNBASE" to the actual
  528  *      (physical) address starting relative to 0]
  529  */
  530 void
  531 pmap_bootstrap(vm_paddr_t *firstaddr)
  532 {
  533         vm_offset_t va;
  534         pt_entry_t *pte, *unused;
  535 
  536         /*
  537          * Create an initial set of page tables to run the kernel in.
  538          */
  539         create_pagetables(firstaddr);
  540 
  541         virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
  542         virtual_avail = pmap_kmem_choose(virtual_avail);
  543 
  544         virtual_end = VM_MAX_KERNEL_ADDRESS;
  545 
  546 
  547         /* XXX do %cr0 as well */
  548         load_cr4(rcr4() | CR4_PGE | CR4_PSE);
  549         load_cr3(KPML4phys);
  550 
  551         /*
  552          * Initialize the kernel pmap (which is statically allocated).
  553          */
  554         PMAP_LOCK_INIT(kernel_pmap);
  555         kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
  556         kernel_pmap->pm_root = NULL;
  557         kernel_pmap->pm_active = -1;    /* don't allow deactivation */
  558         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
  559 
  560         /*
  561          * Reserve some special page table entries/VA space for temporary
  562          * mapping of pages.
  563          */
  564 #define SYSMAP(c, p, v, n)      \
  565         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
  566 
  567         va = virtual_avail;
  568         pte = vtopte(va);
  569 
  570         /*
  571          * CMAP1 is only used for the memory test.
  572          */
  573         SYSMAP(caddr_t, CMAP1, CADDR1, 1)
  574 
  575         /*
  576          * Crashdump maps.
  577          */
  578         SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
  579 
  580         virtual_avail = va;
  581 
  582         *CMAP1 = 0;
  583 
  584         invltlb();
  585 
  586         /* Initialize the PAT MSR. */
  587         pmap_init_pat();
  588 }
  589 
  590 /*
  591  * Setup the PAT MSR.
  592  */
  593 void
  594 pmap_init_pat(void)
  595 {
  596         int pat_table[PAT_INDEX_SIZE];
  597         uint64_t pat_msr;
  598         u_long cr0, cr4;
  599         int i;
  600 
  601         /* Bail if this CPU doesn't implement PAT. */
  602         if ((cpu_feature & CPUID_PAT) == 0)
  603                 panic("no PAT??");
  604 
  605         /* Set default PAT index table. */
  606         for (i = 0; i < PAT_INDEX_SIZE; i++)
  607                 pat_table[i] = -1;
  608         pat_table[PAT_WRITE_BACK] = 0;
  609         pat_table[PAT_WRITE_THROUGH] = 1;
  610         pat_table[PAT_UNCACHEABLE] = 3;
  611         pat_table[PAT_WRITE_COMBINING] = 3;
  612         pat_table[PAT_WRITE_PROTECTED] = 3;
  613         pat_table[PAT_UNCACHED] = 3;
  614 
  615         /* Initialize default PAT entries. */
  616         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
  617             PAT_VALUE(1, PAT_WRITE_THROUGH) |
  618             PAT_VALUE(2, PAT_UNCACHED) |
  619             PAT_VALUE(3, PAT_UNCACHEABLE) |
  620             PAT_VALUE(4, PAT_WRITE_BACK) |
  621             PAT_VALUE(5, PAT_WRITE_THROUGH) |
  622             PAT_VALUE(6, PAT_UNCACHED) |
  623             PAT_VALUE(7, PAT_UNCACHEABLE);
  624 
  625         if (pat_works) {
  626                 /*
  627                  * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
  628                  * Program 5 and 6 as WP and WC.
  629                  * Leave 4 and 7 as WB and UC.
  630                  */
  631                 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
  632                 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
  633                     PAT_VALUE(6, PAT_WRITE_COMBINING);
  634                 pat_table[PAT_UNCACHED] = 2;
  635                 pat_table[PAT_WRITE_PROTECTED] = 5;
  636                 pat_table[PAT_WRITE_COMBINING] = 6;
  637         } else {
  638                 /*
  639                  * Just replace PAT Index 2 with WC instead of UC-.
  640                  */
  641                 pat_msr &= ~PAT_MASK(2);
  642                 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
  643                 pat_table[PAT_WRITE_COMBINING] = 2;
  644         }
  645 
  646         /* Disable PGE. */
  647         cr4 = rcr4();
  648         load_cr4(cr4 & ~CR4_PGE);
  649 
  650         /* Disable caches (CD = 1, NW = 0). */
  651         cr0 = rcr0();
  652         load_cr0((cr0 & ~CR0_NW) | CR0_CD);
  653 
  654         /* Flushes caches and TLBs. */
  655         wbinvd();
  656         invltlb();
  657 
  658         /* Update PAT and index table. */
  659         wrmsr(MSR_PAT, pat_msr);
  660         for (i = 0; i < PAT_INDEX_SIZE; i++)
  661                 pat_index[i] = pat_table[i];
  662 
  663         /* Flush caches and TLBs again. */
  664         wbinvd();
  665         invltlb();
  666 
  667         /* Restore caches and PGE. */
  668         load_cr0(cr0);
  669         load_cr4(cr4);
  670 }
  671 
  672 /*
  673  *      Initialize a vm_page's machine-dependent fields.
  674  */
  675 void
  676 pmap_page_init(vm_page_t m)
  677 {
  678 
  679         TAILQ_INIT(&m->md.pv_list);
  680         m->md.pat_mode = PAT_WRITE_BACK;
  681 }
  682 
  683 /*
  684  *      Initialize the pmap module.
  685  *      Called by vm_init, to initialize any structures that the pmap
  686  *      system needs to map virtual memory.
  687  */
  688 void
  689 pmap_init(void)
  690 {
  691         vm_page_t mpte;
  692         vm_size_t s;
  693         int i, pv_npg;
  694 
  695         /*
  696          * Initialize the vm page array entries for the kernel pmap's
  697          * page table pages.
  698          */ 
  699         for (i = 0; i < NKPT; i++) {
  700                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
  701                 KASSERT(mpte >= vm_page_array &&
  702                     mpte < &vm_page_array[vm_page_array_size],
  703                     ("pmap_init: page table page is out of range"));
  704                 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
  705                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
  706         }
  707 
  708         /*
  709          * If the kernel is running in a virtual machine on an AMD Family 10h
  710          * processor, then it must assume that MCA is enabled by the virtual
  711          * machine monitor.
  712          */
  713         if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
  714             CPUID_TO_FAMILY(cpu_id) == 0x10)
  715                 workaround_erratum383 = 1;
  716 
  717         /*
  718          * Are large page mappings enabled?
  719          */
  720         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
  721         if (pg_ps_enabled) {
  722                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
  723                     ("pmap_init: can't assign to pagesizes[1]"));
  724                 pagesizes[1] = NBPDR;
  725         }
  726 
  727         /*
  728          * Calculate the size of the pv head table for superpages.
  729          */
  730         for (i = 0; phys_avail[i + 1]; i += 2);
  731         pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
  732 
  733         /*
  734          * Allocate memory for the pv head table for superpages.
  735          */
  736         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
  737         s = round_page(s);
  738         pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
  739         for (i = 0; i < pv_npg; i++)
  740                 TAILQ_INIT(&pv_table[i].pv_list);
  741 }
  742 
  743 SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
  744     "2MB page mapping counters");
  745 
  746 static u_long pmap_pde_demotions;
  747 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
  748     &pmap_pde_demotions, 0, "2MB page demotions");
  749 
  750 static u_long pmap_pde_mappings;
  751 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
  752     &pmap_pde_mappings, 0, "2MB page mappings");
  753 
  754 static u_long pmap_pde_p_failures;
  755 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
  756     &pmap_pde_p_failures, 0, "2MB page promotion failures");
  757 
  758 static u_long pmap_pde_promotions;
  759 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
  760     &pmap_pde_promotions, 0, "2MB page promotions");
  761 
  762 SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
  763     "1GB page mapping counters");
  764 
  765 static u_long pmap_pdpe_demotions;
  766 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
  767     &pmap_pdpe_demotions, 0, "1GB page demotions");
  768 
  769 
  770 /***************************************************
  771  * Low level helper routines.....
  772  ***************************************************/
  773 
  774 /*
  775  * Determine the appropriate bits to set in a PTE or PDE for a specified
  776  * caching mode.
  777  */
  778 static int
  779 pmap_cache_bits(int mode, boolean_t is_pde)
  780 {
  781         int cache_bits, pat_flag, pat_idx;
  782 
  783         if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
  784                 panic("Unknown caching mode %d\n", mode);
  785 
  786         /* The PAT bit is different for PTE's and PDE's. */
  787         pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
  788 
  789         /* Map the caching mode to a PAT index. */
  790         pat_idx = pat_index[mode];
  791 
  792         /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
  793         cache_bits = 0;
  794         if (pat_idx & 0x4)
  795                 cache_bits |= pat_flag;
  796         if (pat_idx & 0x2)
  797                 cache_bits |= PG_NC_PCD;
  798         if (pat_idx & 0x1)
  799                 cache_bits |= PG_NC_PWT;
  800         return (cache_bits);
  801 }
  802 
  803 /*
  804  * After changing the page size for the specified virtual address in the page
  805  * table, flush the corresponding entries from the processor's TLB.  Only the
  806  * calling processor's TLB is affected.
  807  *
  808  * The calling thread must be pinned to a processor.
  809  */
  810 static void
  811 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
  812 {
  813         u_long cr4;
  814 
  815         if ((newpde & PG_PS) == 0)
  816                 /* Demotion: flush a specific 2MB page mapping. */
  817                 invlpg(va);
  818         else if ((newpde & PG_G) == 0)
  819                 /*
  820                  * Promotion: flush every 4KB page mapping from the TLB
  821                  * because there are too many to flush individually.
  822                  */
  823                 invltlb();
  824         else {
  825                 /*
  826                  * Promotion: flush every 4KB page mapping from the TLB,
  827                  * including any global (PG_G) mappings.
  828                  */
  829                 cr4 = rcr4();
  830                 load_cr4(cr4 & ~CR4_PGE);
  831                 /*
  832                  * Although preemption at this point could be detrimental to
  833                  * performance, it would not lead to an error.  PG_G is simply
  834                  * ignored if CR4.PGE is clear.  Moreover, in case this block
  835                  * is re-entered, the load_cr4() either above or below will
  836                  * modify CR4.PGE flushing the TLB.
  837                  */
  838                 load_cr4(cr4 | CR4_PGE);
  839         }
  840 }
  841 #ifdef SMP
  842 /*
  843  * For SMP, these functions have to use the IPI mechanism for coherence.
  844  *
  845  * N.B.: Before calling any of the following TLB invalidation functions,
  846  * the calling processor must ensure that all stores updating a non-
  847  * kernel page table are globally performed.  Otherwise, another
  848  * processor could cache an old, pre-update entry without being
  849  * invalidated.  This can happen one of two ways: (1) The pmap becomes
  850  * active on another processor after its pm_active field is checked by
  851  * one of the following functions but before a store updating the page
  852  * table is globally performed. (2) The pmap becomes active on another
  853  * processor before its pm_active field is checked but due to
  854  * speculative loads one of the following functions stills reads the
  855  * pmap as inactive on the other processor.
  856  * 
  857  * The kernel page table is exempt because its pm_active field is
  858  * immutable.  The kernel page table is always active on every
  859  * processor.
  860  */
  861 void
  862 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
  863 {
  864         cpumask_t cpumask, other_cpus;
  865 
  866         sched_pin();
  867         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
  868                 invlpg(va);
  869                 smp_invlpg(va);
  870         } else {
  871                 cpumask = PCPU_GET(cpumask);
  872                 other_cpus = PCPU_GET(other_cpus);
  873                 if (pmap->pm_active & cpumask)
  874                         invlpg(va);
  875                 if (pmap->pm_active & other_cpus)
  876                         smp_masked_invlpg(pmap->pm_active & other_cpus, va);
  877         }
  878         sched_unpin();
  879 }
  880 
  881 void
  882 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  883 {
  884         cpumask_t cpumask, other_cpus;
  885         vm_offset_t addr;
  886 
  887         sched_pin();
  888         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
  889                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
  890                         invlpg(addr);
  891                 smp_invlpg_range(sva, eva);
  892         } else {
  893                 cpumask = PCPU_GET(cpumask);
  894                 other_cpus = PCPU_GET(other_cpus);
  895                 if (pmap->pm_active & cpumask)
  896                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
  897                                 invlpg(addr);
  898                 if (pmap->pm_active & other_cpus)
  899                         smp_masked_invlpg_range(pmap->pm_active & other_cpus,
  900                             sva, eva);
  901         }
  902         sched_unpin();
  903 }
  904 
  905 void
  906 pmap_invalidate_all(pmap_t pmap)
  907 {
  908         cpumask_t cpumask, other_cpus;
  909 
  910         sched_pin();
  911         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
  912                 invltlb();
  913                 smp_invltlb();
  914         } else {
  915                 cpumask = PCPU_GET(cpumask);
  916                 other_cpus = PCPU_GET(other_cpus);
  917                 if (pmap->pm_active & cpumask)
  918                         invltlb();
  919                 if (pmap->pm_active & other_cpus)
  920                         smp_masked_invltlb(pmap->pm_active & other_cpus);
  921         }
  922         sched_unpin();
  923 }
  924 
  925 void
  926 pmap_invalidate_cache(void)
  927 {
  928 
  929         sched_pin();
  930         wbinvd();
  931         smp_cache_flush();
  932         sched_unpin();
  933 }
  934 
  935 struct pde_action {
  936         cpumask_t store;        /* processor that updates the PDE */
  937         cpumask_t invalidate;   /* processors that invalidate their TLB */
  938         vm_offset_t va;
  939         pd_entry_t *pde;
  940         pd_entry_t newpde;
  941 };
  942 
  943 static void
  944 pmap_update_pde_action(void *arg)
  945 {
  946         struct pde_action *act = arg;
  947 
  948         if (act->store == PCPU_GET(cpumask))
  949                 pde_store(act->pde, act->newpde);
  950 }
  951 
  952 static void
  953 pmap_update_pde_teardown(void *arg)
  954 {
  955         struct pde_action *act = arg;
  956 
  957         if ((act->invalidate & PCPU_GET(cpumask)) != 0)
  958                 pmap_update_pde_invalidate(act->va, act->newpde);
  959 }
  960 
  961 /*
  962  * Change the page size for the specified virtual address in a way that
  963  * prevents any possibility of the TLB ever having two entries that map the
  964  * same virtual address using different page sizes.  This is the recommended
  965  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  966  * machine check exception for a TLB state that is improperly diagnosed as a
  967  * hardware error.
  968  */
  969 static void
  970 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
  971 {
  972         struct pde_action act;
  973         cpumask_t active, cpumask;
  974 
  975         sched_pin();
  976         cpumask = PCPU_GET(cpumask);
  977         if (pmap == kernel_pmap)
  978                 active = all_cpus;
  979         else
  980                 active = pmap->pm_active;
  981         if ((active & PCPU_GET(other_cpus)) != 0) {
  982                 act.store = cpumask;
  983                 act.invalidate = active;
  984                 act.va = va;
  985                 act.pde = pde;
  986                 act.newpde = newpde;
  987                 smp_rendezvous_cpus(cpumask | active,
  988                     smp_no_rendevous_barrier, pmap_update_pde_action,
  989                     pmap_update_pde_teardown, &act);
  990         } else {
  991                 pde_store(pde, newpde);
  992                 if ((active & cpumask) != 0)
  993                         pmap_update_pde_invalidate(va, newpde);
  994         }
  995         sched_unpin();
  996 }
  997 #else /* !SMP */
  998 /*
  999  * Normal, non-SMP, invalidation functions.
 1000  * We inline these within pmap.c for speed.
 1001  */
 1002 PMAP_INLINE void
 1003 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1004 {
 1005 
 1006         if (pmap == kernel_pmap || pmap->pm_active)
 1007                 invlpg(va);
 1008 }
 1009 
 1010 PMAP_INLINE void
 1011 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1012 {
 1013         vm_offset_t addr;
 1014 
 1015         if (pmap == kernel_pmap || pmap->pm_active)
 1016                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1017                         invlpg(addr);
 1018 }
 1019 
 1020 PMAP_INLINE void
 1021 pmap_invalidate_all(pmap_t pmap)
 1022 {
 1023 
 1024         if (pmap == kernel_pmap || pmap->pm_active)
 1025                 invltlb();
 1026 }
 1027 
 1028 PMAP_INLINE void
 1029 pmap_invalidate_cache(void)
 1030 {
 1031 
 1032         wbinvd();
 1033 }
 1034 
 1035 static void
 1036 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1037 {
 1038 
 1039         pde_store(pde, newpde);
 1040         if (pmap == kernel_pmap || pmap->pm_active)
 1041                 pmap_update_pde_invalidate(va, newpde);
 1042 }
 1043 #endif /* !SMP */
 1044 
 1045 static void
 1046 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 1047 {
 1048 
 1049         KASSERT((sva & PAGE_MASK) == 0,
 1050             ("pmap_invalidate_cache_range: sva not page-aligned"));
 1051         KASSERT((eva & PAGE_MASK) == 0,
 1052             ("pmap_invalidate_cache_range: eva not page-aligned"));
 1053 
 1054         if (cpu_feature & CPUID_SS)
 1055                 ; /* If "Self Snoop" is supported, do nothing. */
 1056         else if ((cpu_feature & CPUID_CLFSH) != 0 &&
 1057                  eva - sva < 2 * 1024 * 1024) {
 1058 
 1059                 /*
 1060                  * XXX: Some CPUs fault, hang, or trash the local APIC
 1061                  * registers if we use CLFLUSH on the local APIC
 1062                  * range.  The local APIC is always uncached, so we
 1063                  * don't need to flush for that range anyway.
 1064                  */
 1065                 if (pmap_kextract(sva) == lapic_paddr)
 1066                         return;
 1067 
 1068                 /*
 1069                  * Otherwise, do per-cache line flush.  Use the mfence
 1070                  * instruction to insure that previous stores are
 1071                  * included in the write-back.  The processor
 1072                  * propagates flush to other processors in the cache
 1073                  * coherence domain.
 1074                  */
 1075                 mfence();
 1076                 for (; sva < eva; sva += cpu_clflush_line_size)
 1077                         clflush(sva);
 1078                 mfence();
 1079         } else {
 1080 
 1081                 /*
 1082                  * No targeted cache flush methods are supported by CPU,
 1083                  * or the supplied range is bigger than 2MB.
 1084                  * Globally invalidate cache.
 1085                  */
 1086                 pmap_invalidate_cache();
 1087         }
 1088 }
 1089 
 1090 /*
 1091  * Are we current address space or kernel?
 1092  */
 1093 static __inline int
 1094 pmap_is_current(pmap_t pmap)
 1095 {
 1096         return (pmap == kernel_pmap ||
 1097             (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
 1098 }
 1099 
 1100 /*
 1101  *      Routine:        pmap_extract
 1102  *      Function:
 1103  *              Extract the physical page address associated
 1104  *              with the given map/virtual_address pair.
 1105  */
 1106 vm_paddr_t 
 1107 pmap_extract(pmap_t pmap, vm_offset_t va)
 1108 {
 1109         pdp_entry_t *pdpe;
 1110         pd_entry_t *pde;
 1111         pt_entry_t *pte;
 1112         vm_paddr_t pa;
 1113 
 1114         pa = 0;
 1115         PMAP_LOCK(pmap);
 1116         pdpe = pmap_pdpe(pmap, va);
 1117         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 1118                 if ((*pdpe & PG_PS) != 0)
 1119                         pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 1120                 else {
 1121                         pde = pmap_pdpe_to_pde(pdpe, va);
 1122                         if ((*pde & PG_V) != 0) {
 1123                                 if ((*pde & PG_PS) != 0) {
 1124                                         pa = (*pde & PG_PS_FRAME) |
 1125                                             (va & PDRMASK);
 1126                                 } else {
 1127                                         pte = pmap_pde_to_pte(pde, va);
 1128                                         pa = (*pte & PG_FRAME) |
 1129                                             (va & PAGE_MASK);
 1130                                 }
 1131                         }
 1132                 }
 1133         }
 1134         PMAP_UNLOCK(pmap);
 1135         return (pa);
 1136 }
 1137 
 1138 /*
 1139  *      Routine:        pmap_extract_and_hold
 1140  *      Function:
 1141  *              Atomically extract and hold the physical page
 1142  *              with the given pmap and virtual address pair
 1143  *              if that mapping permits the given protection.
 1144  */
 1145 vm_page_t
 1146 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 1147 {
 1148         pd_entry_t pde, *pdep;
 1149         pt_entry_t pte;
 1150         vm_page_t m;
 1151 
 1152         m = NULL;
 1153         vm_page_lock_queues();
 1154         PMAP_LOCK(pmap);
 1155         pdep = pmap_pde(pmap, va);
 1156         if (pdep != NULL && (pde = *pdep)) {
 1157                 if (pde & PG_PS) {
 1158                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 1159                                 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 1160                                     (va & PDRMASK));
 1161                                 vm_page_hold(m);
 1162                         }
 1163                 } else {
 1164                         pte = *pmap_pde_to_pte(pdep, va);
 1165                         if ((pte & PG_V) &&
 1166                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 1167                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 1168                                 vm_page_hold(m);
 1169                         }
 1170                 }
 1171         }
 1172         vm_page_unlock_queues();
 1173         PMAP_UNLOCK(pmap);
 1174         return (m);
 1175 }
 1176 
 1177 vm_paddr_t
 1178 pmap_kextract(vm_offset_t va)
 1179 {
 1180         pd_entry_t pde;
 1181         vm_paddr_t pa;
 1182 
 1183         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 1184                 pa = DMAP_TO_PHYS(va);
 1185         } else {
 1186                 pde = *vtopde(va);
 1187                 if (pde & PG_PS) {
 1188                         pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 1189                 } else {
 1190                         /*
 1191                          * Beware of a concurrent promotion that changes the
 1192                          * PDE at this point!  For example, vtopte() must not
 1193                          * be used to access the PTE because it would use the
 1194                          * new PDE.  It is, however, safe to use the old PDE
 1195                          * because the page table page is preserved by the
 1196                          * promotion.
 1197                          */
 1198                         pa = *pmap_pde_to_pte(&pde, va);
 1199                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 1200                 }
 1201         }
 1202         return pa;
 1203 }
 1204 
 1205 /***************************************************
 1206  * Low level mapping routines.....
 1207  ***************************************************/
 1208 
 1209 /*
 1210  * Add a wired page to the kva.
 1211  * Note: not SMP coherent.
 1212  */
 1213 PMAP_INLINE void 
 1214 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 1215 {
 1216         pt_entry_t *pte;
 1217 
 1218         pte = vtopte(va);
 1219         pte_store(pte, pa | PG_RW | PG_V | PG_G);
 1220 }
 1221 
 1222 static __inline void
 1223 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 1224 {
 1225         pt_entry_t *pte;
 1226 
 1227         pte = vtopte(va);
 1228         pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
 1229 }
 1230 
 1231 /*
 1232  * Remove a page from the kernel pagetables.
 1233  * Note: not SMP coherent.
 1234  */
 1235 PMAP_INLINE void
 1236 pmap_kremove(vm_offset_t va)
 1237 {
 1238         pt_entry_t *pte;
 1239 
 1240         pte = vtopte(va);
 1241         pte_clear(pte);
 1242 }
 1243 
 1244 /*
 1245  *      Used to map a range of physical addresses into kernel
 1246  *      virtual address space.
 1247  *
 1248  *      The value passed in '*virt' is a suggested virtual address for
 1249  *      the mapping. Architectures which can support a direct-mapped
 1250  *      physical to virtual region can return the appropriate address
 1251  *      within that region, leaving '*virt' unchanged. Other
 1252  *      architectures should map the pages starting at '*virt' and
 1253  *      update '*virt' with the first usable address after the mapped
 1254  *      region.
 1255  */
 1256 vm_offset_t
 1257 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 1258 {
 1259         return PHYS_TO_DMAP(start);
 1260 }
 1261 
 1262 
 1263 /*
 1264  * Add a list of wired pages to the kva
 1265  * this routine is only used for temporary
 1266  * kernel mappings that do not need to have
 1267  * page modification or references recorded.
 1268  * Note that old mappings are simply written
 1269  * over.  The page *must* be wired.
 1270  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1271  */
 1272 void
 1273 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 1274 {
 1275         pt_entry_t *endpte, oldpte, pa, *pte;
 1276         vm_page_t m;
 1277 
 1278         oldpte = 0;
 1279         pte = vtopte(sva);
 1280         endpte = pte + count;
 1281         while (pte < endpte) {
 1282                 m = *ma++;
 1283                 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
 1284                 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
 1285                         oldpte |= *pte;
 1286                         pte_store(pte, pa | PG_G | PG_RW | PG_V);
 1287                 }
 1288                 pte++;
 1289         }
 1290         if (__predict_false((oldpte & PG_V) != 0))
 1291                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
 1292                     PAGE_SIZE);
 1293 }
 1294 
 1295 /*
 1296  * This routine tears out page mappings from the
 1297  * kernel -- it is meant only for temporary mappings.
 1298  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1299  */
 1300 void
 1301 pmap_qremove(vm_offset_t sva, int count)
 1302 {
 1303         vm_offset_t va;
 1304 
 1305         va = sva;
 1306         while (count-- > 0) {
 1307                 pmap_kremove(va);
 1308                 va += PAGE_SIZE;
 1309         }
 1310         pmap_invalidate_range(kernel_pmap, sva, va);
 1311 }
 1312 
 1313 /***************************************************
 1314  * Page table page management routines.....
 1315  ***************************************************/
 1316 static __inline void
 1317 pmap_free_zero_pages(vm_page_t free)
 1318 {
 1319         vm_page_t m;
 1320 
 1321         while (free != NULL) {
 1322                 m = free;
 1323                 free = m->right;
 1324                 /* Preserve the page's PG_ZERO setting. */
 1325                 vm_page_free_toq(m);
 1326         }
 1327 }
 1328 
 1329 /*
 1330  * Schedule the specified unused page table page to be freed.  Specifically,
 1331  * add the page to the specified list of pages that will be released to the
 1332  * physical memory manager after the TLB has been updated.
 1333  */
 1334 static __inline void
 1335 pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
 1336 {
 1337 
 1338         if (set_PG_ZERO)
 1339                 m->flags |= PG_ZERO;
 1340         else
 1341                 m->flags &= ~PG_ZERO;
 1342         m->right = *free;
 1343         *free = m;
 1344 }
 1345         
 1346 /*
 1347  * Inserts the specified page table page into the specified pmap's collection
 1348  * of idle page table pages.  Each of a pmap's page table pages is responsible
 1349  * for mapping a distinct range of virtual addresses.  The pmap's collection is
 1350  * ordered by this virtual address range.
 1351  */
 1352 static void
 1353 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 1354 {
 1355         vm_page_t root;
 1356 
 1357         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1358         root = pmap->pm_root;
 1359         if (root == NULL) {
 1360                 mpte->left = NULL;
 1361                 mpte->right = NULL;
 1362         } else {
 1363                 root = vm_page_splay(mpte->pindex, root);
 1364                 if (mpte->pindex < root->pindex) {
 1365                         mpte->left = root->left;
 1366                         mpte->right = root;
 1367                         root->left = NULL;
 1368                 } else if (mpte->pindex == root->pindex)
 1369                         panic("pmap_insert_pt_page: pindex already inserted");
 1370                 else {
 1371                         mpte->right = root->right;
 1372                         mpte->left = root;
 1373                         root->right = NULL;
 1374                 }
 1375         }
 1376         pmap->pm_root = mpte;
 1377 }
 1378 
 1379 /*
 1380  * Looks for a page table page mapping the specified virtual address in the
 1381  * specified pmap's collection of idle page table pages.  Returns NULL if there
 1382  * is no page table page corresponding to the specified virtual address.
 1383  */
 1384 static vm_page_t
 1385 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
 1386 {
 1387         vm_page_t mpte;
 1388         vm_pindex_t pindex = pmap_pde_pindex(va);
 1389 
 1390         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1391         if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
 1392                 mpte = vm_page_splay(pindex, mpte);
 1393                 if ((pmap->pm_root = mpte)->pindex != pindex)
 1394                         mpte = NULL;
 1395         }
 1396         return (mpte);
 1397 }
 1398 
 1399 /*
 1400  * Removes the specified page table page from the specified pmap's collection
 1401  * of idle page table pages.  The specified page table page must be a member of
 1402  * the pmap's collection.
 1403  */
 1404 static void
 1405 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
 1406 {
 1407         vm_page_t root;
 1408 
 1409         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1410         if (mpte != pmap->pm_root) {
 1411                 root = vm_page_splay(mpte->pindex, pmap->pm_root);
 1412                 KASSERT(mpte == root,
 1413                     ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
 1414                     mpte, pmap));
 1415         }
 1416         if (mpte->left == NULL)
 1417                 root = mpte->right;
 1418         else {
 1419                 root = vm_page_splay(mpte->pindex, mpte->left);
 1420                 root->right = mpte->right;
 1421         }
 1422         pmap->pm_root = root;
 1423 }
 1424 
 1425 /*
 1426  * This routine unholds page table pages, and if the hold count
 1427  * drops to zero, then it decrements the wire count.
 1428  */
 1429 static __inline int
 1430 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
 1431 {
 1432 
 1433         --m->wire_count;
 1434         if (m->wire_count == 0)
 1435                 return _pmap_unwire_pte_hold(pmap, va, m, free);
 1436         else
 1437                 return 0;
 1438 }
 1439 
 1440 static int 
 1441 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 
 1442     vm_page_t *free)
 1443 {
 1444 
 1445         /*
 1446          * unmap the page table page
 1447          */
 1448         if (m->pindex >= (NUPDE + NUPDPE)) {
 1449                 /* PDP page */
 1450                 pml4_entry_t *pml4;
 1451                 pml4 = pmap_pml4e(pmap, va);
 1452                 *pml4 = 0;
 1453         } else if (m->pindex >= NUPDE) {
 1454                 /* PD page */
 1455                 pdp_entry_t *pdp;
 1456                 pdp = pmap_pdpe(pmap, va);
 1457                 *pdp = 0;
 1458         } else {
 1459                 /* PTE page */
 1460                 pd_entry_t *pd;
 1461                 pd = pmap_pde(pmap, va);
 1462                 *pd = 0;
 1463         }
 1464         --pmap->pm_stats.resident_count;
 1465         if (m->pindex < NUPDE) {
 1466                 /* We just released a PT, unhold the matching PD */
 1467                 vm_page_t pdpg;
 1468 
 1469                 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 1470                 pmap_unwire_pte_hold(pmap, va, pdpg, free);
 1471         }
 1472         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 1473                 /* We just released a PD, unhold the matching PDP */
 1474                 vm_page_t pdppg;
 1475 
 1476                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 1477                 pmap_unwire_pte_hold(pmap, va, pdppg, free);
 1478         }
 1479 
 1480         /*
 1481          * This is a release store so that the ordinary store unmapping
 1482          * the page table page is globally performed before TLB shoot-
 1483          * down is begun.
 1484          */
 1485         atomic_subtract_rel_int(&cnt.v_wire_count, 1);
 1486 
 1487         /* 
 1488          * Put page on a list so that it is released after
 1489          * *ALL* TLB shootdown is done
 1490          */
 1491         pmap_add_delayed_free_list(m, free, TRUE);
 1492         
 1493         return 1;
 1494 }
 1495 
 1496 /*
 1497  * After removing a page table entry, this routine is used to
 1498  * conditionally free the page, and manage the hold/wire counts.
 1499  */
 1500 static int
 1501 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
 1502 {
 1503         vm_page_t mpte;
 1504 
 1505         if (va >= VM_MAXUSER_ADDRESS)
 1506                 return 0;
 1507         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 1508         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 1509         return pmap_unwire_pte_hold(pmap, va, mpte, free);
 1510 }
 1511 
 1512 void
 1513 pmap_pinit0(pmap_t pmap)
 1514 {
 1515 
 1516         PMAP_LOCK_INIT(pmap);
 1517         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 1518         pmap->pm_root = NULL;
 1519         pmap->pm_active = 0;
 1520         PCPU_SET(curpmap, pmap);
 1521         TAILQ_INIT(&pmap->pm_pvchunk);
 1522         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 1523 }
 1524 
 1525 /*
 1526  * Initialize a preallocated and zeroed pmap structure,
 1527  * such as one in a vmspace structure.
 1528  */
 1529 int
 1530 pmap_pinit(pmap_t pmap)
 1531 {
 1532         vm_page_t pml4pg;
 1533         static vm_pindex_t color;
 1534 
 1535         PMAP_LOCK_INIT(pmap);
 1536 
 1537         /*
 1538          * allocate the page directory page
 1539          */
 1540         while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
 1541             VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 1542                 VM_WAIT;
 1543 
 1544         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 1545 
 1546         if ((pml4pg->flags & PG_ZERO) == 0)
 1547                 pagezero(pmap->pm_pml4);
 1548 
 1549         /* Wire in kernel global address entries. */
 1550         pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
 1551         pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
 1552 
 1553         /* install self-referential address mapping entry(s) */
 1554         pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
 1555 
 1556         pmap->pm_root = NULL;
 1557         pmap->pm_active = 0;
 1558         TAILQ_INIT(&pmap->pm_pvchunk);
 1559         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 1560 
 1561         return (1);
 1562 }
 1563 
 1564 /*
 1565  * this routine is called if the page table page is not
 1566  * mapped correctly.
 1567  *
 1568  * Note: If a page allocation fails at page table level two or three,
 1569  * one or two pages may be held during the wait, only to be released
 1570  * afterwards.  This conservative approach is easily argued to avoid
 1571  * race conditions.
 1572  */
 1573 static vm_page_t
 1574 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
 1575 {
 1576         vm_page_t m, pdppg, pdpg;
 1577 
 1578         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 1579             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 1580             ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 1581 
 1582         /*
 1583          * Allocate a page table page.
 1584          */
 1585         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 1586             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 1587                 if (flags & M_WAITOK) {
 1588                         PMAP_UNLOCK(pmap);
 1589                         vm_page_unlock_queues();
 1590                         VM_WAIT;
 1591                         vm_page_lock_queues();
 1592                         PMAP_LOCK(pmap);
 1593                 }
 1594 
 1595                 /*
 1596                  * Indicate the need to retry.  While waiting, the page table
 1597                  * page may have been allocated.
 1598                  */
 1599                 return (NULL);
 1600         }
 1601         if ((m->flags & PG_ZERO) == 0)
 1602                 pmap_zero_page(m);
 1603 
 1604         /*
 1605          * Map the pagetable page into the process address space, if
 1606          * it isn't already there.
 1607          */
 1608 
 1609         if (ptepindex >= (NUPDE + NUPDPE)) {
 1610                 pml4_entry_t *pml4;
 1611                 vm_pindex_t pml4index;
 1612 
 1613                 /* Wire up a new PDPE page */
 1614                 pml4index = ptepindex - (NUPDE + NUPDPE);
 1615                 pml4 = &pmap->pm_pml4[pml4index];
 1616                 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 1617 
 1618         } else if (ptepindex >= NUPDE) {
 1619                 vm_pindex_t pml4index;
 1620                 vm_pindex_t pdpindex;
 1621                 pml4_entry_t *pml4;
 1622                 pdp_entry_t *pdp;
 1623 
 1624                 /* Wire up a new PDE page */
 1625                 pdpindex = ptepindex - NUPDE;
 1626                 pml4index = pdpindex >> NPML4EPGSHIFT;
 1627 
 1628                 pml4 = &pmap->pm_pml4[pml4index];
 1629                 if ((*pml4 & PG_V) == 0) {
 1630                         /* Have to allocate a new pdp, recurse */
 1631                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 1632                             flags) == NULL) {
 1633                                 --m->wire_count;
 1634                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 1635                                 vm_page_free_zero(m);
 1636                                 return (NULL);
 1637                         }
 1638                 } else {
 1639                         /* Add reference to pdp page */
 1640                         pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 1641                         pdppg->wire_count++;
 1642                 }
 1643                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 1644 
 1645                 /* Now find the pdp page */
 1646                 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 1647                 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 1648 
 1649         } else {
 1650                 vm_pindex_t pml4index;
 1651                 vm_pindex_t pdpindex;
 1652                 pml4_entry_t *pml4;
 1653                 pdp_entry_t *pdp;
 1654                 pd_entry_t *pd;
 1655 
 1656                 /* Wire up a new PTE page */
 1657                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 1658                 pml4index = pdpindex >> NPML4EPGSHIFT;
 1659 
 1660                 /* First, find the pdp and check that its valid. */
 1661                 pml4 = &pmap->pm_pml4[pml4index];
 1662                 if ((*pml4 & PG_V) == 0) {
 1663                         /* Have to allocate a new pd, recurse */
 1664                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 1665                             flags) == NULL) {
 1666                                 --m->wire_count;
 1667                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 1668                                 vm_page_free_zero(m);
 1669                                 return (NULL);
 1670                         }
 1671                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 1672                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 1673                 } else {
 1674                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 1675                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 1676                         if ((*pdp & PG_V) == 0) {
 1677                                 /* Have to allocate a new pd, recurse */
 1678                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 1679                                     flags) == NULL) {
 1680                                         --m->wire_count;
 1681                                         atomic_subtract_int(&cnt.v_wire_count,
 1682                                             1);
 1683                                         vm_page_free_zero(m);
 1684                                         return (NULL);
 1685                                 }
 1686                         } else {
 1687                                 /* Add reference to the pd page */
 1688                                 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 1689                                 pdpg->wire_count++;
 1690                         }
 1691                 }
 1692                 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 1693 
 1694                 /* Now we know where the page directory page is */
 1695                 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 1696                 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 1697         }
 1698 
 1699         pmap->pm_stats.resident_count++;
 1700 
 1701         return m;
 1702 }
 1703 
 1704 static vm_page_t
 1705 pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
 1706 {
 1707         vm_pindex_t pdpindex, ptepindex;
 1708         pdp_entry_t *pdpe;
 1709         vm_page_t pdpg;
 1710 
 1711         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 1712             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 1713             ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
 1714 retry:
 1715         pdpe = pmap_pdpe(pmap, va);
 1716         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 1717                 /* Add a reference to the pd page. */
 1718                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 1719                 pdpg->wire_count++;
 1720         } else {
 1721                 /* Allocate a pd page. */
 1722                 ptepindex = pmap_pde_pindex(va);
 1723                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 1724                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
 1725                 if (pdpg == NULL && (flags & M_WAITOK))
 1726                         goto retry;
 1727         }
 1728         return (pdpg);
 1729 }
 1730 
 1731 static vm_page_t
 1732 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
 1733 {
 1734         vm_pindex_t ptepindex;
 1735         pd_entry_t *pd;
 1736         vm_page_t m;
 1737 
 1738         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 1739             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 1740             ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 1741 
 1742         /*
 1743          * Calculate pagetable page index
 1744          */
 1745         ptepindex = pmap_pde_pindex(va);
 1746 retry:
 1747         /*
 1748          * Get the page directory entry
 1749          */
 1750         pd = pmap_pde(pmap, va);
 1751 
 1752         /*
 1753          * This supports switching from a 2MB page to a
 1754          * normal 4K page.
 1755          */
 1756         if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 1757                 if (!pmap_demote_pde(pmap, pd, va)) {
 1758                         /*
 1759                          * Invalidation of the 2MB page mapping may have caused
 1760                          * the deallocation of the underlying PD page.
 1761                          */
 1762                         pd = NULL;
 1763                 }
 1764         }
 1765 
 1766         /*
 1767          * If the page table page is mapped, we just increment the
 1768          * hold count, and activate it.
 1769          */
 1770         if (pd != NULL && (*pd & PG_V) != 0) {
 1771                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 1772                 m->wire_count++;
 1773         } else {
 1774                 /*
 1775                  * Here if the pte page isn't mapped, or if it has been
 1776                  * deallocated.
 1777                  */
 1778                 m = _pmap_allocpte(pmap, ptepindex, flags);
 1779                 if (m == NULL && (flags & M_WAITOK))
 1780                         goto retry;
 1781         }
 1782         return (m);
 1783 }
 1784 
 1785 
 1786 /***************************************************
 1787  * Pmap allocation/deallocation routines.
 1788  ***************************************************/
 1789 
 1790 /*
 1791  * Release any resources held by the given physical map.
 1792  * Called when a pmap initialized by pmap_pinit is being released.
 1793  * Should only be called if the map contains no valid mappings.
 1794  */
 1795 void
 1796 pmap_release(pmap_t pmap)
 1797 {
 1798         vm_page_t m;
 1799 
 1800         KASSERT(pmap->pm_stats.resident_count == 0,
 1801             ("pmap_release: pmap resident count %ld != 0",
 1802             pmap->pm_stats.resident_count));
 1803         KASSERT(pmap->pm_root == NULL,
 1804             ("pmap_release: pmap has reserved page table page(s)"));
 1805 
 1806         m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
 1807 
 1808         pmap->pm_pml4[KPML4I] = 0;      /* KVA */
 1809         pmap->pm_pml4[DMPML4I] = 0;     /* Direct Map */
 1810         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
 1811 
 1812         m->wire_count--;
 1813         atomic_subtract_int(&cnt.v_wire_count, 1);
 1814         vm_page_free_zero(m);
 1815         PMAP_LOCK_DESTROY(pmap);
 1816 }
 1817 
 1818 static int
 1819 kvm_size(SYSCTL_HANDLER_ARGS)
 1820 {
 1821         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 1822 
 1823         return sysctl_handle_long(oidp, &ksize, 0, req);
 1824 }
 1825 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
 1826     0, 0, kvm_size, "LU", "Size of KVM");
 1827 
 1828 static int
 1829 kvm_free(SYSCTL_HANDLER_ARGS)
 1830 {
 1831         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 1832 
 1833         return sysctl_handle_long(oidp, &kfree, 0, req);
 1834 }
 1835 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
 1836     0, 0, kvm_free, "LU", "Amount of KVM free");
 1837 
 1838 /*
 1839  * grow the number of kernel page table entries, if needed
 1840  */
 1841 void
 1842 pmap_growkernel(vm_offset_t addr)
 1843 {
 1844         vm_paddr_t paddr;
 1845         vm_page_t nkpg;
 1846         pd_entry_t *pde, newpdir;
 1847         pdp_entry_t *pdpe;
 1848 
 1849         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 1850 
 1851         /*
 1852          * Return if "addr" is within the range of kernel page table pages
 1853          * that were preallocated during pmap bootstrap.  Moreover, leave
 1854          * "kernel_vm_end" and the kernel page table as they were.
 1855          *
 1856          * The correctness of this action is based on the following
 1857          * argument: vm_map_findspace() allocates contiguous ranges of the
 1858          * kernel virtual address space.  It calls this function if a range
 1859          * ends after "kernel_vm_end".  If the kernel is mapped between
 1860          * "kernel_vm_end" and "addr", then the range cannot begin at
 1861          * "kernel_vm_end".  In fact, its beginning address cannot be less
 1862          * than the kernel.  Thus, there is no immediate need to allocate
 1863          * any new kernel page table pages between "kernel_vm_end" and
 1864          * "KERNBASE".
 1865          */
 1866         if (KERNBASE < addr && addr <= KERNBASE + NKPT * NBPDR)
 1867                 return;
 1868 
 1869         addr = roundup2(addr, NBPDR);
 1870         if (addr - 1 >= kernel_map->max_offset)
 1871                 addr = kernel_map->max_offset;
 1872         while (kernel_vm_end < addr) {
 1873                 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 1874                 if ((*pdpe & PG_V) == 0) {
 1875                         /* We need a new PDP entry */
 1876                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 1877                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 1878                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 1879                         if (nkpg == NULL)
 1880                                 panic("pmap_growkernel: no memory to grow kernel");
 1881                         if ((nkpg->flags & PG_ZERO) == 0)
 1882                                 pmap_zero_page(nkpg);
 1883                         paddr = VM_PAGE_TO_PHYS(nkpg);
 1884                         *pdpe = (pdp_entry_t)
 1885                                 (paddr | PG_V | PG_RW | PG_A | PG_M);
 1886                         continue; /* try again */
 1887                 }
 1888                 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 1889                 if ((*pde & PG_V) != 0) {
 1890                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 1891                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 1892                                 kernel_vm_end = kernel_map->max_offset;
 1893                                 break;                       
 1894                         }
 1895                         continue;
 1896                 }
 1897 
 1898                 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 1899                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 1900                     VM_ALLOC_ZERO);
 1901                 if (nkpg == NULL)
 1902                         panic("pmap_growkernel: no memory to grow kernel");
 1903                 if ((nkpg->flags & PG_ZERO) == 0)
 1904                         pmap_zero_page(nkpg);
 1905                 paddr = VM_PAGE_TO_PHYS(nkpg);
 1906                 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
 1907                 pde_store(pde, newpdir);
 1908 
 1909                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 1910                 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 1911                         kernel_vm_end = kernel_map->max_offset;
 1912                         break;                       
 1913                 }
 1914         }
 1915 }
 1916 
 1917 
 1918 /***************************************************
 1919  * page management routines.
 1920  ***************************************************/
 1921 
 1922 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 1923 CTASSERT(_NPCM == 3);
 1924 CTASSERT(_NPCPV == 168);
 1925 
 1926 static __inline struct pv_chunk *
 1927 pv_to_chunk(pv_entry_t pv)
 1928 {
 1929 
 1930         return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
 1931 }
 1932 
 1933 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 1934 
 1935 #define PC_FREE0        0xfffffffffffffffful
 1936 #define PC_FREE1        0xfffffffffffffffful
 1937 #define PC_FREE2        0x000000fffffffffful
 1938 
 1939 static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 1940 
 1941 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 1942         "Current number of pv entries");
 1943 
 1944 #ifdef PV_STATS
 1945 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 1946 
 1947 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 1948         "Current number of pv entry chunks");
 1949 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 1950         "Current number of pv entry chunks allocated");
 1951 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 1952         "Current number of pv entry chunks frees");
 1953 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 1954         "Number of times tried to get a chunk page but failed.");
 1955 
 1956 static long pv_entry_frees, pv_entry_allocs;
 1957 static int pv_entry_spare;
 1958 
 1959 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 1960         "Current number of pv entry frees");
 1961 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 1962         "Current number of pv entry allocs");
 1963 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 1964         "Current number of spare pv entries");
 1965 
 1966 static int pmap_collect_inactive, pmap_collect_active;
 1967 
 1968 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
 1969         "Current number times pmap_collect called on inactive queue");
 1970 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
 1971         "Current number times pmap_collect called on active queue");
 1972 #endif
 1973 
 1974 /*
 1975  * We are in a serious low memory condition.  Resort to
 1976  * drastic measures to free some pages so we can allocate
 1977  * another pv entry chunk.  This is normally called to
 1978  * unmap inactive pages, and if necessary, active pages.
 1979  *
 1980  * We do not, however, unmap 2mpages because subsequent accesses will
 1981  * allocate per-page pv entries until repromotion occurs, thereby
 1982  * exacerbating the shortage of free pv entries.
 1983  */
 1984 static void
 1985 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
 1986 {
 1987         struct md_page *pvh;
 1988         pd_entry_t *pde;
 1989         pmap_t pmap;
 1990         pt_entry_t *pte, tpte;
 1991         pv_entry_t next_pv, pv;
 1992         vm_offset_t va;
 1993         vm_page_t m, free;
 1994 
 1995         TAILQ_FOREACH(m, &vpq->pl, pageq) {
 1996                 if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
 1997                         continue;
 1998                 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 1999                         va = pv->pv_va;
 2000                         pmap = PV_PMAP(pv);
 2001                         /* Avoid deadlock and lock recursion. */
 2002                         if (pmap > locked_pmap)
 2003                                 PMAP_LOCK(pmap);
 2004                         else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 2005                                 continue;
 2006                         pmap->pm_stats.resident_count--;
 2007                         pde = pmap_pde(pmap, va);
 2008                         KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
 2009                             " a 2mpage in page %p's pv list", m));
 2010                         pte = pmap_pde_to_pte(pde, va);
 2011                         tpte = pte_load_clear(pte);
 2012                         KASSERT((tpte & PG_W) == 0,
 2013                             ("pmap_collect: wired pte %#lx", tpte));
 2014                         if (tpte & PG_A)
 2015                                 vm_page_flag_set(m, PG_REFERENCED);
 2016                         if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2017                                 vm_page_dirty(m);
 2018                         free = NULL;
 2019                         pmap_unuse_pt(pmap, va, *pde, &free);
 2020                         pmap_invalidate_page(pmap, va);
 2021                         pmap_free_zero_pages(free);
 2022                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 2023                         if (TAILQ_EMPTY(&m->md.pv_list)) {
 2024                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2025                                 if (TAILQ_EMPTY(&pvh->pv_list))
 2026                                         vm_page_flag_clear(m, PG_WRITEABLE);
 2027                         }
 2028                         free_pv_entry(pmap, pv);
 2029                         if (pmap != locked_pmap)
 2030                                 PMAP_UNLOCK(pmap);
 2031                 }
 2032         }
 2033 }
 2034 
 2035 
 2036 /*
 2037  * free the pv_entry back to the free list
 2038  */
 2039 static void
 2040 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 2041 {
 2042         vm_page_t m;
 2043         struct pv_chunk *pc;
 2044         int idx, field, bit;
 2045 
 2046         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2047         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2048         PV_STAT(pv_entry_frees++);
 2049         PV_STAT(pv_entry_spare++);
 2050         pv_entry_count--;
 2051         pc = pv_to_chunk(pv);
 2052         idx = pv - &pc->pc_pventry[0];
 2053         field = idx / 64;
 2054         bit = idx % 64;
 2055         pc->pc_map[field] |= 1ul << bit;
 2056         /* move to head of list */
 2057         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2058         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 2059             pc->pc_map[2] != PC_FREE2) {
 2060                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2061                 return;
 2062         }
 2063         PV_STAT(pv_entry_spare -= _NPCPV);
 2064         PV_STAT(pc_chunk_count--);
 2065         PV_STAT(pc_chunk_frees++);
 2066         /* entire chunk is free, return it */
 2067         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 2068         dump_drop_page(m->phys_addr);
 2069         vm_page_unwire(m, 0);
 2070         vm_page_free(m);
 2071 }
 2072 
 2073 /*
 2074  * get a new pv_entry, allocating a block from the system
 2075  * when needed.
 2076  */
 2077 static pv_entry_t
 2078 get_pv_entry(pmap_t pmap, boolean_t try)
 2079 {
 2080         static vm_pindex_t colour;
 2081         struct vpgqueues *pq;
 2082         int bit, field;
 2083         pv_entry_t pv;
 2084         struct pv_chunk *pc;
 2085         vm_page_t m;
 2086 
 2087         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2088         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2089         PV_STAT(pv_entry_allocs++);
 2090         pq = NULL;
 2091 retry:
 2092         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 2093         if (pc != NULL) {
 2094                 for (field = 0; field < _NPCM; field++) {
 2095                         if (pc->pc_map[field]) {
 2096                                 bit = bsfq(pc->pc_map[field]);
 2097                                 break;
 2098                         }
 2099                 }
 2100                 if (field < _NPCM) {
 2101                         pv = &pc->pc_pventry[field * 64 + bit];
 2102                         pc->pc_map[field] &= ~(1ul << bit);
 2103                         /* If this was the last item, move it to tail */
 2104                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 2105                             pc->pc_map[2] == 0) {
 2106                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2107                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 2108                                     pc_list);
 2109                         }
 2110                         pv_entry_count++;
 2111                         PV_STAT(pv_entry_spare--);
 2112                         return (pv);
 2113                 }
 2114         }
 2115         /* No free items, allocate another chunk */
 2116         m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ?
 2117             VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
 2118             VM_ALLOC_WIRED);
 2119         if (m == NULL) {
 2120                 if (try) {
 2121                         PV_STAT(pc_chunk_tryfail++);
 2122                         return (NULL);
 2123                 }
 2124                 /*
 2125                  * Reclaim pv entries: At first, destroy mappings to inactive
 2126                  * pages.  After that, if a pv chunk entry is still needed,
 2127                  * destroy mappings to active pages.
 2128                  */
 2129                 if (pq == NULL) {
 2130                         PV_STAT(pmap_collect_inactive++);
 2131                         pq = &vm_page_queues[PQ_INACTIVE];
 2132                 } else if (pq == &vm_page_queues[PQ_INACTIVE]) {
 2133                         PV_STAT(pmap_collect_active++);
 2134                         pq = &vm_page_queues[PQ_ACTIVE];
 2135                 } else
 2136                         panic("get_pv_entry: allocation failed");
 2137                 pmap_collect(pmap, pq);
 2138                 goto retry;
 2139         }
 2140         PV_STAT(pc_chunk_count++);
 2141         PV_STAT(pc_chunk_allocs++);
 2142         colour++;
 2143         dump_add_page(m->phys_addr);
 2144         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 2145         pc->pc_pmap = pmap;
 2146         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
 2147         pc->pc_map[1] = PC_FREE1;
 2148         pc->pc_map[2] = PC_FREE2;
 2149         pv = &pc->pc_pventry[0];
 2150         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2151         pv_entry_count++;
 2152         PV_STAT(pv_entry_spare += _NPCPV - 1);
 2153         return (pv);
 2154 }
 2155 
 2156 /*
 2157  * First find and then remove the pv entry for the specified pmap and virtual
 2158  * address from the specified pv list.  Returns the pv entry if found and NULL
 2159  * otherwise.  This operation can be performed on pv lists for either 4KB or
 2160  * 2MB page mappings.
 2161  */
 2162 static __inline pv_entry_t
 2163 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 2164 {
 2165         pv_entry_t pv;
 2166 
 2167         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2168         TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 2169                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 2170                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
 2171                         break;
 2172                 }
 2173         }
 2174         return (pv);
 2175 }
 2176 
 2177 /*
 2178  * After demotion from a 2MB page mapping to 512 4KB page mappings,
 2179  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
 2180  * entries for each of the 4KB page mappings.
 2181  */
 2182 static void
 2183 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 2184 {
 2185         struct md_page *pvh;
 2186         pv_entry_t pv;
 2187         vm_offset_t va_last;
 2188         vm_page_t m;
 2189 
 2190         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2191         KASSERT((pa & PDRMASK) == 0,
 2192             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 2193 
 2194         /*
 2195          * Transfer the 2mpage's pv entry for this mapping to the first
 2196          * page's pv list.
 2197          */
 2198         pvh = pa_to_pvh(pa);
 2199         va = trunc_2mpage(va);
 2200         pv = pmap_pvh_remove(pvh, pmap, va);
 2201         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 2202         m = PHYS_TO_VM_PAGE(pa);
 2203         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 2204         /* Instantiate the remaining NPTEPG - 1 pv entries. */
 2205         va_last = va + NBPDR - PAGE_SIZE;
 2206         do {
 2207                 m++;
 2208                 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
 2209                     ("pmap_pv_demote_pde: page %p is not managed", m));
 2210                 va += PAGE_SIZE;
 2211                 pmap_insert_entry(pmap, va, m);
 2212         } while (va < va_last);
 2213 }
 2214 
 2215 /*
 2216  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
 2217  * replace the many pv entries for the 4KB page mappings by a single pv entry
 2218  * for the 2MB page mapping.
 2219  */
 2220 static void
 2221 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 2222 {
 2223         struct md_page *pvh;
 2224         pv_entry_t pv;
 2225         vm_offset_t va_last;
 2226         vm_page_t m;
 2227 
 2228         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2229         KASSERT((pa & PDRMASK) == 0,
 2230             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 2231 
 2232         /*
 2233          * Transfer the first page's pv entry for this mapping to the
 2234          * 2mpage's pv list.  Aside from avoiding the cost of a call
 2235          * to get_pv_entry(), a transfer avoids the possibility that
 2236          * get_pv_entry() calls pmap_collect() and that pmap_collect()
 2237          * removes one of the mappings that is being promoted.
 2238          */
 2239         m = PHYS_TO_VM_PAGE(pa);
 2240         va = trunc_2mpage(va);
 2241         pv = pmap_pvh_remove(&m->md, pmap, va);
 2242         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 2243         pvh = pa_to_pvh(pa);
 2244         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
 2245         /* Free the remaining NPTEPG - 1 pv entries. */
 2246         va_last = va + NBPDR - PAGE_SIZE;
 2247         do {
 2248                 m++;
 2249                 va += PAGE_SIZE;
 2250                 pmap_pvh_free(&m->md, pmap, va);
 2251         } while (va < va_last);
 2252 }
 2253 
 2254 /*
 2255  * First find and then destroy the pv entry for the specified pmap and virtual
 2256  * address.  This operation can be performed on pv lists for either 4KB or 2MB
 2257  * page mappings.
 2258  */
 2259 static void
 2260 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 2261 {
 2262         pv_entry_t pv;
 2263 
 2264         pv = pmap_pvh_remove(pvh, pmap, va);
 2265         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 2266         free_pv_entry(pmap, pv);
 2267 }
 2268 
 2269 static void
 2270 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 2271 {
 2272         struct md_page *pvh;
 2273 
 2274         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2275         pmap_pvh_free(&m->md, pmap, va);
 2276         if (TAILQ_EMPTY(&m->md.pv_list)) {
 2277                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2278                 if (TAILQ_EMPTY(&pvh->pv_list))
 2279                         vm_page_flag_clear(m, PG_WRITEABLE);
 2280         }
 2281 }
 2282 
 2283 /*
 2284  * Create a pv entry for page at pa for
 2285  * (pmap, va).
 2286  */
 2287 static void
 2288 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 2289 {
 2290         pv_entry_t pv;
 2291 
 2292         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2293         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2294         pv = get_pv_entry(pmap, FALSE);
 2295         pv->pv_va = va;
 2296         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 2297 }
 2298 
 2299 /*
 2300  * Conditionally create a pv entry.
 2301  */
 2302 static boolean_t
 2303 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 2304 {
 2305         pv_entry_t pv;
 2306 
 2307         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2308         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2309         if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
 2310                 pv->pv_va = va;
 2311                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 2312                 return (TRUE);
 2313         } else
 2314                 return (FALSE);
 2315 }
 2316 
 2317 /*
 2318  * Create the pv entry for a 2MB page mapping.
 2319  */
 2320 static boolean_t
 2321 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 2322 {
 2323         struct md_page *pvh;
 2324         pv_entry_t pv;
 2325 
 2326         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2327         if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
 2328                 pv->pv_va = va;
 2329                 pvh = pa_to_pvh(pa);
 2330                 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
 2331                 return (TRUE);
 2332         } else
 2333                 return (FALSE);
 2334 }
 2335 
 2336 /*
 2337  * Fills a page table page with mappings to consecutive physical pages.
 2338  */
 2339 static void
 2340 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 2341 {
 2342         pt_entry_t *pte;
 2343 
 2344         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 2345                 *pte = newpte;
 2346                 newpte += PAGE_SIZE;
 2347         }
 2348 }
 2349 
 2350 /*
 2351  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
 2352  * mapping is invalidated.
 2353  */
 2354 static boolean_t
 2355 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 2356 {
 2357         pd_entry_t newpde, oldpde;
 2358         pt_entry_t *firstpte, newpte;
 2359         vm_paddr_t mptepa;
 2360         vm_page_t free, mpte;
 2361 
 2362         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2363         oldpde = *pde;
 2364         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 2365             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 2366         mpte = pmap_lookup_pt_page(pmap, va);
 2367         if (mpte != NULL)
 2368                 pmap_remove_pt_page(pmap, mpte);
 2369         else {
 2370                 KASSERT((oldpde & PG_W) == 0,
 2371                     ("pmap_demote_pde: page table page for a wired mapping"
 2372                     " is missing"));
 2373 
 2374                 /*
 2375                  * Invalidate the 2MB page mapping and return "failure" if the
 2376                  * mapping was never accessed or the allocation of the new
 2377                  * page table page fails.  If the 2MB page mapping belongs to
 2378                  * the direct map region of the kernel's address space, then
 2379                  * the page allocation request specifies the highest possible
 2380                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
 2381                  * normal.  Page table pages are preallocated for every other
 2382                  * part of the kernel address space, so the direct map region
 2383                  * is the only part of the kernel address space that must be
 2384                  * handled here.
 2385                  */
 2386                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 2387                     pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
 2388                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 2389                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 2390                         free = NULL;
 2391                         pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
 2392                         pmap_invalidate_page(pmap, trunc_2mpage(va));
 2393                         pmap_free_zero_pages(free);
 2394                         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
 2395                             " in pmap %p", va, pmap);
 2396                         return (FALSE);
 2397                 }
 2398                 if (va < VM_MAXUSER_ADDRESS)
 2399                         pmap->pm_stats.resident_count++;
 2400         }
 2401         mptepa = VM_PAGE_TO_PHYS(mpte);
 2402         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 2403         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 2404         KASSERT((oldpde & PG_A) != 0,
 2405             ("pmap_demote_pde: oldpde is missing PG_A"));
 2406         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 2407             ("pmap_demote_pde: oldpde is missing PG_M"));
 2408         newpte = oldpde & ~PG_PS;
 2409         if ((newpte & PG_PDE_PAT) != 0)
 2410                 newpte ^= PG_PDE_PAT | PG_PTE_PAT;
 2411 
 2412         /*
 2413          * If the page table page is new, initialize it.
 2414          */
 2415         if (mpte->wire_count == 1) {
 2416                 mpte->wire_count = NPTEPG;
 2417                 pmap_fill_ptp(firstpte, newpte);
 2418         }
 2419         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 2420             ("pmap_demote_pde: firstpte and newpte map different physical"
 2421             " addresses"));
 2422 
 2423         /*
 2424          * If the mapping has changed attributes, update the page table
 2425          * entries.
 2426          */
 2427         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 2428                 pmap_fill_ptp(firstpte, newpte);
 2429 
 2430         /*
 2431          * Demote the mapping.  This pmap is locked.  The old PDE has
 2432          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 2433          * set.  Thus, there is no danger of a race with another
 2434          * processor changing the setting of PG_A and/or PG_M between
 2435          * the read above and the store below. 
 2436          */
 2437         if (workaround_erratum383)
 2438                 pmap_update_pde(pmap, va, pde, newpde);
 2439         else
 2440                 pde_store(pde, newpde);
 2441 
 2442         /*
 2443          * Invalidate a stale recursive mapping of the page table page.
 2444          */
 2445         if (va >= VM_MAXUSER_ADDRESS)
 2446                 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 2447 
 2448         /*
 2449          * Demote the pv entry.  This depends on the earlier demotion
 2450          * of the mapping.  Specifically, the (re)creation of a per-
 2451          * page pv entry might trigger the execution of pmap_collect(),
 2452          * which might reclaim a newly (re)created per-page pv entry
 2453          * and destroy the associated mapping.  In order to destroy
 2454          * the mapping, the PDE must have already changed from mapping
 2455          * the 2mpage to referencing the page table page.
 2456          */
 2457         if ((oldpde & PG_MANAGED) != 0)
 2458                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
 2459 
 2460         pmap_pde_demotions++;
 2461         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
 2462             " in pmap %p", va, pmap);
 2463         return (TRUE);
 2464 }
 2465 
 2466 /*
 2467  * pmap_remove_pde: do the things to unmap a superpage in a process
 2468  */
 2469 static int
 2470 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 2471     vm_page_t *free)
 2472 {
 2473         struct md_page *pvh;
 2474         pd_entry_t oldpde;
 2475         vm_offset_t eva, va;
 2476         vm_page_t m, mpte;
 2477 
 2478         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2479         KASSERT((sva & PDRMASK) == 0,
 2480             ("pmap_remove_pde: sva is not 2mpage aligned"));
 2481         oldpde = pte_load_clear(pdq);
 2482         if (oldpde & PG_W)
 2483                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 2484 
 2485         /*
 2486          * Machines that don't support invlpg, also don't support
 2487          * PG_G.
 2488          */
 2489         if (oldpde & PG_G)
 2490                 pmap_invalidate_page(kernel_pmap, sva);
 2491         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 2492         if (oldpde & PG_MANAGED) {
 2493                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 2494                 pmap_pvh_free(pvh, pmap, sva);
 2495                 eva = sva + NBPDR;
 2496                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 2497                     va < eva; va += PAGE_SIZE, m++) {
 2498                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2499                                 vm_page_dirty(m);
 2500                         if (oldpde & PG_A)
 2501                                 vm_page_flag_set(m, PG_REFERENCED);
 2502                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 2503                             TAILQ_EMPTY(&pvh->pv_list))
 2504                                 vm_page_flag_clear(m, PG_WRITEABLE);
 2505                 }
 2506         }
 2507         if (pmap == kernel_pmap) {
 2508                 if (!pmap_demote_pde(pmap, pdq, sva))
 2509                         panic("pmap_remove_pde: failed demotion");
 2510         } else {
 2511                 mpte = pmap_lookup_pt_page(pmap, sva);
 2512                 if (mpte != NULL) {
 2513                         pmap_remove_pt_page(pmap, mpte);
 2514                         pmap->pm_stats.resident_count--;
 2515                         KASSERT(mpte->wire_count == NPTEPG,
 2516                             ("pmap_remove_pde: pte page wire count error"));
 2517                         mpte->wire_count = 0;
 2518                         pmap_add_delayed_free_list(mpte, free, FALSE);
 2519                         atomic_subtract_int(&cnt.v_wire_count, 1);
 2520                 }
 2521         }
 2522         return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 2523 }
 2524 
 2525 /*
 2526  * pmap_remove_pte: do the things to unmap a page in a process
 2527  */
 2528 static int
 2529 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
 2530     pd_entry_t ptepde, vm_page_t *free)
 2531 {
 2532         pt_entry_t oldpte;
 2533         vm_page_t m;
 2534 
 2535         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2536         oldpte = pte_load_clear(ptq);
 2537         if (oldpte & PG_W)
 2538                 pmap->pm_stats.wired_count -= 1;
 2539         pmap->pm_stats.resident_count -= 1;
 2540         if (oldpte & PG_MANAGED) {
 2541                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 2542                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2543                         vm_page_dirty(m);
 2544                 if (oldpte & PG_A)
 2545                         vm_page_flag_set(m, PG_REFERENCED);
 2546                 pmap_remove_entry(pmap, m, va);
 2547         }
 2548         return (pmap_unuse_pt(pmap, va, ptepde, free));
 2549 }
 2550 
 2551 /*
 2552  * Remove a single page from a process address space
 2553  */
 2554 static void
 2555 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
 2556 {
 2557         pt_entry_t *pte;
 2558 
 2559         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2560         if ((*pde & PG_V) == 0)
 2561                 return;
 2562         pte = pmap_pde_to_pte(pde, va);
 2563         if ((*pte & PG_V) == 0)
 2564                 return;
 2565         pmap_remove_pte(pmap, pte, va, *pde, free);
 2566         pmap_invalidate_page(pmap, va);
 2567 }
 2568 
 2569 /*
 2570  *      Remove the given range of addresses from the specified map.
 2571  *
 2572  *      It is assumed that the start and end are properly
 2573  *      rounded to the page size.
 2574  */
 2575 void
 2576 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 2577 {
 2578         vm_offset_t va, va_next;
 2579         pml4_entry_t *pml4e;
 2580         pdp_entry_t *pdpe;
 2581         pd_entry_t ptpaddr, *pde;
 2582         pt_entry_t *pte;
 2583         vm_page_t free = NULL;
 2584         int anyvalid;
 2585 
 2586         /*
 2587          * Perform an unsynchronized read.  This is, however, safe.
 2588          */
 2589         if (pmap->pm_stats.resident_count == 0)
 2590                 return;
 2591 
 2592         anyvalid = 0;
 2593 
 2594         vm_page_lock_queues();
 2595         PMAP_LOCK(pmap);
 2596 
 2597         /*
 2598          * special handling of removing one page.  a very
 2599          * common operation and easy to short circuit some
 2600          * code.
 2601          */
 2602         if (sva + PAGE_SIZE == eva) {
 2603                 pde = pmap_pde(pmap, sva);
 2604                 if (pde && (*pde & PG_PS) == 0) {
 2605                         pmap_remove_page(pmap, sva, pde, &free);
 2606                         goto out;
 2607                 }
 2608         }
 2609 
 2610         for (; sva < eva; sva = va_next) {
 2611 
 2612                 if (pmap->pm_stats.resident_count == 0)
 2613                         break;
 2614 
 2615                 pml4e = pmap_pml4e(pmap, sva);
 2616                 if ((*pml4e & PG_V) == 0) {
 2617                         va_next = (sva + NBPML4) & ~PML4MASK;
 2618                         if (va_next < sva)
 2619                                 va_next = eva;
 2620                         continue;
 2621                 }
 2622 
 2623                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 2624                 if ((*pdpe & PG_V) == 0) {
 2625                         va_next = (sva + NBPDP) & ~PDPMASK;
 2626                         if (va_next < sva)
 2627                                 va_next = eva;
 2628                         continue;
 2629                 }
 2630 
 2631                 /*
 2632                  * Calculate index for next page table.
 2633                  */
 2634                 va_next = (sva + NBPDR) & ~PDRMASK;
 2635                 if (va_next < sva)
 2636                         va_next = eva;
 2637 
 2638                 pde = pmap_pdpe_to_pde(pdpe, sva);
 2639                 ptpaddr = *pde;
 2640 
 2641                 /*
 2642                  * Weed out invalid mappings.
 2643                  */
 2644                 if (ptpaddr == 0)
 2645                         continue;
 2646 
 2647                 /*
 2648                  * Check for large page.
 2649                  */
 2650                 if ((ptpaddr & PG_PS) != 0) {
 2651                         /*
 2652                          * Are we removing the entire large page?  If not,
 2653                          * demote the mapping and fall through.
 2654                          */
 2655                         if (sva + NBPDR == va_next && eva >= va_next) {
 2656                                 /*
 2657                                  * The TLB entry for a PG_G mapping is
 2658                                  * invalidated by pmap_remove_pde().
 2659                                  */
 2660                                 if ((ptpaddr & PG_G) == 0)
 2661                                         anyvalid = 1;
 2662                                 pmap_remove_pde(pmap, pde, sva, &free);
 2663                                 continue;
 2664                         } else if (!pmap_demote_pde(pmap, pde, sva)) {
 2665                                 /* The large page mapping was destroyed. */
 2666                                 continue;
 2667                         } else
 2668                                 ptpaddr = *pde;
 2669                 }
 2670 
 2671                 /*
 2672                  * Limit our scan to either the end of the va represented
 2673                  * by the current page table page, or to the end of the
 2674                  * range being removed.
 2675                  */
 2676                 if (va_next > eva)
 2677                         va_next = eva;
 2678 
 2679                 va = va_next;
 2680                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 2681                     sva += PAGE_SIZE) {
 2682                         if (*pte == 0) {
 2683                                 if (va != va_next) {
 2684                                         pmap_invalidate_range(pmap, va, sva);
 2685                                         va = va_next;
 2686                                 }
 2687                                 continue;
 2688                         }
 2689                         if ((*pte & PG_G) == 0)
 2690                                 anyvalid = 1;
 2691                         else if (va == va_next)
 2692                                 va = sva;
 2693                         if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free)) {
 2694                                 sva += PAGE_SIZE;
 2695                                 break;
 2696                         }
 2697                 }
 2698                 if (va != va_next)
 2699                         pmap_invalidate_range(pmap, va, sva);
 2700         }
 2701 out:
 2702         if (anyvalid)
 2703                 pmap_invalidate_all(pmap);
 2704         vm_page_unlock_queues();        
 2705         PMAP_UNLOCK(pmap);
 2706         pmap_free_zero_pages(free);
 2707 }
 2708 
 2709 /*
 2710  *      Routine:        pmap_remove_all
 2711  *      Function:
 2712  *              Removes this physical page from
 2713  *              all physical maps in which it resides.
 2714  *              Reflects back modify bits to the pager.
 2715  *
 2716  *      Notes:
 2717  *              Original versions of this routine were very
 2718  *              inefficient because they iteratively called
 2719  *              pmap_remove (slow...)
 2720  */
 2721 
 2722 void
 2723 pmap_remove_all(vm_page_t m)
 2724 {
 2725         struct md_page *pvh;
 2726         pv_entry_t pv;
 2727         pmap_t pmap;
 2728         pt_entry_t *pte, tpte;
 2729         pd_entry_t *pde;
 2730         vm_offset_t va;
 2731         vm_page_t free;
 2732 
 2733         KASSERT((m->flags & PG_FICTITIOUS) == 0,
 2734             ("pmap_remove_all: page %p is fictitious", m));
 2735         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2736         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2737         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 2738                 va = pv->pv_va;
 2739                 pmap = PV_PMAP(pv);
 2740                 PMAP_LOCK(pmap);
 2741                 pde = pmap_pde(pmap, va);
 2742                 (void)pmap_demote_pde(pmap, pde, va);
 2743                 PMAP_UNLOCK(pmap);
 2744         }
 2745         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 2746                 pmap = PV_PMAP(pv);
 2747                 PMAP_LOCK(pmap);
 2748                 pmap->pm_stats.resident_count--;
 2749                 pde = pmap_pde(pmap, pv->pv_va);
 2750                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 2751                     " a 2mpage in page %p's pv list", m));
 2752                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 2753                 tpte = pte_load_clear(pte);
 2754                 if (tpte & PG_W)
 2755                         pmap->pm_stats.wired_count--;
 2756                 if (tpte & PG_A)
 2757                         vm_page_flag_set(m, PG_REFERENCED);
 2758 
 2759                 /*
 2760                  * Update the vm_page_t clean and reference bits.
 2761                  */
 2762                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2763                         vm_page_dirty(m);
 2764                 free = NULL;
 2765                 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 2766                 pmap_invalidate_page(pmap, pv->pv_va);
 2767                 pmap_free_zero_pages(free);
 2768                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 2769                 free_pv_entry(pmap, pv);
 2770                 PMAP_UNLOCK(pmap);
 2771         }
 2772         vm_page_flag_clear(m, PG_WRITEABLE);
 2773 }
 2774 
 2775 /*
 2776  * pmap_protect_pde: do the things to protect a 2mpage in a process
 2777  */
 2778 static boolean_t
 2779 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 2780 {
 2781         pd_entry_t newpde, oldpde;
 2782         vm_offset_t eva, va;
 2783         vm_page_t m;
 2784         boolean_t anychanged;
 2785 
 2786         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2787         KASSERT((sva & PDRMASK) == 0,
 2788             ("pmap_protect_pde: sva is not 2mpage aligned"));
 2789         anychanged = FALSE;
 2790 retry:
 2791         oldpde = newpde = *pde;
 2792         if (oldpde & PG_MANAGED) {
 2793                 eva = sva + NBPDR;
 2794                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 2795                     va < eva; va += PAGE_SIZE, m++) {
 2796                         /*
 2797                          * In contrast to the analogous operation on a 4KB page
 2798                          * mapping, the mapping's PG_A flag is not cleared and
 2799                          * the page's PG_REFERENCED flag is not set.  The
 2800                          * reason is that pmap_demote_pde() expects that a 2MB
 2801                          * page mapping with a stored page table page has PG_A
 2802                          * set.
 2803                          */
 2804                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2805                                 vm_page_dirty(m);
 2806                 }
 2807         }
 2808         if ((prot & VM_PROT_WRITE) == 0)
 2809                 newpde &= ~(PG_RW | PG_M);
 2810         if ((prot & VM_PROT_EXECUTE) == 0)
 2811                 newpde |= pg_nx;
 2812         if (newpde != oldpde) {
 2813                 if (!atomic_cmpset_long(pde, oldpde, newpde))
 2814                         goto retry;
 2815                 if (oldpde & PG_G)
 2816                         pmap_invalidate_page(pmap, sva);
 2817                 else
 2818                         anychanged = TRUE;
 2819         }
 2820         return (anychanged);
 2821 }
 2822 
 2823 /*
 2824  *      Set the physical protection on the
 2825  *      specified range of this map as requested.
 2826  */
 2827 void
 2828 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 2829 {
 2830         vm_offset_t va_next;
 2831         pml4_entry_t *pml4e;
 2832         pdp_entry_t *pdpe;
 2833         pd_entry_t ptpaddr, *pde;
 2834         pt_entry_t *pte;
 2835         int anychanged;
 2836 
 2837         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 2838                 pmap_remove(pmap, sva, eva);
 2839                 return;
 2840         }
 2841 
 2842         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 2843             (VM_PROT_WRITE|VM_PROT_EXECUTE))
 2844                 return;
 2845 
 2846         anychanged = 0;
 2847 
 2848         vm_page_lock_queues();
 2849         PMAP_LOCK(pmap);
 2850         for (; sva < eva; sva = va_next) {
 2851 
 2852                 pml4e = pmap_pml4e(pmap, sva);
 2853                 if ((*pml4e & PG_V) == 0) {
 2854                         va_next = (sva + NBPML4) & ~PML4MASK;
 2855                         if (va_next < sva)
 2856                                 va_next = eva;
 2857                         continue;
 2858                 }
 2859 
 2860                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 2861                 if ((*pdpe & PG_V) == 0) {
 2862                         va_next = (sva + NBPDP) & ~PDPMASK;
 2863                         if (va_next < sva)
 2864                                 va_next = eva;
 2865                         continue;
 2866                 }
 2867 
 2868                 va_next = (sva + NBPDR) & ~PDRMASK;
 2869                 if (va_next < sva)
 2870                         va_next = eva;
 2871 
 2872                 pde = pmap_pdpe_to_pde(pdpe, sva);
 2873                 ptpaddr = *pde;
 2874 
 2875                 /*
 2876                  * Weed out invalid mappings.
 2877                  */
 2878                 if (ptpaddr == 0)
 2879                         continue;
 2880 
 2881                 /*
 2882                  * Check for large page.
 2883                  */
 2884                 if ((ptpaddr & PG_PS) != 0) {
 2885                         /*
 2886                          * Are we protecting the entire large page?  If not,
 2887                          * demote the mapping and fall through.
 2888                          */
 2889                         if (sva + NBPDR == va_next && eva >= va_next) {
 2890                                 /*
 2891                                  * The TLB entry for a PG_G mapping is
 2892                                  * invalidated by pmap_protect_pde().
 2893                                  */
 2894                                 if (pmap_protect_pde(pmap, pde, sva, prot))
 2895                                         anychanged = 1;
 2896                                 continue;
 2897                         } else if (!pmap_demote_pde(pmap, pde, sva)) {
 2898                                 /* The large page mapping was destroyed. */
 2899                                 continue;
 2900                         }
 2901                 }
 2902 
 2903                 if (va_next > eva)
 2904                         va_next = eva;
 2905 
 2906                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 2907                     sva += PAGE_SIZE) {
 2908                         pt_entry_t obits, pbits;
 2909                         vm_page_t m;
 2910 
 2911 retry:
 2912                         obits = pbits = *pte;
 2913                         if ((pbits & PG_V) == 0)
 2914                                 continue;
 2915                         if (pbits & PG_MANAGED) {
 2916                                 m = NULL;
 2917                                 if (pbits & PG_A) {
 2918                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 2919                                         vm_page_flag_set(m, PG_REFERENCED);
 2920                                         pbits &= ~PG_A;
 2921                                 }
 2922                                 if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 2923                                         if (m == NULL)
 2924                                                 m = PHYS_TO_VM_PAGE(pbits &
 2925                                                     PG_FRAME);
 2926                                         vm_page_dirty(m);
 2927                                 }
 2928                         }
 2929 
 2930                         if ((prot & VM_PROT_WRITE) == 0)
 2931                                 pbits &= ~(PG_RW | PG_M);
 2932                         if ((prot & VM_PROT_EXECUTE) == 0)
 2933                                 pbits |= pg_nx;
 2934 
 2935                         if (pbits != obits) {
 2936                                 if (!atomic_cmpset_long(pte, obits, pbits))
 2937                                         goto retry;
 2938                                 if (obits & PG_G)
 2939                                         pmap_invalidate_page(pmap, sva);
 2940                                 else
 2941                                         anychanged = 1;
 2942                         }
 2943                 }
 2944         }
 2945         if (anychanged)
 2946                 pmap_invalidate_all(pmap);
 2947         vm_page_unlock_queues();
 2948         PMAP_UNLOCK(pmap);
 2949 }
 2950 
 2951 /*
 2952  * Tries to promote the 512, contiguous 4KB page mappings that are within a
 2953  * single page table page (PTP) to a single 2MB page mapping.  For promotion
 2954  * to occur, two conditions must be met: (1) the 4KB page mappings must map
 2955  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
 2956  * identical characteristics. 
 2957  */
 2958 static void
 2959 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 2960 {
 2961         pd_entry_t newpde;
 2962         pt_entry_t *firstpte, oldpte, pa, *pte;
 2963         vm_offset_t oldpteva;
 2964         vm_page_t mpte;
 2965 
 2966         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2967 
 2968         /*
 2969          * Examine the first PTE in the specified PTP.  Abort if this PTE is
 2970          * either invalid, unused, or does not map the first 4KB physical page
 2971          * within a 2MB page. 
 2972          */
 2973         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 2974 setpde:
 2975         newpde = *firstpte;
 2976         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 2977                 pmap_pde_p_failures++;
 2978                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 2979                     " in pmap %p", va, pmap);
 2980                 return;
 2981         }
 2982         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 2983                 /*
 2984                  * When PG_M is already clear, PG_RW can be cleared without
 2985                  * a TLB invalidation.
 2986                  */
 2987                 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 2988                         goto setpde;
 2989                 newpde &= ~PG_RW;
 2990         }
 2991 
 2992         /*
 2993          * Examine each of the other PTEs in the specified PTP.  Abort if this
 2994          * PTE maps an unexpected 4KB physical page or does not have identical
 2995          * characteristics to the first PTE.
 2996          */
 2997         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 2998         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 2999 setpte:
 3000                 oldpte = *pte;
 3001                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 3002                         pmap_pde_p_failures++;
 3003                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 3004                             " in pmap %p", va, pmap);
 3005                         return;
 3006                 }
 3007                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 3008                         /*
 3009                          * When PG_M is already clear, PG_RW can be cleared
 3010                          * without a TLB invalidation.
 3011                          */
 3012                         if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 3013                                 goto setpte;
 3014                         oldpte &= ~PG_RW;
 3015                         oldpteva = (oldpte & PG_FRAME & PDRMASK) |
 3016                             (va & ~PDRMASK);
 3017                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 3018                             " in pmap %p", oldpteva, pmap);
 3019                 }
 3020                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 3021                         pmap_pde_p_failures++;
 3022                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 3023                             " in pmap %p", va, pmap);
 3024                         return;
 3025                 }
 3026                 pa -= PAGE_SIZE;
 3027         }
 3028 
 3029         /*
 3030          * Save the page table page in its current state until the PDE
 3031          * mapping the superpage is demoted by pmap_demote_pde() or
 3032          * destroyed by pmap_remove_pde(). 
 3033          */
 3034         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 3035         KASSERT(mpte >= vm_page_array &&
 3036             mpte < &vm_page_array[vm_page_array_size],
 3037             ("pmap_promote_pde: page table page is out of range"));
 3038         KASSERT(mpte->pindex == pmap_pde_pindex(va),
 3039             ("pmap_promote_pde: page table page's pindex is wrong"));
 3040         pmap_insert_pt_page(pmap, mpte);
 3041 
 3042         /*
 3043          * Promote the pv entries.
 3044          */
 3045         if ((newpde & PG_MANAGED) != 0)
 3046                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
 3047 
 3048         /*
 3049          * Propagate the PAT index to its proper position.
 3050          */
 3051         if ((newpde & PG_PTE_PAT) != 0)
 3052                 newpde ^= PG_PDE_PAT | PG_PTE_PAT;
 3053 
 3054         /*
 3055          * Map the superpage.
 3056          */
 3057         if (workaround_erratum383)
 3058                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 3059         else
 3060                 pde_store(pde, PG_PS | newpde);
 3061 
 3062         pmap_pde_promotions++;
 3063         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 3064             " in pmap %p", va, pmap);
 3065 }
 3066 
 3067 /*
 3068  *      Insert the given physical page (p) at
 3069  *      the specified virtual address (v) in the
 3070  *      target physical map with the protection requested.
 3071  *
 3072  *      If specified, the page will be wired down, meaning
 3073  *      that the related pte can not be reclaimed.
 3074  *
 3075  *      NB:  This is the only routine which MAY NOT lazy-evaluate
 3076  *      or lose information.  That is, this routine must actually
 3077  *      insert this page into the given map NOW.
 3078  */
 3079 void
 3080 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
 3081     vm_prot_t prot, boolean_t wired)
 3082 {
 3083         vm_paddr_t pa;
 3084         pd_entry_t *pde;
 3085         pt_entry_t *pte;
 3086         vm_paddr_t opa;
 3087         pt_entry_t origpte, newpte;
 3088         vm_page_t mpte, om;
 3089         boolean_t invlva;
 3090 
 3091         va = trunc_page(va);
 3092         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 3093         KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 3094             ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va));
 3095 
 3096         mpte = NULL;
 3097 
 3098         vm_page_lock_queues();
 3099         PMAP_LOCK(pmap);
 3100 
 3101         /*
 3102          * In the case that a page table page is not
 3103          * resident, we are creating it here.
 3104          */
 3105         if (va < VM_MAXUSER_ADDRESS) {
 3106                 mpte = pmap_allocpte(pmap, va, M_WAITOK);
 3107         }
 3108 
 3109         pde = pmap_pde(pmap, va);
 3110         if (pde != NULL && (*pde & PG_V) != 0) {
 3111                 if ((*pde & PG_PS) != 0)
 3112                         panic("pmap_enter: attempted pmap_enter on 2MB page");
 3113                 pte = pmap_pde_to_pte(pde, va);
 3114         } else
 3115                 panic("pmap_enter: invalid page directory va=%#lx", va);
 3116 
 3117         pa = VM_PAGE_TO_PHYS(m);
 3118         om = NULL;
 3119         origpte = *pte;
 3120         opa = origpte & PG_FRAME;
 3121 
 3122         /*
 3123          * Mapping has not changed, must be protection or wiring change.
 3124          */
 3125         if (origpte && (opa == pa)) {
 3126                 /*
 3127                  * Wiring change, just update stats. We don't worry about
 3128                  * wiring PT pages as they remain resident as long as there
 3129                  * are valid mappings in them. Hence, if a user page is wired,
 3130                  * the PT page will be also.
 3131                  */
 3132                 if (wired && ((origpte & PG_W) == 0))
 3133                         pmap->pm_stats.wired_count++;
 3134                 else if (!wired && (origpte & PG_W))
 3135                         pmap->pm_stats.wired_count--;
 3136 
 3137                 /*
 3138                  * Remove extra pte reference
 3139                  */
 3140                 if (mpte)
 3141                         mpte->wire_count--;
 3142 
 3143                 /*
 3144                  * We might be turning off write access to the page,
 3145                  * so we go ahead and sense modify status.
 3146                  */
 3147                 if (origpte & PG_MANAGED) {
 3148                         om = m;
 3149                         pa |= PG_MANAGED;
 3150                 }
 3151                 goto validate;
 3152         } 
 3153         /*
 3154          * Mapping has changed, invalidate old range and fall through to
 3155          * handle validating new mapping.
 3156          */
 3157         if (opa) {
 3158                 if (origpte & PG_W)
 3159                         pmap->pm_stats.wired_count--;
 3160                 if (origpte & PG_MANAGED) {
 3161                         om = PHYS_TO_VM_PAGE(opa);
 3162                         pmap_remove_entry(pmap, om, va);
 3163                 }
 3164                 if (mpte != NULL) {
 3165                         mpte->wire_count--;
 3166                         KASSERT(mpte->wire_count > 0,
 3167                             ("pmap_enter: missing reference to page table page,"
 3168                              " va: 0x%lx", va));
 3169                 }
 3170         } else
 3171                 pmap->pm_stats.resident_count++;
 3172 
 3173         /*
 3174          * Enter on the PV list if part of our managed memory.
 3175          */
 3176         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 3177                 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 3178                     ("pmap_enter: managed mapping within the clean submap"));
 3179                 pmap_insert_entry(pmap, va, m);
 3180                 pa |= PG_MANAGED;
 3181         }
 3182 
 3183         /*
 3184          * Increment counters
 3185          */
 3186         if (wired)
 3187                 pmap->pm_stats.wired_count++;
 3188 
 3189 validate:
 3190         /*
 3191          * Now validate mapping with desired protection/wiring.
 3192          */
 3193         newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
 3194         if ((prot & VM_PROT_WRITE) != 0) {
 3195                 newpte |= PG_RW;
 3196                 vm_page_flag_set(m, PG_WRITEABLE);
 3197         }
 3198         if ((prot & VM_PROT_EXECUTE) == 0)
 3199                 newpte |= pg_nx;
 3200         if (wired)
 3201                 newpte |= PG_W;
 3202         if (va < VM_MAXUSER_ADDRESS)
 3203                 newpte |= PG_U;
 3204         if (pmap == kernel_pmap)
 3205                 newpte |= PG_G;
 3206 
 3207         /*
 3208          * if the mapping or permission bits are different, we need
 3209          * to update the pte.
 3210          */
 3211         if ((origpte & ~(PG_M|PG_A)) != newpte) {
 3212                 newpte |= PG_A;
 3213                 if ((access & VM_PROT_WRITE) != 0)
 3214                         newpte |= PG_M;
 3215                 if (origpte & PG_V) {
 3216                         invlva = FALSE;
 3217                         origpte = pte_load_store(pte, newpte);
 3218                         if (origpte & PG_A) {
 3219                                 if (origpte & PG_MANAGED)
 3220                                         vm_page_flag_set(om, PG_REFERENCED);
 3221                                 if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
 3222                                     PG_NX) == 0 && (newpte & PG_NX)))
 3223                                         invlva = TRUE;
 3224                         }
 3225                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 3226                                 if ((origpte & PG_MANAGED) != 0)
 3227                                         vm_page_dirty(om);
 3228                                 if ((newpte & PG_RW) == 0)
 3229                                         invlva = TRUE;
 3230                         }
 3231                         if (invlva)
 3232                                 pmap_invalidate_page(pmap, va);
 3233                 } else
 3234                         pte_store(pte, newpte);
 3235         }
 3236 
 3237         /*
 3238          * If both the page table page and the reservation are fully
 3239          * populated, then attempt promotion.
 3240          */
 3241         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 3242             pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
 3243                 pmap_promote_pde(pmap, pde, va);
 3244 
 3245         vm_page_unlock_queues();
 3246         PMAP_UNLOCK(pmap);
 3247 }
 3248 
 3249 /*
 3250  * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
 3251  * otherwise.  Fails if (1) a page table page cannot be allocated without
 3252  * blocking, (2) a mapping already exists at the specified virtual address, or
 3253  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
 3254  */
 3255 static boolean_t
 3256 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 3257 {
 3258         pd_entry_t *pde, newpde;
 3259         vm_page_t free, mpde;
 3260 
 3261         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3262         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3263         if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
 3264                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3265                     " in pmap %p", va, pmap);
 3266                 return (FALSE);
 3267         }
 3268         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
 3269         pde = &pde[pmap_pde_index(va)];
 3270         if ((*pde & PG_V) != 0) {
 3271                 KASSERT(mpde->wire_count > 1,
 3272                     ("pmap_enter_pde: mpde's wire count is too low"));
 3273                 mpde->wire_count--;
 3274                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3275                     " in pmap %p", va, pmap);
 3276                 return (FALSE);
 3277         }
 3278         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
 3279             PG_PS | PG_V;
 3280         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 3281                 newpde |= PG_MANAGED;
 3282 
 3283                 /*
 3284                  * Abort this mapping if its PV entry could not be created.
 3285                  */
 3286                 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
 3287                         free = NULL;
 3288                         if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) {
 3289                                 pmap_invalidate_page(pmap, va);
 3290                                 pmap_free_zero_pages(free);
 3291                         }
 3292                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3293                             " in pmap %p", va, pmap);
 3294                         return (FALSE);
 3295                 }
 3296         }
 3297         if ((prot & VM_PROT_EXECUTE) == 0)
 3298                 newpde |= pg_nx;
 3299         if (va < VM_MAXUSER_ADDRESS)
 3300                 newpde |= PG_U;
 3301 
 3302         /*
 3303          * Increment counters.
 3304          */
 3305         pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
 3306 
 3307         /*
 3308          * Map the superpage.
 3309          */
 3310         pde_store(pde, newpde);
 3311 
 3312         pmap_pde_mappings++;
 3313         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 3314             " in pmap %p", va, pmap);
 3315         return (TRUE);
 3316 }
 3317 
 3318 /*
 3319  * Maps a sequence of resident pages belonging to the same object.
 3320  * The sequence begins with the given page m_start.  This page is
 3321  * mapped at the given virtual address start.  Each subsequent page is
 3322  * mapped at a virtual address that is offset from start by the same
 3323  * amount as the page is offset from m_start within the object.  The
 3324  * last page in the sequence is the page with the largest offset from
 3325  * m_start that can be mapped at a virtual address less than the given
 3326  * virtual address end.  Not every virtual page between start and end
 3327  * is mapped; only those for which a resident page exists with the
 3328  * corresponding offset from m_start are mapped.
 3329  */
 3330 void
 3331 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
 3332     vm_page_t m_start, vm_prot_t prot)
 3333 {
 3334         vm_offset_t va;
 3335         vm_page_t m, mpte;
 3336         vm_pindex_t diff, psize;
 3337 
 3338         VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 3339         psize = atop(end - start);
 3340         mpte = NULL;
 3341         m = m_start;
 3342         PMAP_LOCK(pmap);
 3343         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 3344                 va = start + ptoa(diff);
 3345                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 3346                     (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
 3347                     pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
 3348                     pmap_enter_pde(pmap, va, m, prot))
 3349                         m = &m[NBPDR / PAGE_SIZE - 1];
 3350                 else
 3351                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 3352                             mpte);
 3353                 m = TAILQ_NEXT(m, listq);
 3354         }
 3355         PMAP_UNLOCK(pmap);
 3356 }
 3357 
 3358 /*
 3359  * this code makes some *MAJOR* assumptions:
 3360  * 1. Current pmap & pmap exists.
 3361  * 2. Not wired.
 3362  * 3. Read access.
 3363  * 4. No page table pages.
 3364  * but is *MUCH* faster than pmap_enter...
 3365  */
 3366 
 3367 void
 3368 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 3369 {
 3370 
 3371         PMAP_LOCK(pmap);
 3372         (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 3373         PMAP_UNLOCK(pmap);
 3374 }
 3375 
 3376 static vm_page_t
 3377 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 3378     vm_prot_t prot, vm_page_t mpte)
 3379 {
 3380         vm_page_t free;
 3381         pt_entry_t *pte;
 3382         vm_paddr_t pa;
 3383 
 3384         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 3385             (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
 3386             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 3387         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3388         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3389 
 3390         /*
 3391          * In the case that a page table page is not
 3392          * resident, we are creating it here.
 3393          */
 3394         if (va < VM_MAXUSER_ADDRESS) {
 3395                 vm_pindex_t ptepindex;
 3396                 pd_entry_t *ptepa;
 3397 
 3398                 /*
 3399                  * Calculate pagetable page index
 3400                  */
 3401                 ptepindex = pmap_pde_pindex(va);
 3402                 if (mpte && (mpte->pindex == ptepindex)) {
 3403                         mpte->wire_count++;
 3404                 } else {
 3405                         /*
 3406                          * Get the page directory entry
 3407                          */
 3408                         ptepa = pmap_pde(pmap, va);
 3409 
 3410                         /*
 3411                          * If the page table page is mapped, we just increment
 3412                          * the hold count, and activate it.
 3413                          */
 3414                         if (ptepa && (*ptepa & PG_V) != 0) {
 3415                                 if (*ptepa & PG_PS)
 3416                                         return (NULL);
 3417                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 3418                                 mpte->wire_count++;
 3419                         } else {
 3420                                 mpte = _pmap_allocpte(pmap, ptepindex,
 3421                                     M_NOWAIT);
 3422                                 if (mpte == NULL)
 3423                                         return (mpte);
 3424                         }
 3425                 }
 3426                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 3427                 pte = &pte[pmap_pte_index(va)];
 3428         } else {
 3429                 mpte = NULL;
 3430                 pte = vtopte(va);
 3431         }
 3432         if (*pte) {
 3433                 if (mpte != NULL) {
 3434                         mpte->wire_count--;
 3435                         mpte = NULL;
 3436                 }
 3437                 return (mpte);
 3438         }
 3439 
 3440         /*
 3441          * Enter on the PV list if part of our managed memory.
 3442          */
 3443         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
 3444             !pmap_try_insert_pv_entry(pmap, va, m)) {
 3445                 if (mpte != NULL) {
 3446                         free = NULL;
 3447                         if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
 3448                                 pmap_invalidate_page(pmap, va);
 3449                                 pmap_free_zero_pages(free);
 3450                         }
 3451                         mpte = NULL;
 3452                 }
 3453                 return (mpte);
 3454         }
 3455 
 3456         /*
 3457          * Increment counters
 3458          */
 3459         pmap->pm_stats.resident_count++;
 3460 
 3461         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
 3462         if ((prot & VM_PROT_EXECUTE) == 0)
 3463                 pa |= pg_nx;
 3464 
 3465         /*
 3466          * Now validate mapping with RO protection
 3467          */
 3468         if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 3469                 pte_store(pte, pa | PG_V | PG_U);
 3470         else
 3471                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 3472         return mpte;
 3473 }
 3474 
 3475 /*
 3476  * Make a temporary mapping for a physical address.  This is only intended
 3477  * to be used for panic dumps.
 3478  */
 3479 void *
 3480 pmap_kenter_temporary(vm_paddr_t pa, int i)
 3481 {
 3482         vm_offset_t va;
 3483 
 3484         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 3485         pmap_kenter(va, pa);
 3486         invlpg(va);
 3487         return ((void *)crashdumpmap);
 3488 }
 3489 
 3490 /*
 3491  * This code maps large physical mmap regions into the
 3492  * processor address space.  Note that some shortcuts
 3493  * are taken, but the code works.
 3494  */
 3495 void
 3496 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
 3497     vm_pindex_t pindex, vm_size_t size)
 3498 {
 3499         pd_entry_t *pde;
 3500         vm_paddr_t pa, ptepa;
 3501         vm_page_t p, pdpg;
 3502         int pat_mode;
 3503 
 3504         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 3505         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 3506             ("pmap_object_init_pt: non-device object"));
 3507         if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 3508                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
 3509                         return;
 3510                 p = vm_page_lookup(object, pindex);
 3511                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
 3512                     ("pmap_object_init_pt: invalid page %p", p));
 3513                 pat_mode = p->md.pat_mode;
 3514 
 3515                 /*
 3516                  * Abort the mapping if the first page is not physically
 3517                  * aligned to a 2MB page boundary.
 3518                  */
 3519                 ptepa = VM_PAGE_TO_PHYS(p);
 3520                 if (ptepa & (NBPDR - 1))
 3521                         return;
 3522 
 3523                 /*
 3524                  * Skip the first page.  Abort the mapping if the rest of
 3525                  * the pages are not physically contiguous or have differing
 3526                  * memory attributes.
 3527                  */
 3528                 p = TAILQ_NEXT(p, listq);
 3529                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 3530                     pa += PAGE_SIZE) {
 3531                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
 3532                             ("pmap_object_init_pt: invalid page %p", p));
 3533                         if (pa != VM_PAGE_TO_PHYS(p) ||
 3534                             pat_mode != p->md.pat_mode)
 3535                                 return;
 3536                         p = TAILQ_NEXT(p, listq);
 3537                 }
 3538 
 3539                 /*
 3540                  * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 3541                  * "size" is a multiple of 2M, adding the PAT setting to "pa"
 3542                  * will not affect the termination of this loop.
 3543                  */ 
 3544                 PMAP_LOCK(pmap);
 3545                 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
 3546                     size; pa += NBPDR) {
 3547                         pdpg = pmap_allocpde(pmap, addr, M_NOWAIT);
 3548                         if (pdpg == NULL) {
 3549                                 /*
 3550                                  * The creation of mappings below is only an
 3551                                  * optimization.  If a page directory page
 3552                                  * cannot be allocated without blocking,
 3553                                  * continue on to the next mapping rather than
 3554                                  * blocking.
 3555                                  */
 3556                                 addr += NBPDR;
 3557                                 continue;
 3558                         }
 3559                         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 3560                         pde = &pde[pmap_pde_index(addr)];
 3561                         if ((*pde & PG_V) == 0) {
 3562                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
 3563                                     PG_U | PG_RW | PG_V);
 3564                                 pmap->pm_stats.resident_count += NBPDR /
 3565                                     PAGE_SIZE;
 3566                                 pmap_pde_mappings++;
 3567                         } else {
 3568                                 /* Continue on if the PDE is already valid. */
 3569                                 pdpg->wire_count--;
 3570                                 KASSERT(pdpg->wire_count > 0,
 3571                                     ("pmap_object_init_pt: missing reference "
 3572                                     "to page directory page, va: 0x%lx", addr));
 3573                         }
 3574                         addr += NBPDR;
 3575                 }
 3576                 PMAP_UNLOCK(pmap);
 3577         }
 3578 }
 3579 
 3580 /*
 3581  *      Routine:        pmap_change_wiring
 3582  *      Function:       Change the wiring attribute for a map/virtual-address
 3583  *                      pair.
 3584  *      In/out conditions:
 3585  *                      The mapping must already exist in the pmap.
 3586  */
 3587 void
 3588 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 3589 {
 3590         pd_entry_t *pde;
 3591         pt_entry_t *pte;
 3592         boolean_t are_queues_locked;
 3593 
 3594         are_queues_locked = FALSE;
 3595 
 3596         /*
 3597          * Wiring is not a hardware characteristic so there is no need to
 3598          * invalidate TLB.
 3599          */
 3600 retry:
 3601         PMAP_LOCK(pmap);
 3602         pde = pmap_pde(pmap, va);
 3603         if ((*pde & PG_PS) != 0) {
 3604                 if (!wired != ((*pde & PG_W) == 0)) {
 3605                         if (!are_queues_locked) {
 3606                                 are_queues_locked = TRUE;
 3607                                 if (!mtx_trylock(&vm_page_queue_mtx)) {
 3608                                         PMAP_UNLOCK(pmap);
 3609                                         vm_page_lock_queues();
 3610                                         goto retry;
 3611                                 }
 3612                         }
 3613                         if (!pmap_demote_pde(pmap, pde, va))
 3614                                 panic("pmap_change_wiring: demotion failed");
 3615                 } else
 3616                         goto out;
 3617         }
 3618         pte = pmap_pde_to_pte(pde, va);
 3619         if (wired && (*pte & PG_W) == 0) {
 3620                 pmap->pm_stats.wired_count++;
 3621                 atomic_set_long(pte, PG_W);
 3622         } else if (!wired && (*pte & PG_W) != 0) {
 3623                 pmap->pm_stats.wired_count--;
 3624                 atomic_clear_long(pte, PG_W);
 3625         }
 3626 out:
 3627         if (are_queues_locked)
 3628                 vm_page_unlock_queues();
 3629         PMAP_UNLOCK(pmap);
 3630 }
 3631 
 3632 
 3633 
 3634 /*
 3635  *      Copy the range specified by src_addr/len
 3636  *      from the source map to the range dst_addr/len
 3637  *      in the destination map.
 3638  *
 3639  *      This routine is only advisory and need not do anything.
 3640  */
 3641 
 3642 void
 3643 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 3644     vm_offset_t src_addr)
 3645 {
 3646         vm_page_t   free;
 3647         vm_offset_t addr;
 3648         vm_offset_t end_addr = src_addr + len;
 3649         vm_offset_t va_next;
 3650 
 3651         if (dst_addr != src_addr)
 3652                 return;
 3653 
 3654         vm_page_lock_queues();
 3655         if (dst_pmap < src_pmap) {
 3656                 PMAP_LOCK(dst_pmap);
 3657                 PMAP_LOCK(src_pmap);
 3658         } else {
 3659                 PMAP_LOCK(src_pmap);
 3660                 PMAP_LOCK(dst_pmap);
 3661         }
 3662         for (addr = src_addr; addr < end_addr; addr = va_next) {
 3663                 pt_entry_t *src_pte, *dst_pte;
 3664                 vm_page_t dstmpde, dstmpte, srcmpte;
 3665                 pml4_entry_t *pml4e;
 3666                 pdp_entry_t *pdpe;
 3667                 pd_entry_t srcptepaddr, *pde;
 3668 
 3669                 KASSERT(addr < UPT_MIN_ADDRESS,
 3670                     ("pmap_copy: invalid to pmap_copy page tables"));
 3671 
 3672                 pml4e = pmap_pml4e(src_pmap, addr);
 3673                 if ((*pml4e & PG_V) == 0) {
 3674                         va_next = (addr + NBPML4) & ~PML4MASK;
 3675                         if (va_next < addr)
 3676                                 va_next = end_addr;
 3677                         continue;
 3678                 }
 3679 
 3680                 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 3681                 if ((*pdpe & PG_V) == 0) {
 3682                         va_next = (addr + NBPDP) & ~PDPMASK;
 3683                         if (va_next < addr)
 3684                                 va_next = end_addr;
 3685                         continue;
 3686                 }
 3687 
 3688                 va_next = (addr + NBPDR) & ~PDRMASK;
 3689                 if (va_next < addr)
 3690                         va_next = end_addr;
 3691 
 3692                 pde = pmap_pdpe_to_pde(pdpe, addr);
 3693                 srcptepaddr = *pde;
 3694                 if (srcptepaddr == 0)
 3695                         continue;
 3696                         
 3697                 if (srcptepaddr & PG_PS) {
 3698                         dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
 3699                         if (dstmpde == NULL)
 3700                                 break;
 3701                         pde = (pd_entry_t *)
 3702                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
 3703                         pde = &pde[pmap_pde_index(addr)];
 3704                         if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 3705                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
 3706                             PG_PS_FRAME))) {
 3707                                 *pde = srcptepaddr & ~PG_W;
 3708                                 dst_pmap->pm_stats.resident_count +=
 3709                                     NBPDR / PAGE_SIZE;
 3710                         } else
 3711                                 dstmpde->wire_count--;
 3712                         continue;
 3713                 }
 3714 
 3715                 srcptepaddr &= PG_FRAME;
 3716                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 3717                 KASSERT(srcmpte->wire_count > 0,
 3718                     ("pmap_copy: source page table page is unused"));
 3719 
 3720                 if (va_next > end_addr)
 3721                         va_next = end_addr;
 3722 
 3723                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 3724                 src_pte = &src_pte[pmap_pte_index(addr)];
 3725                 dstmpte = NULL;
 3726                 while (addr < va_next) {
 3727                         pt_entry_t ptetemp;
 3728                         ptetemp = *src_pte;
 3729                         /*
 3730                          * we only virtual copy managed pages
 3731                          */
 3732                         if ((ptetemp & PG_MANAGED) != 0) {
 3733                                 if (dstmpte != NULL &&
 3734                                     dstmpte->pindex == pmap_pde_pindex(addr))
 3735                                         dstmpte->wire_count++;
 3736                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
 3737                                     addr, M_NOWAIT)) == NULL)
 3738                                         goto out;
 3739                                 dst_pte = (pt_entry_t *)
 3740                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 3741                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
 3742                                 if (*dst_pte == 0 &&
 3743                                     pmap_try_insert_pv_entry(dst_pmap, addr,
 3744                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 3745                                         /*
 3746                                          * Clear the wired, modified, and
 3747                                          * accessed (referenced) bits
 3748                                          * during the copy.
 3749                                          */
 3750                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
 3751                                             PG_A);
 3752                                         dst_pmap->pm_stats.resident_count++;
 3753                                 } else {
 3754                                         free = NULL;
 3755                                         if (pmap_unwire_pte_hold(dst_pmap,
 3756                                             addr, dstmpte, &free)) {
 3757                                                 pmap_invalidate_page(dst_pmap,
 3758                                                     addr);
 3759                                                 pmap_free_zero_pages(free);
 3760                                         }
 3761                                         goto out;
 3762                                 }
 3763                                 if (dstmpte->wire_count >= srcmpte->wire_count)
 3764                                         break;
 3765                         }
 3766                         addr += PAGE_SIZE;
 3767                         src_pte++;
 3768                 }
 3769         }
 3770 out:
 3771         vm_page_unlock_queues();
 3772         PMAP_UNLOCK(src_pmap);
 3773         PMAP_UNLOCK(dst_pmap);
 3774 }       
 3775 
 3776 /*
 3777  *      pmap_zero_page zeros the specified hardware page by mapping 
 3778  *      the page into KVM and using bzero to clear its contents.
 3779  */
 3780 void
 3781 pmap_zero_page(vm_page_t m)
 3782 {
 3783         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 3784 
 3785         pagezero((void *)va);
 3786 }
 3787 
 3788 /*
 3789  *      pmap_zero_page_area zeros the specified hardware page by mapping 
 3790  *      the page into KVM and using bzero to clear its contents.
 3791  *
 3792  *      off and size may not cover an area beyond a single hardware page.
 3793  */
 3794 void
 3795 pmap_zero_page_area(vm_page_t m, int off, int size)
 3796 {
 3797         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 3798 
 3799         if (off == 0 && size == PAGE_SIZE)
 3800                 pagezero((void *)va);
 3801         else
 3802                 bzero((char *)va + off, size);
 3803 }
 3804 
 3805 /*
 3806  *      pmap_zero_page_idle zeros the specified hardware page by mapping 
 3807  *      the page into KVM and using bzero to clear its contents.  This
 3808  *      is intended to be called from the vm_pagezero process only and
 3809  *      outside of Giant.
 3810  */
 3811 void
 3812 pmap_zero_page_idle(vm_page_t m)
 3813 {
 3814         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 3815 
 3816         pagezero((void *)va);
 3817 }
 3818 
 3819 /*
 3820  *      pmap_copy_page copies the specified (machine independent)
 3821  *      page by mapping the page into virtual memory and using
 3822  *      bcopy to copy the page, one machine dependent page at a
 3823  *      time.
 3824  */
 3825 void
 3826 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 3827 {
 3828         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 3829         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 3830 
 3831         pagecopy((void *)src, (void *)dst);
 3832 }
 3833 
 3834 /*
 3835  * Returns true if the pmap's pv is one of the first
 3836  * 16 pvs linked to from this page.  This count may
 3837  * be changed upwards or downwards in the future; it
 3838  * is only necessary that true be returned for a small
 3839  * subset of pmaps for proper page aging.
 3840  */
 3841 boolean_t
 3842 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 3843 {
 3844         struct md_page *pvh;
 3845         pv_entry_t pv;
 3846         int loops = 0;
 3847 
 3848         if (m->flags & PG_FICTITIOUS)
 3849                 return FALSE;
 3850 
 3851         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3852         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 3853                 if (PV_PMAP(pv) == pmap) {
 3854                         return TRUE;
 3855                 }
 3856                 loops++;
 3857                 if (loops >= 16)
 3858                         break;
 3859         }
 3860         if (loops < 16) {
 3861                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3862                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 3863                         if (PV_PMAP(pv) == pmap)
 3864                                 return (TRUE);
 3865                         loops++;
 3866                         if (loops >= 16)
 3867                                 break;
 3868                 }
 3869         }
 3870         return (FALSE);
 3871 }
 3872 
 3873 /*
 3874  *      pmap_page_wired_mappings:
 3875  *
 3876  *      Return the number of managed mappings to the given physical page
 3877  *      that are wired.
 3878  */
 3879 int
 3880 pmap_page_wired_mappings(vm_page_t m)
 3881 {
 3882         int count;
 3883 
 3884         count = 0;
 3885         if ((m->flags & PG_FICTITIOUS) != 0)
 3886                 return (count);
 3887         count = pmap_pvh_wired_mappings(&m->md, count);
 3888         return (pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count));
 3889 }
 3890 
 3891 /*
 3892  *      pmap_pvh_wired_mappings:
 3893  *
 3894  *      Return the updated number "count" of managed mappings that are wired.
 3895  */
 3896 static int
 3897 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
 3898 {
 3899         pmap_t pmap;
 3900         pt_entry_t *pte;
 3901         pv_entry_t pv;
 3902 
 3903         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3904         TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 3905                 pmap = PV_PMAP(pv);
 3906                 PMAP_LOCK(pmap);
 3907                 pte = pmap_pte(pmap, pv->pv_va);
 3908                 if ((*pte & PG_W) != 0)
 3909                         count++;
 3910                 PMAP_UNLOCK(pmap);
 3911         }
 3912         return (count);
 3913 }
 3914 
 3915 /*
 3916  * Returns TRUE if the given page is mapped individually or as part of
 3917  * a 2mpage.  Otherwise, returns FALSE.
 3918  */
 3919 boolean_t
 3920 pmap_page_is_mapped(vm_page_t m)
 3921 {
 3922         struct md_page *pvh;
 3923 
 3924         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 3925                 return (FALSE);
 3926         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3927         if (TAILQ_EMPTY(&m->md.pv_list)) {
 3928                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3929                 return (!TAILQ_EMPTY(&pvh->pv_list));
 3930         } else
 3931                 return (TRUE);
 3932 }
 3933 
 3934 /*
 3935  * Remove all pages from specified address space
 3936  * this aids process exit speeds.  Also, this code
 3937  * is special cased for current process only, but
 3938  * can have the more generic (and slightly slower)
 3939  * mode enabled.  This is much faster than pmap_remove
 3940  * in the case of running down an entire address space.
 3941  */
 3942 void
 3943 pmap_remove_pages(pmap_t pmap)
 3944 {
 3945         pd_entry_t ptepde;
 3946         pt_entry_t *pte, tpte;
 3947         vm_page_t free = NULL;
 3948         vm_page_t m, mpte, mt;
 3949         pv_entry_t pv;
 3950         struct md_page *pvh;
 3951         struct pv_chunk *pc, *npc;
 3952         int field, idx;
 3953         int64_t bit;
 3954         uint64_t inuse, bitmask;
 3955         int allfree;
 3956 
 3957         if (pmap != PCPU_GET(curpmap)) {
 3958                 printf("warning: pmap_remove_pages called with non-current pmap\n");
 3959                 return;
 3960         }
 3961         vm_page_lock_queues();
 3962         PMAP_LOCK(pmap);
 3963         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 3964                 allfree = 1;
 3965                 for (field = 0; field < _NPCM; field++) {
 3966                         inuse = (~(pc->pc_map[field])) & pc_freemask[field];
 3967                         while (inuse != 0) {
 3968                                 bit = bsfq(inuse);
 3969                                 bitmask = 1UL << bit;
 3970                                 idx = field * 64 + bit;
 3971                                 pv = &pc->pc_pventry[idx];
 3972                                 inuse &= ~bitmask;
 3973 
 3974                                 pte = pmap_pdpe(pmap, pv->pv_va);
 3975                                 ptepde = *pte;
 3976                                 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 3977                                 tpte = *pte;
 3978                                 if ((tpte & (PG_PS | PG_V)) == PG_V) {
 3979                                         ptepde = tpte;
 3980                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 3981                                             PG_FRAME);
 3982                                         pte = &pte[pmap_pte_index(pv->pv_va)];
 3983                                         tpte = *pte & ~PG_PTE_PAT;
 3984                                 }
 3985                                 if ((tpte & PG_V) == 0)
 3986                                         panic("bad pte");
 3987 
 3988 /*
 3989  * We cannot remove wired pages from a process' mapping at this time
 3990  */
 3991                                 if (tpte & PG_W) {
 3992                                         allfree = 0;
 3993                                         continue;
 3994                                 }
 3995 
 3996                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 3997                                 KASSERT(m->phys_addr == (tpte & PG_FRAME),
 3998                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 3999                                     m, (uintmax_t)m->phys_addr,
 4000                                     (uintmax_t)tpte));
 4001 
 4002                                 KASSERT(m < &vm_page_array[vm_page_array_size],
 4003                                         ("pmap_remove_pages: bad tpte %#jx",
 4004                                         (uintmax_t)tpte));
 4005 
 4006                                 pte_clear(pte);
 4007 
 4008                                 /*
 4009                                  * Update the vm_page_t clean/reference bits.
 4010                                  */
 4011                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 4012                                         if ((tpte & PG_PS) != 0) {
 4013                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 4014                                                         vm_page_dirty(mt);
 4015                                         } else
 4016                                                 vm_page_dirty(m);
 4017                                 }
 4018 
 4019                                 /* Mark free */
 4020                                 PV_STAT(pv_entry_frees++);
 4021                                 PV_STAT(pv_entry_spare++);
 4022                                 pv_entry_count--;
 4023                                 pc->pc_map[field] |= bitmask;
 4024                                 if ((tpte & PG_PS) != 0) {
 4025                                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 4026                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 4027                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
 4028                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 4029                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 4030                                                         if (TAILQ_EMPTY(&mt->md.pv_list))
 4031                                                                 vm_page_flag_clear(mt, PG_WRITEABLE);
 4032                                         }
 4033                                         mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
 4034                                         if (mpte != NULL) {
 4035                                                 pmap_remove_pt_page(pmap, mpte);
 4036                                                 pmap->pm_stats.resident_count--;
 4037                                                 KASSERT(mpte->wire_count == NPTEPG,
 4038                                                     ("pmap_remove_pages: pte page wire count error"));
 4039                                                 mpte->wire_count = 0;
 4040                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
 4041                                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 4042                                         }
 4043                                 } else {
 4044                                         pmap->pm_stats.resident_count--;
 4045                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 4046                                         if (TAILQ_EMPTY(&m->md.pv_list)) {
 4047                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4048                                                 if (TAILQ_EMPTY(&pvh->pv_list))
 4049                                                         vm_page_flag_clear(m, PG_WRITEABLE);
 4050                                         }
 4051                                 }
 4052                                 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 4053                         }
 4054                 }
 4055                 if (allfree) {
 4056                         PV_STAT(pv_entry_spare -= _NPCPV);
 4057                         PV_STAT(pc_chunk_count--);
 4058                         PV_STAT(pc_chunk_frees++);
 4059                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4060                         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 4061                         dump_drop_page(m->phys_addr);
 4062                         vm_page_unwire(m, 0);
 4063                         vm_page_free(m);
 4064                 }
 4065         }
 4066         pmap_invalidate_all(pmap);
 4067         vm_page_unlock_queues();
 4068         PMAP_UNLOCK(pmap);
 4069         pmap_free_zero_pages(free);
 4070 }
 4071 
 4072 /*
 4073  *      pmap_is_modified:
 4074  *
 4075  *      Return whether or not the specified physical page was modified
 4076  *      in any physical maps.
 4077  */
 4078 boolean_t
 4079 pmap_is_modified(vm_page_t m)
 4080 {
 4081 
 4082         if (m->flags & PG_FICTITIOUS)
 4083                 return (FALSE);
 4084         if (pmap_is_modified_pvh(&m->md))
 4085                 return (TRUE);
 4086         return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 4087 }
 4088 
 4089 /*
 4090  * Returns TRUE if any of the given mappings were used to modify
 4091  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
 4092  * mappings are supported.
 4093  */
 4094 static boolean_t
 4095 pmap_is_modified_pvh(struct md_page *pvh)
 4096 {
 4097         pv_entry_t pv;
 4098         pt_entry_t *pte;
 4099         pmap_t pmap;
 4100         boolean_t rv;
 4101 
 4102         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4103         rv = FALSE;
 4104         TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 4105                 pmap = PV_PMAP(pv);
 4106                 PMAP_LOCK(pmap);
 4107                 pte = pmap_pte(pmap, pv->pv_va);
 4108                 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
 4109                 PMAP_UNLOCK(pmap);
 4110                 if (rv)
 4111                         break;
 4112         }
 4113         return (rv);
 4114 }
 4115 
 4116 /*
 4117  *      pmap_is_prefaultable:
 4118  *
 4119  *      Return whether or not the specified virtual address is elgible
 4120  *      for prefault.
 4121  */
 4122 boolean_t
 4123 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 4124 {
 4125         pd_entry_t *pde;
 4126         pt_entry_t *pte;
 4127         boolean_t rv;
 4128 
 4129         rv = FALSE;
 4130         PMAP_LOCK(pmap);
 4131         pde = pmap_pde(pmap, addr);
 4132         if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 4133                 pte = pmap_pde_to_pte(pde, addr);
 4134                 rv = (*pte & PG_V) == 0;
 4135         }
 4136         PMAP_UNLOCK(pmap);
 4137         return (rv);
 4138 }
 4139 
 4140 /*
 4141  * Clear the write and modified bits in each of the given page's mappings.
 4142  */
 4143 void
 4144 pmap_remove_write(vm_page_t m)
 4145 {
 4146         struct md_page *pvh;
 4147         pmap_t pmap;
 4148         pv_entry_t next_pv, pv;
 4149         pd_entry_t *pde;
 4150         pt_entry_t oldpte, *pte;
 4151         vm_offset_t va;
 4152 
 4153         if ((m->flags & PG_FICTITIOUS) != 0 ||
 4154             (m->flags & PG_WRITEABLE) == 0)
 4155                 return;
 4156         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4157         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4158         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
 4159                 va = pv->pv_va;
 4160                 pmap = PV_PMAP(pv);
 4161                 PMAP_LOCK(pmap);
 4162                 pde = pmap_pde(pmap, va);
 4163                 if ((*pde & PG_RW) != 0)
 4164                         (void)pmap_demote_pde(pmap, pde, va);
 4165                 PMAP_UNLOCK(pmap);
 4166         }
 4167         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 4168                 pmap = PV_PMAP(pv);
 4169                 PMAP_LOCK(pmap);
 4170                 pde = pmap_pde(pmap, pv->pv_va);
 4171                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
 4172                     " a 2mpage in page %p's pv list", m));
 4173                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 4174 retry:
 4175                 oldpte = *pte;
 4176                 if (oldpte & PG_RW) {
 4177                         if (!atomic_cmpset_long(pte, oldpte, oldpte &
 4178                             ~(PG_RW | PG_M)))
 4179                                 goto retry;
 4180                         if ((oldpte & PG_M) != 0)
 4181                                 vm_page_dirty(m);
 4182                         pmap_invalidate_page(pmap, pv->pv_va);
 4183                 }
 4184                 PMAP_UNLOCK(pmap);
 4185         }
 4186         vm_page_flag_clear(m, PG_WRITEABLE);
 4187 }
 4188 
 4189 /*
 4190  *      pmap_ts_referenced:
 4191  *
 4192  *      Return a count of reference bits for a page, clearing those bits.
 4193  *      It is not necessary for every reference bit to be cleared, but it
 4194  *      is necessary that 0 only be returned when there are truly no
 4195  *      reference bits set.
 4196  *
 4197  *      XXX: The exact number of bits to check and clear is a matter that
 4198  *      should be tested and standardized at some point in the future for
 4199  *      optimal aging of shared pages.
 4200  */
 4201 int
 4202 pmap_ts_referenced(vm_page_t m)
 4203 {
 4204         struct md_page *pvh;
 4205         pv_entry_t pv, pvf, pvn;
 4206         pmap_t pmap;
 4207         pd_entry_t oldpde, *pde;
 4208         pt_entry_t *pte;
 4209         vm_offset_t va;
 4210         int rtval = 0;
 4211 
 4212         if (m->flags & PG_FICTITIOUS)
 4213                 return (rtval);
 4214         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4215         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4216         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
 4217                 va = pv->pv_va;
 4218                 pmap = PV_PMAP(pv);
 4219                 PMAP_LOCK(pmap);
 4220                 pde = pmap_pde(pmap, va);
 4221                 oldpde = *pde;
 4222                 if ((oldpde & PG_A) != 0) {
 4223                         if (pmap_demote_pde(pmap, pde, va)) {
 4224                                 if ((oldpde & PG_W) == 0) {
 4225                                         /*
 4226                                          * Remove the mapping to a single page
 4227                                          * so that a subsequent access may
 4228                                          * repromote.  Since the underlying
 4229                                          * page table page is fully populated,
 4230                                          * this removal never frees a page
 4231                                          * table page.
 4232                                          */
 4233                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 4234                                             PG_PS_FRAME);
 4235                                         pmap_remove_page(pmap, va, pde, NULL);
 4236                                         rtval++;
 4237                                         if (rtval > 4) {
 4238                                                 PMAP_UNLOCK(pmap);
 4239                                                 return (rtval);
 4240                                         }
 4241                                 }
 4242                         }
 4243                 }
 4244                 PMAP_UNLOCK(pmap);
 4245         }
 4246         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 4247                 pvf = pv;
 4248                 do {
 4249                         pvn = TAILQ_NEXT(pv, pv_list);
 4250                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 4251                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 4252                         pmap = PV_PMAP(pv);
 4253                         PMAP_LOCK(pmap);
 4254                         pde = pmap_pde(pmap, pv->pv_va);
 4255                         KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
 4256                             " found a 2mpage in page %p's pv list", m));
 4257                         pte = pmap_pde_to_pte(pde, pv->pv_va);
 4258                         if ((*pte & PG_A) != 0) {
 4259                                 atomic_clear_long(pte, PG_A);
 4260                                 pmap_invalidate_page(pmap, pv->pv_va);
 4261                                 rtval++;
 4262                                 if (rtval > 4)
 4263                                         pvn = NULL;
 4264                         }
 4265                         PMAP_UNLOCK(pmap);
 4266                 } while ((pv = pvn) != NULL && pv != pvf);
 4267         }
 4268         return (rtval);
 4269 }
 4270 
 4271 /*
 4272  *      Clear the modify bits on the specified physical page.
 4273  */
 4274 void
 4275 pmap_clear_modify(vm_page_t m)
 4276 {
 4277         struct md_page *pvh;
 4278         pmap_t pmap;
 4279         pv_entry_t next_pv, pv;
 4280         pd_entry_t oldpde, *pde;
 4281         pt_entry_t oldpte, *pte;
 4282         vm_offset_t va;
 4283 
 4284         if ((m->flags & PG_FICTITIOUS) != 0)
 4285                 return;
 4286         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4287         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4288         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
 4289                 va = pv->pv_va;
 4290                 pmap = PV_PMAP(pv);
 4291                 PMAP_LOCK(pmap);
 4292                 pde = pmap_pde(pmap, va);
 4293                 oldpde = *pde;
 4294                 if ((oldpde & PG_RW) != 0) {
 4295                         if (pmap_demote_pde(pmap, pde, va)) {
 4296                                 if ((oldpde & PG_W) == 0) {
 4297                                         /*
 4298                                          * Write protect the mapping to a
 4299                                          * single page so that a subsequent
 4300                                          * write access may repromote.
 4301                                          */
 4302                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 4303                                             PG_PS_FRAME);
 4304                                         pte = pmap_pde_to_pte(pde, va);
 4305                                         oldpte = *pte;
 4306                                         if ((oldpte & PG_V) != 0) {
 4307                                                 while (!atomic_cmpset_long(pte,
 4308                                                     oldpte,
 4309                                                     oldpte & ~(PG_M | PG_RW)))
 4310                                                         oldpte = *pte;
 4311                                                 vm_page_dirty(m);
 4312                                                 pmap_invalidate_page(pmap, va);
 4313                                         }
 4314                                 }
 4315                         }
 4316                 }
 4317                 PMAP_UNLOCK(pmap);
 4318         }
 4319         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 4320                 pmap = PV_PMAP(pv);
 4321                 PMAP_LOCK(pmap);
 4322                 pde = pmap_pde(pmap, pv->pv_va);
 4323                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 4324                     " a 2mpage in page %p's pv list", m));
 4325                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 4326                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 4327                         atomic_clear_long(pte, PG_M);
 4328                         pmap_invalidate_page(pmap, pv->pv_va);
 4329                 }
 4330                 PMAP_UNLOCK(pmap);
 4331         }
 4332 }
 4333 
 4334 /*
 4335  *      pmap_clear_reference:
 4336  *
 4337  *      Clear the reference bit on the specified physical page.
 4338  */
 4339 void
 4340 pmap_clear_reference(vm_page_t m)
 4341 {
 4342         struct md_page *pvh;
 4343         pmap_t pmap;
 4344         pv_entry_t next_pv, pv;
 4345         pd_entry_t oldpde, *pde;
 4346         pt_entry_t *pte;
 4347         vm_offset_t va;
 4348 
 4349         if ((m->flags & PG_FICTITIOUS) != 0)
 4350                 return;
 4351         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4352         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4353         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
 4354                 va = pv->pv_va;
 4355                 pmap = PV_PMAP(pv);
 4356                 PMAP_LOCK(pmap);
 4357                 pde = pmap_pde(pmap, va);
 4358                 oldpde = *pde;
 4359                 if ((oldpde & PG_A) != 0) {
 4360                         if (pmap_demote_pde(pmap, pde, va)) {
 4361                                 /*
 4362                                  * Remove the mapping to a single page so
 4363                                  * that a subsequent access may repromote.
 4364                                  * Since the underlying page table page is
 4365                                  * fully populated, this removal never frees
 4366                                  * a page table page.
 4367                                  */
 4368                                 va += VM_PAGE_TO_PHYS(m) - (oldpde &
 4369                                     PG_PS_FRAME);
 4370                                 pmap_remove_page(pmap, va, pde, NULL);
 4371                         }
 4372                 }
 4373                 PMAP_UNLOCK(pmap);
 4374         }
 4375         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 4376                 pmap = PV_PMAP(pv);
 4377                 PMAP_LOCK(pmap);
 4378                 pde = pmap_pde(pmap, pv->pv_va);
 4379                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
 4380                     " a 2mpage in page %p's pv list", m));
 4381                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 4382                 if (*pte & PG_A) {
 4383                         atomic_clear_long(pte, PG_A);
 4384                         pmap_invalidate_page(pmap, pv->pv_va);
 4385                 }
 4386                 PMAP_UNLOCK(pmap);
 4387         }
 4388 }
 4389 
 4390 /*
 4391  * Miscellaneous support routines follow
 4392  */
 4393 
 4394 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 4395 static __inline void
 4396 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
 4397 {
 4398         u_int opte, npte;
 4399 
 4400         /*
 4401          * The cache mode bits are all in the low 32-bits of the
 4402          * PTE, so we can just spin on updating the low 32-bits.
 4403          */
 4404         do {
 4405                 opte = *(u_int *)pte;
 4406                 npte = opte & ~PG_PTE_CACHE;
 4407                 npte |= cache_bits;
 4408         } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 4409 }
 4410 
 4411 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 4412 static __inline void
 4413 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
 4414 {
 4415         u_int opde, npde;
 4416 
 4417         /*
 4418          * The cache mode bits are all in the low 32-bits of the
 4419          * PDE, so we can just spin on updating the low 32-bits.
 4420          */
 4421         do {
 4422                 opde = *(u_int *)pde;
 4423                 npde = opde & ~PG_PDE_CACHE;
 4424                 npde |= cache_bits;
 4425         } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 4426 }
 4427 
 4428 /*
 4429  * Map a set of physical memory pages into the kernel virtual
 4430  * address space. Return a pointer to where it is mapped. This
 4431  * routine is intended to be used for mapping device memory,
 4432  * NOT real memory.
 4433  */
 4434 void *
 4435 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 4436 {
 4437         vm_offset_t va, offset;
 4438         vm_size_t tmpsize;
 4439 
 4440         /*
 4441          * If the specified range of physical addresses fits within the direct
 4442          * map window, use the direct map. 
 4443          */
 4444         if (pa < dmaplimit && pa + size < dmaplimit) {
 4445                 va = PHYS_TO_DMAP(pa);
 4446                 if (!pmap_change_attr(va, size, mode))
 4447                         return ((void *)va);
 4448         }
 4449         offset = pa & PAGE_MASK;
 4450         size = roundup(offset + size, PAGE_SIZE);
 4451         va = kmem_alloc_nofault(kernel_map, size);
 4452         if (!va)
 4453                 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 4454         pa = trunc_page(pa);
 4455         for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 4456                 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 4457         pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 4458         pmap_invalidate_cache_range(va, va + tmpsize);
 4459         return ((void *)(va + offset));
 4460 }
 4461 
 4462 void *
 4463 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 4464 {
 4465 
 4466         return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 4467 }
 4468 
 4469 void *
 4470 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 4471 {
 4472 
 4473         return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 4474 }
 4475 
 4476 void
 4477 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 4478 {
 4479         vm_offset_t base, offset, tmpva;
 4480 
 4481         /* If we gave a direct map region in pmap_mapdev, do nothing */
 4482         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 4483                 return;
 4484         base = trunc_page(va);
 4485         offset = va & PAGE_MASK;
 4486         size = roundup(offset + size, PAGE_SIZE);
 4487         for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
 4488                 pmap_kremove(tmpva);
 4489         pmap_invalidate_range(kernel_pmap, va, tmpva);
 4490         kmem_free(kernel_map, base, size);
 4491 }
 4492 
 4493 /*
 4494  * Tries to demote a 1GB page mapping.
 4495  */
 4496 static boolean_t
 4497 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
 4498 {
 4499         pdp_entry_t newpdpe, oldpdpe;
 4500         pd_entry_t *firstpde, newpde, *pde;
 4501         vm_paddr_t mpdepa;
 4502         vm_page_t mpde;
 4503 
 4504         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4505         oldpdpe = *pdpe;
 4506         KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
 4507             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 4508         if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
 4509             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 4510                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 4511                     " in pmap %p", va, pmap);
 4512                 return (FALSE);
 4513         }
 4514         mpdepa = VM_PAGE_TO_PHYS(mpde);
 4515         firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
 4516         newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
 4517         KASSERT((oldpdpe & PG_A) != 0,
 4518             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 4519         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 4520             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 4521         newpde = oldpdpe;
 4522 
 4523         /*
 4524          * Initialize the page directory page.
 4525          */
 4526         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 4527                 *pde = newpde;
 4528                 newpde += NBPDR;
 4529         }
 4530 
 4531         /*
 4532          * Demote the mapping.
 4533          */
 4534         *pdpe = newpdpe;
 4535 
 4536         /*
 4537          * Invalidate a stale recursive mapping of the page directory page.
 4538          */
 4539         pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
 4540 
 4541         pmap_pdpe_demotions++;
 4542         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 4543             " in pmap %p", va, pmap);
 4544         return (TRUE);
 4545 }
 4546 
 4547 /*
 4548  * Sets the memory attribute for the specified page.
 4549  */
 4550 void
 4551 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 4552 {
 4553 
 4554         m->md.pat_mode = ma;
 4555 
 4556         /*
 4557          * If "m" is a normal page, update its direct mapping.  This update
 4558          * can be relied upon to perform any cache operations that are
 4559          * required for data coherence.
 4560          */
 4561         if ((m->flags & PG_FICTITIOUS) == 0 &&
 4562             pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 4563             m->md.pat_mode))
 4564                 panic("memory attribute change on the direct map failed");
 4565 }
 4566 
 4567 /*
 4568  * Changes the specified virtual address range's memory type to that given by
 4569  * the parameter "mode".  The specified virtual address range must be
 4570  * completely contained within either the direct map or the kernel map.  If
 4571  * the virtual address range is contained within the kernel map, then the
 4572  * memory type for each of the corresponding ranges of the direct map is also
 4573  * changed.  (The corresponding ranges of the direct map are those ranges that
 4574  * map the same physical pages as the specified virtual address range.)  These
 4575  * changes to the direct map are necessary because Intel describes the
 4576  * behavior of their processors as "undefined" if two or more mappings to the
 4577  * same physical page have different memory types.
 4578  *
 4579  * Returns zero if the change completed successfully, and either EINVAL or
 4580  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
 4581  * of the virtual address range was not mapped, and ENOMEM is returned if
 4582  * there was insufficient memory available to complete the change.  In the
 4583  * latter case, the memory type may have been changed on some part of the
 4584  * virtual address range or the direct map.
 4585  */
 4586 int
 4587 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 4588 {
 4589         int error;
 4590 
 4591         PMAP_LOCK(kernel_pmap);
 4592         error = pmap_change_attr_locked(va, size, mode);
 4593         PMAP_UNLOCK(kernel_pmap);
 4594         return (error);
 4595 }
 4596 
 4597 static int
 4598 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 4599 {
 4600         vm_offset_t base, offset, tmpva;
 4601         vm_paddr_t pa_start, pa_end;
 4602         pdp_entry_t *pdpe;
 4603         pd_entry_t *pde;
 4604         pt_entry_t *pte;
 4605         int cache_bits_pte, cache_bits_pde, error;
 4606         boolean_t changed;
 4607 
 4608         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 4609         base = trunc_page(va);
 4610         offset = va & PAGE_MASK;
 4611         size = roundup(offset + size, PAGE_SIZE);
 4612 
 4613         /*
 4614          * Only supported on kernel virtual addresses, including the direct
 4615          * map but excluding the recursive map.
 4616          */
 4617         if (base < DMAP_MIN_ADDRESS)
 4618                 return (EINVAL);
 4619 
 4620         cache_bits_pde = pmap_cache_bits(mode, 1);
 4621         cache_bits_pte = pmap_cache_bits(mode, 0);
 4622         changed = FALSE;
 4623 
 4624         /*
 4625          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 4626          * into 4KB pages if required.
 4627          */
 4628         for (tmpva = base; tmpva < base + size; ) {
 4629                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
 4630                 if (*pdpe == 0)
 4631                         return (EINVAL);
 4632                 if (*pdpe & PG_PS) {
 4633                         /*
 4634                          * If the current 1GB page already has the required
 4635                          * memory type, then we need not demote this page. Just
 4636                          * increment tmpva to the next 1GB page frame.
 4637                          */
 4638                         if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
 4639                                 tmpva = trunc_1gpage(tmpva) + NBPDP;
 4640                                 continue;
 4641                         }
 4642 
 4643                         /*
 4644                          * If the current offset aligns with a 1GB page frame
 4645                          * and there is at least 1GB left within the range, then
 4646                          * we need not break down this page into 2MB pages.
 4647                          */
 4648                         if ((tmpva & PDPMASK) == 0 &&
 4649                             tmpva + PDPMASK < base + size) {
 4650                                 tmpva += NBPDP;
 4651                                 continue;
 4652                         }
 4653                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
 4654                                 return (ENOMEM);
 4655                 }
 4656                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
 4657                 if (*pde == 0)
 4658                         return (EINVAL);
 4659                 if (*pde & PG_PS) {
 4660                         /*
 4661                          * If the current 2MB page already has the required
 4662                          * memory type, then we need not demote this page. Just
 4663                          * increment tmpva to the next 2MB page frame.
 4664                          */
 4665                         if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
 4666                                 tmpva = trunc_2mpage(tmpva) + NBPDR;
 4667                                 continue;
 4668                         }
 4669 
 4670                         /*
 4671                          * If the current offset aligns with a 2MB page frame
 4672                          * and there is at least 2MB left within the range, then
 4673                          * we need not break down this page into 4KB pages.
 4674                          */
 4675                         if ((tmpva & PDRMASK) == 0 &&
 4676                             tmpva + PDRMASK < base + size) {
 4677                                 tmpva += NBPDR;
 4678                                 continue;
 4679                         }
 4680                         if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
 4681                                 return (ENOMEM);
 4682                 }
 4683                 pte = pmap_pde_to_pte(pde, tmpva);
 4684                 if (*pte == 0)
 4685                         return (EINVAL);
 4686                 tmpva += PAGE_SIZE;
 4687         }
 4688         error = 0;
 4689 
 4690         /*
 4691          * Ok, all the pages exist, so run through them updating their
 4692          * cache mode if required.
 4693          */
 4694         pa_start = pa_end = 0;
 4695         for (tmpva = base; tmpva < base + size; ) {
 4696                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
 4697                 if (*pdpe & PG_PS) {
 4698                         if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
 4699                                 pmap_pde_attr(pdpe, cache_bits_pde);
 4700                                 changed = TRUE;
 4701                         }
 4702                         if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 4703                                 if (pa_start == pa_end) {
 4704                                         /* Start physical address run. */
 4705                                         pa_start = *pdpe & PG_PS_FRAME;
 4706                                         pa_end = pa_start + NBPDP;
 4707                                 } else if (pa_end == (*pdpe & PG_PS_FRAME))
 4708                                         pa_end += NBPDP;
 4709                                 else {
 4710                                         /* Run ended, update direct map. */
 4711                                         error = pmap_change_attr_locked(
 4712                                             PHYS_TO_DMAP(pa_start),
 4713                                             pa_end - pa_start, mode);
 4714                                         if (error != 0)
 4715                                                 break;
 4716                                         /* Start physical address run. */
 4717                                         pa_start = *pdpe & PG_PS_FRAME;
 4718                                         pa_end = pa_start + NBPDP;
 4719                                 }
 4720                         }
 4721                         tmpva = trunc_1gpage(tmpva) + NBPDP;
 4722                         continue;
 4723                 }
 4724                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
 4725                 if (*pde & PG_PS) {
 4726                         if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
 4727                                 pmap_pde_attr(pde, cache_bits_pde);
 4728                                 changed = TRUE;
 4729                         }
 4730                         if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 4731                                 if (pa_start == pa_end) {
 4732                                         /* Start physical address run. */
 4733                                         pa_start = *pde & PG_PS_FRAME;
 4734                                         pa_end = pa_start + NBPDR;
 4735                                 } else if (pa_end == (*pde & PG_PS_FRAME))
 4736                                         pa_end += NBPDR;
 4737                                 else {
 4738                                         /* Run ended, update direct map. */
 4739                                         error = pmap_change_attr_locked(
 4740                                             PHYS_TO_DMAP(pa_start),
 4741                                             pa_end - pa_start, mode);
 4742                                         if (error != 0)
 4743                                                 break;
 4744                                         /* Start physical address run. */
 4745                                         pa_start = *pde & PG_PS_FRAME;
 4746                                         pa_end = pa_start + NBPDR;
 4747                                 }
 4748                         }
 4749                         tmpva = trunc_2mpage(tmpva) + NBPDR;
 4750                 } else {
 4751                         pte = pmap_pde_to_pte(pde, tmpva);
 4752                         if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
 4753                                 pmap_pte_attr(pte, cache_bits_pte);
 4754                                 changed = TRUE;
 4755                         }
 4756                         if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 4757                                 if (pa_start == pa_end) {
 4758                                         /* Start physical address run. */
 4759                                         pa_start = *pte & PG_FRAME;
 4760                                         pa_end = pa_start + PAGE_SIZE;
 4761                                 } else if (pa_end == (*pte & PG_FRAME))
 4762                                         pa_end += PAGE_SIZE;
 4763                                 else {
 4764                                         /* Run ended, update direct map. */
 4765                                         error = pmap_change_attr_locked(
 4766                                             PHYS_TO_DMAP(pa_start),
 4767                                             pa_end - pa_start, mode);
 4768                                         if (error != 0)
 4769                                                 break;
 4770                                         /* Start physical address run. */
 4771                                         pa_start = *pte & PG_FRAME;
 4772                                         pa_end = pa_start + PAGE_SIZE;
 4773                                 }
 4774                         }
 4775                         tmpva += PAGE_SIZE;
 4776                 }
 4777         }
 4778         if (error == 0 && pa_start != pa_end)
 4779                 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
 4780                     pa_end - pa_start, mode);
 4781 
 4782         /*
 4783          * Flush CPU caches if required to make sure any data isn't cached that
 4784          * shouldn't be, etc.
 4785          */
 4786         if (changed) {
 4787                 pmap_invalidate_range(kernel_pmap, base, tmpva);
 4788                 pmap_invalidate_cache_range(base, tmpva);
 4789         }
 4790         return (error);
 4791 }
 4792 
 4793 /*
 4794  * Demotes any mapping within the direct map region that covers more than the
 4795  * specified range of physical addresses.  This range's size must be a power
 4796  * of two and its starting address must be a multiple of its size.  Since the
 4797  * demotion does not change any attributes of the mapping, a TLB invalidation
 4798  * is not mandatory.  The caller may, however, request a TLB invalidation.
 4799  */
 4800 void
 4801 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
 4802 {
 4803         pdp_entry_t *pdpe;
 4804         pd_entry_t *pde;
 4805         vm_offset_t va;
 4806         boolean_t changed;
 4807 
 4808         if (len == 0)
 4809                 return;
 4810         KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
 4811         KASSERT((base & (len - 1)) == 0,
 4812             ("pmap_demote_DMAP: base is not a multiple of len"));
 4813         if (len < NBPDP && base < dmaplimit) {
 4814                 va = PHYS_TO_DMAP(base);
 4815                 changed = FALSE;
 4816                 PMAP_LOCK(kernel_pmap);
 4817                 pdpe = pmap_pdpe(kernel_pmap, va);
 4818                 if ((*pdpe & PG_V) == 0)
 4819                         panic("pmap_demote_DMAP: invalid PDPE");
 4820                 if ((*pdpe & PG_PS) != 0) {
 4821                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
 4822                                 panic("pmap_demote_DMAP: PDPE failed");
 4823                         changed = TRUE;
 4824                 }
 4825                 if (len < NBPDR) {
 4826                         pde = pmap_pdpe_to_pde(pdpe, va);
 4827                         if ((*pde & PG_V) == 0)
 4828                                 panic("pmap_demote_DMAP: invalid PDE");
 4829                         if ((*pde & PG_PS) != 0) {
 4830                                 if (!pmap_demote_pde(kernel_pmap, pde, va))
 4831                                         panic("pmap_demote_DMAP: PDE failed");
 4832                                 changed = TRUE;
 4833                         }
 4834                 }
 4835                 if (changed && invalidate)
 4836                         pmap_invalidate_page(kernel_pmap, va);
 4837                 PMAP_UNLOCK(kernel_pmap);
 4838         }
 4839 }
 4840 
 4841 /*
 4842  * perform the pmap work for mincore
 4843  */
 4844 int
 4845 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 4846 {
 4847         pd_entry_t *pdep;
 4848         pt_entry_t pte;
 4849         vm_paddr_t pa;
 4850         vm_page_t m;
 4851         int val = 0;
 4852         
 4853         PMAP_LOCK(pmap);
 4854         pdep = pmap_pde(pmap, addr);
 4855         if (pdep != NULL && (*pdep & PG_V)) {
 4856                 if (*pdep & PG_PS) {
 4857                         pte = *pdep;
 4858                         val = MINCORE_SUPER;
 4859                         /* Compute the physical address of the 4KB page. */
 4860                         pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 4861                             PG_FRAME;
 4862                 } else {
 4863                         pte = *pmap_pde_to_pte(pdep, addr);
 4864                         pa = pte & PG_FRAME;
 4865                 }
 4866         } else {
 4867                 pte = 0;
 4868                 pa = 0;
 4869         }
 4870         PMAP_UNLOCK(pmap);
 4871 
 4872         if (pte != 0) {
 4873                 val |= MINCORE_INCORE;
 4874                 if ((pte & PG_MANAGED) == 0)
 4875                         return val;
 4876 
 4877                 m = PHYS_TO_VM_PAGE(pa);
 4878 
 4879                 /*
 4880                  * Modified by us
 4881                  */
 4882                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 4883                         val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 4884                 else {
 4885                         /*
 4886                          * Modified by someone else
 4887                          */
 4888                         vm_page_lock_queues();
 4889                         if (m->dirty || pmap_is_modified(m))
 4890                                 val |= MINCORE_MODIFIED_OTHER;
 4891                         vm_page_unlock_queues();
 4892                 }
 4893                 /*
 4894                  * Referenced by us
 4895                  */
 4896                 if (pte & PG_A)
 4897                         val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 4898                 else {
 4899                         /*
 4900                          * Referenced by someone else
 4901                          */
 4902                         vm_page_lock_queues();
 4903                         if ((m->flags & PG_REFERENCED) ||
 4904                             pmap_ts_referenced(m)) {
 4905                                 val |= MINCORE_REFERENCED_OTHER;
 4906                                 vm_page_flag_set(m, PG_REFERENCED);
 4907                         }
 4908                         vm_page_unlock_queues();
 4909                 }
 4910         } 
 4911         return val;
 4912 }
 4913 
 4914 void
 4915 pmap_activate(struct thread *td)
 4916 {
 4917         pmap_t  pmap, oldpmap;
 4918         u_int64_t  cr3;
 4919 
 4920         critical_enter();
 4921         pmap = vmspace_pmap(td->td_proc->p_vmspace);
 4922         oldpmap = PCPU_GET(curpmap);
 4923 #ifdef SMP
 4924         atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 4925         atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 4926 #else
 4927         oldpmap->pm_active &= ~PCPU_GET(cpumask);
 4928         pmap->pm_active |= PCPU_GET(cpumask);
 4929 #endif
 4930         cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
 4931         td->td_pcb->pcb_cr3 = cr3;
 4932         load_cr3(cr3);
 4933         PCPU_SET(curpmap, pmap);
 4934         critical_exit();
 4935 }
 4936 
 4937 void
 4938 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 4939 {
 4940 }
 4941 
 4942 /*
 4943  *      Increase the starting virtual address of the given mapping if a
 4944  *      different alignment might result in more superpage mappings.
 4945  */
 4946 void
 4947 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
 4948     vm_offset_t *addr, vm_size_t size)
 4949 {
 4950         vm_offset_t superpage_offset;
 4951 
 4952         if (size < NBPDR)
 4953                 return;
 4954         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 4955                 offset += ptoa(object->pg_color);
 4956         superpage_offset = offset & PDRMASK;
 4957         if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 4958             (*addr & PDRMASK) == superpage_offset)
 4959                 return;
 4960         if ((*addr & PDRMASK) < superpage_offset)
 4961                 *addr = (*addr & ~PDRMASK) + superpage_offset;
 4962         else
 4963                 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 4964 }

Cache object: 774ac335be57839ddb92371d567b581e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.