The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1991 Regents of the University of California.
    3  * All rights reserved.
    4  * Copyright (c) 1994 John S. Dyson
    5  * All rights reserved.
    6  * Copyright (c) 1994 David Greenman
    7  * All rights reserved.
    8  * Copyright (c) 2003 Peter Wemm
    9  * All rights reserved.
   10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
   11  * All rights reserved.
   12  *
   13  * This code is derived from software contributed to Berkeley by
   14  * the Systems Programming Group of the University of Utah Computer
   15  * Science Department and William Jolitz of UUNET Technologies Inc.
   16  *
   17  * Redistribution and use in source and binary forms, with or without
   18  * modification, are permitted provided that the following conditions
   19  * are met:
   20  * 1. Redistributions of source code must retain the above copyright
   21  *    notice, this list of conditions and the following disclaimer.
   22  * 2. Redistributions in binary form must reproduce the above copyright
   23  *    notice, this list of conditions and the following disclaimer in the
   24  *    documentation and/or other materials provided with the distribution.
   25  * 3. All advertising materials mentioning features or use of this software
   26  *    must display the following acknowledgement:
   27  *      This product includes software developed by the University of
   28  *      California, Berkeley and its contributors.
   29  * 4. Neither the name of the University nor the names of its contributors
   30  *    may be used to endorse or promote products derived from this software
   31  *    without specific prior written permission.
   32  *
   33  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   34  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   35  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   36  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   37  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   38  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   39  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   40  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   41  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   42  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   43  * SUCH DAMAGE.
   44  *
   45  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
   46  */
   47 /*-
   48  * Copyright (c) 2003 Networks Associates Technology, Inc.
   49  * All rights reserved.
   50  *
   51  * This software was developed for the FreeBSD Project by Jake Burkholder,
   52  * Safeport Network Services, and Network Associates Laboratories, the
   53  * Security Research Division of Network Associates, Inc. under
   54  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
   55  * CHATS research program.
   56  *
   57  * Redistribution and use in source and binary forms, with or without
   58  * modification, are permitted provided that the following conditions
   59  * are met:
   60  * 1. Redistributions of source code must retain the above copyright
   61  *    notice, this list of conditions and the following disclaimer.
   62  * 2. Redistributions in binary form must reproduce the above copyright
   63  *    notice, this list of conditions and the following disclaimer in the
   64  *    documentation and/or other materials provided with the distribution.
   65  *
   66  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   67  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   68  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   69  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   70  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   71  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   72  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   73  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   74  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   75  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   76  * SUCH DAMAGE.
   77  */
   78 
   79 #define AMD64_NPT_AWARE
   80 
   81 #include <sys/cdefs.h>
   82 __FBSDID("$FreeBSD: releng/10.0/sys/amd64/amd64/pmap.c 267829 2014-06-24 19:05:08Z delphij $");
   83 
   84 /*
   85  *      Manages physical address maps.
   86  *
   87  *      Since the information managed by this module is
   88  *      also stored by the logical address mapping module,
   89  *      this module may throw away valid virtual-to-physical
   90  *      mappings at almost any time.  However, invalidations
   91  *      of virtual-to-physical mappings must be done as
   92  *      requested.
   93  *
   94  *      In order to cope with hardware architectures which
   95  *      make virtual-to-physical map invalidates expensive,
   96  *      this module may delay invalidate or reduced protection
   97  *      operations until such time as they are actually
   98  *      necessary.  This module is given full information as
   99  *      to which processors are currently using which maps,
  100  *      and to when physical maps must be made correct.
  101  */
  102 
  103 #include "opt_pmap.h"
  104 #include "opt_vm.h"
  105 
  106 #include <sys/param.h>
  107 #include <sys/bus.h>
  108 #include <sys/systm.h>
  109 #include <sys/kernel.h>
  110 #include <sys/ktr.h>
  111 #include <sys/lock.h>
  112 #include <sys/malloc.h>
  113 #include <sys/mman.h>
  114 #include <sys/mutex.h>
  115 #include <sys/proc.h>
  116 #include <sys/rwlock.h>
  117 #include <sys/sx.h>
  118 #include <sys/vmmeter.h>
  119 #include <sys/sched.h>
  120 #include <sys/sysctl.h>
  121 #include <sys/_unrhdr.h>
  122 #include <sys/smp.h>
  123 
  124 #include <vm/vm.h>
  125 #include <vm/vm_param.h>
  126 #include <vm/vm_kern.h>
  127 #include <vm/vm_page.h>
  128 #include <vm/vm_map.h>
  129 #include <vm/vm_object.h>
  130 #include <vm/vm_extern.h>
  131 #include <vm/vm_pageout.h>
  132 #include <vm/vm_pager.h>
  133 #include <vm/vm_radix.h>
  134 #include <vm/vm_reserv.h>
  135 #include <vm/uma.h>
  136 
  137 #include <machine/intr_machdep.h>
  138 #include <machine/apicvar.h>
  139 #include <machine/cpu.h>
  140 #include <machine/cputypes.h>
  141 #include <machine/md_var.h>
  142 #include <machine/pcb.h>
  143 #include <machine/specialreg.h>
  144 #ifdef SMP
  145 #include <machine/smp.h>
  146 #endif
  147 
  148 static __inline boolean_t
  149 pmap_emulate_ad_bits(pmap_t pmap)
  150 {
  151 
  152         return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
  153 }
  154 
  155 static __inline pt_entry_t
  156 pmap_valid_bit(pmap_t pmap)
  157 {
  158         pt_entry_t mask;
  159 
  160         switch (pmap->pm_type) {
  161         case PT_X86:
  162                 mask = X86_PG_V;
  163                 break;
  164         case PT_EPT:
  165                 if (pmap_emulate_ad_bits(pmap))
  166                         mask = EPT_PG_EMUL_V;
  167                 else
  168                         mask = EPT_PG_READ;
  169                 break;
  170         default:
  171                 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
  172         }
  173 
  174         return (mask);
  175 }
  176 
  177 static __inline pt_entry_t
  178 pmap_rw_bit(pmap_t pmap)
  179 {
  180         pt_entry_t mask;
  181 
  182         switch (pmap->pm_type) {
  183         case PT_X86:
  184                 mask = X86_PG_RW;
  185                 break;
  186         case PT_EPT:
  187                 if (pmap_emulate_ad_bits(pmap))
  188                         mask = EPT_PG_EMUL_RW;
  189                 else
  190                         mask = EPT_PG_WRITE;
  191                 break;
  192         default:
  193                 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
  194         }
  195 
  196         return (mask);
  197 }
  198 
  199 static __inline pt_entry_t
  200 pmap_global_bit(pmap_t pmap)
  201 {
  202         pt_entry_t mask;
  203 
  204         switch (pmap->pm_type) {
  205         case PT_X86:
  206                 mask = X86_PG_G;
  207                 break;
  208         case PT_EPT:
  209                 mask = 0;
  210                 break;
  211         default:
  212                 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
  213         }
  214 
  215         return (mask);
  216 }
  217 
  218 static __inline pt_entry_t
  219 pmap_accessed_bit(pmap_t pmap)
  220 {
  221         pt_entry_t mask;
  222 
  223         switch (pmap->pm_type) {
  224         case PT_X86:
  225                 mask = X86_PG_A;
  226                 break;
  227         case PT_EPT:
  228                 if (pmap_emulate_ad_bits(pmap))
  229                         mask = EPT_PG_READ;
  230                 else
  231                         mask = EPT_PG_A;
  232                 break;
  233         default:
  234                 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
  235         }
  236 
  237         return (mask);
  238 }
  239 
  240 static __inline pt_entry_t
  241 pmap_modified_bit(pmap_t pmap)
  242 {
  243         pt_entry_t mask;
  244 
  245         switch (pmap->pm_type) {
  246         case PT_X86:
  247                 mask = X86_PG_M;
  248                 break;
  249         case PT_EPT:
  250                 if (pmap_emulate_ad_bits(pmap))
  251                         mask = EPT_PG_WRITE;
  252                 else
  253                         mask = EPT_PG_M;
  254                 break;
  255         default:
  256                 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
  257         }
  258 
  259         return (mask);
  260 }
  261 
  262 #if !defined(DIAGNOSTIC)
  263 #ifdef __GNUC_GNU_INLINE__
  264 #define PMAP_INLINE     __attribute__((__gnu_inline__)) inline
  265 #else
  266 #define PMAP_INLINE     extern inline
  267 #endif
  268 #else
  269 #define PMAP_INLINE
  270 #endif
  271 
  272 #ifdef PV_STATS
  273 #define PV_STAT(x)      do { x ; } while (0)
  274 #else
  275 #define PV_STAT(x)      do { } while (0)
  276 #endif
  277 
  278 #define pa_index(pa)    ((pa) >> PDRSHIFT)
  279 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
  280 
  281 #define NPV_LIST_LOCKS  MAXCPU
  282 
  283 #define PHYS_TO_PV_LIST_LOCK(pa)        \
  284                         (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
  285 
  286 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
  287         struct rwlock **_lockp = (lockp);               \
  288         struct rwlock *_new_lock;                       \
  289                                                         \
  290         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
  291         if (_new_lock != *_lockp) {                     \
  292                 if (*_lockp != NULL)                    \
  293                         rw_wunlock(*_lockp);            \
  294                 *_lockp = _new_lock;                    \
  295                 rw_wlock(*_lockp);                      \
  296         }                                               \
  297 } while (0)
  298 
  299 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
  300                         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
  301 
  302 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
  303         struct rwlock **_lockp = (lockp);               \
  304                                                         \
  305         if (*_lockp != NULL) {                          \
  306                 rw_wunlock(*_lockp);                    \
  307                 *_lockp = NULL;                         \
  308         }                                               \
  309 } while (0)
  310 
  311 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
  312                         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
  313 
  314 struct pmap kernel_pmap_store;
  315 
  316 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
  317 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
  318 
  319 int nkpt;
  320 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
  321     "Number of kernel page table pages allocated on bootup");
  322 
  323 static int ndmpdp;
  324 static vm_paddr_t dmaplimit;
  325 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
  326 pt_entry_t pg_nx;
  327 
  328 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  329 
  330 static int pat_works = 1;
  331 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
  332     "Is page attribute table fully functional?");
  333 
  334 static int pg_ps_enabled = 1;
  335 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
  336     "Are large page mappings enabled?");
  337 
  338 #define PAT_INDEX_SIZE  8
  339 static int pat_index[PAT_INDEX_SIZE];   /* cache mode to PAT index conversion */
  340 
  341 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
  342 static u_int64_t        KPDphys;        /* phys addr of kernel level 2 */
  343 u_int64_t               KPDPphys;       /* phys addr of kernel level 3 */
  344 u_int64_t               KPML4phys;      /* phys addr of kernel level 4 */
  345 
  346 static u_int64_t        DMPDphys;       /* phys addr of direct mapped level 2 */
  347 static u_int64_t        DMPDPphys;      /* phys addr of direct mapped level 3 */
  348 static int              ndmpdpphys;     /* number of DMPDPphys pages */
  349 
  350 static struct rwlock_padalign pvh_global_lock;
  351 
  352 /*
  353  * Data for the pv entry allocation mechanism
  354  */
  355 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
  356 static struct mtx pv_chunks_mutex;
  357 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
  358 static struct md_page *pv_table;
  359 
  360 /*
  361  * All those kernel PT submaps that BSD is so fond of
  362  */
  363 pt_entry_t *CMAP1 = 0;
  364 caddr_t CADDR1 = 0;
  365 
  366 static int pmap_flags = PMAP_PDE_SUPERPAGE;     /* flags for x86 pmaps */
  367 
  368 static struct unrhdr pcid_unr;
  369 static struct mtx pcid_mtx;
  370 int pmap_pcid_enabled = 0;
  371 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
  372     0, "Is TLB Context ID enabled ?");
  373 int invpcid_works = 0;
  374 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
  375     "Is the invpcid instruction available ?");
  376 
  377 static int
  378 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
  379 {
  380         int i;
  381         uint64_t res;
  382 
  383         res = 0;
  384         CPU_FOREACH(i) {
  385                 res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
  386         }
  387         return (sysctl_handle_64(oidp, &res, 0, req));
  388 }
  389 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
  390     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
  391     "Count of saved TLB context on switch");
  392 
  393 /*
  394  * Crashdump maps.
  395  */
  396 static caddr_t crashdumpmap;
  397 
  398 static void     free_pv_chunk(struct pv_chunk *pc);
  399 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
  400 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
  401 static int      popcnt_pc_map_elem(uint64_t elem);
  402 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
  403 static void     reserve_pv_entries(pmap_t pmap, int needed,
  404                     struct rwlock **lockp);
  405 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  406                     struct rwlock **lockp);
  407 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  408                     struct rwlock **lockp);
  409 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  410                     struct rwlock **lockp);
  411 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  412 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
  413                     vm_offset_t va);
  414 
  415 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
  416 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  417 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
  418     vm_offset_t va, struct rwlock **lockp);
  419 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
  420     vm_offset_t va);
  421 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
  422     vm_prot_t prot, struct rwlock **lockp);
  423 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
  424     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
  425 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
  426 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
  427 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
  428 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
  429 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
  430 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
  431     struct rwlock **lockp);
  432 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
  433     vm_prot_t prot);
  434 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
  435 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  436     struct spglist *free, struct rwlock **lockp);
  437 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
  438     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
  439 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
  440 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  441     struct spglist *free);
  442 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
  443     vm_page_t m, struct rwlock **lockp);
  444 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  445     pd_entry_t newpde);
  446 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
  447 
  448 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
  449                 struct rwlock **lockp);
  450 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
  451                 struct rwlock **lockp);
  452 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
  453                 struct rwlock **lockp);
  454 
  455 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
  456     struct spglist *free);
  457 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
  458 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
  459 
  460 /*
  461  * Move the kernel virtual free pointer to the next
  462  * 2MB.  This is used to help improve performance
  463  * by using a large (2MB) page for much of the kernel
  464  * (.text, .data, .bss)
  465  */
  466 static vm_offset_t
  467 pmap_kmem_choose(vm_offset_t addr)
  468 {
  469         vm_offset_t newaddr = addr;
  470 
  471         newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
  472         return (newaddr);
  473 }
  474 
  475 /********************/
  476 /* Inline functions */
  477 /********************/
  478 
  479 /* Return a non-clipped PD index for a given VA */
  480 static __inline vm_pindex_t
  481 pmap_pde_pindex(vm_offset_t va)
  482 {
  483         return (va >> PDRSHIFT);
  484 }
  485 
  486 
  487 /* Return various clipped indexes for a given VA */
  488 static __inline vm_pindex_t
  489 pmap_pte_index(vm_offset_t va)
  490 {
  491 
  492         return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
  493 }
  494 
  495 static __inline vm_pindex_t
  496 pmap_pde_index(vm_offset_t va)
  497 {
  498 
  499         return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
  500 }
  501 
  502 static __inline vm_pindex_t
  503 pmap_pdpe_index(vm_offset_t va)
  504 {
  505 
  506         return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
  507 }
  508 
  509 static __inline vm_pindex_t
  510 pmap_pml4e_index(vm_offset_t va)
  511 {
  512 
  513         return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
  514 }
  515 
  516 /* Return a pointer to the PML4 slot that corresponds to a VA */
  517 static __inline pml4_entry_t *
  518 pmap_pml4e(pmap_t pmap, vm_offset_t va)
  519 {
  520 
  521         return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
  522 }
  523 
  524 /* Return a pointer to the PDP slot that corresponds to a VA */
  525 static __inline pdp_entry_t *
  526 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
  527 {
  528         pdp_entry_t *pdpe;
  529 
  530         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
  531         return (&pdpe[pmap_pdpe_index(va)]);
  532 }
  533 
  534 /* Return a pointer to the PDP slot that corresponds to a VA */
  535 static __inline pdp_entry_t *
  536 pmap_pdpe(pmap_t pmap, vm_offset_t va)
  537 {
  538         pml4_entry_t *pml4e;
  539         pt_entry_t PG_V;
  540 
  541         PG_V = pmap_valid_bit(pmap);
  542         pml4e = pmap_pml4e(pmap, va);
  543         if ((*pml4e & PG_V) == 0)
  544                 return (NULL);
  545         return (pmap_pml4e_to_pdpe(pml4e, va));
  546 }
  547 
  548 /* Return a pointer to the PD slot that corresponds to a VA */
  549 static __inline pd_entry_t *
  550 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
  551 {
  552         pd_entry_t *pde;
  553 
  554         pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
  555         return (&pde[pmap_pde_index(va)]);
  556 }
  557 
  558 /* Return a pointer to the PD slot that corresponds to a VA */
  559 static __inline pd_entry_t *
  560 pmap_pde(pmap_t pmap, vm_offset_t va)
  561 {
  562         pdp_entry_t *pdpe;
  563         pt_entry_t PG_V;
  564 
  565         PG_V = pmap_valid_bit(pmap);
  566         pdpe = pmap_pdpe(pmap, va);
  567         if (pdpe == NULL || (*pdpe & PG_V) == 0)
  568                 return (NULL);
  569         return (pmap_pdpe_to_pde(pdpe, va));
  570 }
  571 
  572 /* Return a pointer to the PT slot that corresponds to a VA */
  573 static __inline pt_entry_t *
  574 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
  575 {
  576         pt_entry_t *pte;
  577 
  578         pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
  579         return (&pte[pmap_pte_index(va)]);
  580 }
  581 
  582 /* Return a pointer to the PT slot that corresponds to a VA */
  583 static __inline pt_entry_t *
  584 pmap_pte(pmap_t pmap, vm_offset_t va)
  585 {
  586         pd_entry_t *pde;
  587         pt_entry_t PG_V;
  588 
  589         PG_V = pmap_valid_bit(pmap);
  590         pde = pmap_pde(pmap, va);
  591         if (pde == NULL || (*pde & PG_V) == 0)
  592                 return (NULL);
  593         if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
  594                 return ((pt_entry_t *)pde);
  595         return (pmap_pde_to_pte(pde, va));
  596 }
  597 
  598 static __inline void
  599 pmap_resident_count_inc(pmap_t pmap, int count)
  600 {
  601 
  602         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  603         pmap->pm_stats.resident_count += count;
  604 }
  605 
  606 static __inline void
  607 pmap_resident_count_dec(pmap_t pmap, int count)
  608 {
  609 
  610         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  611         pmap->pm_stats.resident_count -= count;
  612 }
  613 
  614 PMAP_INLINE pt_entry_t *
  615 vtopte(vm_offset_t va)
  616 {
  617         u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  618 
  619         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
  620 
  621         return (PTmap + ((va >> PAGE_SHIFT) & mask));
  622 }
  623 
  624 static __inline pd_entry_t *
  625 vtopde(vm_offset_t va)
  626 {
  627         u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  628 
  629         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
  630 
  631         return (PDmap + ((va >> PDRSHIFT) & mask));
  632 }
  633 
  634 static u_int64_t
  635 allocpages(vm_paddr_t *firstaddr, int n)
  636 {
  637         u_int64_t ret;
  638 
  639         ret = *firstaddr;
  640         bzero((void *)ret, n * PAGE_SIZE);
  641         *firstaddr += n * PAGE_SIZE;
  642         return (ret);
  643 }
  644 
  645 CTASSERT(powerof2(NDMPML4E));
  646 
  647 /* number of kernel PDP slots */
  648 #define NKPDPE(ptpgs)           howmany((ptpgs), NPDEPG)
  649 
  650 static void
  651 nkpt_init(vm_paddr_t addr)
  652 {
  653         int pt_pages;
  654         
  655 #ifdef NKPT
  656         pt_pages = NKPT;
  657 #else
  658         pt_pages = howmany(addr, 1 << PDRSHIFT);
  659         pt_pages += NKPDPE(pt_pages);
  660 
  661         /*
  662          * Add some slop beyond the bare minimum required for bootstrapping
  663          * the kernel.
  664          *
  665          * This is quite important when allocating KVA for kernel modules.
  666          * The modules are required to be linked in the negative 2GB of
  667          * the address space.  If we run out of KVA in this region then
  668          * pmap_growkernel() will need to allocate page table pages to map
  669          * the entire 512GB of KVA space which is an unnecessary tax on
  670          * physical memory.
  671          */
  672         pt_pages += 8;          /* 16MB additional slop for kernel modules */
  673 #endif
  674         nkpt = pt_pages;
  675 }
  676 
  677 static void
  678 create_pagetables(vm_paddr_t *firstaddr)
  679 {
  680         int i, j, ndm1g, nkpdpe;
  681         pt_entry_t *pt_p;
  682         pd_entry_t *pd_p;
  683         pdp_entry_t *pdp_p;
  684         pml4_entry_t *p4_p;
  685 
  686         /* Allocate page table pages for the direct map */
  687         ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
  688         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
  689                 ndmpdp = 4;
  690         ndmpdpphys = howmany(ndmpdp, NPDPEPG);
  691         if (ndmpdpphys > NDMPML4E) {
  692                 /*
  693                  * Each NDMPML4E allows 512 GB, so limit to that,
  694                  * and then readjust ndmpdp and ndmpdpphys.
  695                  */
  696                 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
  697                 Maxmem = atop(NDMPML4E * NBPML4);
  698                 ndmpdpphys = NDMPML4E;
  699                 ndmpdp = NDMPML4E * NPDEPG;
  700         }
  701         DMPDPphys = allocpages(firstaddr, ndmpdpphys);
  702         ndm1g = 0;
  703         if ((amd_feature & AMDID_PAGE1GB) != 0)
  704                 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
  705         if (ndm1g < ndmpdp)
  706                 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
  707         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
  708 
  709         /* Allocate pages */
  710         KPML4phys = allocpages(firstaddr, 1);
  711         KPDPphys = allocpages(firstaddr, NKPML4E);
  712 
  713         /*
  714          * Allocate the initial number of kernel page table pages required to
  715          * bootstrap.  We defer this until after all memory-size dependent
  716          * allocations are done (e.g. direct map), so that we don't have to
  717          * build in too much slop in our estimate.
  718          *
  719          * Note that when NKPML4E > 1, we have an empty page underneath
  720          * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
  721          * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
  722          */
  723         nkpt_init(*firstaddr);
  724         nkpdpe = NKPDPE(nkpt);
  725 
  726         KPTphys = allocpages(firstaddr, nkpt);
  727         KPDphys = allocpages(firstaddr, nkpdpe);
  728 
  729         /* Fill in the underlying page table pages */
  730         /* Nominally read-only (but really R/W) from zero to physfree */
  731         /* XXX not fully used, underneath 2M pages */
  732         pt_p = (pt_entry_t *)KPTphys;
  733         for (i = 0; ptoa(i) < *firstaddr; i++)
  734                 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
  735 
  736         /* Now map the page tables at their location within PTmap */
  737         pd_p = (pd_entry_t *)KPDphys;
  738         for (i = 0; i < nkpt; i++)
  739                 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
  740 
  741         /* Map from zero to end of allocations under 2M pages */
  742         /* This replaces some of the KPTphys entries above */
  743         for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
  744                 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
  745                     X86_PG_G;
  746 
  747         /* And connect up the PD to the PDP (leaving room for L4 pages) */
  748         pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
  749         for (i = 0; i < nkpdpe; i++)
  750                 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
  751                     PG_U;
  752 
  753         /*
  754          * Now, set up the direct map region using 2MB and/or 1GB pages.  If
  755          * the end of physical memory is not aligned to a 1GB page boundary,
  756          * then the residual physical memory is mapped with 2MB pages.  Later,
  757          * if pmap_mapdev{_attr}() uses the direct map for non-write-back
  758          * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
  759          * that are partially used. 
  760          */
  761         pd_p = (pd_entry_t *)DMPDphys;
  762         for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
  763                 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
  764                 /* Preset PG_M and PG_A because demotion expects it. */
  765                 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
  766                     X86_PG_M | X86_PG_A;
  767         }
  768         pdp_p = (pdp_entry_t *)DMPDPphys;
  769         for (i = 0; i < ndm1g; i++) {
  770                 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
  771                 /* Preset PG_M and PG_A because demotion expects it. */
  772                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
  773                     X86_PG_M | X86_PG_A;
  774         }
  775         for (j = 0; i < ndmpdp; i++, j++) {
  776                 pdp_p[i] = DMPDphys + ptoa(j);
  777                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
  778         }
  779 
  780         /* And recursively map PML4 to itself in order to get PTmap */
  781         p4_p = (pml4_entry_t *)KPML4phys;
  782         p4_p[PML4PML4I] = KPML4phys;
  783         p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
  784 
  785         /* Connect the Direct Map slot(s) up to the PML4. */
  786         for (i = 0; i < ndmpdpphys; i++) {
  787                 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
  788                 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
  789         }
  790 
  791         /* Connect the KVA slots up to the PML4 */
  792         for (i = 0; i < NKPML4E; i++) {
  793                 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
  794                 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
  795         }
  796 }
  797 
  798 /*
  799  *      Bootstrap the system enough to run with virtual memory.
  800  *
  801  *      On amd64 this is called after mapping has already been enabled
  802  *      and just syncs the pmap module with what has already been done.
  803  *      [We can't call it easily with mapping off since the kernel is not
  804  *      mapped with PA == VA, hence we would have to relocate every address
  805  *      from the linked base (virtual) address "KERNBASE" to the actual
  806  *      (physical) address starting relative to 0]
  807  */
  808 void
  809 pmap_bootstrap(vm_paddr_t *firstaddr)
  810 {
  811         vm_offset_t va;
  812         pt_entry_t *pte, *unused;
  813 
  814         /*
  815          * Create an initial set of page tables to run the kernel in.
  816          */
  817         create_pagetables(firstaddr);
  818 
  819         virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
  820         virtual_avail = pmap_kmem_choose(virtual_avail);
  821 
  822         virtual_end = VM_MAX_KERNEL_ADDRESS;
  823 
  824 
  825         /* XXX do %cr0 as well */
  826         load_cr4(rcr4() | CR4_PGE | CR4_PSE);
  827         load_cr3(KPML4phys);
  828         if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
  829                 load_cr4(rcr4() | CR4_SMEP);
  830 
  831         /*
  832          * Initialize the kernel pmap (which is statically allocated).
  833          */
  834         PMAP_LOCK_INIT(kernel_pmap);
  835         kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
  836         kernel_pmap->pm_cr3 = KPML4phys;
  837         CPU_FILL(&kernel_pmap->pm_active);      /* don't allow deactivation */
  838         CPU_ZERO(&kernel_pmap->pm_save);
  839         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
  840         kernel_pmap->pm_flags = pmap_flags;
  841 
  842         /*
  843          * Initialize the global pv list lock.
  844          */
  845         rw_init(&pvh_global_lock, "pmap pv global");
  846 
  847         /*
  848          * Reserve some special page table entries/VA space for temporary
  849          * mapping of pages.
  850          */
  851 #define SYSMAP(c, p, v, n)      \
  852         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
  853 
  854         va = virtual_avail;
  855         pte = vtopte(va);
  856 
  857         /*
  858          * CMAP1 is only used for the memory test.
  859          */
  860         SYSMAP(caddr_t, CMAP1, CADDR1, 1)
  861 
  862         /*
  863          * Crashdump maps.
  864          */
  865         SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
  866 
  867         virtual_avail = va;
  868 
  869         /* Initialize the PAT MSR. */
  870         pmap_init_pat();
  871 
  872         /* Initialize TLB Context Id. */
  873         TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
  874         if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
  875                 load_cr4(rcr4() | CR4_PCIDE);
  876                 mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
  877                 init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
  878                 /* Check for INVPCID support */
  879                 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
  880                     != 0;
  881                 kernel_pmap->pm_pcid = 0;
  882 #ifndef SMP
  883                 pmap_pcid_enabled = 0;
  884 #endif
  885         } else
  886                 pmap_pcid_enabled = 0;
  887 }
  888 
  889 /*
  890  * Setup the PAT MSR.
  891  */
  892 void
  893 pmap_init_pat(void)
  894 {
  895         int pat_table[PAT_INDEX_SIZE];
  896         uint64_t pat_msr;
  897         u_long cr0, cr4;
  898         int i;
  899 
  900         /* Bail if this CPU doesn't implement PAT. */
  901         if ((cpu_feature & CPUID_PAT) == 0)
  902                 panic("no PAT??");
  903 
  904         /* Set default PAT index table. */
  905         for (i = 0; i < PAT_INDEX_SIZE; i++)
  906                 pat_table[i] = -1;
  907         pat_table[PAT_WRITE_BACK] = 0;
  908         pat_table[PAT_WRITE_THROUGH] = 1;
  909         pat_table[PAT_UNCACHEABLE] = 3;
  910         pat_table[PAT_WRITE_COMBINING] = 3;
  911         pat_table[PAT_WRITE_PROTECTED] = 3;
  912         pat_table[PAT_UNCACHED] = 3;
  913 
  914         /* Initialize default PAT entries. */
  915         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
  916             PAT_VALUE(1, PAT_WRITE_THROUGH) |
  917             PAT_VALUE(2, PAT_UNCACHED) |
  918             PAT_VALUE(3, PAT_UNCACHEABLE) |
  919             PAT_VALUE(4, PAT_WRITE_BACK) |
  920             PAT_VALUE(5, PAT_WRITE_THROUGH) |
  921             PAT_VALUE(6, PAT_UNCACHED) |
  922             PAT_VALUE(7, PAT_UNCACHEABLE);
  923 
  924         if (pat_works) {
  925                 /*
  926                  * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
  927                  * Program 5 and 6 as WP and WC.
  928                  * Leave 4 and 7 as WB and UC.
  929                  */
  930                 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
  931                 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
  932                     PAT_VALUE(6, PAT_WRITE_COMBINING);
  933                 pat_table[PAT_UNCACHED] = 2;
  934                 pat_table[PAT_WRITE_PROTECTED] = 5;
  935                 pat_table[PAT_WRITE_COMBINING] = 6;
  936         } else {
  937                 /*
  938                  * Just replace PAT Index 2 with WC instead of UC-.
  939                  */
  940                 pat_msr &= ~PAT_MASK(2);
  941                 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
  942                 pat_table[PAT_WRITE_COMBINING] = 2;
  943         }
  944 
  945         /* Disable PGE. */
  946         cr4 = rcr4();
  947         load_cr4(cr4 & ~CR4_PGE);
  948 
  949         /* Disable caches (CD = 1, NW = 0). */
  950         cr0 = rcr0();
  951         load_cr0((cr0 & ~CR0_NW) | CR0_CD);
  952 
  953         /* Flushes caches and TLBs. */
  954         wbinvd();
  955         invltlb();
  956 
  957         /* Update PAT and index table. */
  958         wrmsr(MSR_PAT, pat_msr);
  959         for (i = 0; i < PAT_INDEX_SIZE; i++)
  960                 pat_index[i] = pat_table[i];
  961 
  962         /* Flush caches and TLBs again. */
  963         wbinvd();
  964         invltlb();
  965 
  966         /* Restore caches and PGE. */
  967         load_cr0(cr0);
  968         load_cr4(cr4);
  969 }
  970 
  971 /*
  972  *      Initialize a vm_page's machine-dependent fields.
  973  */
  974 void
  975 pmap_page_init(vm_page_t m)
  976 {
  977 
  978         TAILQ_INIT(&m->md.pv_list);
  979         m->md.pat_mode = PAT_WRITE_BACK;
  980 }
  981 
  982 /*
  983  *      Initialize the pmap module.
  984  *      Called by vm_init, to initialize any structures that the pmap
  985  *      system needs to map virtual memory.
  986  */
  987 void
  988 pmap_init(void)
  989 {
  990         vm_page_t mpte;
  991         vm_size_t s;
  992         int i, pv_npg;
  993 
  994         /*
  995          * Initialize the vm page array entries for the kernel pmap's
  996          * page table pages.
  997          */ 
  998         for (i = 0; i < nkpt; i++) {
  999                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 1000                 KASSERT(mpte >= vm_page_array &&
 1001                     mpte < &vm_page_array[vm_page_array_size],
 1002                     ("pmap_init: page table page is out of range"));
 1003                 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
 1004                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 1005         }
 1006 
 1007         /*
 1008          * If the kernel is running in a virtual machine on an AMD Family 10h
 1009          * processor, then it must assume that MCA is enabled by the virtual
 1010          * machine monitor.
 1011          */
 1012         if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
 1013             CPUID_TO_FAMILY(cpu_id) == 0x10)
 1014                 workaround_erratum383 = 1;
 1015 
 1016         /*
 1017          * Are large page mappings enabled?
 1018          */
 1019         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 1020         if (pg_ps_enabled) {
 1021                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 1022                     ("pmap_init: can't assign to pagesizes[1]"));
 1023                 pagesizes[1] = NBPDR;
 1024         }
 1025 
 1026         /*
 1027          * Initialize the pv chunk list mutex.
 1028          */
 1029         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 1030 
 1031         /*
 1032          * Initialize the pool of pv list locks.
 1033          */
 1034         for (i = 0; i < NPV_LIST_LOCKS; i++)
 1035                 rw_init(&pv_list_locks[i], "pmap pv list");
 1036 
 1037         /*
 1038          * Calculate the size of the pv head table for superpages.
 1039          */
 1040         for (i = 0; phys_avail[i + 1]; i += 2);
 1041         pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
 1042 
 1043         /*
 1044          * Allocate memory for the pv head table for superpages.
 1045          */
 1046         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 1047         s = round_page(s);
 1048         pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
 1049             M_WAITOK | M_ZERO);
 1050         for (i = 0; i < pv_npg; i++)
 1051                 TAILQ_INIT(&pv_table[i].pv_list);
 1052 }
 1053 
 1054 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
 1055     "2MB page mapping counters");
 1056 
 1057 static u_long pmap_pde_demotions;
 1058 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
 1059     &pmap_pde_demotions, 0, "2MB page demotions");
 1060 
 1061 static u_long pmap_pde_mappings;
 1062 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
 1063     &pmap_pde_mappings, 0, "2MB page mappings");
 1064 
 1065 static u_long pmap_pde_p_failures;
 1066 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
 1067     &pmap_pde_p_failures, 0, "2MB page promotion failures");
 1068 
 1069 static u_long pmap_pde_promotions;
 1070 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
 1071     &pmap_pde_promotions, 0, "2MB page promotions");
 1072 
 1073 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
 1074     "1GB page mapping counters");
 1075 
 1076 static u_long pmap_pdpe_demotions;
 1077 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
 1078     &pmap_pdpe_demotions, 0, "1GB page demotions");
 1079 
 1080 /***************************************************
 1081  * Low level helper routines.....
 1082  ***************************************************/
 1083 
 1084 static pt_entry_t
 1085 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
 1086 {
 1087         int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
 1088 
 1089         switch (pmap->pm_type) {
 1090         case PT_X86:
 1091                 /* Verify that both PAT bits are not set at the same time */
 1092                 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
 1093                     ("Invalid PAT bits in entry %#lx", entry));
 1094 
 1095                 /* Swap the PAT bits if one of them is set */
 1096                 if ((entry & x86_pat_bits) != 0)
 1097                         entry ^= x86_pat_bits;
 1098                 break;
 1099         case PT_EPT:
 1100                 /*
 1101                  * Nothing to do - the memory attributes are represented
 1102                  * the same way for regular pages and superpages.
 1103                  */
 1104                 break;
 1105         default:
 1106                 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
 1107         }
 1108 
 1109         return (entry);
 1110 }
 1111 
 1112 /*
 1113  * Determine the appropriate bits to set in a PTE or PDE for a specified
 1114  * caching mode.
 1115  */
 1116 static int
 1117 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 1118 {
 1119         int cache_bits, pat_flag, pat_idx;
 1120 
 1121         if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
 1122                 panic("Unknown caching mode %d\n", mode);
 1123 
 1124         switch (pmap->pm_type) {
 1125         case PT_X86:
 1126                 /* The PAT bit is different for PTE's and PDE's. */
 1127                 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 1128 
 1129                 /* Map the caching mode to a PAT index. */
 1130                 pat_idx = pat_index[mode];
 1131 
 1132                 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 1133                 cache_bits = 0;
 1134                 if (pat_idx & 0x4)
 1135                         cache_bits |= pat_flag;
 1136                 if (pat_idx & 0x2)
 1137                         cache_bits |= PG_NC_PCD;
 1138                 if (pat_idx & 0x1)
 1139                         cache_bits |= PG_NC_PWT;
 1140                 break;
 1141 
 1142         case PT_EPT:
 1143                 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
 1144                 break;
 1145 
 1146         default:
 1147                 panic("unsupported pmap type %d", pmap->pm_type);
 1148         }
 1149 
 1150         return (cache_bits);
 1151 }
 1152 
 1153 static int
 1154 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 1155 {
 1156         int mask;
 1157 
 1158         switch (pmap->pm_type) {
 1159         case PT_X86:
 1160                 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
 1161                 break;
 1162         case PT_EPT:
 1163                 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
 1164                 break;
 1165         default:
 1166                 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
 1167         }
 1168 
 1169         return (mask);
 1170 }
 1171 
 1172 static __inline boolean_t
 1173 pmap_ps_enabled(pmap_t pmap)
 1174 {
 1175 
 1176         return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 1177 }
 1178 
 1179 static void
 1180 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
 1181 {
 1182 
 1183         switch (pmap->pm_type) {
 1184         case PT_X86:
 1185                 break;
 1186         case PT_EPT:
 1187                 /*
 1188                  * XXX
 1189                  * This is a little bogus since the generation number is
 1190                  * supposed to be bumped up when a region of the address
 1191                  * space is invalidated in the page tables.
 1192                  *
 1193                  * In this case the old PDE entry is valid but yet we want
 1194                  * to make sure that any mappings using the old entry are
 1195                  * invalidated in the TLB.
 1196                  *
 1197                  * The reason this works as expected is because we rendezvous
 1198                  * "all" host cpus and force any vcpu context to exit as a
 1199                  * side-effect.
 1200                  */
 1201                 atomic_add_acq_long(&pmap->pm_eptgen, 1);
 1202                 break;
 1203         default:
 1204                 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
 1205         }
 1206         pde_store(pde, newpde);
 1207 }
 1208 
 1209 /*
 1210  * After changing the page size for the specified virtual address in the page
 1211  * table, flush the corresponding entries from the processor's TLB.  Only the
 1212  * calling processor's TLB is affected.
 1213  *
 1214  * The calling thread must be pinned to a processor.
 1215  */
 1216 static void
 1217 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 1218 {
 1219         pt_entry_t PG_G;
 1220 
 1221         if (pmap->pm_type == PT_EPT)
 1222                 return;
 1223 
 1224         KASSERT(pmap->pm_type == PT_X86,
 1225             ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
 1226 
 1227         PG_G = pmap_global_bit(pmap);
 1228 
 1229         if ((newpde & PG_PS) == 0)
 1230                 /* Demotion: flush a specific 2MB page mapping. */
 1231                 invlpg(va);
 1232         else if ((newpde & PG_G) == 0)
 1233                 /*
 1234                  * Promotion: flush every 4KB page mapping from the TLB
 1235                  * because there are too many to flush individually.
 1236                  */
 1237                 invltlb();
 1238         else {
 1239                 /*
 1240                  * Promotion: flush every 4KB page mapping from the TLB,
 1241                  * including any global (PG_G) mappings.
 1242                  */
 1243                 invltlb_globpcid();
 1244         }
 1245 }
 1246 #ifdef SMP
 1247 
 1248 static void
 1249 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
 1250 {
 1251         struct invpcid_descr d;
 1252         uint64_t cr3;
 1253 
 1254         if (invpcid_works) {
 1255                 d.pcid = pmap->pm_pcid;
 1256                 d.pad = 0;
 1257                 d.addr = va;
 1258                 invpcid(&d, INVPCID_ADDR);
 1259                 return;
 1260         }
 1261 
 1262         cr3 = rcr3();
 1263         critical_enter();
 1264         load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
 1265         invlpg(va);
 1266         load_cr3(cr3 | CR3_PCID_SAVE);
 1267         critical_exit();
 1268 }
 1269 
 1270 /*
 1271  * For SMP, these functions have to use the IPI mechanism for coherence.
 1272  *
 1273  * N.B.: Before calling any of the following TLB invalidation functions,
 1274  * the calling processor must ensure that all stores updating a non-
 1275  * kernel page table are globally performed.  Otherwise, another
 1276  * processor could cache an old, pre-update entry without being
 1277  * invalidated.  This can happen one of two ways: (1) The pmap becomes
 1278  * active on another processor after its pm_active field is checked by
 1279  * one of the following functions but before a store updating the page
 1280  * table is globally performed. (2) The pmap becomes active on another
 1281  * processor before its pm_active field is checked but due to
 1282  * speculative loads one of the following functions stills reads the
 1283  * pmap as inactive on the other processor.
 1284  * 
 1285  * The kernel page table is exempt because its pm_active field is
 1286  * immutable.  The kernel page table is always active on every
 1287  * processor.
 1288  */
 1289 
 1290 /*
 1291  * Interrupt the cpus that are executing in the guest context.
 1292  * This will force the vcpu to exit and the cached EPT mappings
 1293  * will be invalidated by the host before the next vmresume.
 1294  */
 1295 static __inline void
 1296 pmap_invalidate_ept(pmap_t pmap)
 1297 {
 1298 
 1299         sched_pin();
 1300         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 1301             ("pmap_invalidate_ept: absurd pm_active"));
 1302 
 1303         /*
 1304          * The TLB mappings associated with a vcpu context are not
 1305          * flushed each time a different vcpu is chosen to execute.
 1306          *
 1307          * This is in contrast with a process's vtop mappings that
 1308          * are flushed from the TLB on each context switch.
 1309          *
 1310          * Therefore we need to do more than just a TLB shootdown on
 1311          * the active cpus in 'pmap->pm_active'. To do this we keep
 1312          * track of the number of invalidations performed on this pmap.
 1313          *
 1314          * Each vcpu keeps a cache of this counter and compares it
 1315          * just before a vmresume. If the counter is out-of-date an
 1316          * invept will be done to flush stale mappings from the TLB.
 1317          */
 1318         atomic_add_acq_long(&pmap->pm_eptgen, 1);
 1319 
 1320         /*
 1321          * Force the vcpu to exit and trap back into the hypervisor.
 1322          *
 1323          * XXX this is not optimal because IPI_AST builds a trapframe
 1324          * whereas all we need is an 'eoi' followed by 'iret'.
 1325          */
 1326         ipi_selected(pmap->pm_active, IPI_AST);
 1327         sched_unpin();
 1328 }
 1329 
 1330 void
 1331 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1332 {
 1333         cpuset_t other_cpus;
 1334         u_int cpuid;
 1335 
 1336         if (pmap->pm_type == PT_EPT) {
 1337                 pmap_invalidate_ept(pmap);
 1338                 return;
 1339         }
 1340 
 1341         KASSERT(pmap->pm_type == PT_X86,
 1342             ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 1343 
 1344         sched_pin();
 1345         if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1346                 if (!pmap_pcid_enabled) {
 1347                         invlpg(va);
 1348                 } else {
 1349                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
 1350                                 if (pmap == PCPU_GET(curpmap))
 1351                                         invlpg(va);
 1352                                 else
 1353                                         pmap_invalidate_page_pcid(pmap, va);
 1354                         } else {
 1355                                 invltlb_globpcid();
 1356                         }
 1357                 }
 1358                 smp_invlpg(pmap, va);
 1359         } else {
 1360                 cpuid = PCPU_GET(cpuid);
 1361                 other_cpus = all_cpus;
 1362                 CPU_CLR(cpuid, &other_cpus);
 1363                 if (CPU_ISSET(cpuid, &pmap->pm_active))
 1364                         invlpg(va);
 1365                 else if (pmap_pcid_enabled) {
 1366                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
 1367                                 pmap_invalidate_page_pcid(pmap, va);
 1368                         else
 1369                                 invltlb_globpcid();
 1370                 }
 1371                 if (pmap_pcid_enabled)
 1372                         CPU_AND(&other_cpus, &pmap->pm_save);
 1373                 else
 1374                         CPU_AND(&other_cpus, &pmap->pm_active);
 1375                 if (!CPU_EMPTY(&other_cpus))
 1376                         smp_masked_invlpg(other_cpus, pmap, va);
 1377         }
 1378         sched_unpin();
 1379 }
 1380 
 1381 static void
 1382 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1383 {
 1384         struct invpcid_descr d;
 1385         uint64_t cr3;
 1386         vm_offset_t addr;
 1387 
 1388         if (invpcid_works) {
 1389                 d.pcid = pmap->pm_pcid;
 1390                 d.pad = 0;
 1391                 for (addr = sva; addr < eva; addr += PAGE_SIZE) {
 1392                         d.addr = addr;
 1393                         invpcid(&d, INVPCID_ADDR);
 1394                 }
 1395                 return;
 1396         }
 1397 
 1398         cr3 = rcr3();
 1399         critical_enter();
 1400         load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
 1401         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1402                 invlpg(addr);
 1403         load_cr3(cr3 | CR3_PCID_SAVE);
 1404         critical_exit();
 1405 }
 1406 
 1407 void
 1408 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1409 {
 1410         cpuset_t other_cpus;
 1411         vm_offset_t addr;
 1412         u_int cpuid;
 1413 
 1414         if (pmap->pm_type == PT_EPT) {
 1415                 pmap_invalidate_ept(pmap);
 1416                 return;
 1417         }
 1418 
 1419         KASSERT(pmap->pm_type == PT_X86,
 1420             ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 1421 
 1422         sched_pin();
 1423         if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1424                 if (!pmap_pcid_enabled) {
 1425                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1426                                 invlpg(addr);
 1427                 } else {
 1428                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
 1429                                 if (pmap == PCPU_GET(curpmap)) {
 1430                                         for (addr = sva; addr < eva;
 1431                                             addr += PAGE_SIZE)
 1432                                                 invlpg(addr);
 1433                                 } else {
 1434                                         pmap_invalidate_range_pcid(pmap,
 1435                                             sva, eva);
 1436                                 }
 1437                         } else {
 1438                                 invltlb_globpcid();
 1439                         }
 1440                 }
 1441                 smp_invlpg_range(pmap, sva, eva);
 1442         } else {
 1443                 cpuid = PCPU_GET(cpuid);
 1444                 other_cpus = all_cpus;
 1445                 CPU_CLR(cpuid, &other_cpus);
 1446                 if (CPU_ISSET(cpuid, &pmap->pm_active)) {
 1447                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1448                                 invlpg(addr);
 1449                 } else if (pmap_pcid_enabled) {
 1450                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
 1451                                 pmap_invalidate_range_pcid(pmap, sva, eva);
 1452                         else
 1453                                 invltlb_globpcid();
 1454                 }
 1455                 if (pmap_pcid_enabled)
 1456                         CPU_AND(&other_cpus, &pmap->pm_save);
 1457                 else
 1458                         CPU_AND(&other_cpus, &pmap->pm_active);
 1459                 if (!CPU_EMPTY(&other_cpus))
 1460                         smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
 1461         }
 1462         sched_unpin();
 1463 }
 1464 
 1465 void
 1466 pmap_invalidate_all(pmap_t pmap)
 1467 {
 1468         cpuset_t other_cpus;
 1469         struct invpcid_descr d;
 1470         uint64_t cr3;
 1471         u_int cpuid;
 1472 
 1473         if (pmap->pm_type == PT_EPT) {
 1474                 pmap_invalidate_ept(pmap);
 1475                 return;
 1476         }
 1477 
 1478         KASSERT(pmap->pm_type == PT_X86,
 1479             ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 1480 
 1481         sched_pin();
 1482         cpuid = PCPU_GET(cpuid);
 1483         if (pmap == kernel_pmap ||
 1484             (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
 1485             !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1486                 if (invpcid_works) {
 1487                         bzero(&d, sizeof(d));
 1488                         invpcid(&d, INVPCID_CTXGLOB);
 1489                 } else {
 1490                         invltlb_globpcid();
 1491                 }
 1492                 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
 1493                 smp_invltlb(pmap);
 1494         } else {
 1495                 other_cpus = all_cpus;
 1496                 CPU_CLR(cpuid, &other_cpus);
 1497 
 1498                 /*
 1499                  * This logic is duplicated in the Xinvltlb shootdown
 1500                  * IPI handler.
 1501                  */
 1502                 if (pmap_pcid_enabled) {
 1503                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
 1504                                 if (invpcid_works) {
 1505                                         d.pcid = pmap->pm_pcid;
 1506                                         d.pad = 0;
 1507                                         d.addr = 0;
 1508                                         invpcid(&d, INVPCID_CTX);
 1509                                 } else {
 1510                                         cr3 = rcr3();
 1511                                         critical_enter();
 1512 
 1513                                         /*
 1514                                          * Bit 63 is clear, pcid TLB
 1515                                          * entries are invalidated.
 1516                                          */
 1517                                         load_cr3(pmap->pm_cr3);
 1518                                         load_cr3(cr3 | CR3_PCID_SAVE);
 1519                                         critical_exit();
 1520                                 }
 1521                         } else {
 1522                                 invltlb_globpcid();
 1523                         }
 1524                 } else if (CPU_ISSET(cpuid, &pmap->pm_active))
 1525                         invltlb();
 1526                 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
 1527                 if (pmap_pcid_enabled)
 1528                         CPU_AND(&other_cpus, &pmap->pm_save);
 1529                 else
 1530                         CPU_AND(&other_cpus, &pmap->pm_active);
 1531                 if (!CPU_EMPTY(&other_cpus))
 1532                         smp_masked_invltlb(other_cpus, pmap);
 1533         }
 1534         sched_unpin();
 1535 }
 1536 
 1537 void
 1538 pmap_invalidate_cache(void)
 1539 {
 1540 
 1541         sched_pin();
 1542         wbinvd();
 1543         smp_cache_flush();
 1544         sched_unpin();
 1545 }
 1546 
 1547 struct pde_action {
 1548         cpuset_t invalidate;    /* processors that invalidate their TLB */
 1549         pmap_t pmap;
 1550         vm_offset_t va;
 1551         pd_entry_t *pde;
 1552         pd_entry_t newpde;
 1553         u_int store;            /* processor that updates the PDE */
 1554 };
 1555 
 1556 static void
 1557 pmap_update_pde_action(void *arg)
 1558 {
 1559         struct pde_action *act = arg;
 1560 
 1561         if (act->store == PCPU_GET(cpuid))
 1562                 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 1563 }
 1564 
 1565 static void
 1566 pmap_update_pde_teardown(void *arg)
 1567 {
 1568         struct pde_action *act = arg;
 1569 
 1570         if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 1571                 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 1572 }
 1573 
 1574 /*
 1575  * Change the page size for the specified virtual address in a way that
 1576  * prevents any possibility of the TLB ever having two entries that map the
 1577  * same virtual address using different page sizes.  This is the recommended
 1578  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
 1579  * machine check exception for a TLB state that is improperly diagnosed as a
 1580  * hardware error.
 1581  */
 1582 static void
 1583 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1584 {
 1585         struct pde_action act;
 1586         cpuset_t active, other_cpus;
 1587         u_int cpuid;
 1588 
 1589         sched_pin();
 1590         cpuid = PCPU_GET(cpuid);
 1591         other_cpus = all_cpus;
 1592         CPU_CLR(cpuid, &other_cpus);
 1593         if (pmap == kernel_pmap || pmap->pm_type == PT_EPT)
 1594                 active = all_cpus;
 1595         else {
 1596                 active = pmap->pm_active;
 1597                 CPU_AND_ATOMIC(&pmap->pm_save, &active);
 1598         }
 1599         if (CPU_OVERLAP(&active, &other_cpus)) { 
 1600                 act.store = cpuid;
 1601                 act.invalidate = active;
 1602                 act.va = va;
 1603                 act.pmap = pmap;
 1604                 act.pde = pde;
 1605                 act.newpde = newpde;
 1606                 CPU_SET(cpuid, &active);
 1607                 smp_rendezvous_cpus(active,
 1608                     smp_no_rendevous_barrier, pmap_update_pde_action,
 1609                     pmap_update_pde_teardown, &act);
 1610         } else {
 1611                 pmap_update_pde_store(pmap, pde, newpde);
 1612                 if (CPU_ISSET(cpuid, &active))
 1613                         pmap_update_pde_invalidate(pmap, va, newpde);
 1614         }
 1615         sched_unpin();
 1616 }
 1617 #else /* !SMP */
 1618 /*
 1619  * Normal, non-SMP, invalidation functions.
 1620  * We inline these within pmap.c for speed.
 1621  */
 1622 PMAP_INLINE void
 1623 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1624 {
 1625 
 1626         switch (pmap->pm_type) {
 1627         case PT_X86:
 1628                 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1629                         invlpg(va);
 1630                 break;
 1631         case PT_EPT:
 1632                 pmap->pm_eptgen++;
 1633                 break;
 1634         default:
 1635                 panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
 1636         }
 1637 }
 1638 
 1639 PMAP_INLINE void
 1640 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1641 {
 1642         vm_offset_t addr;
 1643 
 1644         switch (pmap->pm_type) {
 1645         case PT_X86:
 1646                 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1647                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1648                                 invlpg(addr);
 1649                 break;
 1650         case PT_EPT:
 1651                 pmap->pm_eptgen++;
 1652                 break;
 1653         default:
 1654                 panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
 1655         }
 1656 }
 1657 
 1658 PMAP_INLINE void
 1659 pmap_invalidate_all(pmap_t pmap)
 1660 {
 1661 
 1662         switch (pmap->pm_type) {
 1663         case PT_X86:
 1664                 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1665                         invltlb();
 1666                 break;
 1667         case PT_EPT:
 1668                 pmap->pm_eptgen++;
 1669                 break;
 1670         default:
 1671                 panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
 1672         }
 1673 }
 1674 
 1675 PMAP_INLINE void
 1676 pmap_invalidate_cache(void)
 1677 {
 1678 
 1679         wbinvd();
 1680 }
 1681 
 1682 static void
 1683 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1684 {
 1685 
 1686         pmap_update_pde_store(pmap, pde, newpde);
 1687         if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1688                 pmap_update_pde_invalidate(pmap, va, newpde);
 1689         else
 1690                 CPU_ZERO(&pmap->pm_save);
 1691 }
 1692 #endif /* !SMP */
 1693 
 1694 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 1695 
 1696 void
 1697 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 1698 {
 1699 
 1700         KASSERT((sva & PAGE_MASK) == 0,
 1701             ("pmap_invalidate_cache_range: sva not page-aligned"));
 1702         KASSERT((eva & PAGE_MASK) == 0,
 1703             ("pmap_invalidate_cache_range: eva not page-aligned"));
 1704 
 1705         if (cpu_feature & CPUID_SS)
 1706                 ; /* If "Self Snoop" is supported, do nothing. */
 1707         else if ((cpu_feature & CPUID_CLFSH) != 0 &&
 1708             eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 1709 
 1710                 /*
 1711                  * XXX: Some CPUs fault, hang, or trash the local APIC
 1712                  * registers if we use CLFLUSH on the local APIC
 1713                  * range.  The local APIC is always uncached, so we
 1714                  * don't need to flush for that range anyway.
 1715                  */
 1716                 if (pmap_kextract(sva) == lapic_paddr)
 1717                         return;
 1718 
 1719                 /*
 1720                  * Otherwise, do per-cache line flush.  Use the mfence
 1721                  * instruction to insure that previous stores are
 1722                  * included in the write-back.  The processor
 1723                  * propagates flush to other processors in the cache
 1724                  * coherence domain.
 1725                  */
 1726                 mfence();
 1727                 for (; sva < eva; sva += cpu_clflush_line_size)
 1728                         clflush(sva);
 1729                 mfence();
 1730         } else {
 1731 
 1732                 /*
 1733                  * No targeted cache flush methods are supported by CPU,
 1734                  * or the supplied range is bigger than 2MB.
 1735                  * Globally invalidate cache.
 1736                  */
 1737                 pmap_invalidate_cache();
 1738         }
 1739 }
 1740 
 1741 /*
 1742  * Remove the specified set of pages from the data and instruction caches.
 1743  *
 1744  * In contrast to pmap_invalidate_cache_range(), this function does not
 1745  * rely on the CPU's self-snoop feature, because it is intended for use
 1746  * when moving pages into a different cache domain.
 1747  */
 1748 void
 1749 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 1750 {
 1751         vm_offset_t daddr, eva;
 1752         int i;
 1753 
 1754         if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 1755             (cpu_feature & CPUID_CLFSH) == 0)
 1756                 pmap_invalidate_cache();
 1757         else {
 1758                 mfence();
 1759                 for (i = 0; i < count; i++) {
 1760                         daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 1761                         eva = daddr + PAGE_SIZE;
 1762                         for (; daddr < eva; daddr += cpu_clflush_line_size)
 1763                                 clflush(daddr);
 1764                 }
 1765                 mfence();
 1766         }
 1767 }
 1768 
 1769 /*
 1770  * Are we current address space or kernel?
 1771  */
 1772 static __inline int
 1773 pmap_is_current(pmap_t pmap)
 1774 {
 1775         return (pmap == kernel_pmap ||
 1776             (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
 1777 }
 1778 
 1779 /*
 1780  *      Routine:        pmap_extract
 1781  *      Function:
 1782  *              Extract the physical page address associated
 1783  *              with the given map/virtual_address pair.
 1784  */
 1785 vm_paddr_t 
 1786 pmap_extract(pmap_t pmap, vm_offset_t va)
 1787 {
 1788         pdp_entry_t *pdpe;
 1789         pd_entry_t *pde;
 1790         pt_entry_t *pte, PG_V;
 1791         vm_paddr_t pa;
 1792 
 1793         pa = 0;
 1794         PG_V = pmap_valid_bit(pmap);
 1795         PMAP_LOCK(pmap);
 1796         pdpe = pmap_pdpe(pmap, va);
 1797         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 1798                 if ((*pdpe & PG_PS) != 0)
 1799                         pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 1800                 else {
 1801                         pde = pmap_pdpe_to_pde(pdpe, va);
 1802                         if ((*pde & PG_V) != 0) {
 1803                                 if ((*pde & PG_PS) != 0) {
 1804                                         pa = (*pde & PG_PS_FRAME) |
 1805                                             (va & PDRMASK);
 1806                                 } else {
 1807                                         pte = pmap_pde_to_pte(pde, va);
 1808                                         pa = (*pte & PG_FRAME) |
 1809                                             (va & PAGE_MASK);
 1810                                 }
 1811                         }
 1812                 }
 1813         }
 1814         PMAP_UNLOCK(pmap);
 1815         return (pa);
 1816 }
 1817 
 1818 /*
 1819  *      Routine:        pmap_extract_and_hold
 1820  *      Function:
 1821  *              Atomically extract and hold the physical page
 1822  *              with the given pmap and virtual address pair
 1823  *              if that mapping permits the given protection.
 1824  */
 1825 vm_page_t
 1826 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 1827 {
 1828         pd_entry_t pde, *pdep;
 1829         pt_entry_t pte, PG_RW, PG_V;
 1830         vm_paddr_t pa;
 1831         vm_page_t m;
 1832 
 1833         pa = 0;
 1834         m = NULL;
 1835         PG_RW = pmap_rw_bit(pmap);
 1836         PG_V = pmap_valid_bit(pmap);
 1837         PMAP_LOCK(pmap);
 1838 retry:
 1839         pdep = pmap_pde(pmap, va);
 1840         if (pdep != NULL && (pde = *pdep)) {
 1841                 if (pde & PG_PS) {
 1842                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 1843                                 if (vm_page_pa_tryrelock(pmap, (pde &
 1844                                     PG_PS_FRAME) | (va & PDRMASK), &pa))
 1845                                         goto retry;
 1846                                 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 1847                                     (va & PDRMASK));
 1848                                 vm_page_hold(m);
 1849                         }
 1850                 } else {
 1851                         pte = *pmap_pde_to_pte(pdep, va);
 1852                         if ((pte & PG_V) &&
 1853                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 1854                                 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 1855                                     &pa))
 1856                                         goto retry;
 1857                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 1858                                 vm_page_hold(m);
 1859                         }
 1860                 }
 1861         }
 1862         PA_UNLOCK_COND(pa);
 1863         PMAP_UNLOCK(pmap);
 1864         return (m);
 1865 }
 1866 
 1867 vm_paddr_t
 1868 pmap_kextract(vm_offset_t va)
 1869 {
 1870         pd_entry_t pde;
 1871         vm_paddr_t pa;
 1872 
 1873         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 1874                 pa = DMAP_TO_PHYS(va);
 1875         } else {
 1876                 pde = *vtopde(va);
 1877                 if (pde & PG_PS) {
 1878                         pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 1879                 } else {
 1880                         /*
 1881                          * Beware of a concurrent promotion that changes the
 1882                          * PDE at this point!  For example, vtopte() must not
 1883                          * be used to access the PTE because it would use the
 1884                          * new PDE.  It is, however, safe to use the old PDE
 1885                          * because the page table page is preserved by the
 1886                          * promotion.
 1887                          */
 1888                         pa = *pmap_pde_to_pte(&pde, va);
 1889                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 1890                 }
 1891         }
 1892         return (pa);
 1893 }
 1894 
 1895 /***************************************************
 1896  * Low level mapping routines.....
 1897  ***************************************************/
 1898 
 1899 /*
 1900  * Add a wired page to the kva.
 1901  * Note: not SMP coherent.
 1902  */
 1903 PMAP_INLINE void 
 1904 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 1905 {
 1906         pt_entry_t *pte;
 1907 
 1908         pte = vtopte(va);
 1909         pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
 1910 }
 1911 
 1912 static __inline void
 1913 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 1914 {
 1915         pt_entry_t *pte;
 1916         int cache_bits;
 1917 
 1918         pte = vtopte(va);
 1919         cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
 1920         pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
 1921 }
 1922 
 1923 /*
 1924  * Remove a page from the kernel pagetables.
 1925  * Note: not SMP coherent.
 1926  */
 1927 PMAP_INLINE void
 1928 pmap_kremove(vm_offset_t va)
 1929 {
 1930         pt_entry_t *pte;
 1931 
 1932         pte = vtopte(va);
 1933         pte_clear(pte);
 1934 }
 1935 
 1936 /*
 1937  *      Used to map a range of physical addresses into kernel
 1938  *      virtual address space.
 1939  *
 1940  *      The value passed in '*virt' is a suggested virtual address for
 1941  *      the mapping. Architectures which can support a direct-mapped
 1942  *      physical to virtual region can return the appropriate address
 1943  *      within that region, leaving '*virt' unchanged. Other
 1944  *      architectures should map the pages starting at '*virt' and
 1945  *      update '*virt' with the first usable address after the mapped
 1946  *      region.
 1947  */
 1948 vm_offset_t
 1949 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 1950 {
 1951         return PHYS_TO_DMAP(start);
 1952 }
 1953 
 1954 
 1955 /*
 1956  * Add a list of wired pages to the kva
 1957  * this routine is only used for temporary
 1958  * kernel mappings that do not need to have
 1959  * page modification or references recorded.
 1960  * Note that old mappings are simply written
 1961  * over.  The page *must* be wired.
 1962  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1963  */
 1964 void
 1965 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 1966 {
 1967         pt_entry_t *endpte, oldpte, pa, *pte;
 1968         vm_page_t m;
 1969         int cache_bits;
 1970 
 1971         oldpte = 0;
 1972         pte = vtopte(sva);
 1973         endpte = pte + count;
 1974         while (pte < endpte) {
 1975                 m = *ma++;
 1976                 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 1977                 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 1978                 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 1979                         oldpte |= *pte;
 1980                         pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
 1981                 }
 1982                 pte++;
 1983         }
 1984         if (__predict_false((oldpte & X86_PG_V) != 0))
 1985                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
 1986                     PAGE_SIZE);
 1987 }
 1988 
 1989 /*
 1990  * This routine tears out page mappings from the
 1991  * kernel -- it is meant only for temporary mappings.
 1992  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1993  */
 1994 void
 1995 pmap_qremove(vm_offset_t sva, int count)
 1996 {
 1997         vm_offset_t va;
 1998 
 1999         va = sva;
 2000         while (count-- > 0) {
 2001                 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
 2002                 pmap_kremove(va);
 2003                 va += PAGE_SIZE;
 2004         }
 2005         pmap_invalidate_range(kernel_pmap, sva, va);
 2006 }
 2007 
 2008 /***************************************************
 2009  * Page table page management routines.....
 2010  ***************************************************/
 2011 static __inline void
 2012 pmap_free_zero_pages(struct spglist *free)
 2013 {
 2014         vm_page_t m;
 2015 
 2016         while ((m = SLIST_FIRST(free)) != NULL) {
 2017                 SLIST_REMOVE_HEAD(free, plinks.s.ss);
 2018                 /* Preserve the page's PG_ZERO setting. */
 2019                 vm_page_free_toq(m);
 2020         }
 2021 }
 2022 
 2023 /*
 2024  * Schedule the specified unused page table page to be freed.  Specifically,
 2025  * add the page to the specified list of pages that will be released to the
 2026  * physical memory manager after the TLB has been updated.
 2027  */
 2028 static __inline void
 2029 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
 2030     boolean_t set_PG_ZERO)
 2031 {
 2032 
 2033         if (set_PG_ZERO)
 2034                 m->flags |= PG_ZERO;
 2035         else
 2036                 m->flags &= ~PG_ZERO;
 2037         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 2038 }
 2039         
 2040 /*
 2041  * Inserts the specified page table page into the specified pmap's collection
 2042  * of idle page table pages.  Each of a pmap's page table pages is responsible
 2043  * for mapping a distinct range of virtual addresses.  The pmap's collection is
 2044  * ordered by this virtual address range.
 2045  */
 2046 static __inline int
 2047 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 2048 {
 2049 
 2050         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2051         return (vm_radix_insert(&pmap->pm_root, mpte));
 2052 }
 2053 
 2054 /*
 2055  * Looks for a page table page mapping the specified virtual address in the
 2056  * specified pmap's collection of idle page table pages.  Returns NULL if there
 2057  * is no page table page corresponding to the specified virtual address.
 2058  */
 2059 static __inline vm_page_t
 2060 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
 2061 {
 2062 
 2063         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2064         return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
 2065 }
 2066 
 2067 /*
 2068  * Removes the specified page table page from the specified pmap's collection
 2069  * of idle page table pages.  The specified page table page must be a member of
 2070  * the pmap's collection.
 2071  */
 2072 static __inline void
 2073 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
 2074 {
 2075 
 2076         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2077         vm_radix_remove(&pmap->pm_root, mpte->pindex);
 2078 }
 2079 
 2080 /*
 2081  * Decrements a page table page's wire count, which is used to record the
 2082  * number of valid page table entries within the page.  If the wire count
 2083  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
 2084  * page table page was unmapped and FALSE otherwise.
 2085  */
 2086 static inline boolean_t
 2087 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 2088 {
 2089 
 2090         --m->wire_count;
 2091         if (m->wire_count == 0) {
 2092                 _pmap_unwire_ptp(pmap, va, m, free);
 2093                 return (TRUE);
 2094         } else
 2095                 return (FALSE);
 2096 }
 2097 
 2098 static void
 2099 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 2100 {
 2101 
 2102         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2103         /*
 2104          * unmap the page table page
 2105          */
 2106         if (m->pindex >= (NUPDE + NUPDPE)) {
 2107                 /* PDP page */
 2108                 pml4_entry_t *pml4;
 2109                 pml4 = pmap_pml4e(pmap, va);
 2110                 *pml4 = 0;
 2111         } else if (m->pindex >= NUPDE) {
 2112                 /* PD page */
 2113                 pdp_entry_t *pdp;
 2114                 pdp = pmap_pdpe(pmap, va);
 2115                 *pdp = 0;
 2116         } else {
 2117                 /* PTE page */
 2118                 pd_entry_t *pd;
 2119                 pd = pmap_pde(pmap, va);
 2120                 *pd = 0;
 2121         }
 2122         pmap_resident_count_dec(pmap, 1);
 2123         if (m->pindex < NUPDE) {
 2124                 /* We just released a PT, unhold the matching PD */
 2125                 vm_page_t pdpg;
 2126 
 2127                 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 2128                 pmap_unwire_ptp(pmap, va, pdpg, free);
 2129         }
 2130         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 2131                 /* We just released a PD, unhold the matching PDP */
 2132                 vm_page_t pdppg;
 2133 
 2134                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 2135                 pmap_unwire_ptp(pmap, va, pdppg, free);
 2136         }
 2137 
 2138         /*
 2139          * This is a release store so that the ordinary store unmapping
 2140          * the page table page is globally performed before TLB shoot-
 2141          * down is begun.
 2142          */
 2143         atomic_subtract_rel_int(&cnt.v_wire_count, 1);
 2144 
 2145         /* 
 2146          * Put page on a list so that it is released after
 2147          * *ALL* TLB shootdown is done
 2148          */
 2149         pmap_add_delayed_free_list(m, free, TRUE);
 2150 }
 2151 
 2152 /*
 2153  * After removing a page table entry, this routine is used to
 2154  * conditionally free the page, and manage the hold/wire counts.
 2155  */
 2156 static int
 2157 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
 2158     struct spglist *free)
 2159 {
 2160         vm_page_t mpte;
 2161 
 2162         if (va >= VM_MAXUSER_ADDRESS)
 2163                 return (0);
 2164         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 2165         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 2166         return (pmap_unwire_ptp(pmap, va, mpte, free));
 2167 }
 2168 
 2169 void
 2170 pmap_pinit0(pmap_t pmap)
 2171 {
 2172 
 2173         PMAP_LOCK_INIT(pmap);
 2174         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 2175         pmap->pm_cr3 = KPML4phys;
 2176         pmap->pm_root.rt_root = 0;
 2177         CPU_ZERO(&pmap->pm_active);
 2178         CPU_ZERO(&pmap->pm_save);
 2179         PCPU_SET(curpmap, pmap);
 2180         TAILQ_INIT(&pmap->pm_pvchunk);
 2181         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 2182         pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
 2183         pmap->pm_flags = pmap_flags;
 2184 }
 2185 
 2186 /*
 2187  * Initialize a preallocated and zeroed pmap structure,
 2188  * such as one in a vmspace structure.
 2189  */
 2190 int
 2191 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 2192 {
 2193         vm_page_t pml4pg;
 2194         vm_paddr_t pml4phys;
 2195         int i;
 2196 
 2197         /*
 2198          * allocate the page directory page
 2199          */
 2200         while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 2201             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 2202                 VM_WAIT;
 2203 
 2204         pml4phys = VM_PAGE_TO_PHYS(pml4pg);
 2205         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
 2206         pmap->pm_pcid = -1;
 2207         pmap->pm_cr3 = ~0;      /* initialize to an invalid value */
 2208 
 2209         if ((pml4pg->flags & PG_ZERO) == 0)
 2210                 pagezero(pmap->pm_pml4);
 2211 
 2212         /*
 2213          * Do not install the host kernel mappings in the nested page
 2214          * tables. These mappings are meaningless in the guest physical
 2215          * address space.
 2216          */
 2217         if ((pmap->pm_type = pm_type) == PT_X86) {
 2218                 pmap->pm_cr3 = pml4phys;
 2219 
 2220                 /* Wire in kernel global address entries. */
 2221                 for (i = 0; i < NKPML4E; i++) {
 2222                         pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
 2223                             X86_PG_RW | X86_PG_V | PG_U;
 2224                 }
 2225                 for (i = 0; i < ndmpdpphys; i++) {
 2226                         pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
 2227                             X86_PG_RW | X86_PG_V | PG_U;
 2228                 }
 2229 
 2230                 /* install self-referential address mapping entry(s) */
 2231                 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
 2232                     X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 2233 
 2234                 if (pmap_pcid_enabled) {
 2235                         pmap->pm_pcid = alloc_unr(&pcid_unr);
 2236                         if (pmap->pm_pcid != -1)
 2237                                 pmap->pm_cr3 |= pmap->pm_pcid;
 2238                 }
 2239         }
 2240 
 2241         pmap->pm_root.rt_root = 0;
 2242         CPU_ZERO(&pmap->pm_active);
 2243         TAILQ_INIT(&pmap->pm_pvchunk);
 2244         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 2245         pmap->pm_flags = flags;
 2246         pmap->pm_eptgen = 0;
 2247         CPU_ZERO(&pmap->pm_save);
 2248 
 2249         return (1);
 2250 }
 2251 
 2252 int
 2253 pmap_pinit(pmap_t pmap)
 2254 {
 2255 
 2256         return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 2257 }
 2258 
 2259 /*
 2260  * This routine is called if the desired page table page does not exist.
 2261  *
 2262  * If page table page allocation fails, this routine may sleep before
 2263  * returning NULL.  It sleeps only if a lock pointer was given.
 2264  *
 2265  * Note: If a page allocation fails at page table level two or three,
 2266  * one or two pages may be held during the wait, only to be released
 2267  * afterwards.  This conservative approach is easily argued to avoid
 2268  * race conditions.
 2269  */
 2270 static vm_page_t
 2271 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 2272 {
 2273         vm_page_t m, pdppg, pdpg;
 2274         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 2275 
 2276         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2277 
 2278         PG_A = pmap_accessed_bit(pmap);
 2279         PG_M = pmap_modified_bit(pmap);
 2280         PG_V = pmap_valid_bit(pmap);
 2281         PG_RW = pmap_rw_bit(pmap);
 2282 
 2283         /*
 2284          * Allocate a page table page.
 2285          */
 2286         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 2287             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 2288                 if (lockp != NULL) {
 2289                         RELEASE_PV_LIST_LOCK(lockp);
 2290                         PMAP_UNLOCK(pmap);
 2291                         rw_runlock(&pvh_global_lock);
 2292                         VM_WAIT;
 2293                         rw_rlock(&pvh_global_lock);
 2294                         PMAP_LOCK(pmap);
 2295                 }
 2296 
 2297                 /*
 2298                  * Indicate the need to retry.  While waiting, the page table
 2299                  * page may have been allocated.
 2300                  */
 2301                 return (NULL);
 2302         }
 2303         if ((m->flags & PG_ZERO) == 0)
 2304                 pmap_zero_page(m);
 2305 
 2306         /*
 2307          * Map the pagetable page into the process address space, if
 2308          * it isn't already there.
 2309          */
 2310 
 2311         if (ptepindex >= (NUPDE + NUPDPE)) {
 2312                 pml4_entry_t *pml4;
 2313                 vm_pindex_t pml4index;
 2314 
 2315                 /* Wire up a new PDPE page */
 2316                 pml4index = ptepindex - (NUPDE + NUPDPE);
 2317                 pml4 = &pmap->pm_pml4[pml4index];
 2318                 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2319 
 2320         } else if (ptepindex >= NUPDE) {
 2321                 vm_pindex_t pml4index;
 2322                 vm_pindex_t pdpindex;
 2323                 pml4_entry_t *pml4;
 2324                 pdp_entry_t *pdp;
 2325 
 2326                 /* Wire up a new PDE page */
 2327                 pdpindex = ptepindex - NUPDE;
 2328                 pml4index = pdpindex >> NPML4EPGSHIFT;
 2329 
 2330                 pml4 = &pmap->pm_pml4[pml4index];
 2331                 if ((*pml4 & PG_V) == 0) {
 2332                         /* Have to allocate a new pdp, recurse */
 2333                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 2334                             lockp) == NULL) {
 2335                                 --m->wire_count;
 2336                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 2337                                 vm_page_free_zero(m);
 2338                                 return (NULL);
 2339                         }
 2340                 } else {
 2341                         /* Add reference to pdp page */
 2342                         pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 2343                         pdppg->wire_count++;
 2344                 }
 2345                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2346 
 2347                 /* Now find the pdp page */
 2348                 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2349                 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2350 
 2351         } else {
 2352                 vm_pindex_t pml4index;
 2353                 vm_pindex_t pdpindex;
 2354                 pml4_entry_t *pml4;
 2355                 pdp_entry_t *pdp;
 2356                 pd_entry_t *pd;
 2357 
 2358                 /* Wire up a new PTE page */
 2359                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 2360                 pml4index = pdpindex >> NPML4EPGSHIFT;
 2361 
 2362                 /* First, find the pdp and check that its valid. */
 2363                 pml4 = &pmap->pm_pml4[pml4index];
 2364                 if ((*pml4 & PG_V) == 0) {
 2365                         /* Have to allocate a new pd, recurse */
 2366                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 2367                             lockp) == NULL) {
 2368                                 --m->wire_count;
 2369                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 2370                                 vm_page_free_zero(m);
 2371                                 return (NULL);
 2372                         }
 2373                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2374                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2375                 } else {
 2376                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2377                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2378                         if ((*pdp & PG_V) == 0) {
 2379                                 /* Have to allocate a new pd, recurse */
 2380                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 2381                                     lockp) == NULL) {
 2382                                         --m->wire_count;
 2383                                         atomic_subtract_int(&cnt.v_wire_count,
 2384                                             1);
 2385                                         vm_page_free_zero(m);
 2386                                         return (NULL);
 2387                                 }
 2388                         } else {
 2389                                 /* Add reference to the pd page */
 2390                                 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 2391                                 pdpg->wire_count++;
 2392                         }
 2393                 }
 2394                 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 2395 
 2396                 /* Now we know where the page directory page is */
 2397                 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 2398                 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2399         }
 2400 
 2401         pmap_resident_count_inc(pmap, 1);
 2402 
 2403         return (m);
 2404 }
 2405 
 2406 static vm_page_t
 2407 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 2408 {
 2409         vm_pindex_t pdpindex, ptepindex;
 2410         pdp_entry_t *pdpe, PG_V;
 2411         vm_page_t pdpg;
 2412 
 2413         PG_V = pmap_valid_bit(pmap);
 2414 
 2415 retry:
 2416         pdpe = pmap_pdpe(pmap, va);
 2417         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 2418                 /* Add a reference to the pd page. */
 2419                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 2420                 pdpg->wire_count++;
 2421         } else {
 2422                 /* Allocate a pd page. */
 2423                 ptepindex = pmap_pde_pindex(va);
 2424                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 2425                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 2426                 if (pdpg == NULL && lockp != NULL)
 2427                         goto retry;
 2428         }
 2429         return (pdpg);
 2430 }
 2431 
 2432 static vm_page_t
 2433 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 2434 {
 2435         vm_pindex_t ptepindex;
 2436         pd_entry_t *pd, PG_V;
 2437         vm_page_t m;
 2438 
 2439         PG_V = pmap_valid_bit(pmap);
 2440 
 2441         /*
 2442          * Calculate pagetable page index
 2443          */
 2444         ptepindex = pmap_pde_pindex(va);
 2445 retry:
 2446         /*
 2447          * Get the page directory entry
 2448          */
 2449         pd = pmap_pde(pmap, va);
 2450 
 2451         /*
 2452          * This supports switching from a 2MB page to a
 2453          * normal 4K page.
 2454          */
 2455         if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 2456                 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 2457                         /*
 2458                          * Invalidation of the 2MB page mapping may have caused
 2459                          * the deallocation of the underlying PD page.
 2460                          */
 2461                         pd = NULL;
 2462                 }
 2463         }
 2464 
 2465         /*
 2466          * If the page table page is mapped, we just increment the
 2467          * hold count, and activate it.
 2468          */
 2469         if (pd != NULL && (*pd & PG_V) != 0) {
 2470                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 2471                 m->wire_count++;
 2472         } else {
 2473                 /*
 2474                  * Here if the pte page isn't mapped, or if it has been
 2475                  * deallocated.
 2476                  */
 2477                 m = _pmap_allocpte(pmap, ptepindex, lockp);
 2478                 if (m == NULL && lockp != NULL)
 2479                         goto retry;
 2480         }
 2481         return (m);
 2482 }
 2483 
 2484 
 2485 /***************************************************
 2486  * Pmap allocation/deallocation routines.
 2487  ***************************************************/
 2488 
 2489 /*
 2490  * Release any resources held by the given physical map.
 2491  * Called when a pmap initialized by pmap_pinit is being released.
 2492  * Should only be called if the map contains no valid mappings.
 2493  */
 2494 void
 2495 pmap_release(pmap_t pmap)
 2496 {
 2497         vm_page_t m;
 2498         int i;
 2499 
 2500         KASSERT(pmap->pm_stats.resident_count == 0,
 2501             ("pmap_release: pmap resident count %ld != 0",
 2502             pmap->pm_stats.resident_count));
 2503         KASSERT(vm_radix_is_empty(&pmap->pm_root),
 2504             ("pmap_release: pmap has reserved page table page(s)"));
 2505 
 2506         if (pmap_pcid_enabled) {
 2507                 /*
 2508                  * Invalidate any left TLB entries, to allow the reuse
 2509                  * of the pcid.
 2510                  */
 2511                 pmap_invalidate_all(pmap);
 2512         }
 2513 
 2514         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
 2515 
 2516         for (i = 0; i < NKPML4E; i++)   /* KVA */
 2517                 pmap->pm_pml4[KPML4BASE + i] = 0;
 2518         for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 2519                 pmap->pm_pml4[DMPML4I + i] = 0;
 2520         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
 2521 
 2522         m->wire_count--;
 2523         atomic_subtract_int(&cnt.v_wire_count, 1);
 2524         vm_page_free_zero(m);
 2525         if (pmap->pm_pcid != -1)
 2526                 free_unr(&pcid_unr, pmap->pm_pcid);
 2527 }
 2528 
 2529 static int
 2530 kvm_size(SYSCTL_HANDLER_ARGS)
 2531 {
 2532         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 2533 
 2534         return sysctl_handle_long(oidp, &ksize, 0, req);
 2535 }
 2536 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
 2537     0, 0, kvm_size, "LU", "Size of KVM");
 2538 
 2539 static int
 2540 kvm_free(SYSCTL_HANDLER_ARGS)
 2541 {
 2542         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 2543 
 2544         return sysctl_handle_long(oidp, &kfree, 0, req);
 2545 }
 2546 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
 2547     0, 0, kvm_free, "LU", "Amount of KVM free");
 2548 
 2549 /*
 2550  * grow the number of kernel page table entries, if needed
 2551  */
 2552 void
 2553 pmap_growkernel(vm_offset_t addr)
 2554 {
 2555         vm_paddr_t paddr;
 2556         vm_page_t nkpg;
 2557         pd_entry_t *pde, newpdir;
 2558         pdp_entry_t *pdpe;
 2559 
 2560         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 2561 
 2562         /*
 2563          * Return if "addr" is within the range of kernel page table pages
 2564          * that were preallocated during pmap bootstrap.  Moreover, leave
 2565          * "kernel_vm_end" and the kernel page table as they were.
 2566          *
 2567          * The correctness of this action is based on the following
 2568          * argument: vm_map_findspace() allocates contiguous ranges of the
 2569          * kernel virtual address space.  It calls this function if a range
 2570          * ends after "kernel_vm_end".  If the kernel is mapped between
 2571          * "kernel_vm_end" and "addr", then the range cannot begin at
 2572          * "kernel_vm_end".  In fact, its beginning address cannot be less
 2573          * than the kernel.  Thus, there is no immediate need to allocate
 2574          * any new kernel page table pages between "kernel_vm_end" and
 2575          * "KERNBASE".
 2576          */
 2577         if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 2578                 return;
 2579 
 2580         addr = roundup2(addr, NBPDR);
 2581         if (addr - 1 >= kernel_map->max_offset)
 2582                 addr = kernel_map->max_offset;
 2583         while (kernel_vm_end < addr) {
 2584                 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 2585                 if ((*pdpe & X86_PG_V) == 0) {
 2586                         /* We need a new PDP entry */
 2587                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 2588                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 2589                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 2590                         if (nkpg == NULL)
 2591                                 panic("pmap_growkernel: no memory to grow kernel");
 2592                         if ((nkpg->flags & PG_ZERO) == 0)
 2593                                 pmap_zero_page(nkpg);
 2594                         paddr = VM_PAGE_TO_PHYS(nkpg);
 2595                         *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
 2596                             X86_PG_A | X86_PG_M);
 2597                         continue; /* try again */
 2598                 }
 2599                 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 2600                 if ((*pde & X86_PG_V) != 0) {
 2601                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 2602                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 2603                                 kernel_vm_end = kernel_map->max_offset;
 2604                                 break;                       
 2605                         }
 2606                         continue;
 2607                 }
 2608 
 2609                 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 2610                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 2611                     VM_ALLOC_ZERO);
 2612                 if (nkpg == NULL)
 2613                         panic("pmap_growkernel: no memory to grow kernel");
 2614                 if ((nkpg->flags & PG_ZERO) == 0)
 2615                         pmap_zero_page(nkpg);
 2616                 paddr = VM_PAGE_TO_PHYS(nkpg);
 2617                 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 2618                 pde_store(pde, newpdir);
 2619 
 2620                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 2621                 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 2622                         kernel_vm_end = kernel_map->max_offset;
 2623                         break;                       
 2624                 }
 2625         }
 2626 }
 2627 
 2628 
 2629 /***************************************************
 2630  * page management routines.
 2631  ***************************************************/
 2632 
 2633 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 2634 CTASSERT(_NPCM == 3);
 2635 CTASSERT(_NPCPV == 168);
 2636 
 2637 static __inline struct pv_chunk *
 2638 pv_to_chunk(pv_entry_t pv)
 2639 {
 2640 
 2641         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 2642 }
 2643 
 2644 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 2645 
 2646 #define PC_FREE0        0xfffffffffffffffful
 2647 #define PC_FREE1        0xfffffffffffffffful
 2648 #define PC_FREE2        0x000000fffffffffful
 2649 
 2650 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 2651 
 2652 #ifdef PV_STATS
 2653 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 2654 
 2655 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 2656         "Current number of pv entry chunks");
 2657 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 2658         "Current number of pv entry chunks allocated");
 2659 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 2660         "Current number of pv entry chunks frees");
 2661 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 2662         "Number of times tried to get a chunk page but failed.");
 2663 
 2664 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 2665 static int pv_entry_spare;
 2666 
 2667 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 2668         "Current number of pv entry frees");
 2669 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 2670         "Current number of pv entry allocs");
 2671 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 2672         "Current number of pv entries");
 2673 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 2674         "Current number of spare pv entries");
 2675 #endif
 2676 
 2677 /*
 2678  * We are in a serious low memory condition.  Resort to
 2679  * drastic measures to free some pages so we can allocate
 2680  * another pv entry chunk.
 2681  *
 2682  * Returns NULL if PV entries were reclaimed from the specified pmap.
 2683  *
 2684  * We do not, however, unmap 2mpages because subsequent accesses will
 2685  * allocate per-page pv entries until repromotion occurs, thereby
 2686  * exacerbating the shortage of free pv entries.
 2687  */
 2688 static vm_page_t
 2689 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 2690 {
 2691         struct pch new_tail;
 2692         struct pv_chunk *pc;
 2693         struct md_page *pvh;
 2694         pd_entry_t *pde;
 2695         pmap_t pmap;
 2696         pt_entry_t *pte, tpte;
 2697         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 2698         pv_entry_t pv;
 2699         vm_offset_t va;
 2700         vm_page_t m, m_pc;
 2701         struct spglist free;
 2702         uint64_t inuse;
 2703         int bit, field, freed;
 2704 
 2705         rw_assert(&pvh_global_lock, RA_LOCKED);
 2706         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 2707         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 2708         pmap = NULL;
 2709         m_pc = NULL;
 2710         PG_G = PG_A = PG_M = PG_RW = 0;
 2711         SLIST_INIT(&free);
 2712         TAILQ_INIT(&new_tail);
 2713         mtx_lock(&pv_chunks_mutex);
 2714         while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
 2715                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 2716                 mtx_unlock(&pv_chunks_mutex);
 2717                 if (pmap != pc->pc_pmap) {
 2718                         if (pmap != NULL) {
 2719                                 pmap_invalidate_all(pmap);
 2720                                 if (pmap != locked_pmap)
 2721                                         PMAP_UNLOCK(pmap);
 2722                         }
 2723                         pmap = pc->pc_pmap;
 2724                         /* Avoid deadlock and lock recursion. */
 2725                         if (pmap > locked_pmap) {
 2726                                 RELEASE_PV_LIST_LOCK(lockp);
 2727                                 PMAP_LOCK(pmap);
 2728                         } else if (pmap != locked_pmap &&
 2729                             !PMAP_TRYLOCK(pmap)) {
 2730                                 pmap = NULL;
 2731                                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 2732                                 mtx_lock(&pv_chunks_mutex);
 2733                                 continue;
 2734                         }
 2735                         PG_G = pmap_global_bit(pmap);
 2736                         PG_A = pmap_accessed_bit(pmap);
 2737                         PG_M = pmap_modified_bit(pmap);
 2738                         PG_RW = pmap_rw_bit(pmap);
 2739                 }
 2740 
 2741                 /*
 2742                  * Destroy every non-wired, 4 KB page mapping in the chunk.
 2743                  */
 2744                 freed = 0;
 2745                 for (field = 0; field < _NPCM; field++) {
 2746                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 2747                             inuse != 0; inuse &= ~(1UL << bit)) {
 2748                                 bit = bsfq(inuse);
 2749                                 pv = &pc->pc_pventry[field * 64 + bit];
 2750                                 va = pv->pv_va;
 2751                                 pde = pmap_pde(pmap, va);
 2752                                 if ((*pde & PG_PS) != 0)
 2753                                         continue;
 2754                                 pte = pmap_pde_to_pte(pde, va);
 2755                                 if ((*pte & PG_W) != 0)
 2756                                         continue;
 2757                                 tpte = pte_load_clear(pte);
 2758                                 if ((tpte & PG_G) != 0)
 2759                                         pmap_invalidate_page(pmap, va);
 2760                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 2761                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2762                                         vm_page_dirty(m);
 2763                                 if ((tpte & PG_A) != 0)
 2764                                         vm_page_aflag_set(m, PGA_REFERENCED);
 2765                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 2766                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 2767                                 m->md.pv_gen++;
 2768                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 2769                                     (m->flags & PG_FICTITIOUS) == 0) {
 2770                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2771                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 2772                                                 vm_page_aflag_clear(m,
 2773                                                     PGA_WRITEABLE);
 2774                                         }
 2775                                 }
 2776                                 pc->pc_map[field] |= 1UL << bit;
 2777                                 pmap_unuse_pt(pmap, va, *pde, &free);
 2778                                 freed++;
 2779                         }
 2780                 }
 2781                 if (freed == 0) {
 2782                         TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 2783                         mtx_lock(&pv_chunks_mutex);
 2784                         continue;
 2785                 }
 2786                 /* Every freed mapping is for a 4 KB page. */
 2787                 pmap_resident_count_dec(pmap, freed);
 2788                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 2789                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 2790                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 2791                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2792                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 2793                     pc->pc_map[2] == PC_FREE2) {
 2794                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 2795                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 2796                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 2797                         /* Entire chunk is free; return it. */
 2798                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 2799                         dump_drop_page(m_pc->phys_addr);
 2800                         mtx_lock(&pv_chunks_mutex);
 2801                         break;
 2802                 }
 2803                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2804                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 2805                 mtx_lock(&pv_chunks_mutex);
 2806                 /* One freed pv entry in locked_pmap is sufficient. */
 2807                 if (pmap == locked_pmap)
 2808                         break;
 2809         }
 2810         TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 2811         mtx_unlock(&pv_chunks_mutex);
 2812         if (pmap != NULL) {
 2813                 pmap_invalidate_all(pmap);
 2814                 if (pmap != locked_pmap)
 2815                         PMAP_UNLOCK(pmap);
 2816         }
 2817         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 2818                 m_pc = SLIST_FIRST(&free);
 2819                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 2820                 /* Recycle a freed page table page. */
 2821                 m_pc->wire_count = 1;
 2822                 atomic_add_int(&cnt.v_wire_count, 1);
 2823         }
 2824         pmap_free_zero_pages(&free);
 2825         return (m_pc);
 2826 }
 2827 
 2828 /*
 2829  * free the pv_entry back to the free list
 2830  */
 2831 static void
 2832 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 2833 {
 2834         struct pv_chunk *pc;
 2835         int idx, field, bit;
 2836 
 2837         rw_assert(&pvh_global_lock, RA_LOCKED);
 2838         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2839         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 2840         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 2841         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 2842         pc = pv_to_chunk(pv);
 2843         idx = pv - &pc->pc_pventry[0];
 2844         field = idx / 64;
 2845         bit = idx % 64;
 2846         pc->pc_map[field] |= 1ul << bit;
 2847         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 2848             pc->pc_map[2] != PC_FREE2) {
 2849                 /* 98% of the time, pc is already at the head of the list. */
 2850                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 2851                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2852                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2853                 }
 2854                 return;
 2855         }
 2856         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2857         free_pv_chunk(pc);
 2858 }
 2859 
 2860 static void
 2861 free_pv_chunk(struct pv_chunk *pc)
 2862 {
 2863         vm_page_t m;
 2864 
 2865         mtx_lock(&pv_chunks_mutex);
 2866         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 2867         mtx_unlock(&pv_chunks_mutex);
 2868         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 2869         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 2870         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 2871         /* entire chunk is free, return it */
 2872         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 2873         dump_drop_page(m->phys_addr);
 2874         vm_page_unwire(m, 0);
 2875         vm_page_free(m);
 2876 }
 2877 
 2878 /*
 2879  * Returns a new PV entry, allocating a new PV chunk from the system when
 2880  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
 2881  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
 2882  * returned.
 2883  *
 2884  * The given PV list lock may be released.
 2885  */
 2886 static pv_entry_t
 2887 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 2888 {
 2889         int bit, field;
 2890         pv_entry_t pv;
 2891         struct pv_chunk *pc;
 2892         vm_page_t m;
 2893 
 2894         rw_assert(&pvh_global_lock, RA_LOCKED);
 2895         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2896         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 2897 retry:
 2898         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 2899         if (pc != NULL) {
 2900                 for (field = 0; field < _NPCM; field++) {
 2901                         if (pc->pc_map[field]) {
 2902                                 bit = bsfq(pc->pc_map[field]);
 2903                                 break;
 2904                         }
 2905                 }
 2906                 if (field < _NPCM) {
 2907                         pv = &pc->pc_pventry[field * 64 + bit];
 2908                         pc->pc_map[field] &= ~(1ul << bit);
 2909                         /* If this was the last item, move it to tail */
 2910                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 2911                             pc->pc_map[2] == 0) {
 2912                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2913                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 2914                                     pc_list);
 2915                         }
 2916                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 2917                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 2918                         return (pv);
 2919                 }
 2920         }
 2921         /* No free items, allocate another chunk */
 2922         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 2923             VM_ALLOC_WIRED);
 2924         if (m == NULL) {
 2925                 if (lockp == NULL) {
 2926                         PV_STAT(pc_chunk_tryfail++);
 2927                         return (NULL);
 2928                 }
 2929                 m = reclaim_pv_chunk(pmap, lockp);
 2930                 if (m == NULL)
 2931                         goto retry;
 2932         }
 2933         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 2934         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 2935         dump_add_page(m->phys_addr);
 2936         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 2937         pc->pc_pmap = pmap;
 2938         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
 2939         pc->pc_map[1] = PC_FREE1;
 2940         pc->pc_map[2] = PC_FREE2;
 2941         mtx_lock(&pv_chunks_mutex);
 2942         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 2943         mtx_unlock(&pv_chunks_mutex);
 2944         pv = &pc->pc_pventry[0];
 2945         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2946         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 2947         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 2948         return (pv);
 2949 }
 2950 
 2951 /*
 2952  * Returns the number of one bits within the given PV chunk map element.
 2953  */
 2954 static int
 2955 popcnt_pc_map_elem(uint64_t elem)
 2956 {
 2957         int count;
 2958 
 2959         /*
 2960          * This simple method of counting the one bits performs well because
 2961          * the given element typically contains more zero bits than one bits.
 2962          */
 2963         count = 0;
 2964         for (; elem != 0; elem &= elem - 1)
 2965                 count++;
 2966         return (count);
 2967 }
 2968 
 2969 /*
 2970  * Ensure that the number of spare PV entries in the specified pmap meets or
 2971  * exceeds the given count, "needed".
 2972  *
 2973  * The given PV list lock may be released.
 2974  */
 2975 static void
 2976 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 2977 {
 2978         struct pch new_tail;
 2979         struct pv_chunk *pc;
 2980         int avail, free;
 2981         vm_page_t m;
 2982 
 2983         rw_assert(&pvh_global_lock, RA_LOCKED);
 2984         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2985         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 2986 
 2987         /*
 2988          * Newly allocated PV chunks must be stored in a private list until
 2989          * the required number of PV chunks have been allocated.  Otherwise,
 2990          * reclaim_pv_chunk() could recycle one of these chunks.  In
 2991          * contrast, these chunks must be added to the pmap upon allocation.
 2992          */
 2993         TAILQ_INIT(&new_tail);
 2994 retry:
 2995         avail = 0;
 2996         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 2997                 if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
 2998                         free = popcnt_pc_map_elem(pc->pc_map[0]);
 2999                         free += popcnt_pc_map_elem(pc->pc_map[1]);
 3000                         free += popcnt_pc_map_elem(pc->pc_map[2]);
 3001                 } else {
 3002                         free = popcntq(pc->pc_map[0]);
 3003                         free += popcntq(pc->pc_map[1]);
 3004                         free += popcntq(pc->pc_map[2]);
 3005                 }
 3006                 if (free == 0)
 3007                         break;
 3008                 avail += free;
 3009                 if (avail >= needed)
 3010                         break;
 3011         }
 3012         for (; avail < needed; avail += _NPCPV) {
 3013                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 3014                     VM_ALLOC_WIRED);
 3015                 if (m == NULL) {
 3016                         m = reclaim_pv_chunk(pmap, lockp);
 3017                         if (m == NULL)
 3018                                 goto retry;
 3019                 }
 3020                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 3021                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 3022                 dump_add_page(m->phys_addr);
 3023                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 3024                 pc->pc_pmap = pmap;
 3025                 pc->pc_map[0] = PC_FREE0;
 3026                 pc->pc_map[1] = PC_FREE1;
 3027                 pc->pc_map[2] = PC_FREE2;
 3028                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 3029                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 3030                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 3031         }
 3032         if (!TAILQ_EMPTY(&new_tail)) {
 3033                 mtx_lock(&pv_chunks_mutex);
 3034                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 3035                 mtx_unlock(&pv_chunks_mutex);
 3036         }
 3037 }
 3038 
 3039 /*
 3040  * First find and then remove the pv entry for the specified pmap and virtual
 3041  * address from the specified pv list.  Returns the pv entry if found and NULL
 3042  * otherwise.  This operation can be performed on pv lists for either 4KB or
 3043  * 2MB page mappings.
 3044  */
 3045 static __inline pv_entry_t
 3046 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 3047 {
 3048         pv_entry_t pv;
 3049 
 3050         rw_assert(&pvh_global_lock, RA_LOCKED);
 3051         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 3052                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 3053                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 3054                         pvh->pv_gen++;
 3055                         break;
 3056                 }
 3057         }
 3058         return (pv);
 3059 }
 3060 
 3061 /*
 3062  * After demotion from a 2MB page mapping to 512 4KB page mappings,
 3063  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
 3064  * entries for each of the 4KB page mappings.
 3065  */
 3066 static void
 3067 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3068     struct rwlock **lockp)
 3069 {
 3070         struct md_page *pvh;
 3071         struct pv_chunk *pc;
 3072         pv_entry_t pv;
 3073         vm_offset_t va_last;
 3074         vm_page_t m;
 3075         int bit, field;
 3076 
 3077         rw_assert(&pvh_global_lock, RA_LOCKED);
 3078         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3079         KASSERT((pa & PDRMASK) == 0,
 3080             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 3081         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3082 
 3083         /*
 3084          * Transfer the 2mpage's pv entry for this mapping to the first
 3085          * page's pv list.  Once this transfer begins, the pv list lock
 3086          * must not be released until the last pv entry is reinstantiated.
 3087          */
 3088         pvh = pa_to_pvh(pa);
 3089         va = trunc_2mpage(va);
 3090         pv = pmap_pvh_remove(pvh, pmap, va);
 3091         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 3092         m = PHYS_TO_VM_PAGE(pa);
 3093         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3094         m->md.pv_gen++;
 3095         /* Instantiate the remaining NPTEPG - 1 pv entries. */
 3096         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 3097         va_last = va + NBPDR - PAGE_SIZE;
 3098         for (;;) {
 3099                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 3100                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 3101                     pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
 3102                 for (field = 0; field < _NPCM; field++) {
 3103                         while (pc->pc_map[field]) {
 3104                                 bit = bsfq(pc->pc_map[field]);
 3105                                 pc->pc_map[field] &= ~(1ul << bit);
 3106                                 pv = &pc->pc_pventry[field * 64 + bit];
 3107                                 va += PAGE_SIZE;
 3108                                 pv->pv_va = va;
 3109                                 m++;
 3110                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3111                             ("pmap_pv_demote_pde: page %p is not managed", m));
 3112                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3113                                 m->md.pv_gen++;
 3114                                 if (va == va_last)
 3115                                         goto out;
 3116                         }
 3117                 }
 3118                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3119                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 3120         }
 3121 out:
 3122         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 3123                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3124                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 3125         }
 3126         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 3127         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 3128 }
 3129 
 3130 /*
 3131  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
 3132  * replace the many pv entries for the 4KB page mappings by a single pv entry
 3133  * for the 2MB page mapping.
 3134  */
 3135 static void
 3136 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3137     struct rwlock **lockp)
 3138 {
 3139         struct md_page *pvh;
 3140         pv_entry_t pv;
 3141         vm_offset_t va_last;
 3142         vm_page_t m;
 3143 
 3144         rw_assert(&pvh_global_lock, RA_LOCKED);
 3145         KASSERT((pa & PDRMASK) == 0,
 3146             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 3147         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3148 
 3149         /*
 3150          * Transfer the first page's pv entry for this mapping to the 2mpage's
 3151          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 3152          * a transfer avoids the possibility that get_pv_entry() calls
 3153          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 3154          * mappings that is being promoted.
 3155          */
 3156         m = PHYS_TO_VM_PAGE(pa);
 3157         va = trunc_2mpage(va);
 3158         pv = pmap_pvh_remove(&m->md, pmap, va);
 3159         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 3160         pvh = pa_to_pvh(pa);
 3161         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 3162         pvh->pv_gen++;
 3163         /* Free the remaining NPTEPG - 1 pv entries. */
 3164         va_last = va + NBPDR - PAGE_SIZE;
 3165         do {
 3166                 m++;
 3167                 va += PAGE_SIZE;
 3168                 pmap_pvh_free(&m->md, pmap, va);
 3169         } while (va < va_last);
 3170 }
 3171 
 3172 /*
 3173  * First find and then destroy the pv entry for the specified pmap and virtual
 3174  * address.  This operation can be performed on pv lists for either 4KB or 2MB
 3175  * page mappings.
 3176  */
 3177 static void
 3178 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 3179 {
 3180         pv_entry_t pv;
 3181 
 3182         pv = pmap_pvh_remove(pvh, pmap, va);
 3183         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 3184         free_pv_entry(pmap, pv);
 3185 }
 3186 
 3187 /*
 3188  * Conditionally create the PV entry for a 4KB page mapping if the required
 3189  * memory can be allocated without resorting to reclamation.
 3190  */
 3191 static boolean_t
 3192 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
 3193     struct rwlock **lockp)
 3194 {
 3195         pv_entry_t pv;
 3196 
 3197         rw_assert(&pvh_global_lock, RA_LOCKED);
 3198         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3199         /* Pass NULL instead of the lock pointer to disable reclamation. */
 3200         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 3201                 pv->pv_va = va;
 3202                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 3203                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3204                 m->md.pv_gen++;
 3205                 return (TRUE);
 3206         } else
 3207                 return (FALSE);
 3208 }
 3209 
 3210 /*
 3211  * Conditionally create the PV entry for a 2MB page mapping if the required
 3212  * memory can be allocated without resorting to reclamation.
 3213  */
 3214 static boolean_t
 3215 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3216     struct rwlock **lockp)
 3217 {
 3218         struct md_page *pvh;
 3219         pv_entry_t pv;
 3220 
 3221         rw_assert(&pvh_global_lock, RA_LOCKED);
 3222         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3223         /* Pass NULL instead of the lock pointer to disable reclamation. */
 3224         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 3225                 pv->pv_va = va;
 3226                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3227                 pvh = pa_to_pvh(pa);
 3228                 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 3229                 pvh->pv_gen++;
 3230                 return (TRUE);
 3231         } else
 3232                 return (FALSE);
 3233 }
 3234 
 3235 /*
 3236  * Fills a page table page with mappings to consecutive physical pages.
 3237  */
 3238 static void
 3239 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 3240 {
 3241         pt_entry_t *pte;
 3242 
 3243         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 3244                 *pte = newpte;
 3245                 newpte += PAGE_SIZE;
 3246         }
 3247 }
 3248 
 3249 /*
 3250  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
 3251  * mapping is invalidated.
 3252  */
 3253 static boolean_t
 3254 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 3255 {
 3256         struct rwlock *lock;
 3257         boolean_t rv;
 3258 
 3259         lock = NULL;
 3260         rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
 3261         if (lock != NULL)
 3262                 rw_wunlock(lock);
 3263         return (rv);
 3264 }
 3265 
 3266 static boolean_t
 3267 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 3268     struct rwlock **lockp)
 3269 {
 3270         pd_entry_t newpde, oldpde;
 3271         pt_entry_t *firstpte, newpte;
 3272         pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
 3273         vm_paddr_t mptepa;
 3274         vm_page_t mpte;
 3275         struct spglist free;
 3276         int PG_PTE_CACHE;
 3277 
 3278         PG_G = pmap_global_bit(pmap);
 3279         PG_A = pmap_accessed_bit(pmap);
 3280         PG_M = pmap_modified_bit(pmap);
 3281         PG_RW = pmap_rw_bit(pmap);
 3282         PG_V = pmap_valid_bit(pmap);
 3283         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 3284 
 3285         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3286         oldpde = *pde;
 3287         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 3288             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 3289         if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
 3290             NULL)
 3291                 pmap_remove_pt_page(pmap, mpte);
 3292         else {
 3293                 KASSERT((oldpde & PG_W) == 0,
 3294                     ("pmap_demote_pde: page table page for a wired mapping"
 3295                     " is missing"));
 3296 
 3297                 /*
 3298                  * Invalidate the 2MB page mapping and return "failure" if the
 3299                  * mapping was never accessed or the allocation of the new
 3300                  * page table page fails.  If the 2MB page mapping belongs to
 3301                  * the direct map region of the kernel's address space, then
 3302                  * the page allocation request specifies the highest possible
 3303                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
 3304                  * normal.  Page table pages are preallocated for every other
 3305                  * part of the kernel address space, so the direct map region
 3306                  * is the only part of the kernel address space that must be
 3307                  * handled here.
 3308                  */
 3309                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 3310                     pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
 3311                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 3312                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 3313                         SLIST_INIT(&free);
 3314                         pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
 3315                             lockp);
 3316                         pmap_invalidate_page(pmap, trunc_2mpage(va));
 3317                         pmap_free_zero_pages(&free);
 3318                         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
 3319                             " in pmap %p", va, pmap);
 3320                         return (FALSE);
 3321                 }
 3322                 if (va < VM_MAXUSER_ADDRESS)
 3323                         pmap_resident_count_inc(pmap, 1);
 3324         }
 3325         mptepa = VM_PAGE_TO_PHYS(mpte);
 3326         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 3327         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 3328         KASSERT((oldpde & PG_A) != 0,
 3329             ("pmap_demote_pde: oldpde is missing PG_A"));
 3330         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 3331             ("pmap_demote_pde: oldpde is missing PG_M"));
 3332         newpte = oldpde & ~PG_PS;
 3333         newpte = pmap_swap_pat(pmap, newpte);
 3334 
 3335         /*
 3336          * If the page table page is new, initialize it.
 3337          */
 3338         if (mpte->wire_count == 1) {
 3339                 mpte->wire_count = NPTEPG;
 3340                 pmap_fill_ptp(firstpte, newpte);
 3341         }
 3342         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 3343             ("pmap_demote_pde: firstpte and newpte map different physical"
 3344             " addresses"));
 3345 
 3346         /*
 3347          * If the mapping has changed attributes, update the page table
 3348          * entries.
 3349          */
 3350         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 3351                 pmap_fill_ptp(firstpte, newpte);
 3352 
 3353         /*
 3354          * The spare PV entries must be reserved prior to demoting the
 3355          * mapping, that is, prior to changing the PDE.  Otherwise, the state
 3356          * of the PDE and the PV lists will be inconsistent, which can result
 3357          * in reclaim_pv_chunk() attempting to remove a PV entry from the
 3358          * wrong PV list and pmap_pv_demote_pde() failing to find the expected
 3359          * PV entry for the 2MB page mapping that is being demoted.
 3360          */
 3361         if ((oldpde & PG_MANAGED) != 0)
 3362                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 3363 
 3364         /*
 3365          * Demote the mapping.  This pmap is locked.  The old PDE has
 3366          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 3367          * set.  Thus, there is no danger of a race with another
 3368          * processor changing the setting of PG_A and/or PG_M between
 3369          * the read above and the store below. 
 3370          */
 3371         if (workaround_erratum383)
 3372                 pmap_update_pde(pmap, va, pde, newpde);
 3373         else
 3374                 pde_store(pde, newpde);
 3375 
 3376         /*
 3377          * Invalidate a stale recursive mapping of the page table page.
 3378          */
 3379         if (va >= VM_MAXUSER_ADDRESS)
 3380                 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 3381 
 3382         /*
 3383          * Demote the PV entry.
 3384          */
 3385         if ((oldpde & PG_MANAGED) != 0)
 3386                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
 3387 
 3388         atomic_add_long(&pmap_pde_demotions, 1);
 3389         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
 3390             " in pmap %p", va, pmap);
 3391         return (TRUE);
 3392 }
 3393 
 3394 /*
 3395  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
 3396  */
 3397 static void
 3398 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 3399 {
 3400         pd_entry_t newpde;
 3401         vm_paddr_t mptepa;
 3402         vm_page_t mpte;
 3403 
 3404         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 3405         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3406         mpte = pmap_lookup_pt_page(pmap, va);
 3407         if (mpte == NULL)
 3408                 panic("pmap_remove_kernel_pde: Missing pt page.");
 3409 
 3410         pmap_remove_pt_page(pmap, mpte);
 3411         mptepa = VM_PAGE_TO_PHYS(mpte);
 3412         newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
 3413 
 3414         /*
 3415          * Initialize the page table page.
 3416          */
 3417         pagezero((void *)PHYS_TO_DMAP(mptepa));
 3418 
 3419         /*
 3420          * Demote the mapping.
 3421          */
 3422         if (workaround_erratum383)
 3423                 pmap_update_pde(pmap, va, pde, newpde);
 3424         else
 3425                 pde_store(pde, newpde);
 3426 
 3427         /*
 3428          * Invalidate a stale recursive mapping of the page table page.
 3429          */
 3430         pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 3431 }
 3432 
 3433 /*
 3434  * pmap_remove_pde: do the things to unmap a superpage in a process
 3435  */
 3436 static int
 3437 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 3438     struct spglist *free, struct rwlock **lockp)
 3439 {
 3440         struct md_page *pvh;
 3441         pd_entry_t oldpde;
 3442         vm_offset_t eva, va;
 3443         vm_page_t m, mpte;
 3444         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 3445 
 3446         PG_G = pmap_global_bit(pmap);
 3447         PG_A = pmap_accessed_bit(pmap);
 3448         PG_M = pmap_modified_bit(pmap);
 3449         PG_RW = pmap_rw_bit(pmap);
 3450 
 3451         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3452         KASSERT((sva & PDRMASK) == 0,
 3453             ("pmap_remove_pde: sva is not 2mpage aligned"));
 3454         oldpde = pte_load_clear(pdq);
 3455         if (oldpde & PG_W)
 3456                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 3457 
 3458         /*
 3459          * Machines that don't support invlpg, also don't support
 3460          * PG_G.
 3461          */
 3462         if (oldpde & PG_G)
 3463                 pmap_invalidate_page(kernel_pmap, sva);
 3464         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 3465         if (oldpde & PG_MANAGED) {
 3466                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 3467                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 3468                 pmap_pvh_free(pvh, pmap, sva);
 3469                 eva = sva + NBPDR;
 3470                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 3471                     va < eva; va += PAGE_SIZE, m++) {
 3472                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3473                                 vm_page_dirty(m);
 3474                         if (oldpde & PG_A)
 3475                                 vm_page_aflag_set(m, PGA_REFERENCED);
 3476                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 3477                             TAILQ_EMPTY(&pvh->pv_list))
 3478                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 3479                 }
 3480         }
 3481         if (pmap == kernel_pmap) {
 3482                 pmap_remove_kernel_pde(pmap, pdq, sva);
 3483         } else {
 3484                 mpte = pmap_lookup_pt_page(pmap, sva);
 3485                 if (mpte != NULL) {
 3486                         pmap_remove_pt_page(pmap, mpte);
 3487                         pmap_resident_count_dec(pmap, 1);
 3488                         KASSERT(mpte->wire_count == NPTEPG,
 3489                             ("pmap_remove_pde: pte page wire count error"));
 3490                         mpte->wire_count = 0;
 3491                         pmap_add_delayed_free_list(mpte, free, FALSE);
 3492                         atomic_subtract_int(&cnt.v_wire_count, 1);
 3493                 }
 3494         }
 3495         return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 3496 }
 3497 
 3498 /*
 3499  * pmap_remove_pte: do the things to unmap a page in a process
 3500  */
 3501 static int
 3502 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
 3503     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 3504 {
 3505         struct md_page *pvh;
 3506         pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 3507         vm_page_t m;
 3508 
 3509         PG_A = pmap_accessed_bit(pmap);
 3510         PG_M = pmap_modified_bit(pmap);
 3511         PG_RW = pmap_rw_bit(pmap);
 3512 
 3513         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3514         oldpte = pte_load_clear(ptq);
 3515         if (oldpte & PG_W)
 3516                 pmap->pm_stats.wired_count -= 1;
 3517         pmap_resident_count_dec(pmap, 1);
 3518         if (oldpte & PG_MANAGED) {
 3519                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 3520                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3521                         vm_page_dirty(m);
 3522                 if (oldpte & PG_A)
 3523                         vm_page_aflag_set(m, PGA_REFERENCED);
 3524                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 3525                 pmap_pvh_free(&m->md, pmap, va);
 3526                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 3527                     (m->flags & PG_FICTITIOUS) == 0) {
 3528                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3529                         if (TAILQ_EMPTY(&pvh->pv_list))
 3530                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 3531                 }
 3532         }
 3533         return (pmap_unuse_pt(pmap, va, ptepde, free));
 3534 }
 3535 
 3536 /*
 3537  * Remove a single page from a process address space
 3538  */
 3539 static void
 3540 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 3541     struct spglist *free)
 3542 {
 3543         struct rwlock *lock;
 3544         pt_entry_t *pte, PG_V;
 3545 
 3546         PG_V = pmap_valid_bit(pmap);
 3547         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3548         if ((*pde & PG_V) == 0)
 3549                 return;
 3550         pte = pmap_pde_to_pte(pde, va);
 3551         if ((*pte & PG_V) == 0)
 3552                 return;
 3553         lock = NULL;
 3554         pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
 3555         if (lock != NULL)
 3556                 rw_wunlock(lock);
 3557         pmap_invalidate_page(pmap, va);
 3558 }
 3559 
 3560 /*
 3561  *      Remove the given range of addresses from the specified map.
 3562  *
 3563  *      It is assumed that the start and end are properly
 3564  *      rounded to the page size.
 3565  */
 3566 void
 3567 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 3568 {
 3569         struct rwlock *lock;
 3570         vm_offset_t va, va_next;
 3571         pml4_entry_t *pml4e;
 3572         pdp_entry_t *pdpe;
 3573         pd_entry_t ptpaddr, *pde;
 3574         pt_entry_t *pte, PG_G, PG_V;
 3575         struct spglist free;
 3576         int anyvalid;
 3577 
 3578         PG_G = pmap_global_bit(pmap);
 3579         PG_V = pmap_valid_bit(pmap);
 3580 
 3581         /*
 3582          * Perform an unsynchronized read.  This is, however, safe.
 3583          */
 3584         if (pmap->pm_stats.resident_count == 0)
 3585                 return;
 3586 
 3587         anyvalid = 0;
 3588         SLIST_INIT(&free);
 3589 
 3590         rw_rlock(&pvh_global_lock);
 3591         PMAP_LOCK(pmap);
 3592 
 3593         /*
 3594          * special handling of removing one page.  a very
 3595          * common operation and easy to short circuit some
 3596          * code.
 3597          */
 3598         if (sva + PAGE_SIZE == eva) {
 3599                 pde = pmap_pde(pmap, sva);
 3600                 if (pde && (*pde & PG_PS) == 0) {
 3601                         pmap_remove_page(pmap, sva, pde, &free);
 3602                         goto out;
 3603                 }
 3604         }
 3605 
 3606         lock = NULL;
 3607         for (; sva < eva; sva = va_next) {
 3608 
 3609                 if (pmap->pm_stats.resident_count == 0)
 3610                         break;
 3611 
 3612                 pml4e = pmap_pml4e(pmap, sva);
 3613                 if ((*pml4e & PG_V) == 0) {
 3614                         va_next = (sva + NBPML4) & ~PML4MASK;
 3615                         if (va_next < sva)
 3616                                 va_next = eva;
 3617                         continue;
 3618                 }
 3619 
 3620                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 3621                 if ((*pdpe & PG_V) == 0) {
 3622                         va_next = (sva + NBPDP) & ~PDPMASK;
 3623                         if (va_next < sva)
 3624                                 va_next = eva;
 3625                         continue;
 3626                 }
 3627 
 3628                 /*
 3629                  * Calculate index for next page table.
 3630                  */
 3631                 va_next = (sva + NBPDR) & ~PDRMASK;
 3632                 if (va_next < sva)
 3633                         va_next = eva;
 3634 
 3635                 pde = pmap_pdpe_to_pde(pdpe, sva);
 3636                 ptpaddr = *pde;
 3637 
 3638                 /*
 3639                  * Weed out invalid mappings.
 3640                  */
 3641                 if (ptpaddr == 0)
 3642                         continue;
 3643 
 3644                 /*
 3645                  * Check for large page.
 3646                  */
 3647                 if ((ptpaddr & PG_PS) != 0) {
 3648                         /*
 3649                          * Are we removing the entire large page?  If not,
 3650                          * demote the mapping and fall through.
 3651                          */
 3652                         if (sva + NBPDR == va_next && eva >= va_next) {
 3653                                 /*
 3654                                  * The TLB entry for a PG_G mapping is
 3655                                  * invalidated by pmap_remove_pde().
 3656                                  */
 3657                                 if ((ptpaddr & PG_G) == 0)
 3658                                         anyvalid = 1;
 3659                                 pmap_remove_pde(pmap, pde, sva, &free, &lock);
 3660                                 continue;
 3661                         } else if (!pmap_demote_pde_locked(pmap, pde, sva,
 3662                             &lock)) {
 3663                                 /* The large page mapping was destroyed. */
 3664                                 continue;
 3665                         } else
 3666                                 ptpaddr = *pde;
 3667                 }
 3668 
 3669                 /*
 3670                  * Limit our scan to either the end of the va represented
 3671                  * by the current page table page, or to the end of the
 3672                  * range being removed.
 3673                  */
 3674                 if (va_next > eva)
 3675                         va_next = eva;
 3676 
 3677                 va = va_next;
 3678                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 3679                     sva += PAGE_SIZE) {
 3680                         if (*pte == 0) {
 3681                                 if (va != va_next) {
 3682                                         pmap_invalidate_range(pmap, va, sva);
 3683                                         va = va_next;
 3684                                 }
 3685                                 continue;
 3686                         }
 3687                         if ((*pte & PG_G) == 0)
 3688                                 anyvalid = 1;
 3689                         else if (va == va_next)
 3690                                 va = sva;
 3691                         if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
 3692                             &lock)) {
 3693                                 sva += PAGE_SIZE;
 3694                                 break;
 3695                         }
 3696                 }
 3697                 if (va != va_next)
 3698                         pmap_invalidate_range(pmap, va, sva);
 3699         }
 3700         if (lock != NULL)
 3701                 rw_wunlock(lock);
 3702 out:
 3703         if (anyvalid)
 3704                 pmap_invalidate_all(pmap);
 3705         rw_runlock(&pvh_global_lock);   
 3706         PMAP_UNLOCK(pmap);
 3707         pmap_free_zero_pages(&free);
 3708 }
 3709 
 3710 /*
 3711  *      Routine:        pmap_remove_all
 3712  *      Function:
 3713  *              Removes this physical page from
 3714  *              all physical maps in which it resides.
 3715  *              Reflects back modify bits to the pager.
 3716  *
 3717  *      Notes:
 3718  *              Original versions of this routine were very
 3719  *              inefficient because they iteratively called
 3720  *              pmap_remove (slow...)
 3721  */
 3722 
 3723 void
 3724 pmap_remove_all(vm_page_t m)
 3725 {
 3726         struct md_page *pvh;
 3727         pv_entry_t pv;
 3728         pmap_t pmap;
 3729         pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 3730         pd_entry_t *pde;
 3731         vm_offset_t va;
 3732         struct spglist free;
 3733 
 3734         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3735             ("pmap_remove_all: page %p is not managed", m));
 3736         SLIST_INIT(&free);
 3737         rw_wlock(&pvh_global_lock);
 3738         if ((m->flags & PG_FICTITIOUS) != 0)
 3739                 goto small_mappings;
 3740         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3741         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 3742                 pmap = PV_PMAP(pv);
 3743                 PMAP_LOCK(pmap);
 3744                 va = pv->pv_va;
 3745                 pde = pmap_pde(pmap, va);
 3746                 (void)pmap_demote_pde(pmap, pde, va);
 3747                 PMAP_UNLOCK(pmap);
 3748         }
 3749 small_mappings:
 3750         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 3751                 pmap = PV_PMAP(pv);
 3752                 PMAP_LOCK(pmap);
 3753                 PG_A = pmap_accessed_bit(pmap);
 3754                 PG_M = pmap_modified_bit(pmap);
 3755                 PG_RW = pmap_rw_bit(pmap);
 3756                 pmap_resident_count_dec(pmap, 1);
 3757                 pde = pmap_pde(pmap, pv->pv_va);
 3758                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 3759                     " a 2mpage in page %p's pv list", m));
 3760                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 3761                 tpte = pte_load_clear(pte);
 3762                 if (tpte & PG_W)
 3763                         pmap->pm_stats.wired_count--;
 3764                 if (tpte & PG_A)
 3765                         vm_page_aflag_set(m, PGA_REFERENCED);
 3766 
 3767                 /*
 3768                  * Update the vm_page_t clean and reference bits.
 3769                  */
 3770                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3771                         vm_page_dirty(m);
 3772                 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 3773                 pmap_invalidate_page(pmap, pv->pv_va);
 3774                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 3775                 m->md.pv_gen++;
 3776                 free_pv_entry(pmap, pv);
 3777                 PMAP_UNLOCK(pmap);
 3778         }
 3779         vm_page_aflag_clear(m, PGA_WRITEABLE);
 3780         rw_wunlock(&pvh_global_lock);
 3781         pmap_free_zero_pages(&free);
 3782 }
 3783 
 3784 /*
 3785  * pmap_protect_pde: do the things to protect a 2mpage in a process
 3786  */
 3787 static boolean_t
 3788 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 3789 {
 3790         pd_entry_t newpde, oldpde;
 3791         vm_offset_t eva, va;
 3792         vm_page_t m;
 3793         boolean_t anychanged;
 3794         pt_entry_t PG_G, PG_M, PG_RW;
 3795 
 3796         PG_G = pmap_global_bit(pmap);
 3797         PG_M = pmap_modified_bit(pmap);
 3798         PG_RW = pmap_rw_bit(pmap);
 3799 
 3800         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3801         KASSERT((sva & PDRMASK) == 0,
 3802             ("pmap_protect_pde: sva is not 2mpage aligned"));
 3803         anychanged = FALSE;
 3804 retry:
 3805         oldpde = newpde = *pde;
 3806         if (oldpde & PG_MANAGED) {
 3807                 eva = sva + NBPDR;
 3808                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 3809                     va < eva; va += PAGE_SIZE, m++)
 3810                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3811                                 vm_page_dirty(m);
 3812         }
 3813         if ((prot & VM_PROT_WRITE) == 0)
 3814                 newpde &= ~(PG_RW | PG_M);
 3815         if ((prot & VM_PROT_EXECUTE) == 0)
 3816                 newpde |= pg_nx;
 3817         if (newpde != oldpde) {
 3818                 if (!atomic_cmpset_long(pde, oldpde, newpde))
 3819                         goto retry;
 3820                 if (oldpde & PG_G)
 3821                         pmap_invalidate_page(pmap, sva);
 3822                 else
 3823                         anychanged = TRUE;
 3824         }
 3825         return (anychanged);
 3826 }
 3827 
 3828 /*
 3829  *      Set the physical protection on the
 3830  *      specified range of this map as requested.
 3831  */
 3832 void
 3833 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 3834 {
 3835         vm_offset_t va_next;
 3836         pml4_entry_t *pml4e;
 3837         pdp_entry_t *pdpe;
 3838         pd_entry_t ptpaddr, *pde;
 3839         pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 3840         boolean_t anychanged, pv_lists_locked;
 3841 
 3842         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 3843                 pmap_remove(pmap, sva, eva);
 3844                 return;
 3845         }
 3846 
 3847         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 3848             (VM_PROT_WRITE|VM_PROT_EXECUTE))
 3849                 return;
 3850 
 3851         PG_G = pmap_global_bit(pmap);
 3852         PG_M = pmap_modified_bit(pmap);
 3853         PG_V = pmap_valid_bit(pmap);
 3854         PG_RW = pmap_rw_bit(pmap);
 3855         pv_lists_locked = FALSE;
 3856 resume:
 3857         anychanged = FALSE;
 3858 
 3859         PMAP_LOCK(pmap);
 3860         for (; sva < eva; sva = va_next) {
 3861 
 3862                 pml4e = pmap_pml4e(pmap, sva);
 3863                 if ((*pml4e & PG_V) == 0) {
 3864                         va_next = (sva + NBPML4) & ~PML4MASK;
 3865                         if (va_next < sva)
 3866                                 va_next = eva;
 3867                         continue;
 3868                 }
 3869 
 3870                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 3871                 if ((*pdpe & PG_V) == 0) {
 3872                         va_next = (sva + NBPDP) & ~PDPMASK;
 3873                         if (va_next < sva)
 3874                                 va_next = eva;
 3875                         continue;
 3876                 }
 3877 
 3878                 va_next = (sva + NBPDR) & ~PDRMASK;
 3879                 if (va_next < sva)
 3880                         va_next = eva;
 3881 
 3882                 pde = pmap_pdpe_to_pde(pdpe, sva);
 3883                 ptpaddr = *pde;
 3884 
 3885                 /*
 3886                  * Weed out invalid mappings.
 3887                  */
 3888                 if (ptpaddr == 0)
 3889                         continue;
 3890 
 3891                 /*
 3892                  * Check for large page.
 3893                  */
 3894                 if ((ptpaddr & PG_PS) != 0) {
 3895                         /*
 3896                          * Are we protecting the entire large page?  If not,
 3897                          * demote the mapping and fall through.
 3898                          */
 3899                         if (sva + NBPDR == va_next && eva >= va_next) {
 3900                                 /*
 3901                                  * The TLB entry for a PG_G mapping is
 3902                                  * invalidated by pmap_protect_pde().
 3903                                  */
 3904                                 if (pmap_protect_pde(pmap, pde, sva, prot))
 3905                                         anychanged = TRUE;
 3906                                 continue;
 3907                         } else {
 3908                                 if (!pv_lists_locked) {
 3909                                         pv_lists_locked = TRUE;
 3910                                         if (!rw_try_rlock(&pvh_global_lock)) {
 3911                                                 if (anychanged)
 3912                                                         pmap_invalidate_all(
 3913                                                             pmap);
 3914                                                 PMAP_UNLOCK(pmap);
 3915                                                 rw_rlock(&pvh_global_lock);
 3916                                                 goto resume;
 3917                                         }
 3918                                 }
 3919                                 if (!pmap_demote_pde(pmap, pde, sva)) {
 3920                                         /*
 3921                                          * The large page mapping was
 3922                                          * destroyed.
 3923                                          */
 3924                                         continue;
 3925                                 }
 3926                         }
 3927                 }
 3928 
 3929                 if (va_next > eva)
 3930                         va_next = eva;
 3931 
 3932                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 3933                     sva += PAGE_SIZE) {
 3934                         pt_entry_t obits, pbits;
 3935                         vm_page_t m;
 3936 
 3937 retry:
 3938                         obits = pbits = *pte;
 3939                         if ((pbits & PG_V) == 0)
 3940                                 continue;
 3941 
 3942                         if ((prot & VM_PROT_WRITE) == 0) {
 3943                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 3944                                     (PG_MANAGED | PG_M | PG_RW)) {
 3945                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 3946                                         vm_page_dirty(m);
 3947                                 }
 3948                                 pbits &= ~(PG_RW | PG_M);
 3949                         }
 3950                         if ((prot & VM_PROT_EXECUTE) == 0)
 3951                                 pbits |= pg_nx;
 3952 
 3953                         if (pbits != obits) {
 3954                                 if (!atomic_cmpset_long(pte, obits, pbits))
 3955                                         goto retry;
 3956                                 if (obits & PG_G)
 3957                                         pmap_invalidate_page(pmap, sva);
 3958                                 else
 3959                                         anychanged = TRUE;
 3960                         }
 3961                 }
 3962         }
 3963         if (anychanged)
 3964                 pmap_invalidate_all(pmap);
 3965         if (pv_lists_locked)
 3966                 rw_runlock(&pvh_global_lock);
 3967         PMAP_UNLOCK(pmap);
 3968 }
 3969 
 3970 /*
 3971  * Tries to promote the 512, contiguous 4KB page mappings that are within a
 3972  * single page table page (PTP) to a single 2MB page mapping.  For promotion
 3973  * to occur, two conditions must be met: (1) the 4KB page mappings must map
 3974  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
 3975  * identical characteristics. 
 3976  */
 3977 static void
 3978 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 3979     struct rwlock **lockp)
 3980 {
 3981         pd_entry_t newpde;
 3982         pt_entry_t *firstpte, oldpte, pa, *pte;
 3983         pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
 3984         vm_offset_t oldpteva;
 3985         vm_page_t mpte;
 3986         int PG_PTE_CACHE;
 3987 
 3988         PG_A = pmap_accessed_bit(pmap);
 3989         PG_G = pmap_global_bit(pmap);
 3990         PG_M = pmap_modified_bit(pmap);
 3991         PG_V = pmap_valid_bit(pmap);
 3992         PG_RW = pmap_rw_bit(pmap);
 3993         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 3994 
 3995         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3996 
 3997         /*
 3998          * Examine the first PTE in the specified PTP.  Abort if this PTE is
 3999          * either invalid, unused, or does not map the first 4KB physical page
 4000          * within a 2MB page. 
 4001          */
 4002         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 4003 setpde:
 4004         newpde = *firstpte;
 4005         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 4006                 atomic_add_long(&pmap_pde_p_failures, 1);
 4007                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4008                     " in pmap %p", va, pmap);
 4009                 return;
 4010         }
 4011         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 4012                 /*
 4013                  * When PG_M is already clear, PG_RW can be cleared without
 4014                  * a TLB invalidation.
 4015                  */
 4016                 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 4017                         goto setpde;
 4018                 newpde &= ~PG_RW;
 4019         }
 4020 
 4021         /*
 4022          * Examine each of the other PTEs in the specified PTP.  Abort if this
 4023          * PTE maps an unexpected 4KB physical page or does not have identical
 4024          * characteristics to the first PTE.
 4025          */
 4026         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 4027         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 4028 setpte:
 4029                 oldpte = *pte;
 4030                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 4031                         atomic_add_long(&pmap_pde_p_failures, 1);
 4032                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4033                             " in pmap %p", va, pmap);
 4034                         return;
 4035                 }
 4036                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 4037                         /*
 4038                          * When PG_M is already clear, PG_RW can be cleared
 4039                          * without a TLB invalidation.
 4040                          */
 4041                         if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 4042                                 goto setpte;
 4043                         oldpte &= ~PG_RW;
 4044                         oldpteva = (oldpte & PG_FRAME & PDRMASK) |
 4045                             (va & ~PDRMASK);
 4046                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 4047                             " in pmap %p", oldpteva, pmap);
 4048                 }
 4049                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 4050                         atomic_add_long(&pmap_pde_p_failures, 1);
 4051                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4052                             " in pmap %p", va, pmap);
 4053                         return;
 4054                 }
 4055                 pa -= PAGE_SIZE;
 4056         }
 4057 
 4058         /*
 4059          * Save the page table page in its current state until the PDE
 4060          * mapping the superpage is demoted by pmap_demote_pde() or
 4061          * destroyed by pmap_remove_pde(). 
 4062          */
 4063         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 4064         KASSERT(mpte >= vm_page_array &&
 4065             mpte < &vm_page_array[vm_page_array_size],
 4066             ("pmap_promote_pde: page table page is out of range"));
 4067         KASSERT(mpte->pindex == pmap_pde_pindex(va),
 4068             ("pmap_promote_pde: page table page's pindex is wrong"));
 4069         if (pmap_insert_pt_page(pmap, mpte)) {
 4070                 atomic_add_long(&pmap_pde_p_failures, 1);
 4071                 CTR2(KTR_PMAP,
 4072                     "pmap_promote_pde: failure for va %#lx in pmap %p", va,
 4073                     pmap);
 4074                 return;
 4075         }
 4076 
 4077         /*
 4078          * Promote the pv entries.
 4079          */
 4080         if ((newpde & PG_MANAGED) != 0)
 4081                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 4082 
 4083         /*
 4084          * Propagate the PAT index to its proper position.
 4085          */
 4086         newpde = pmap_swap_pat(pmap, newpde);
 4087 
 4088         /*
 4089          * Map the superpage.
 4090          */
 4091         if (workaround_erratum383)
 4092                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 4093         else
 4094                 pde_store(pde, PG_PS | newpde);
 4095 
 4096         atomic_add_long(&pmap_pde_promotions, 1);
 4097         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 4098             " in pmap %p", va, pmap);
 4099 }
 4100 
 4101 /*
 4102  *      Insert the given physical page (p) at
 4103  *      the specified virtual address (v) in the
 4104  *      target physical map with the protection requested.
 4105  *
 4106  *      If specified, the page will be wired down, meaning
 4107  *      that the related pte can not be reclaimed.
 4108  *
 4109  *      NB:  This is the only routine which MAY NOT lazy-evaluate
 4110  *      or lose information.  That is, this routine must actually
 4111  *      insert this page into the given map NOW.
 4112  */
 4113 void
 4114 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
 4115     vm_prot_t prot, boolean_t wired)
 4116 {
 4117         struct rwlock *lock;
 4118         pd_entry_t *pde;
 4119         pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 4120         pt_entry_t newpte, origpte;
 4121         pv_entry_t pv;
 4122         vm_paddr_t opa, pa;
 4123         vm_page_t mpte, om;
 4124 
 4125         PG_A = pmap_accessed_bit(pmap);
 4126         PG_G = pmap_global_bit(pmap);
 4127         PG_M = pmap_modified_bit(pmap);
 4128         PG_V = pmap_valid_bit(pmap);
 4129         PG_RW = pmap_rw_bit(pmap);
 4130 
 4131         va = trunc_page(va);
 4132         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 4133         KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 4134             ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 4135             va));
 4136         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
 4137             va >= kmi.clean_eva,
 4138             ("pmap_enter: managed mapping within the clean submap"));
 4139         if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 4140                 VM_OBJECT_ASSERT_WLOCKED(m->object);
 4141         pa = VM_PAGE_TO_PHYS(m);
 4142         newpte = (pt_entry_t)(pa | PG_A | PG_V);
 4143         if ((access & VM_PROT_WRITE) != 0)
 4144                 newpte |= PG_M;
 4145         if ((prot & VM_PROT_WRITE) != 0)
 4146                 newpte |= PG_RW;
 4147         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 4148             ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't"));
 4149         if ((prot & VM_PROT_EXECUTE) == 0)
 4150                 newpte |= pg_nx;
 4151         if (wired)
 4152                 newpte |= PG_W;
 4153         if (va < VM_MAXUSER_ADDRESS)
 4154                 newpte |= PG_U;
 4155         if (pmap == kernel_pmap)
 4156                 newpte |= PG_G;
 4157         newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
 4158 
 4159         /*
 4160          * Set modified bit gratuitously for writeable mappings if
 4161          * the page is unmanaged. We do not want to take a fault
 4162          * to do the dirty bit accounting for these mappings.
 4163          */
 4164         if ((m->oflags & VPO_UNMANAGED) != 0) {
 4165                 if ((newpte & PG_RW) != 0)
 4166                         newpte |= PG_M;
 4167         }
 4168 
 4169         mpte = NULL;
 4170 
 4171         lock = NULL;
 4172         rw_rlock(&pvh_global_lock);
 4173         PMAP_LOCK(pmap);
 4174 
 4175         /*
 4176          * In the case that a page table page is not
 4177          * resident, we are creating it here.
 4178          */
 4179 retry:
 4180         pde = pmap_pde(pmap, va);
 4181         if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
 4182             pmap_demote_pde_locked(pmap, pde, va, &lock))) {
 4183                 pte = pmap_pde_to_pte(pde, va);
 4184                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 4185                         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 4186                         mpte->wire_count++;
 4187                 }
 4188         } else if (va < VM_MAXUSER_ADDRESS) {
 4189                 /*
 4190                  * Here if the pte page isn't mapped, or if it has been
 4191                  * deallocated.
 4192                  */
 4193                 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
 4194                 goto retry;
 4195         } else
 4196                 panic("pmap_enter: invalid page directory va=%#lx", va);
 4197 
 4198         origpte = *pte;
 4199 
 4200         /*
 4201          * Is the specified virtual address already mapped?
 4202          */
 4203         if ((origpte & PG_V) != 0) {
 4204                 /*
 4205                  * Wiring change, just update stats. We don't worry about
 4206                  * wiring PT pages as they remain resident as long as there
 4207                  * are valid mappings in them. Hence, if a user page is wired,
 4208                  * the PT page will be also.
 4209                  */
 4210                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 4211                         pmap->pm_stats.wired_count++;
 4212                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 4213                         pmap->pm_stats.wired_count--;
 4214 
 4215                 /*
 4216                  * Remove the extra PT page reference.
 4217                  */
 4218                 if (mpte != NULL) {
 4219                         mpte->wire_count--;
 4220                         KASSERT(mpte->wire_count > 0,
 4221                             ("pmap_enter: missing reference to page table page,"
 4222                              " va: 0x%lx", va));
 4223                 }
 4224 
 4225                 /*
 4226                  * Has the physical page changed?
 4227                  */
 4228                 opa = origpte & PG_FRAME;
 4229                 if (opa == pa) {
 4230                         /*
 4231                          * No, might be a protection or wiring change.
 4232                          */
 4233                         if ((origpte & PG_MANAGED) != 0) {
 4234                                 newpte |= PG_MANAGED;
 4235                                 if ((newpte & PG_RW) != 0)
 4236                                         vm_page_aflag_set(m, PGA_WRITEABLE);
 4237                         }
 4238                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 4239                                 goto unchanged;
 4240                         goto validate;
 4241                 }
 4242         } else {
 4243                 /*
 4244                  * Increment the counters.
 4245                  */
 4246                 if ((newpte & PG_W) != 0)
 4247                         pmap->pm_stats.wired_count++;
 4248                 pmap_resident_count_inc(pmap, 1);
 4249         }
 4250 
 4251         /*
 4252          * Enter on the PV list if part of our managed memory.
 4253          */
 4254         if ((m->oflags & VPO_UNMANAGED) == 0) {
 4255                 newpte |= PG_MANAGED;
 4256                 pv = get_pv_entry(pmap, &lock);
 4257                 pv->pv_va = va;
 4258                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 4259                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 4260                 m->md.pv_gen++;
 4261                 if ((newpte & PG_RW) != 0)
 4262                         vm_page_aflag_set(m, PGA_WRITEABLE);
 4263         }
 4264 
 4265         /*
 4266          * Update the PTE.
 4267          */
 4268         if ((origpte & PG_V) != 0) {
 4269 validate:
 4270                 origpte = pte_load_store(pte, newpte);
 4271                 opa = origpte & PG_FRAME;
 4272                 if (opa != pa) {
 4273                         if ((origpte & PG_MANAGED) != 0) {
 4274                                 om = PHYS_TO_VM_PAGE(opa);
 4275                                 if ((origpte & (PG_M | PG_RW)) == (PG_M |
 4276                                     PG_RW))
 4277                                         vm_page_dirty(om);
 4278                                 if ((origpte & PG_A) != 0)
 4279                                         vm_page_aflag_set(om, PGA_REFERENCED);
 4280                                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 4281                                 pmap_pvh_free(&om->md, pmap, va);
 4282                                 if ((om->aflags & PGA_WRITEABLE) != 0 &&
 4283                                     TAILQ_EMPTY(&om->md.pv_list) &&
 4284                                     ((om->flags & PG_FICTITIOUS) != 0 ||
 4285                                     TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 4286                                         vm_page_aflag_clear(om, PGA_WRITEABLE);
 4287                         }
 4288                 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
 4289                     PG_RW)) == (PG_M | PG_RW)) {
 4290                         if ((origpte & PG_MANAGED) != 0)
 4291                                 vm_page_dirty(m);
 4292 
 4293                         /*
 4294                          * Although the PTE may still have PG_RW set, TLB
 4295                          * invalidation may nonetheless be required because
 4296                          * the PTE no longer has PG_M set.
 4297                          */
 4298                 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 4299                         /*
 4300                          * This PTE change does not require TLB invalidation.
 4301                          */
 4302                         goto unchanged;
 4303                 }
 4304                 if ((origpte & PG_A) != 0)
 4305                         pmap_invalidate_page(pmap, va);
 4306         } else
 4307                 pte_store(pte, newpte);
 4308 
 4309 unchanged:
 4310 
 4311         /*
 4312          * If both the page table page and the reservation are fully
 4313          * populated, then attempt promotion.
 4314          */
 4315         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 4316             pmap_ps_enabled(pmap) &&
 4317             (m->flags & PG_FICTITIOUS) == 0 &&
 4318             vm_reserv_level_iffullpop(m) == 0)
 4319                 pmap_promote_pde(pmap, pde, va, &lock);
 4320 
 4321         if (lock != NULL)
 4322                 rw_wunlock(lock);
 4323         rw_runlock(&pvh_global_lock);
 4324         PMAP_UNLOCK(pmap);
 4325 }
 4326 
 4327 /*
 4328  * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
 4329  * otherwise.  Fails if (1) a page table page cannot be allocated without
 4330  * blocking, (2) a mapping already exists at the specified virtual address, or
 4331  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
 4332  */
 4333 static boolean_t
 4334 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 4335     struct rwlock **lockp)
 4336 {
 4337         pd_entry_t *pde, newpde;
 4338         pt_entry_t PG_V;
 4339         vm_page_t mpde;
 4340         struct spglist free;
 4341 
 4342         PG_V = pmap_valid_bit(pmap);
 4343         rw_assert(&pvh_global_lock, RA_LOCKED);
 4344         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4345 
 4346         if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
 4347                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4348                     " in pmap %p", va, pmap);
 4349                 return (FALSE);
 4350         }
 4351         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
 4352         pde = &pde[pmap_pde_index(va)];
 4353         if ((*pde & PG_V) != 0) {
 4354                 KASSERT(mpde->wire_count > 1,
 4355                     ("pmap_enter_pde: mpde's wire count is too low"));
 4356                 mpde->wire_count--;
 4357                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4358                     " in pmap %p", va, pmap);
 4359                 return (FALSE);
 4360         }
 4361         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 4362             PG_PS | PG_V;
 4363         if ((m->oflags & VPO_UNMANAGED) == 0) {
 4364                 newpde |= PG_MANAGED;
 4365 
 4366                 /*
 4367                  * Abort this mapping if its PV entry could not be created.
 4368                  */
 4369                 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
 4370                     lockp)) {
 4371                         SLIST_INIT(&free);
 4372                         if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
 4373                                 pmap_invalidate_page(pmap, va);
 4374                                 pmap_free_zero_pages(&free);
 4375                         }
 4376                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4377                             " in pmap %p", va, pmap);
 4378                         return (FALSE);
 4379                 }
 4380         }
 4381         if ((prot & VM_PROT_EXECUTE) == 0)
 4382                 newpde |= pg_nx;
 4383         if (va < VM_MAXUSER_ADDRESS)
 4384                 newpde |= PG_U;
 4385 
 4386         /*
 4387          * Increment counters.
 4388          */
 4389         pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 4390 
 4391         /*
 4392          * Map the superpage.
 4393          */
 4394         pde_store(pde, newpde);
 4395 
 4396         atomic_add_long(&pmap_pde_mappings, 1);
 4397         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 4398             " in pmap %p", va, pmap);
 4399         return (TRUE);
 4400 }
 4401 
 4402 /*
 4403  * Maps a sequence of resident pages belonging to the same object.
 4404  * The sequence begins with the given page m_start.  This page is
 4405  * mapped at the given virtual address start.  Each subsequent page is
 4406  * mapped at a virtual address that is offset from start by the same
 4407  * amount as the page is offset from m_start within the object.  The
 4408  * last page in the sequence is the page with the largest offset from
 4409  * m_start that can be mapped at a virtual address less than the given
 4410  * virtual address end.  Not every virtual page between start and end
 4411  * is mapped; only those for which a resident page exists with the
 4412  * corresponding offset from m_start are mapped.
 4413  */
 4414 void
 4415 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
 4416     vm_page_t m_start, vm_prot_t prot)
 4417 {
 4418         struct rwlock *lock;
 4419         vm_offset_t va;
 4420         vm_page_t m, mpte;
 4421         vm_pindex_t diff, psize;
 4422 
 4423         VM_OBJECT_ASSERT_LOCKED(m_start->object);
 4424 
 4425         psize = atop(end - start);
 4426         mpte = NULL;
 4427         m = m_start;
 4428         lock = NULL;
 4429         rw_rlock(&pvh_global_lock);
 4430         PMAP_LOCK(pmap);
 4431         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 4432                 va = start + ptoa(diff);
 4433                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 4434                     (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
 4435                     pmap_ps_enabled(pmap) &&
 4436                     vm_reserv_level_iffullpop(m) == 0 &&
 4437                     pmap_enter_pde(pmap, va, m, prot, &lock))
 4438                         m = &m[NBPDR / PAGE_SIZE - 1];
 4439                 else
 4440                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 4441                             mpte, &lock);
 4442                 m = TAILQ_NEXT(m, listq);
 4443         }
 4444         if (lock != NULL)
 4445                 rw_wunlock(lock);
 4446         rw_runlock(&pvh_global_lock);
 4447         PMAP_UNLOCK(pmap);
 4448 }
 4449 
 4450 /*
 4451  * this code makes some *MAJOR* assumptions:
 4452  * 1. Current pmap & pmap exists.
 4453  * 2. Not wired.
 4454  * 3. Read access.
 4455  * 4. No page table pages.
 4456  * but is *MUCH* faster than pmap_enter...
 4457  */
 4458 
 4459 void
 4460 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 4461 {
 4462         struct rwlock *lock;
 4463 
 4464         lock = NULL;
 4465         rw_rlock(&pvh_global_lock);
 4466         PMAP_LOCK(pmap);
 4467         (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 4468         if (lock != NULL)
 4469                 rw_wunlock(lock);
 4470         rw_runlock(&pvh_global_lock);
 4471         PMAP_UNLOCK(pmap);
 4472 }
 4473 
 4474 static vm_page_t
 4475 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 4476     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 4477 {
 4478         struct spglist free;
 4479         pt_entry_t *pte, PG_V;
 4480         vm_paddr_t pa;
 4481 
 4482         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 4483             (m->oflags & VPO_UNMANAGED) != 0,
 4484             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 4485         PG_V = pmap_valid_bit(pmap);
 4486         rw_assert(&pvh_global_lock, RA_LOCKED);
 4487         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4488 
 4489         /*
 4490          * In the case that a page table page is not
 4491          * resident, we are creating it here.
 4492          */
 4493         if (va < VM_MAXUSER_ADDRESS) {
 4494                 vm_pindex_t ptepindex;
 4495                 pd_entry_t *ptepa;
 4496 
 4497                 /*
 4498                  * Calculate pagetable page index
 4499                  */
 4500                 ptepindex = pmap_pde_pindex(va);
 4501                 if (mpte && (mpte->pindex == ptepindex)) {
 4502                         mpte->wire_count++;
 4503                 } else {
 4504                         /*
 4505                          * Get the page directory entry
 4506                          */
 4507                         ptepa = pmap_pde(pmap, va);
 4508 
 4509                         /*
 4510                          * If the page table page is mapped, we just increment
 4511                          * the hold count, and activate it.  Otherwise, we
 4512                          * attempt to allocate a page table page.  If this
 4513                          * attempt fails, we don't retry.  Instead, we give up.
 4514                          */
 4515                         if (ptepa && (*ptepa & PG_V) != 0) {
 4516                                 if (*ptepa & PG_PS)
 4517                                         return (NULL);
 4518                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 4519                                 mpte->wire_count++;
 4520                         } else {
 4521                                 /*
 4522                                  * Pass NULL instead of the PV list lock
 4523                                  * pointer, because we don't intend to sleep.
 4524                                  */
 4525                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 4526                                 if (mpte == NULL)
 4527                                         return (mpte);
 4528                         }
 4529                 }
 4530                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 4531                 pte = &pte[pmap_pte_index(va)];
 4532         } else {
 4533                 mpte = NULL;
 4534                 pte = vtopte(va);
 4535         }
 4536         if (*pte) {
 4537                 if (mpte != NULL) {
 4538                         mpte->wire_count--;
 4539                         mpte = NULL;
 4540                 }
 4541                 return (mpte);
 4542         }
 4543 
 4544         /*
 4545          * Enter on the PV list if part of our managed memory.
 4546          */
 4547         if ((m->oflags & VPO_UNMANAGED) == 0 &&
 4548             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 4549                 if (mpte != NULL) {
 4550                         SLIST_INIT(&free);
 4551                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 4552                                 pmap_invalidate_page(pmap, va);
 4553                                 pmap_free_zero_pages(&free);
 4554                         }
 4555                         mpte = NULL;
 4556                 }
 4557                 return (mpte);
 4558         }
 4559 
 4560         /*
 4561          * Increment counters
 4562          */
 4563         pmap_resident_count_inc(pmap, 1);
 4564 
 4565         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
 4566         if ((prot & VM_PROT_EXECUTE) == 0)
 4567                 pa |= pg_nx;
 4568 
 4569         /*
 4570          * Now validate mapping with RO protection
 4571          */
 4572         if ((m->oflags & VPO_UNMANAGED) != 0)
 4573                 pte_store(pte, pa | PG_V | PG_U);
 4574         else
 4575                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 4576         return (mpte);
 4577 }
 4578 
 4579 /*
 4580  * Make a temporary mapping for a physical address.  This is only intended
 4581  * to be used for panic dumps.
 4582  */
 4583 void *
 4584 pmap_kenter_temporary(vm_paddr_t pa, int i)
 4585 {
 4586         vm_offset_t va;
 4587 
 4588         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 4589         pmap_kenter(va, pa);
 4590         invlpg(va);
 4591         return ((void *)crashdumpmap);
 4592 }
 4593 
 4594 /*
 4595  * This code maps large physical mmap regions into the
 4596  * processor address space.  Note that some shortcuts
 4597  * are taken, but the code works.
 4598  */
 4599 void
 4600 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
 4601     vm_pindex_t pindex, vm_size_t size)
 4602 {
 4603         pd_entry_t *pde;
 4604         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 4605         vm_paddr_t pa, ptepa;
 4606         vm_page_t p, pdpg;
 4607         int pat_mode;
 4608 
 4609         PG_A = pmap_accessed_bit(pmap);
 4610         PG_M = pmap_modified_bit(pmap);
 4611         PG_V = pmap_valid_bit(pmap);
 4612         PG_RW = pmap_rw_bit(pmap);
 4613 
 4614         VM_OBJECT_ASSERT_WLOCKED(object);
 4615         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 4616             ("pmap_object_init_pt: non-device object"));
 4617         if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 4618                 if (!pmap_ps_enabled(pmap))
 4619                         return;
 4620                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
 4621                         return;
 4622                 p = vm_page_lookup(object, pindex);
 4623                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
 4624                     ("pmap_object_init_pt: invalid page %p", p));
 4625                 pat_mode = p->md.pat_mode;
 4626 
 4627                 /*
 4628                  * Abort the mapping if the first page is not physically
 4629                  * aligned to a 2MB page boundary.
 4630                  */
 4631                 ptepa = VM_PAGE_TO_PHYS(p);
 4632                 if (ptepa & (NBPDR - 1))
 4633                         return;
 4634 
 4635                 /*
 4636                  * Skip the first page.  Abort the mapping if the rest of
 4637                  * the pages are not physically contiguous or have differing
 4638                  * memory attributes.
 4639                  */
 4640                 p = TAILQ_NEXT(p, listq);
 4641                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 4642                     pa += PAGE_SIZE) {
 4643                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
 4644                             ("pmap_object_init_pt: invalid page %p", p));
 4645                         if (pa != VM_PAGE_TO_PHYS(p) ||
 4646                             pat_mode != p->md.pat_mode)
 4647                                 return;
 4648                         p = TAILQ_NEXT(p, listq);
 4649                 }
 4650 
 4651                 /*
 4652                  * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 4653                  * "size" is a multiple of 2M, adding the PAT setting to "pa"
 4654                  * will not affect the termination of this loop.
 4655                  */ 
 4656                 PMAP_LOCK(pmap);
 4657                 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 4658                     pa < ptepa + size; pa += NBPDR) {
 4659                         pdpg = pmap_allocpde(pmap, addr, NULL);
 4660                         if (pdpg == NULL) {
 4661                                 /*
 4662                                  * The creation of mappings below is only an
 4663                                  * optimization.  If a page directory page
 4664                                  * cannot be allocated without blocking,
 4665                                  * continue on to the next mapping rather than
 4666                                  * blocking.
 4667                                  */
 4668                                 addr += NBPDR;
 4669                                 continue;
 4670                         }
 4671                         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 4672                         pde = &pde[pmap_pde_index(addr)];
 4673                         if ((*pde & PG_V) == 0) {
 4674                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
 4675                                     PG_U | PG_RW | PG_V);
 4676                                 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 4677                                 atomic_add_long(&pmap_pde_mappings, 1);
 4678                         } else {
 4679                                 /* Continue on if the PDE is already valid. */
 4680                                 pdpg->wire_count--;
 4681                                 KASSERT(pdpg->wire_count > 0,
 4682                                     ("pmap_object_init_pt: missing reference "
 4683                                     "to page directory page, va: 0x%lx", addr));
 4684                         }
 4685                         addr += NBPDR;
 4686                 }
 4687                 PMAP_UNLOCK(pmap);
 4688         }
 4689 }
 4690 
 4691 /*
 4692  *      Routine:        pmap_change_wiring
 4693  *      Function:       Change the wiring attribute for a map/virtual-address
 4694  *                      pair.
 4695  *      In/out conditions:
 4696  *                      The mapping must already exist in the pmap.
 4697  */
 4698 void
 4699 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 4700 {
 4701         pd_entry_t *pde;
 4702         pt_entry_t *pte;
 4703         boolean_t pv_lists_locked;
 4704 
 4705         pv_lists_locked = FALSE;
 4706 
 4707         /*
 4708          * Wiring is not a hardware characteristic so there is no need to
 4709          * invalidate TLB.
 4710          */
 4711 retry:
 4712         PMAP_LOCK(pmap);
 4713         pde = pmap_pde(pmap, va);
 4714         if ((*pde & PG_PS) != 0) {
 4715                 if (!wired != ((*pde & PG_W) == 0)) {
 4716                         if (!pv_lists_locked) {
 4717                                 pv_lists_locked = TRUE;
 4718                                 if (!rw_try_rlock(&pvh_global_lock)) {
 4719                                         PMAP_UNLOCK(pmap);
 4720                                         rw_rlock(&pvh_global_lock);
 4721                                         goto retry;
 4722                                 }
 4723                         }
 4724                         if (!pmap_demote_pde(pmap, pde, va))
 4725                                 panic("pmap_change_wiring: demotion failed");
 4726                 } else
 4727                         goto out;
 4728         }
 4729         pte = pmap_pde_to_pte(pde, va);
 4730         if (wired && (*pte & PG_W) == 0) {
 4731                 pmap->pm_stats.wired_count++;
 4732                 atomic_set_long(pte, PG_W);
 4733         } else if (!wired && (*pte & PG_W) != 0) {
 4734                 pmap->pm_stats.wired_count--;
 4735                 atomic_clear_long(pte, PG_W);
 4736         }
 4737 out:
 4738         if (pv_lists_locked)
 4739                 rw_runlock(&pvh_global_lock);
 4740         PMAP_UNLOCK(pmap);
 4741 }
 4742 
 4743 /*
 4744  *      Copy the range specified by src_addr/len
 4745  *      from the source map to the range dst_addr/len
 4746  *      in the destination map.
 4747  *
 4748  *      This routine is only advisory and need not do anything.
 4749  */
 4750 
 4751 void
 4752 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 4753     vm_offset_t src_addr)
 4754 {
 4755         struct rwlock *lock;
 4756         struct spglist free;
 4757         vm_offset_t addr;
 4758         vm_offset_t end_addr = src_addr + len;
 4759         vm_offset_t va_next;
 4760         pt_entry_t PG_A, PG_M, PG_V;
 4761 
 4762         if (dst_addr != src_addr)
 4763                 return;
 4764 
 4765         if (dst_pmap->pm_type != src_pmap->pm_type)
 4766                 return;
 4767 
 4768         /*
 4769          * EPT page table entries that require emulation of A/D bits are
 4770          * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
 4771          * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
 4772          * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
 4773          * implementations flag an EPT misconfiguration for exec-only
 4774          * mappings we skip this function entirely for emulated pmaps.
 4775          */
 4776         if (pmap_emulate_ad_bits(dst_pmap))
 4777                 return;
 4778 
 4779         lock = NULL;
 4780         rw_rlock(&pvh_global_lock);
 4781         if (dst_pmap < src_pmap) {
 4782                 PMAP_LOCK(dst_pmap);
 4783                 PMAP_LOCK(src_pmap);
 4784         } else {
 4785                 PMAP_LOCK(src_pmap);
 4786                 PMAP_LOCK(dst_pmap);
 4787         }
 4788 
 4789         PG_A = pmap_accessed_bit(dst_pmap);
 4790         PG_M = pmap_modified_bit(dst_pmap);
 4791         PG_V = pmap_valid_bit(dst_pmap);
 4792 
 4793         for (addr = src_addr; addr < end_addr; addr = va_next) {
 4794                 pt_entry_t *src_pte, *dst_pte;
 4795                 vm_page_t dstmpde, dstmpte, srcmpte;
 4796                 pml4_entry_t *pml4e;
 4797                 pdp_entry_t *pdpe;
 4798                 pd_entry_t srcptepaddr, *pde;
 4799 
 4800                 KASSERT(addr < UPT_MIN_ADDRESS,
 4801                     ("pmap_copy: invalid to pmap_copy page tables"));
 4802 
 4803                 pml4e = pmap_pml4e(src_pmap, addr);
 4804                 if ((*pml4e & PG_V) == 0) {
 4805                         va_next = (addr + NBPML4) & ~PML4MASK;
 4806                         if (va_next < addr)
 4807                                 va_next = end_addr;
 4808                         continue;
 4809                 }
 4810 
 4811                 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 4812                 if ((*pdpe & PG_V) == 0) {
 4813                         va_next = (addr + NBPDP) & ~PDPMASK;
 4814                         if (va_next < addr)
 4815                                 va_next = end_addr;
 4816                         continue;
 4817                 }
 4818 
 4819                 va_next = (addr + NBPDR) & ~PDRMASK;
 4820                 if (va_next < addr)
 4821                         va_next = end_addr;
 4822 
 4823                 pde = pmap_pdpe_to_pde(pdpe, addr);
 4824                 srcptepaddr = *pde;
 4825                 if (srcptepaddr == 0)
 4826                         continue;
 4827                         
 4828                 if (srcptepaddr & PG_PS) {
 4829                         if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 4830                                 continue;
 4831                         dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
 4832                         if (dstmpde == NULL)
 4833                                 break;
 4834                         pde = (pd_entry_t *)
 4835                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
 4836                         pde = &pde[pmap_pde_index(addr)];
 4837                         if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 4838                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
 4839                             PG_PS_FRAME, &lock))) {
 4840                                 *pde = srcptepaddr & ~PG_W;
 4841                                 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
 4842                         } else
 4843                                 dstmpde->wire_count--;
 4844                         continue;
 4845                 }
 4846 
 4847                 srcptepaddr &= PG_FRAME;
 4848                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 4849                 KASSERT(srcmpte->wire_count > 0,
 4850                     ("pmap_copy: source page table page is unused"));
 4851 
 4852                 if (va_next > end_addr)
 4853                         va_next = end_addr;
 4854 
 4855                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 4856                 src_pte = &src_pte[pmap_pte_index(addr)];
 4857                 dstmpte = NULL;
 4858                 while (addr < va_next) {
 4859                         pt_entry_t ptetemp;
 4860                         ptetemp = *src_pte;
 4861                         /*
 4862                          * we only virtual copy managed pages
 4863                          */
 4864                         if ((ptetemp & PG_MANAGED) != 0) {
 4865                                 if (dstmpte != NULL &&
 4866                                     dstmpte->pindex == pmap_pde_pindex(addr))
 4867                                         dstmpte->wire_count++;
 4868                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
 4869                                     addr, NULL)) == NULL)
 4870                                         goto out;
 4871                                 dst_pte = (pt_entry_t *)
 4872                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 4873                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
 4874                                 if (*dst_pte == 0 &&
 4875                                     pmap_try_insert_pv_entry(dst_pmap, addr,
 4876                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
 4877                                     &lock)) {
 4878                                         /*
 4879                                          * Clear the wired, modified, and
 4880                                          * accessed (referenced) bits
 4881                                          * during the copy.
 4882                                          */
 4883                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
 4884                                             PG_A);
 4885                                         pmap_resident_count_inc(dst_pmap, 1);
 4886                                 } else {
 4887                                         SLIST_INIT(&free);
 4888                                         if (pmap_unwire_ptp(dst_pmap, addr,
 4889                                             dstmpte, &free)) {
 4890                                                 pmap_invalidate_page(dst_pmap,
 4891                                                     addr);
 4892                                                 pmap_free_zero_pages(&free);
 4893                                         }
 4894                                         goto out;
 4895                                 }
 4896                                 if (dstmpte->wire_count >= srcmpte->wire_count)
 4897                                         break;
 4898                         }
 4899                         addr += PAGE_SIZE;
 4900                         src_pte++;
 4901                 }
 4902         }
 4903 out:
 4904         if (lock != NULL)
 4905                 rw_wunlock(lock);
 4906         rw_runlock(&pvh_global_lock);
 4907         PMAP_UNLOCK(src_pmap);
 4908         PMAP_UNLOCK(dst_pmap);
 4909 }
 4910 
 4911 /*
 4912  *      pmap_zero_page zeros the specified hardware page by mapping
 4913  *      the page into KVM and using bzero to clear its contents.
 4914  */
 4915 void
 4916 pmap_zero_page(vm_page_t m)
 4917 {
 4918         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 4919 
 4920         pagezero((void *)va);
 4921 }
 4922 
 4923 /*
 4924  *      pmap_zero_page_area zeros the specified hardware page by mapping 
 4925  *      the page into KVM and using bzero to clear its contents.
 4926  *
 4927  *      off and size may not cover an area beyond a single hardware page.
 4928  */
 4929 void
 4930 pmap_zero_page_area(vm_page_t m, int off, int size)
 4931 {
 4932         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 4933 
 4934         if (off == 0 && size == PAGE_SIZE)
 4935                 pagezero((void *)va);
 4936         else
 4937                 bzero((char *)va + off, size);
 4938 }
 4939 
 4940 /*
 4941  *      pmap_zero_page_idle zeros the specified hardware page by mapping 
 4942  *      the page into KVM and using bzero to clear its contents.  This
 4943  *      is intended to be called from the vm_pagezero process only and
 4944  *      outside of Giant.
 4945  */
 4946 void
 4947 pmap_zero_page_idle(vm_page_t m)
 4948 {
 4949         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 4950 
 4951         pagezero((void *)va);
 4952 }
 4953 
 4954 /*
 4955  *      pmap_copy_page copies the specified (machine independent)
 4956  *      page by mapping the page into virtual memory and using
 4957  *      bcopy to copy the page, one machine dependent page at a
 4958  *      time.
 4959  */
 4960 void
 4961 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 4962 {
 4963         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 4964         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 4965 
 4966         pagecopy((void *)src, (void *)dst);
 4967 }
 4968 
 4969 int unmapped_buf_allowed = 1;
 4970 
 4971 void
 4972 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
 4973     vm_offset_t b_offset, int xfersize)
 4974 {
 4975         void *a_cp, *b_cp;
 4976         vm_offset_t a_pg_offset, b_pg_offset;
 4977         int cnt;
 4978 
 4979         while (xfersize > 0) {
 4980                 a_pg_offset = a_offset & PAGE_MASK;
 4981                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 4982                 a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]->
 4983                     phys_addr) + a_pg_offset;
 4984                 b_pg_offset = b_offset & PAGE_MASK;
 4985                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 4986                 b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]->
 4987                     phys_addr) + b_pg_offset;
 4988                 bcopy(a_cp, b_cp, cnt);
 4989                 a_offset += cnt;
 4990                 b_offset += cnt;
 4991                 xfersize -= cnt;
 4992         }
 4993 }
 4994 
 4995 /*
 4996  * Returns true if the pmap's pv is one of the first
 4997  * 16 pvs linked to from this page.  This count may
 4998  * be changed upwards or downwards in the future; it
 4999  * is only necessary that true be returned for a small
 5000  * subset of pmaps for proper page aging.
 5001  */
 5002 boolean_t
 5003 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 5004 {
 5005         struct md_page *pvh;
 5006         struct rwlock *lock;
 5007         pv_entry_t pv;
 5008         int loops = 0;
 5009         boolean_t rv;
 5010 
 5011         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5012             ("pmap_page_exists_quick: page %p is not managed", m));
 5013         rv = FALSE;
 5014         rw_rlock(&pvh_global_lock);
 5015         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5016         rw_rlock(lock);
 5017         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5018                 if (PV_PMAP(pv) == pmap) {
 5019                         rv = TRUE;
 5020                         break;
 5021                 }
 5022                 loops++;
 5023                 if (loops >= 16)
 5024                         break;
 5025         }
 5026         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 5027                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5028                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5029                         if (PV_PMAP(pv) == pmap) {
 5030                                 rv = TRUE;
 5031                                 break;
 5032                         }
 5033                         loops++;
 5034                         if (loops >= 16)
 5035                                 break;
 5036                 }
 5037         }
 5038         rw_runlock(lock);
 5039         rw_runlock(&pvh_global_lock);
 5040         return (rv);
 5041 }
 5042 
 5043 /*
 5044  *      pmap_page_wired_mappings:
 5045  *
 5046  *      Return the number of managed mappings to the given physical page
 5047  *      that are wired.
 5048  */
 5049 int
 5050 pmap_page_wired_mappings(vm_page_t m)
 5051 {
 5052         struct rwlock *lock;
 5053         struct md_page *pvh;
 5054         pmap_t pmap;
 5055         pt_entry_t *pte;
 5056         pv_entry_t pv;
 5057         int count, md_gen, pvh_gen;
 5058 
 5059         if ((m->oflags & VPO_UNMANAGED) != 0)
 5060                 return (0);
 5061         rw_rlock(&pvh_global_lock);
 5062         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5063         rw_rlock(lock);
 5064 restart:
 5065         count = 0;
 5066         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5067                 pmap = PV_PMAP(pv);
 5068                 if (!PMAP_TRYLOCK(pmap)) {
 5069                         md_gen = m->md.pv_gen;
 5070                         rw_runlock(lock);
 5071                         PMAP_LOCK(pmap);
 5072                         rw_rlock(lock);
 5073                         if (md_gen != m->md.pv_gen) {
 5074                                 PMAP_UNLOCK(pmap);
 5075                                 goto restart;
 5076                         }
 5077                 }
 5078                 pte = pmap_pte(pmap, pv->pv_va);
 5079                 if ((*pte & PG_W) != 0)
 5080                         count++;
 5081                 PMAP_UNLOCK(pmap);
 5082         }
 5083         if ((m->flags & PG_FICTITIOUS) == 0) {
 5084                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5085                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5086                         pmap = PV_PMAP(pv);
 5087                         if (!PMAP_TRYLOCK(pmap)) {
 5088                                 md_gen = m->md.pv_gen;
 5089                                 pvh_gen = pvh->pv_gen;
 5090                                 rw_runlock(lock);
 5091                                 PMAP_LOCK(pmap);
 5092                                 rw_rlock(lock);
 5093                                 if (md_gen != m->md.pv_gen ||
 5094                                     pvh_gen != pvh->pv_gen) {
 5095                                         PMAP_UNLOCK(pmap);
 5096                                         goto restart;
 5097                                 }
 5098                         }
 5099                         pte = pmap_pde(pmap, pv->pv_va);
 5100                         if ((*pte & PG_W) != 0)
 5101                                 count++;
 5102                         PMAP_UNLOCK(pmap);
 5103                 }
 5104         }
 5105         rw_runlock(lock);
 5106         rw_runlock(&pvh_global_lock);
 5107         return (count);
 5108 }
 5109 
 5110 /*
 5111  * Returns TRUE if the given page is mapped individually or as part of
 5112  * a 2mpage.  Otherwise, returns FALSE.
 5113  */
 5114 boolean_t
 5115 pmap_page_is_mapped(vm_page_t m)
 5116 {
 5117         struct rwlock *lock;
 5118         boolean_t rv;
 5119 
 5120         if ((m->oflags & VPO_UNMANAGED) != 0)
 5121                 return (FALSE);
 5122         rw_rlock(&pvh_global_lock);
 5123         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5124         rw_rlock(lock);
 5125         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 5126             ((m->flags & PG_FICTITIOUS) == 0 &&
 5127             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 5128         rw_runlock(lock);
 5129         rw_runlock(&pvh_global_lock);
 5130         return (rv);
 5131 }
 5132 
 5133 /*
 5134  * Remove all pages from specified address space
 5135  * this aids process exit speeds.  Also, this code
 5136  * is special cased for current process only, but
 5137  * can have the more generic (and slightly slower)
 5138  * mode enabled.  This is much faster than pmap_remove
 5139  * in the case of running down an entire address space.
 5140  */
 5141 void
 5142 pmap_remove_pages(pmap_t pmap)
 5143 {
 5144         pd_entry_t ptepde;
 5145         pt_entry_t *pte, tpte;
 5146         pt_entry_t PG_M, PG_RW, PG_V;
 5147         struct spglist free;
 5148         vm_page_t m, mpte, mt;
 5149         pv_entry_t pv;
 5150         struct md_page *pvh;
 5151         struct pv_chunk *pc, *npc;
 5152         struct rwlock *lock;
 5153         int64_t bit;
 5154         uint64_t inuse, bitmask;
 5155         int allfree, field, freed, idx;
 5156         boolean_t superpage;
 5157         vm_paddr_t pa;
 5158 
 5159         if (pmap != PCPU_GET(curpmap)) {
 5160                 printf("warning: pmap_remove_pages called with non-current pmap\n");
 5161                 return;
 5162         }
 5163 
 5164         lock = NULL;
 5165         PG_M = pmap_modified_bit(pmap);
 5166         PG_V = pmap_valid_bit(pmap);
 5167         PG_RW = pmap_rw_bit(pmap);
 5168 
 5169         SLIST_INIT(&free);
 5170         rw_rlock(&pvh_global_lock);
 5171         PMAP_LOCK(pmap);
 5172         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 5173                 allfree = 1;
 5174                 freed = 0;
 5175                 for (field = 0; field < _NPCM; field++) {
 5176                         inuse = ~pc->pc_map[field] & pc_freemask[field];
 5177                         while (inuse != 0) {
 5178                                 bit = bsfq(inuse);
 5179                                 bitmask = 1UL << bit;
 5180                                 idx = field * 64 + bit;
 5181                                 pv = &pc->pc_pventry[idx];
 5182                                 inuse &= ~bitmask;
 5183 
 5184                                 pte = pmap_pdpe(pmap, pv->pv_va);
 5185                                 ptepde = *pte;
 5186                                 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 5187                                 tpte = *pte;
 5188                                 if ((tpte & (PG_PS | PG_V)) == PG_V) {
 5189                                         superpage = FALSE;
 5190                                         ptepde = tpte;
 5191                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 5192                                             PG_FRAME);
 5193                                         pte = &pte[pmap_pte_index(pv->pv_va)];
 5194                                         tpte = *pte;
 5195                                 } else {
 5196                                         /*
 5197                                          * Keep track whether 'tpte' is a
 5198                                          * superpage explicitly instead of
 5199                                          * relying on PG_PS being set.
 5200                                          *
 5201                                          * This is because PG_PS is numerically
 5202                                          * identical to PG_PTE_PAT and thus a
 5203                                          * regular page could be mistaken for
 5204                                          * a superpage.
 5205                                          */
 5206                                         superpage = TRUE;
 5207                                 }
 5208 
 5209                                 if ((tpte & PG_V) == 0) {
 5210                                         panic("bad pte va %lx pte %lx",
 5211                                             pv->pv_va, tpte);
 5212                                 }
 5213 
 5214 /*
 5215  * We cannot remove wired pages from a process' mapping at this time
 5216  */
 5217                                 if (tpte & PG_W) {
 5218                                         allfree = 0;
 5219                                         continue;
 5220                                 }
 5221 
 5222                                 if (superpage)
 5223                                         pa = tpte & PG_PS_FRAME;
 5224                                 else
 5225                                         pa = tpte & PG_FRAME;
 5226 
 5227                                 m = PHYS_TO_VM_PAGE(pa);
 5228                                 KASSERT(m->phys_addr == pa,
 5229                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 5230                                     m, (uintmax_t)m->phys_addr,
 5231                                     (uintmax_t)tpte));
 5232 
 5233                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 5234                                     m < &vm_page_array[vm_page_array_size],
 5235                                     ("pmap_remove_pages: bad tpte %#jx",
 5236                                     (uintmax_t)tpte));
 5237 
 5238                                 pte_clear(pte);
 5239 
 5240                                 /*
 5241                                  * Update the vm_page_t clean/reference bits.
 5242                                  */
 5243                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 5244                                         if (superpage) {
 5245                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 5246                                                         vm_page_dirty(mt);
 5247                                         } else
 5248                                                 vm_page_dirty(m);
 5249                                 }
 5250 
 5251                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 5252 
 5253                                 /* Mark free */
 5254                                 pc->pc_map[field] |= bitmask;
 5255                                 if (superpage) {
 5256                                         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 5257                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 5258                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 5259                                         pvh->pv_gen++;
 5260                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 5261                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 5262                                                         if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 5263                                                             TAILQ_EMPTY(&mt->md.pv_list))
 5264                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
 5265                                         }
 5266                                         mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
 5267                                         if (mpte != NULL) {
 5268                                                 pmap_remove_pt_page(pmap, mpte);
 5269                                                 pmap_resident_count_dec(pmap, 1);
 5270                                                 KASSERT(mpte->wire_count == NPTEPG,
 5271                                                     ("pmap_remove_pages: pte page wire count error"));
 5272                                                 mpte->wire_count = 0;
 5273                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
 5274                                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 5275                                         }
 5276                                 } else {
 5277                                         pmap_resident_count_dec(pmap, 1);
 5278                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 5279                                         m->md.pv_gen++;
 5280                                         if ((m->aflags & PGA_WRITEABLE) != 0 &&
 5281                                             TAILQ_EMPTY(&m->md.pv_list) &&
 5282                                             (m->flags & PG_FICTITIOUS) == 0) {
 5283                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5284                                                 if (TAILQ_EMPTY(&pvh->pv_list))
 5285                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5286                                         }
 5287                                 }
 5288                                 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 5289                                 freed++;
 5290                         }
 5291                 }
 5292                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 5293                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 5294                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 5295                 if (allfree) {
 5296                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 5297                         free_pv_chunk(pc);
 5298                 }
 5299         }
 5300         if (lock != NULL)
 5301                 rw_wunlock(lock);
 5302         pmap_invalidate_all(pmap);
 5303         rw_runlock(&pvh_global_lock);
 5304         PMAP_UNLOCK(pmap);
 5305         pmap_free_zero_pages(&free);
 5306 }
 5307 
 5308 static boolean_t
 5309 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 5310 {
 5311         struct rwlock *lock;
 5312         pv_entry_t pv;
 5313         struct md_page *pvh;
 5314         pt_entry_t *pte, mask;
 5315         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 5316         pmap_t pmap;
 5317         int md_gen, pvh_gen;
 5318         boolean_t rv;
 5319 
 5320         rv = FALSE;
 5321         rw_rlock(&pvh_global_lock);
 5322         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5323         rw_rlock(lock);
 5324 restart:
 5325         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5326                 pmap = PV_PMAP(pv);
 5327                 if (!PMAP_TRYLOCK(pmap)) {
 5328                         md_gen = m->md.pv_gen;
 5329                         rw_runlock(lock);
 5330                         PMAP_LOCK(pmap);
 5331                         rw_rlock(lock);
 5332                         if (md_gen != m->md.pv_gen) {
 5333                                 PMAP_UNLOCK(pmap);
 5334                                 goto restart;
 5335                         }
 5336                 }
 5337                 pte = pmap_pte(pmap, pv->pv_va);
 5338                 mask = 0;
 5339                 if (modified) {
 5340                         PG_M = pmap_modified_bit(pmap);
 5341                         PG_RW = pmap_rw_bit(pmap);
 5342                         mask |= PG_RW | PG_M;
 5343                 }
 5344                 if (accessed) {
 5345                         PG_A = pmap_accessed_bit(pmap);
 5346                         PG_V = pmap_valid_bit(pmap);
 5347                         mask |= PG_V | PG_A;
 5348                 }
 5349                 rv = (*pte & mask) == mask;
 5350                 PMAP_UNLOCK(pmap);
 5351                 if (rv)
 5352                         goto out;
 5353         }
 5354         if ((m->flags & PG_FICTITIOUS) == 0) {
 5355                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5356                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5357                         pmap = PV_PMAP(pv);
 5358                         if (!PMAP_TRYLOCK(pmap)) {
 5359                                 md_gen = m->md.pv_gen;
 5360                                 pvh_gen = pvh->pv_gen;
 5361                                 rw_runlock(lock);
 5362                                 PMAP_LOCK(pmap);
 5363                                 rw_rlock(lock);
 5364                                 if (md_gen != m->md.pv_gen ||
 5365                                     pvh_gen != pvh->pv_gen) {
 5366                                         PMAP_UNLOCK(pmap);
 5367                                         goto restart;
 5368                                 }
 5369                         }
 5370                         pte = pmap_pde(pmap, pv->pv_va);
 5371                         mask = 0;
 5372                         if (modified) {
 5373                                 PG_M = pmap_modified_bit(pmap);
 5374                                 PG_RW = pmap_rw_bit(pmap);
 5375                                 mask |= PG_RW | PG_M;
 5376                         }
 5377                         if (accessed) {
 5378                                 PG_A = pmap_accessed_bit(pmap);
 5379                                 PG_V = pmap_valid_bit(pmap);
 5380                                 mask |= PG_V | PG_A;
 5381                         }
 5382                         rv = (*pte & mask) == mask;
 5383                         PMAP_UNLOCK(pmap);
 5384                         if (rv)
 5385                                 goto out;
 5386                 }
 5387         }
 5388 out:
 5389         rw_runlock(lock);
 5390         rw_runlock(&pvh_global_lock);
 5391         return (rv);
 5392 }
 5393 
 5394 /*
 5395  *      pmap_is_modified:
 5396  *
 5397  *      Return whether or not the specified physical page was modified
 5398  *      in any physical maps.
 5399  */
 5400 boolean_t
 5401 pmap_is_modified(vm_page_t m)
 5402 {
 5403 
 5404         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5405             ("pmap_is_modified: page %p is not managed", m));
 5406 
 5407         /*
 5408          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 5409          * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 5410          * is clear, no PTEs can have PG_M set.
 5411          */
 5412         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5413         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 5414                 return (FALSE);
 5415         return (pmap_page_test_mappings(m, FALSE, TRUE));
 5416 }
 5417 
 5418 /*
 5419  *      pmap_is_prefaultable:
 5420  *
 5421  *      Return whether or not the specified virtual address is eligible
 5422  *      for prefault.
 5423  */
 5424 boolean_t
 5425 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 5426 {
 5427         pd_entry_t *pde;
 5428         pt_entry_t *pte, PG_V;
 5429         boolean_t rv;
 5430 
 5431         PG_V = pmap_valid_bit(pmap);
 5432         rv = FALSE;
 5433         PMAP_LOCK(pmap);
 5434         pde = pmap_pde(pmap, addr);
 5435         if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 5436                 pte = pmap_pde_to_pte(pde, addr);
 5437                 rv = (*pte & PG_V) == 0;
 5438         }
 5439         PMAP_UNLOCK(pmap);
 5440         return (rv);
 5441 }
 5442 
 5443 /*
 5444  *      pmap_is_referenced:
 5445  *
 5446  *      Return whether or not the specified physical page was referenced
 5447  *      in any physical maps.
 5448  */
 5449 boolean_t
 5450 pmap_is_referenced(vm_page_t m)
 5451 {
 5452 
 5453         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5454             ("pmap_is_referenced: page %p is not managed", m));
 5455         return (pmap_page_test_mappings(m, TRUE, FALSE));
 5456 }
 5457 
 5458 /*
 5459  * Clear the write and modified bits in each of the given page's mappings.
 5460  */
 5461 void
 5462 pmap_remove_write(vm_page_t m)
 5463 {
 5464         struct md_page *pvh;
 5465         pmap_t pmap;
 5466         struct rwlock *lock;
 5467         pv_entry_t next_pv, pv;
 5468         pd_entry_t *pde;
 5469         pt_entry_t oldpte, *pte, PG_M, PG_RW;
 5470         vm_offset_t va;
 5471         int pvh_gen, md_gen;
 5472 
 5473         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5474             ("pmap_remove_write: page %p is not managed", m));
 5475 
 5476         /*
 5477          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 5478          * set by another thread while the object is locked.  Thus,
 5479          * if PGA_WRITEABLE is clear, no page table entries need updating.
 5480          */
 5481         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5482         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 5483                 return;
 5484         rw_rlock(&pvh_global_lock);
 5485         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5486         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5487 retry_pv_loop:
 5488         rw_wlock(lock);
 5489         if ((m->flags & PG_FICTITIOUS) != 0)
 5490                 goto small_mappings;
 5491         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 5492                 pmap = PV_PMAP(pv);
 5493                 if (!PMAP_TRYLOCK(pmap)) {
 5494                         pvh_gen = pvh->pv_gen;
 5495                         rw_wunlock(lock);
 5496                         PMAP_LOCK(pmap);
 5497                         rw_wlock(lock);
 5498                         if (pvh_gen != pvh->pv_gen) {
 5499                                 PMAP_UNLOCK(pmap);
 5500                                 rw_wunlock(lock);
 5501                                 goto retry_pv_loop;
 5502                         }
 5503                 }
 5504                 PG_RW = pmap_rw_bit(pmap);
 5505                 va = pv->pv_va;
 5506                 pde = pmap_pde(pmap, va);
 5507                 if ((*pde & PG_RW) != 0)
 5508                         (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 5509                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 5510                     ("inconsistent pv lock %p %p for page %p",
 5511                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 5512                 PMAP_UNLOCK(pmap);
 5513         }
 5514 small_mappings:
 5515         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5516                 pmap = PV_PMAP(pv);
 5517                 if (!PMAP_TRYLOCK(pmap)) {
 5518                         pvh_gen = pvh->pv_gen;
 5519                         md_gen = m->md.pv_gen;
 5520                         rw_wunlock(lock);
 5521                         PMAP_LOCK(pmap);
 5522                         rw_wlock(lock);
 5523                         if (pvh_gen != pvh->pv_gen ||
 5524                             md_gen != m->md.pv_gen) {
 5525                                 PMAP_UNLOCK(pmap);
 5526                                 rw_wunlock(lock);
 5527                                 goto retry_pv_loop;
 5528                         }
 5529                 }
 5530                 PG_M = pmap_modified_bit(pmap);
 5531                 PG_RW = pmap_rw_bit(pmap);
 5532                 pde = pmap_pde(pmap, pv->pv_va);
 5533                 KASSERT((*pde & PG_PS) == 0,
 5534                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
 5535                     m));
 5536                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 5537 retry:
 5538                 oldpte = *pte;
 5539                 if (oldpte & PG_RW) {
 5540                         if (!atomic_cmpset_long(pte, oldpte, oldpte &
 5541                             ~(PG_RW | PG_M)))
 5542                                 goto retry;
 5543                         if ((oldpte & PG_M) != 0)
 5544                                 vm_page_dirty(m);
 5545                         pmap_invalidate_page(pmap, pv->pv_va);
 5546                 }
 5547                 PMAP_UNLOCK(pmap);
 5548         }
 5549         rw_wunlock(lock);
 5550         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5551         rw_runlock(&pvh_global_lock);
 5552 }
 5553 
 5554 static __inline boolean_t
 5555 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
 5556 {
 5557 
 5558         if (!pmap_emulate_ad_bits(pmap))
 5559                 return (TRUE);
 5560 
 5561         KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
 5562 
 5563         /*
 5564          * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
 5565          * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
 5566          * if the EPT_PG_WRITE bit is set.
 5567          */
 5568         if ((pte & EPT_PG_WRITE) != 0)
 5569                 return (FALSE);
 5570 
 5571         /*
 5572          * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
 5573          */
 5574         if ((pte & EPT_PG_EXECUTE) == 0 ||
 5575             ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
 5576                 return (TRUE);
 5577         else
 5578                 return (FALSE);
 5579 }
 5580 
 5581 #define PMAP_TS_REFERENCED_MAX  5
 5582 
 5583 /*
 5584  *      pmap_ts_referenced:
 5585  *
 5586  *      Return a count of reference bits for a page, clearing those bits.
 5587  *      It is not necessary for every reference bit to be cleared, but it
 5588  *      is necessary that 0 only be returned when there are truly no
 5589  *      reference bits set.
 5590  *
 5591  *      XXX: The exact number of bits to check and clear is a matter that
 5592  *      should be tested and standardized at some point in the future for
 5593  *      optimal aging of shared pages.
 5594  */
 5595 int
 5596 pmap_ts_referenced(vm_page_t m)
 5597 {
 5598         struct md_page *pvh;
 5599         pv_entry_t pv, pvf;
 5600         pmap_t pmap;
 5601         struct rwlock *lock;
 5602         pd_entry_t oldpde, *pde;
 5603         pt_entry_t *pte, PG_A;
 5604         vm_offset_t va;
 5605         vm_paddr_t pa;
 5606         int cleared, md_gen, not_cleared, pvh_gen;
 5607         struct spglist free;
 5608         boolean_t demoted;
 5609 
 5610         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5611             ("pmap_ts_referenced: page %p is not managed", m));
 5612         SLIST_INIT(&free);
 5613         cleared = 0;
 5614         pa = VM_PAGE_TO_PHYS(m);
 5615         lock = PHYS_TO_PV_LIST_LOCK(pa);
 5616         pvh = pa_to_pvh(pa);
 5617         rw_rlock(&pvh_global_lock);
 5618         rw_wlock(lock);
 5619 retry:
 5620         not_cleared = 0;
 5621         if ((m->flags & PG_FICTITIOUS) != 0 ||
 5622             (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 5623                 goto small_mappings;
 5624         pv = pvf;
 5625         do {
 5626                 if (pvf == NULL)
 5627                         pvf = pv;
 5628                 pmap = PV_PMAP(pv);
 5629                 if (!PMAP_TRYLOCK(pmap)) {
 5630                         pvh_gen = pvh->pv_gen;
 5631                         rw_wunlock(lock);
 5632                         PMAP_LOCK(pmap);
 5633                         rw_wlock(lock);
 5634                         if (pvh_gen != pvh->pv_gen) {
 5635                                 PMAP_UNLOCK(pmap);
 5636                                 goto retry;
 5637                         }
 5638                 }
 5639                 PG_A = pmap_accessed_bit(pmap);
 5640                 va = pv->pv_va;
 5641                 pde = pmap_pde(pmap, pv->pv_va);
 5642                 oldpde = *pde;
 5643                 if ((*pde & PG_A) != 0) {
 5644                         /*
 5645                          * Since this reference bit is shared by 512 4KB
 5646                          * pages, it should not be cleared every time it is
 5647                          * tested.  Apply a simple "hash" function on the
 5648                          * physical page number, the virtual superpage number,
 5649                          * and the pmap address to select one 4KB page out of
 5650                          * the 512 on which testing the reference bit will
 5651                          * result in clearing that reference bit.  This
 5652                          * function is designed to avoid the selection of the
 5653                          * same 4KB page for every 2MB page mapping.
 5654                          *
 5655                          * On demotion, a mapping that hasn't been referenced
 5656                          * is simply destroyed.  To avoid the possibility of a
 5657                          * subsequent page fault on a demoted wired mapping,
 5658                          * always leave its reference bit set.  Moreover,
 5659                          * since the superpage is wired, the current state of
 5660                          * its reference bit won't affect page replacement.
 5661                          */
 5662                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 5663                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 5664                             (*pde & PG_W) == 0) {
 5665                                 if (safe_to_clear_referenced(pmap, oldpde)) {
 5666                                         atomic_clear_long(pde, PG_A);
 5667                                         pmap_invalidate_page(pmap, pv->pv_va);
 5668                                         demoted = FALSE;
 5669                                 } else if (pmap_demote_pde_locked(pmap, pde,
 5670                                     pv->pv_va, &lock)) {
 5671                                         /*
 5672                                          * Remove the mapping to a single page
 5673                                          * so that a subsequent access may
 5674                                          * repromote.  Since the underlying
 5675                                          * page table page is fully populated,
 5676                                          * this removal never frees a page
 5677                                          * table page.
 5678                                          */
 5679                                         demoted = TRUE;
 5680                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 5681                                             PG_PS_FRAME);
 5682                                         pte = pmap_pde_to_pte(pde, va);
 5683                                         pmap_remove_pte(pmap, pte, va, *pde,
 5684                                             NULL, &lock);
 5685                                         pmap_invalidate_page(pmap, va);
 5686                                 } else
 5687                                         demoted = TRUE;
 5688 
 5689                                 if (demoted) {
 5690                                         /*
 5691                                          * The superpage mapping was removed
 5692                                          * entirely and therefore 'pv' is no
 5693                                          * longer valid.
 5694                                          */
 5695                                         if (pvf == pv)
 5696                                                 pvf = NULL;
 5697                                         pv = NULL;
 5698                                 }
 5699                                 cleared++;
 5700                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 5701                                     ("inconsistent pv lock %p %p for page %p",
 5702                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 5703                         } else
 5704                                 not_cleared++;
 5705                 }
 5706                 PMAP_UNLOCK(pmap);
 5707                 /* Rotate the PV list if it has more than one entry. */
 5708                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 5709                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 5710                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 5711                         pvh->pv_gen++;
 5712                 }
 5713                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 5714                         goto out;
 5715         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 5716 small_mappings:
 5717         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 5718                 goto out;
 5719         pv = pvf;
 5720         do {
 5721                 if (pvf == NULL)
 5722                         pvf = pv;
 5723                 pmap = PV_PMAP(pv);
 5724                 if (!PMAP_TRYLOCK(pmap)) {
 5725                         pvh_gen = pvh->pv_gen;
 5726                         md_gen = m->md.pv_gen;
 5727                         rw_wunlock(lock);
 5728                         PMAP_LOCK(pmap);
 5729                         rw_wlock(lock);
 5730                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 5731                                 PMAP_UNLOCK(pmap);
 5732                                 goto retry;
 5733                         }
 5734                 }
 5735                 PG_A = pmap_accessed_bit(pmap);
 5736                 pde = pmap_pde(pmap, pv->pv_va);
 5737                 KASSERT((*pde & PG_PS) == 0,
 5738                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 5739                     m));
 5740                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 5741                 if ((*pte & PG_A) != 0) {
 5742                         if (safe_to_clear_referenced(pmap, *pte)) {
 5743                                 atomic_clear_long(pte, PG_A);
 5744                                 pmap_invalidate_page(pmap, pv->pv_va);
 5745                                 cleared++;
 5746                         } else if ((*pte & PG_W) == 0) {
 5747                                 /*
 5748                                  * Wired pages cannot be paged out so
 5749                                  * doing accessed bit emulation for
 5750                                  * them is wasted effort. We do the
 5751                                  * hard work for unwired pages only.
 5752                                  */
 5753                                 pmap_remove_pte(pmap, pte, pv->pv_va,
 5754                                     *pde, &free, &lock);
 5755                                 pmap_invalidate_page(pmap, pv->pv_va);
 5756                                 cleared++;
 5757                                 if (pvf == pv)
 5758                                         pvf = NULL;
 5759                                 pv = NULL;
 5760                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 5761                                     ("inconsistent pv lock %p %p for page %p",
 5762                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 5763                         } else
 5764                                 not_cleared++;
 5765                 }
 5766                 PMAP_UNLOCK(pmap);
 5767                 /* Rotate the PV list if it has more than one entry. */
 5768                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 5769                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 5770                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 5771                         m->md.pv_gen++;
 5772                 }
 5773         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 5774             not_cleared < PMAP_TS_REFERENCED_MAX);
 5775 out:
 5776         rw_wunlock(lock);
 5777         rw_runlock(&pvh_global_lock);
 5778         pmap_free_zero_pages(&free);
 5779         return (cleared + not_cleared);
 5780 }
 5781 
 5782 /*
 5783  *      Apply the given advice to the specified range of addresses within the
 5784  *      given pmap.  Depending on the advice, clear the referenced and/or
 5785  *      modified flags in each mapping and set the mapped page's dirty field.
 5786  */
 5787 void
 5788 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 5789 {
 5790         struct rwlock *lock;
 5791         pml4_entry_t *pml4e;
 5792         pdp_entry_t *pdpe;
 5793         pd_entry_t oldpde, *pde;
 5794         pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
 5795         vm_offset_t va_next;
 5796         vm_page_t m;
 5797         boolean_t anychanged, pv_lists_locked;
 5798 
 5799         if (advice != MADV_DONTNEED && advice != MADV_FREE)
 5800                 return;
 5801 
 5802         /*
 5803          * A/D bit emulation requires an alternate code path when clearing
 5804          * the modified and accessed bits below. Since this function is
 5805          * advisory in nature we skip it entirely for pmaps that require
 5806          * A/D bit emulation.
 5807          */
 5808         if (pmap_emulate_ad_bits(pmap))
 5809                 return;
 5810 
 5811         PG_A = pmap_accessed_bit(pmap);
 5812         PG_G = pmap_global_bit(pmap);
 5813         PG_M = pmap_modified_bit(pmap);
 5814         PG_V = pmap_valid_bit(pmap);
 5815         PG_RW = pmap_rw_bit(pmap);
 5816 
 5817         pv_lists_locked = FALSE;
 5818 resume:
 5819         anychanged = FALSE;
 5820         PMAP_LOCK(pmap);
 5821         for (; sva < eva; sva = va_next) {
 5822                 pml4e = pmap_pml4e(pmap, sva);
 5823                 if ((*pml4e & PG_V) == 0) {
 5824                         va_next = (sva + NBPML4) & ~PML4MASK;
 5825                         if (va_next < sva)
 5826                                 va_next = eva;
 5827                         continue;
 5828                 }
 5829                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 5830                 if ((*pdpe & PG_V) == 0) {
 5831                         va_next = (sva + NBPDP) & ~PDPMASK;
 5832                         if (va_next < sva)
 5833                                 va_next = eva;
 5834                         continue;
 5835                 }
 5836                 va_next = (sva + NBPDR) & ~PDRMASK;
 5837                 if (va_next < sva)
 5838                         va_next = eva;
 5839                 pde = pmap_pdpe_to_pde(pdpe, sva);
 5840                 oldpde = *pde;
 5841                 if ((oldpde & PG_V) == 0)
 5842                         continue;
 5843                 else if ((oldpde & PG_PS) != 0) {
 5844                         if ((oldpde & PG_MANAGED) == 0)
 5845                                 continue;
 5846                         if (!pv_lists_locked) {
 5847                                 pv_lists_locked = TRUE;
 5848                                 if (!rw_try_rlock(&pvh_global_lock)) {
 5849                                         if (anychanged)
 5850                                                 pmap_invalidate_all(pmap);
 5851                                         PMAP_UNLOCK(pmap);
 5852                                         rw_rlock(&pvh_global_lock);
 5853                                         goto resume;
 5854                                 }
 5855                         }
 5856                         lock = NULL;
 5857                         if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
 5858                                 if (lock != NULL)
 5859                                         rw_wunlock(lock);
 5860 
 5861                                 /*
 5862                                  * The large page mapping was destroyed.
 5863                                  */
 5864                                 continue;
 5865                         }
 5866 
 5867                         /*
 5868                          * Unless the page mappings are wired, remove the
 5869                          * mapping to a single page so that a subsequent
 5870                          * access may repromote.  Since the underlying page
 5871                          * table page is fully populated, this removal never
 5872                          * frees a page table page.
 5873                          */
 5874                         if ((oldpde & PG_W) == 0) {
 5875                                 pte = pmap_pde_to_pte(pde, sva);
 5876                                 KASSERT((*pte & PG_V) != 0,
 5877                                     ("pmap_advise: invalid PTE"));
 5878                                 pmap_remove_pte(pmap, pte, sva, *pde, NULL,
 5879                                     &lock);
 5880                                 anychanged = TRUE;
 5881                         }
 5882                         if (lock != NULL)
 5883                                 rw_wunlock(lock);
 5884                 }
 5885                 if (va_next > eva)
 5886                         va_next = eva;
 5887                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 5888                     sva += PAGE_SIZE) {
 5889                         if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
 5890                             PG_V))
 5891                                 continue;
 5892                         else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 5893                                 if (advice == MADV_DONTNEED) {
 5894                                         /*
 5895                                          * Future calls to pmap_is_modified()
 5896                                          * can be avoided by making the page
 5897                                          * dirty now.
 5898                                          */
 5899                                         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 5900                                         vm_page_dirty(m);
 5901                                 }
 5902                                 atomic_clear_long(pte, PG_M | PG_A);
 5903                         } else if ((*pte & PG_A) != 0)
 5904                                 atomic_clear_long(pte, PG_A);
 5905                         else
 5906                                 continue;
 5907                         if ((*pte & PG_G) != 0)
 5908                                 pmap_invalidate_page(pmap, sva);
 5909                         else
 5910                                 anychanged = TRUE;
 5911                 }
 5912         }
 5913         if (anychanged)
 5914                 pmap_invalidate_all(pmap);
 5915         if (pv_lists_locked)
 5916                 rw_runlock(&pvh_global_lock);
 5917         PMAP_UNLOCK(pmap);
 5918 }
 5919 
 5920 /*
 5921  *      Clear the modify bits on the specified physical page.
 5922  */
 5923 void
 5924 pmap_clear_modify(vm_page_t m)
 5925 {
 5926         struct md_page *pvh;
 5927         pmap_t pmap;
 5928         pv_entry_t next_pv, pv;
 5929         pd_entry_t oldpde, *pde;
 5930         pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
 5931         struct rwlock *lock;
 5932         vm_offset_t va;
 5933         int md_gen, pvh_gen;
 5934 
 5935         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5936             ("pmap_clear_modify: page %p is not managed", m));
 5937         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5938         KASSERT(!vm_page_xbusied(m),