The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1991 Regents of the University of California.
    3  * All rights reserved.
    4  * Copyright (c) 1994 John S. Dyson
    5  * All rights reserved.
    6  * Copyright (c) 1994 David Greenman
    7  * All rights reserved.
    8  * Copyright (c) 2003 Peter Wemm
    9  * All rights reserved.
   10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
   11  * All rights reserved.
   12  *
   13  * This code is derived from software contributed to Berkeley by
   14  * the Systems Programming Group of the University of Utah Computer
   15  * Science Department and William Jolitz of UUNET Technologies Inc.
   16  *
   17  * Redistribution and use in source and binary forms, with or without
   18  * modification, are permitted provided that the following conditions
   19  * are met:
   20  * 1. Redistributions of source code must retain the above copyright
   21  *    notice, this list of conditions and the following disclaimer.
   22  * 2. Redistributions in binary form must reproduce the above copyright
   23  *    notice, this list of conditions and the following disclaimer in the
   24  *    documentation and/or other materials provided with the distribution.
   25  * 3. All advertising materials mentioning features or use of this software
   26  *    must display the following acknowledgement:
   27  *      This product includes software developed by the University of
   28  *      California, Berkeley and its contributors.
   29  * 4. Neither the name of the University nor the names of its contributors
   30  *    may be used to endorse or promote products derived from this software
   31  *    without specific prior written permission.
   32  *
   33  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   34  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   35  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   36  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   37  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   38  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   39  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   40  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   41  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   42  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   43  * SUCH DAMAGE.
   44  *
   45  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
   46  */
   47 /*-
   48  * Copyright (c) 2003 Networks Associates Technology, Inc.
   49  * All rights reserved.
   50  *
   51  * This software was developed for the FreeBSD Project by Jake Burkholder,
   52  * Safeport Network Services, and Network Associates Laboratories, the
   53  * Security Research Division of Network Associates, Inc. under
   54  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
   55  * CHATS research program.
   56  *
   57  * Redistribution and use in source and binary forms, with or without
   58  * modification, are permitted provided that the following conditions
   59  * are met:
   60  * 1. Redistributions of source code must retain the above copyright
   61  *    notice, this list of conditions and the following disclaimer.
   62  * 2. Redistributions in binary form must reproduce the above copyright
   63  *    notice, this list of conditions and the following disclaimer in the
   64  *    documentation and/or other materials provided with the distribution.
   65  *
   66  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   67  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   68  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   69  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   70  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   71  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   72  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   73  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   74  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   75  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   76  * SUCH DAMAGE.
   77  */
   78 
   79 #define AMD64_NPT_AWARE
   80 
   81 #include <sys/cdefs.h>
   82 __FBSDID("$FreeBSD: releng/10.1/sys/amd64/amd64/pmap.c 273832 2014-10-29 16:49:28Z neel $");
   83 
   84 /*
   85  *      Manages physical address maps.
   86  *
   87  *      Since the information managed by this module is
   88  *      also stored by the logical address mapping module,
   89  *      this module may throw away valid virtual-to-physical
   90  *      mappings at almost any time.  However, invalidations
   91  *      of virtual-to-physical mappings must be done as
   92  *      requested.
   93  *
   94  *      In order to cope with hardware architectures which
   95  *      make virtual-to-physical map invalidates expensive,
   96  *      this module may delay invalidate or reduced protection
   97  *      operations until such time as they are actually
   98  *      necessary.  This module is given full information as
   99  *      to which processors are currently using which maps,
  100  *      and to when physical maps must be made correct.
  101  */
  102 
  103 #include "opt_pmap.h"
  104 #include "opt_vm.h"
  105 
  106 #include <sys/param.h>
  107 #include <sys/bus.h>
  108 #include <sys/systm.h>
  109 #include <sys/kernel.h>
  110 #include <sys/ktr.h>
  111 #include <sys/lock.h>
  112 #include <sys/malloc.h>
  113 #include <sys/mman.h>
  114 #include <sys/mutex.h>
  115 #include <sys/proc.h>
  116 #include <sys/rwlock.h>
  117 #include <sys/sx.h>
  118 #include <sys/vmmeter.h>
  119 #include <sys/sched.h>
  120 #include <sys/sysctl.h>
  121 #include <sys/_unrhdr.h>
  122 #include <sys/smp.h>
  123 
  124 #include <vm/vm.h>
  125 #include <vm/vm_param.h>
  126 #include <vm/vm_kern.h>
  127 #include <vm/vm_page.h>
  128 #include <vm/vm_map.h>
  129 #include <vm/vm_object.h>
  130 #include <vm/vm_extern.h>
  131 #include <vm/vm_pageout.h>
  132 #include <vm/vm_pager.h>
  133 #include <vm/vm_radix.h>
  134 #include <vm/vm_reserv.h>
  135 #include <vm/uma.h>
  136 
  137 #include <machine/intr_machdep.h>
  138 #include <machine/apicvar.h>
  139 #include <machine/cpu.h>
  140 #include <machine/cputypes.h>
  141 #include <machine/md_var.h>
  142 #include <machine/pcb.h>
  143 #include <machine/specialreg.h>
  144 #ifdef SMP
  145 #include <machine/smp.h>
  146 #endif
  147 
  148 static __inline boolean_t
  149 pmap_emulate_ad_bits(pmap_t pmap)
  150 {
  151 
  152         return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
  153 }
  154 
  155 static __inline pt_entry_t
  156 pmap_valid_bit(pmap_t pmap)
  157 {
  158         pt_entry_t mask;
  159 
  160         switch (pmap->pm_type) {
  161         case PT_X86:
  162                 mask = X86_PG_V;
  163                 break;
  164         case PT_EPT:
  165                 if (pmap_emulate_ad_bits(pmap))
  166                         mask = EPT_PG_EMUL_V;
  167                 else
  168                         mask = EPT_PG_READ;
  169                 break;
  170         default:
  171                 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
  172         }
  173 
  174         return (mask);
  175 }
  176 
  177 static __inline pt_entry_t
  178 pmap_rw_bit(pmap_t pmap)
  179 {
  180         pt_entry_t mask;
  181 
  182         switch (pmap->pm_type) {
  183         case PT_X86:
  184                 mask = X86_PG_RW;
  185                 break;
  186         case PT_EPT:
  187                 if (pmap_emulate_ad_bits(pmap))
  188                         mask = EPT_PG_EMUL_RW;
  189                 else
  190                         mask = EPT_PG_WRITE;
  191                 break;
  192         default:
  193                 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
  194         }
  195 
  196         return (mask);
  197 }
  198 
  199 static __inline pt_entry_t
  200 pmap_global_bit(pmap_t pmap)
  201 {
  202         pt_entry_t mask;
  203 
  204         switch (pmap->pm_type) {
  205         case PT_X86:
  206                 mask = X86_PG_G;
  207                 break;
  208         case PT_EPT:
  209                 mask = 0;
  210                 break;
  211         default:
  212                 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
  213         }
  214 
  215         return (mask);
  216 }
  217 
  218 static __inline pt_entry_t
  219 pmap_accessed_bit(pmap_t pmap)
  220 {
  221         pt_entry_t mask;
  222 
  223         switch (pmap->pm_type) {
  224         case PT_X86:
  225                 mask = X86_PG_A;
  226                 break;
  227         case PT_EPT:
  228                 if (pmap_emulate_ad_bits(pmap))
  229                         mask = EPT_PG_READ;
  230                 else
  231                         mask = EPT_PG_A;
  232                 break;
  233         default:
  234                 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
  235         }
  236 
  237         return (mask);
  238 }
  239 
  240 static __inline pt_entry_t
  241 pmap_modified_bit(pmap_t pmap)
  242 {
  243         pt_entry_t mask;
  244 
  245         switch (pmap->pm_type) {
  246         case PT_X86:
  247                 mask = X86_PG_M;
  248                 break;
  249         case PT_EPT:
  250                 if (pmap_emulate_ad_bits(pmap))
  251                         mask = EPT_PG_WRITE;
  252                 else
  253                         mask = EPT_PG_M;
  254                 break;
  255         default:
  256                 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
  257         }
  258 
  259         return (mask);
  260 }
  261 
  262 #if !defined(DIAGNOSTIC)
  263 #ifdef __GNUC_GNU_INLINE__
  264 #define PMAP_INLINE     __attribute__((__gnu_inline__)) inline
  265 #else
  266 #define PMAP_INLINE     extern inline
  267 #endif
  268 #else
  269 #define PMAP_INLINE
  270 #endif
  271 
  272 #ifdef PV_STATS
  273 #define PV_STAT(x)      do { x ; } while (0)
  274 #else
  275 #define PV_STAT(x)      do { } while (0)
  276 #endif
  277 
  278 #define pa_index(pa)    ((pa) >> PDRSHIFT)
  279 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
  280 
  281 #define NPV_LIST_LOCKS  MAXCPU
  282 
  283 #define PHYS_TO_PV_LIST_LOCK(pa)        \
  284                         (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
  285 
  286 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
  287         struct rwlock **_lockp = (lockp);               \
  288         struct rwlock *_new_lock;                       \
  289                                                         \
  290         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
  291         if (_new_lock != *_lockp) {                     \
  292                 if (*_lockp != NULL)                    \
  293                         rw_wunlock(*_lockp);            \
  294                 *_lockp = _new_lock;                    \
  295                 rw_wlock(*_lockp);                      \
  296         }                                               \
  297 } while (0)
  298 
  299 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
  300                         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
  301 
  302 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
  303         struct rwlock **_lockp = (lockp);               \
  304                                                         \
  305         if (*_lockp != NULL) {                          \
  306                 rw_wunlock(*_lockp);                    \
  307                 *_lockp = NULL;                         \
  308         }                                               \
  309 } while (0)
  310 
  311 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
  312                         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
  313 
  314 struct pmap kernel_pmap_store;
  315 
  316 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
  317 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
  318 
  319 int nkpt;
  320 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
  321     "Number of kernel page table pages allocated on bootup");
  322 
  323 static int ndmpdp;
  324 vm_paddr_t dmaplimit;
  325 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
  326 pt_entry_t pg_nx;
  327 
  328 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  329 
  330 static int pat_works = 1;
  331 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
  332     "Is page attribute table fully functional?");
  333 
  334 static int pg_ps_enabled = 1;
  335 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
  336     "Are large page mappings enabled?");
  337 
  338 #define PAT_INDEX_SIZE  8
  339 static int pat_index[PAT_INDEX_SIZE];   /* cache mode to PAT index conversion */
  340 
  341 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
  342 static u_int64_t        KPDphys;        /* phys addr of kernel level 2 */
  343 u_int64_t               KPDPphys;       /* phys addr of kernel level 3 */
  344 u_int64_t               KPML4phys;      /* phys addr of kernel level 4 */
  345 
  346 static u_int64_t        DMPDphys;       /* phys addr of direct mapped level 2 */
  347 static u_int64_t        DMPDPphys;      /* phys addr of direct mapped level 3 */
  348 static int              ndmpdpphys;     /* number of DMPDPphys pages */
  349 
  350 static struct rwlock_padalign pvh_global_lock;
  351 
  352 /*
  353  * Data for the pv entry allocation mechanism
  354  */
  355 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
  356 static struct mtx pv_chunks_mutex;
  357 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
  358 static struct md_page *pv_table;
  359 
  360 /*
  361  * All those kernel PT submaps that BSD is so fond of
  362  */
  363 pt_entry_t *CMAP1 = 0;
  364 caddr_t CADDR1 = 0;
  365 
  366 static int pmap_flags = PMAP_PDE_SUPERPAGE;     /* flags for x86 pmaps */
  367 
  368 static struct unrhdr pcid_unr;
  369 static struct mtx pcid_mtx;
  370 int pmap_pcid_enabled = 0;
  371 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
  372     0, "Is TLB Context ID enabled ?");
  373 int invpcid_works = 0;
  374 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
  375     "Is the invpcid instruction available ?");
  376 
  377 static int
  378 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
  379 {
  380         int i;
  381         uint64_t res;
  382 
  383         res = 0;
  384         CPU_FOREACH(i) {
  385                 res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
  386         }
  387         return (sysctl_handle_64(oidp, &res, 0, req));
  388 }
  389 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
  390     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
  391     "Count of saved TLB context on switch");
  392 
  393 /* pmap_copy_pages() over non-DMAP */
  394 static struct mtx cpage_lock;
  395 static vm_offset_t cpage_a;
  396 static vm_offset_t cpage_b;
  397 
  398 /*
  399  * Crashdump maps.
  400  */
  401 static caddr_t crashdumpmap;
  402 
  403 static void     free_pv_chunk(struct pv_chunk *pc);
  404 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
  405 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
  406 static int      popcnt_pc_map_elem(uint64_t elem);
  407 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
  408 static void     reserve_pv_entries(pmap_t pmap, int needed,
  409                     struct rwlock **lockp);
  410 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  411                     struct rwlock **lockp);
  412 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  413                     struct rwlock **lockp);
  414 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  415                     struct rwlock **lockp);
  416 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  417 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
  418                     vm_offset_t va);
  419 
  420 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
  421 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  422 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
  423     vm_offset_t va, struct rwlock **lockp);
  424 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
  425     vm_offset_t va);
  426 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
  427     vm_prot_t prot, struct rwlock **lockp);
  428 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
  429     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
  430 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
  431 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
  432 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
  433 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
  434 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
  435 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
  436     struct rwlock **lockp);
  437 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
  438     vm_prot_t prot);
  439 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
  440 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  441     struct spglist *free, struct rwlock **lockp);
  442 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
  443     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
  444 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
  445 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  446     struct spglist *free);
  447 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
  448     vm_page_t m, struct rwlock **lockp);
  449 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  450     pd_entry_t newpde);
  451 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
  452 
  453 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
  454                 struct rwlock **lockp);
  455 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
  456                 struct rwlock **lockp);
  457 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
  458                 struct rwlock **lockp);
  459 
  460 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
  461     struct spglist *free);
  462 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
  463 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
  464 
  465 /*
  466  * Move the kernel virtual free pointer to the next
  467  * 2MB.  This is used to help improve performance
  468  * by using a large (2MB) page for much of the kernel
  469  * (.text, .data, .bss)
  470  */
  471 static vm_offset_t
  472 pmap_kmem_choose(vm_offset_t addr)
  473 {
  474         vm_offset_t newaddr = addr;
  475 
  476         newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
  477         return (newaddr);
  478 }
  479 
  480 /********************/
  481 /* Inline functions */
  482 /********************/
  483 
  484 /* Return a non-clipped PD index for a given VA */
  485 static __inline vm_pindex_t
  486 pmap_pde_pindex(vm_offset_t va)
  487 {
  488         return (va >> PDRSHIFT);
  489 }
  490 
  491 
  492 /* Return various clipped indexes for a given VA */
  493 static __inline vm_pindex_t
  494 pmap_pte_index(vm_offset_t va)
  495 {
  496 
  497         return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
  498 }
  499 
  500 static __inline vm_pindex_t
  501 pmap_pde_index(vm_offset_t va)
  502 {
  503 
  504         return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
  505 }
  506 
  507 static __inline vm_pindex_t
  508 pmap_pdpe_index(vm_offset_t va)
  509 {
  510 
  511         return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
  512 }
  513 
  514 static __inline vm_pindex_t
  515 pmap_pml4e_index(vm_offset_t va)
  516 {
  517 
  518         return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
  519 }
  520 
  521 /* Return a pointer to the PML4 slot that corresponds to a VA */
  522 static __inline pml4_entry_t *
  523 pmap_pml4e(pmap_t pmap, vm_offset_t va)
  524 {
  525 
  526         return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
  527 }
  528 
  529 /* Return a pointer to the PDP slot that corresponds to a VA */
  530 static __inline pdp_entry_t *
  531 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
  532 {
  533         pdp_entry_t *pdpe;
  534 
  535         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
  536         return (&pdpe[pmap_pdpe_index(va)]);
  537 }
  538 
  539 /* Return a pointer to the PDP slot that corresponds to a VA */
  540 static __inline pdp_entry_t *
  541 pmap_pdpe(pmap_t pmap, vm_offset_t va)
  542 {
  543         pml4_entry_t *pml4e;
  544         pt_entry_t PG_V;
  545 
  546         PG_V = pmap_valid_bit(pmap);
  547         pml4e = pmap_pml4e(pmap, va);
  548         if ((*pml4e & PG_V) == 0)
  549                 return (NULL);
  550         return (pmap_pml4e_to_pdpe(pml4e, va));
  551 }
  552 
  553 /* Return a pointer to the PD slot that corresponds to a VA */
  554 static __inline pd_entry_t *
  555 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
  556 {
  557         pd_entry_t *pde;
  558 
  559         pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
  560         return (&pde[pmap_pde_index(va)]);
  561 }
  562 
  563 /* Return a pointer to the PD slot that corresponds to a VA */
  564 static __inline pd_entry_t *
  565 pmap_pde(pmap_t pmap, vm_offset_t va)
  566 {
  567         pdp_entry_t *pdpe;
  568         pt_entry_t PG_V;
  569 
  570         PG_V = pmap_valid_bit(pmap);
  571         pdpe = pmap_pdpe(pmap, va);
  572         if (pdpe == NULL || (*pdpe & PG_V) == 0)
  573                 return (NULL);
  574         return (pmap_pdpe_to_pde(pdpe, va));
  575 }
  576 
  577 /* Return a pointer to the PT slot that corresponds to a VA */
  578 static __inline pt_entry_t *
  579 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
  580 {
  581         pt_entry_t *pte;
  582 
  583         pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
  584         return (&pte[pmap_pte_index(va)]);
  585 }
  586 
  587 /* Return a pointer to the PT slot that corresponds to a VA */
  588 static __inline pt_entry_t *
  589 pmap_pte(pmap_t pmap, vm_offset_t va)
  590 {
  591         pd_entry_t *pde;
  592         pt_entry_t PG_V;
  593 
  594         PG_V = pmap_valid_bit(pmap);
  595         pde = pmap_pde(pmap, va);
  596         if (pde == NULL || (*pde & PG_V) == 0)
  597                 return (NULL);
  598         if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
  599                 return ((pt_entry_t *)pde);
  600         return (pmap_pde_to_pte(pde, va));
  601 }
  602 
  603 static __inline void
  604 pmap_resident_count_inc(pmap_t pmap, int count)
  605 {
  606 
  607         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  608         pmap->pm_stats.resident_count += count;
  609 }
  610 
  611 static __inline void
  612 pmap_resident_count_dec(pmap_t pmap, int count)
  613 {
  614 
  615         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  616         KASSERT(pmap->pm_stats.resident_count >= count,
  617             ("pmap %p resident count underflow %ld %d", pmap,
  618             pmap->pm_stats.resident_count, count));
  619         pmap->pm_stats.resident_count -= count;
  620 }
  621 
  622 PMAP_INLINE pt_entry_t *
  623 vtopte(vm_offset_t va)
  624 {
  625         u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  626 
  627         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
  628 
  629         return (PTmap + ((va >> PAGE_SHIFT) & mask));
  630 }
  631 
  632 static __inline pd_entry_t *
  633 vtopde(vm_offset_t va)
  634 {
  635         u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  636 
  637         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
  638 
  639         return (PDmap + ((va >> PDRSHIFT) & mask));
  640 }
  641 
  642 static u_int64_t
  643 allocpages(vm_paddr_t *firstaddr, int n)
  644 {
  645         u_int64_t ret;
  646 
  647         ret = *firstaddr;
  648         bzero((void *)ret, n * PAGE_SIZE);
  649         *firstaddr += n * PAGE_SIZE;
  650         return (ret);
  651 }
  652 
  653 CTASSERT(powerof2(NDMPML4E));
  654 
  655 /* number of kernel PDP slots */
  656 #define NKPDPE(ptpgs)           howmany((ptpgs), NPDEPG)
  657 
  658 static void
  659 nkpt_init(vm_paddr_t addr)
  660 {
  661         int pt_pages;
  662         
  663 #ifdef NKPT
  664         pt_pages = NKPT;
  665 #else
  666         pt_pages = howmany(addr, 1 << PDRSHIFT);
  667         pt_pages += NKPDPE(pt_pages);
  668 
  669         /*
  670          * Add some slop beyond the bare minimum required for bootstrapping
  671          * the kernel.
  672          *
  673          * This is quite important when allocating KVA for kernel modules.
  674          * The modules are required to be linked in the negative 2GB of
  675          * the address space.  If we run out of KVA in this region then
  676          * pmap_growkernel() will need to allocate page table pages to map
  677          * the entire 512GB of KVA space which is an unnecessary tax on
  678          * physical memory.
  679          */
  680         pt_pages += 8;          /* 16MB additional slop for kernel modules */
  681 #endif
  682         nkpt = pt_pages;
  683 }
  684 
  685 static void
  686 create_pagetables(vm_paddr_t *firstaddr)
  687 {
  688         int i, j, ndm1g, nkpdpe;
  689         pt_entry_t *pt_p;
  690         pd_entry_t *pd_p;
  691         pdp_entry_t *pdp_p;
  692         pml4_entry_t *p4_p;
  693 
  694         /* Allocate page table pages for the direct map */
  695         ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
  696         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
  697                 ndmpdp = 4;
  698         ndmpdpphys = howmany(ndmpdp, NPDPEPG);
  699         if (ndmpdpphys > NDMPML4E) {
  700                 /*
  701                  * Each NDMPML4E allows 512 GB, so limit to that,
  702                  * and then readjust ndmpdp and ndmpdpphys.
  703                  */
  704                 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
  705                 Maxmem = atop(NDMPML4E * NBPML4);
  706                 ndmpdpphys = NDMPML4E;
  707                 ndmpdp = NDMPML4E * NPDEPG;
  708         }
  709         DMPDPphys = allocpages(firstaddr, ndmpdpphys);
  710         ndm1g = 0;
  711         if ((amd_feature & AMDID_PAGE1GB) != 0)
  712                 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
  713         if (ndm1g < ndmpdp)
  714                 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
  715         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
  716 
  717         /* Allocate pages */
  718         KPML4phys = allocpages(firstaddr, 1);
  719         KPDPphys = allocpages(firstaddr, NKPML4E);
  720 
  721         /*
  722          * Allocate the initial number of kernel page table pages required to
  723          * bootstrap.  We defer this until after all memory-size dependent
  724          * allocations are done (e.g. direct map), so that we don't have to
  725          * build in too much slop in our estimate.
  726          *
  727          * Note that when NKPML4E > 1, we have an empty page underneath
  728          * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
  729          * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
  730          */
  731         nkpt_init(*firstaddr);
  732         nkpdpe = NKPDPE(nkpt);
  733 
  734         KPTphys = allocpages(firstaddr, nkpt);
  735         KPDphys = allocpages(firstaddr, nkpdpe);
  736 
  737         /* Fill in the underlying page table pages */
  738         /* Nominally read-only (but really R/W) from zero to physfree */
  739         /* XXX not fully used, underneath 2M pages */
  740         pt_p = (pt_entry_t *)KPTphys;
  741         for (i = 0; ptoa(i) < *firstaddr; i++)
  742                 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
  743 
  744         /* Now map the page tables at their location within PTmap */
  745         pd_p = (pd_entry_t *)KPDphys;
  746         for (i = 0; i < nkpt; i++)
  747                 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
  748 
  749         /* Map from zero to end of allocations under 2M pages */
  750         /* This replaces some of the KPTphys entries above */
  751         for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
  752                 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
  753                     X86_PG_G;
  754 
  755         /* And connect up the PD to the PDP (leaving room for L4 pages) */
  756         pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
  757         for (i = 0; i < nkpdpe; i++)
  758                 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
  759                     PG_U;
  760 
  761         /*
  762          * Now, set up the direct map region using 2MB and/or 1GB pages.  If
  763          * the end of physical memory is not aligned to a 1GB page boundary,
  764          * then the residual physical memory is mapped with 2MB pages.  Later,
  765          * if pmap_mapdev{_attr}() uses the direct map for non-write-back
  766          * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
  767          * that are partially used. 
  768          */
  769         pd_p = (pd_entry_t *)DMPDphys;
  770         for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
  771                 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
  772                 /* Preset PG_M and PG_A because demotion expects it. */
  773                 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
  774                     X86_PG_M | X86_PG_A;
  775         }
  776         pdp_p = (pdp_entry_t *)DMPDPphys;
  777         for (i = 0; i < ndm1g; i++) {
  778                 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
  779                 /* Preset PG_M and PG_A because demotion expects it. */
  780                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
  781                     X86_PG_M | X86_PG_A;
  782         }
  783         for (j = 0; i < ndmpdp; i++, j++) {
  784                 pdp_p[i] = DMPDphys + ptoa(j);
  785                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
  786         }
  787 
  788         /* And recursively map PML4 to itself in order to get PTmap */
  789         p4_p = (pml4_entry_t *)KPML4phys;
  790         p4_p[PML4PML4I] = KPML4phys;
  791         p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
  792 
  793         /* Connect the Direct Map slot(s) up to the PML4. */
  794         for (i = 0; i < ndmpdpphys; i++) {
  795                 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
  796                 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
  797         }
  798 
  799         /* Connect the KVA slots up to the PML4 */
  800         for (i = 0; i < NKPML4E; i++) {
  801                 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
  802                 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
  803         }
  804 }
  805 
  806 /*
  807  *      Bootstrap the system enough to run with virtual memory.
  808  *
  809  *      On amd64 this is called after mapping has already been enabled
  810  *      and just syncs the pmap module with what has already been done.
  811  *      [We can't call it easily with mapping off since the kernel is not
  812  *      mapped with PA == VA, hence we would have to relocate every address
  813  *      from the linked base (virtual) address "KERNBASE" to the actual
  814  *      (physical) address starting relative to 0]
  815  */
  816 void
  817 pmap_bootstrap(vm_paddr_t *firstaddr)
  818 {
  819         vm_offset_t va;
  820         pt_entry_t *pte;
  821 
  822         /*
  823          * Create an initial set of page tables to run the kernel in.
  824          */
  825         create_pagetables(firstaddr);
  826 
  827         virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
  828         virtual_avail = pmap_kmem_choose(virtual_avail);
  829 
  830         virtual_end = VM_MAX_KERNEL_ADDRESS;
  831 
  832 
  833         /* XXX do %cr0 as well */
  834         load_cr4(rcr4() | CR4_PGE | CR4_PSE);
  835         load_cr3(KPML4phys);
  836         if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
  837                 load_cr4(rcr4() | CR4_SMEP);
  838 
  839         /*
  840          * Initialize the kernel pmap (which is statically allocated).
  841          */
  842         PMAP_LOCK_INIT(kernel_pmap);
  843         kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
  844         kernel_pmap->pm_cr3 = KPML4phys;
  845         CPU_FILL(&kernel_pmap->pm_active);      /* don't allow deactivation */
  846         CPU_FILL(&kernel_pmap->pm_save);        /* always superset of pm_active */
  847         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
  848         kernel_pmap->pm_flags = pmap_flags;
  849 
  850         /*
  851          * Initialize the global pv list lock.
  852          */
  853         rw_init(&pvh_global_lock, "pmap pv global");
  854 
  855         /*
  856          * Reserve some special page table entries/VA space for temporary
  857          * mapping of pages.
  858          */
  859 #define SYSMAP(c, p, v, n)      \
  860         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
  861 
  862         va = virtual_avail;
  863         pte = vtopte(va);
  864 
  865         /*
  866          * Crashdump maps.  The first page is reused as CMAP1 for the
  867          * memory test.
  868          */
  869         SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
  870         CADDR1 = crashdumpmap;
  871 
  872         virtual_avail = va;
  873 
  874         /* Initialize the PAT MSR. */
  875         pmap_init_pat();
  876 
  877         /* Initialize TLB Context Id. */
  878         TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
  879         if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
  880                 load_cr4(rcr4() | CR4_PCIDE);
  881                 mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
  882                 init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
  883                 /* Check for INVPCID support */
  884                 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
  885                     != 0;
  886                 kernel_pmap->pm_pcid = 0;
  887 #ifndef SMP
  888                 pmap_pcid_enabled = 0;
  889 #endif
  890         } else
  891                 pmap_pcid_enabled = 0;
  892 }
  893 
  894 /*
  895  * Setup the PAT MSR.
  896  */
  897 void
  898 pmap_init_pat(void)
  899 {
  900         int pat_table[PAT_INDEX_SIZE];
  901         uint64_t pat_msr;
  902         u_long cr0, cr4;
  903         int i;
  904 
  905         /* Bail if this CPU doesn't implement PAT. */
  906         if ((cpu_feature & CPUID_PAT) == 0)
  907                 panic("no PAT??");
  908 
  909         /* Set default PAT index table. */
  910         for (i = 0; i < PAT_INDEX_SIZE; i++)
  911                 pat_table[i] = -1;
  912         pat_table[PAT_WRITE_BACK] = 0;
  913         pat_table[PAT_WRITE_THROUGH] = 1;
  914         pat_table[PAT_UNCACHEABLE] = 3;
  915         pat_table[PAT_WRITE_COMBINING] = 3;
  916         pat_table[PAT_WRITE_PROTECTED] = 3;
  917         pat_table[PAT_UNCACHED] = 3;
  918 
  919         /* Initialize default PAT entries. */
  920         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
  921             PAT_VALUE(1, PAT_WRITE_THROUGH) |
  922             PAT_VALUE(2, PAT_UNCACHED) |
  923             PAT_VALUE(3, PAT_UNCACHEABLE) |
  924             PAT_VALUE(4, PAT_WRITE_BACK) |
  925             PAT_VALUE(5, PAT_WRITE_THROUGH) |
  926             PAT_VALUE(6, PAT_UNCACHED) |
  927             PAT_VALUE(7, PAT_UNCACHEABLE);
  928 
  929         if (pat_works) {
  930                 /*
  931                  * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
  932                  * Program 5 and 6 as WP and WC.
  933                  * Leave 4 and 7 as WB and UC.
  934                  */
  935                 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
  936                 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
  937                     PAT_VALUE(6, PAT_WRITE_COMBINING);
  938                 pat_table[PAT_UNCACHED] = 2;
  939                 pat_table[PAT_WRITE_PROTECTED] = 5;
  940                 pat_table[PAT_WRITE_COMBINING] = 6;
  941         } else {
  942                 /*
  943                  * Just replace PAT Index 2 with WC instead of UC-.
  944                  */
  945                 pat_msr &= ~PAT_MASK(2);
  946                 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
  947                 pat_table[PAT_WRITE_COMBINING] = 2;
  948         }
  949 
  950         /* Disable PGE. */
  951         cr4 = rcr4();
  952         load_cr4(cr4 & ~CR4_PGE);
  953 
  954         /* Disable caches (CD = 1, NW = 0). */
  955         cr0 = rcr0();
  956         load_cr0((cr0 & ~CR0_NW) | CR0_CD);
  957 
  958         /* Flushes caches and TLBs. */
  959         wbinvd();
  960         invltlb();
  961 
  962         /* Update PAT and index table. */
  963         wrmsr(MSR_PAT, pat_msr);
  964         for (i = 0; i < PAT_INDEX_SIZE; i++)
  965                 pat_index[i] = pat_table[i];
  966 
  967         /* Flush caches and TLBs again. */
  968         wbinvd();
  969         invltlb();
  970 
  971         /* Restore caches and PGE. */
  972         load_cr0(cr0);
  973         load_cr4(cr4);
  974 }
  975 
  976 /*
  977  *      Initialize a vm_page's machine-dependent fields.
  978  */
  979 void
  980 pmap_page_init(vm_page_t m)
  981 {
  982 
  983         TAILQ_INIT(&m->md.pv_list);
  984         m->md.pat_mode = PAT_WRITE_BACK;
  985 }
  986 
  987 /*
  988  *      Initialize the pmap module.
  989  *      Called by vm_init, to initialize any structures that the pmap
  990  *      system needs to map virtual memory.
  991  */
  992 void
  993 pmap_init(void)
  994 {
  995         vm_page_t mpte;
  996         vm_size_t s;
  997         int i, pv_npg;
  998 
  999         /*
 1000          * Initialize the vm page array entries for the kernel pmap's
 1001          * page table pages.
 1002          */ 
 1003         for (i = 0; i < nkpt; i++) {
 1004                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 1005                 KASSERT(mpte >= vm_page_array &&
 1006                     mpte < &vm_page_array[vm_page_array_size],
 1007                     ("pmap_init: page table page is out of range"));
 1008                 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
 1009                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 1010         }
 1011 
 1012         /*
 1013          * If the kernel is running on a virtual machine, then it must assume
 1014          * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 1015          * be prepared for the hypervisor changing the vendor and family that
 1016          * are reported by CPUID.  Consequently, the workaround for AMD Family
 1017          * 10h Erratum 383 is enabled if the processor's feature set does not
 1018          * include at least one feature that is only supported by older Intel
 1019          * or newer AMD processors.
 1020          */
 1021         if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
 1022             (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 1023             CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 1024             AMDID2_FMA4)) == 0)
 1025                 workaround_erratum383 = 1;
 1026 
 1027         /*
 1028          * Are large page mappings enabled?
 1029          */
 1030         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 1031         if (pg_ps_enabled) {
 1032                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 1033                     ("pmap_init: can't assign to pagesizes[1]"));
 1034                 pagesizes[1] = NBPDR;
 1035         }
 1036 
 1037         /*
 1038          * Initialize the pv chunk list mutex.
 1039          */
 1040         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 1041 
 1042         /*
 1043          * Initialize the pool of pv list locks.
 1044          */
 1045         for (i = 0; i < NPV_LIST_LOCKS; i++)
 1046                 rw_init(&pv_list_locks[i], "pmap pv list");
 1047 
 1048         /*
 1049          * Calculate the size of the pv head table for superpages.
 1050          */
 1051         for (i = 0; phys_avail[i + 1]; i += 2);
 1052         pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
 1053 
 1054         /*
 1055          * Allocate memory for the pv head table for superpages.
 1056          */
 1057         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 1058         s = round_page(s);
 1059         pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
 1060             M_WAITOK | M_ZERO);
 1061         for (i = 0; i < pv_npg; i++)
 1062                 TAILQ_INIT(&pv_table[i].pv_list);
 1063 
 1064         mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
 1065         cpage_a = kva_alloc(PAGE_SIZE);
 1066         cpage_b = kva_alloc(PAGE_SIZE);
 1067 }
 1068 
 1069 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
 1070     "2MB page mapping counters");
 1071 
 1072 static u_long pmap_pde_demotions;
 1073 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
 1074     &pmap_pde_demotions, 0, "2MB page demotions");
 1075 
 1076 static u_long pmap_pde_mappings;
 1077 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
 1078     &pmap_pde_mappings, 0, "2MB page mappings");
 1079 
 1080 static u_long pmap_pde_p_failures;
 1081 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
 1082     &pmap_pde_p_failures, 0, "2MB page promotion failures");
 1083 
 1084 static u_long pmap_pde_promotions;
 1085 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
 1086     &pmap_pde_promotions, 0, "2MB page promotions");
 1087 
 1088 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
 1089     "1GB page mapping counters");
 1090 
 1091 static u_long pmap_pdpe_demotions;
 1092 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
 1093     &pmap_pdpe_demotions, 0, "1GB page demotions");
 1094 
 1095 /***************************************************
 1096  * Low level helper routines.....
 1097  ***************************************************/
 1098 
 1099 static pt_entry_t
 1100 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
 1101 {
 1102         int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
 1103 
 1104         switch (pmap->pm_type) {
 1105         case PT_X86:
 1106                 /* Verify that both PAT bits are not set at the same time */
 1107                 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
 1108                     ("Invalid PAT bits in entry %#lx", entry));
 1109 
 1110                 /* Swap the PAT bits if one of them is set */
 1111                 if ((entry & x86_pat_bits) != 0)
 1112                         entry ^= x86_pat_bits;
 1113                 break;
 1114         case PT_EPT:
 1115                 /*
 1116                  * Nothing to do - the memory attributes are represented
 1117                  * the same way for regular pages and superpages.
 1118                  */
 1119                 break;
 1120         default:
 1121                 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
 1122         }
 1123 
 1124         return (entry);
 1125 }
 1126 
 1127 /*
 1128  * Determine the appropriate bits to set in a PTE or PDE for a specified
 1129  * caching mode.
 1130  */
 1131 static int
 1132 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 1133 {
 1134         int cache_bits, pat_flag, pat_idx;
 1135 
 1136         if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
 1137                 panic("Unknown caching mode %d\n", mode);
 1138 
 1139         switch (pmap->pm_type) {
 1140         case PT_X86:
 1141                 /* The PAT bit is different for PTE's and PDE's. */
 1142                 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 1143 
 1144                 /* Map the caching mode to a PAT index. */
 1145                 pat_idx = pat_index[mode];
 1146 
 1147                 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 1148                 cache_bits = 0;
 1149                 if (pat_idx & 0x4)
 1150                         cache_bits |= pat_flag;
 1151                 if (pat_idx & 0x2)
 1152                         cache_bits |= PG_NC_PCD;
 1153                 if (pat_idx & 0x1)
 1154                         cache_bits |= PG_NC_PWT;
 1155                 break;
 1156 
 1157         case PT_EPT:
 1158                 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
 1159                 break;
 1160 
 1161         default:
 1162                 panic("unsupported pmap type %d", pmap->pm_type);
 1163         }
 1164 
 1165         return (cache_bits);
 1166 }
 1167 
 1168 static int
 1169 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 1170 {
 1171         int mask;
 1172 
 1173         switch (pmap->pm_type) {
 1174         case PT_X86:
 1175                 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
 1176                 break;
 1177         case PT_EPT:
 1178                 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
 1179                 break;
 1180         default:
 1181                 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
 1182         }
 1183 
 1184         return (mask);
 1185 }
 1186 
 1187 static __inline boolean_t
 1188 pmap_ps_enabled(pmap_t pmap)
 1189 {
 1190 
 1191         return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 1192 }
 1193 
 1194 static void
 1195 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
 1196 {
 1197 
 1198         switch (pmap->pm_type) {
 1199         case PT_X86:
 1200                 break;
 1201         case PT_EPT:
 1202                 /*
 1203                  * XXX
 1204                  * This is a little bogus since the generation number is
 1205                  * supposed to be bumped up when a region of the address
 1206                  * space is invalidated in the page tables.
 1207                  *
 1208                  * In this case the old PDE entry is valid but yet we want
 1209                  * to make sure that any mappings using the old entry are
 1210                  * invalidated in the TLB.
 1211                  *
 1212                  * The reason this works as expected is because we rendezvous
 1213                  * "all" host cpus and force any vcpu context to exit as a
 1214                  * side-effect.
 1215                  */
 1216                 atomic_add_acq_long(&pmap->pm_eptgen, 1);
 1217                 break;
 1218         default:
 1219                 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
 1220         }
 1221         pde_store(pde, newpde);
 1222 }
 1223 
 1224 /*
 1225  * After changing the page size for the specified virtual address in the page
 1226  * table, flush the corresponding entries from the processor's TLB.  Only the
 1227  * calling processor's TLB is affected.
 1228  *
 1229  * The calling thread must be pinned to a processor.
 1230  */
 1231 static void
 1232 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 1233 {
 1234         pt_entry_t PG_G;
 1235 
 1236         if (pmap->pm_type == PT_EPT)
 1237                 return;
 1238 
 1239         KASSERT(pmap->pm_type == PT_X86,
 1240             ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
 1241 
 1242         PG_G = pmap_global_bit(pmap);
 1243 
 1244         if ((newpde & PG_PS) == 0)
 1245                 /* Demotion: flush a specific 2MB page mapping. */
 1246                 invlpg(va);
 1247         else if ((newpde & PG_G) == 0)
 1248                 /*
 1249                  * Promotion: flush every 4KB page mapping from the TLB
 1250                  * because there are too many to flush individually.
 1251                  */
 1252                 invltlb();
 1253         else {
 1254                 /*
 1255                  * Promotion: flush every 4KB page mapping from the TLB,
 1256                  * including any global (PG_G) mappings.
 1257                  */
 1258                 invltlb_globpcid();
 1259         }
 1260 }
 1261 #ifdef SMP
 1262 
 1263 static void
 1264 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
 1265 {
 1266         struct invpcid_descr d;
 1267         uint64_t cr3;
 1268 
 1269         if (invpcid_works) {
 1270                 d.pcid = pmap->pm_pcid;
 1271                 d.pad = 0;
 1272                 d.addr = va;
 1273                 invpcid(&d, INVPCID_ADDR);
 1274                 return;
 1275         }
 1276 
 1277         cr3 = rcr3();
 1278         critical_enter();
 1279         load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
 1280         invlpg(va);
 1281         load_cr3(cr3 | CR3_PCID_SAVE);
 1282         critical_exit();
 1283 }
 1284 
 1285 /*
 1286  * For SMP, these functions have to use the IPI mechanism for coherence.
 1287  *
 1288  * N.B.: Before calling any of the following TLB invalidation functions,
 1289  * the calling processor must ensure that all stores updating a non-
 1290  * kernel page table are globally performed.  Otherwise, another
 1291  * processor could cache an old, pre-update entry without being
 1292  * invalidated.  This can happen one of two ways: (1) The pmap becomes
 1293  * active on another processor after its pm_active field is checked by
 1294  * one of the following functions but before a store updating the page
 1295  * table is globally performed. (2) The pmap becomes active on another
 1296  * processor before its pm_active field is checked but due to
 1297  * speculative loads one of the following functions stills reads the
 1298  * pmap as inactive on the other processor.
 1299  * 
 1300  * The kernel page table is exempt because its pm_active field is
 1301  * immutable.  The kernel page table is always active on every
 1302  * processor.
 1303  */
 1304 
 1305 /*
 1306  * Interrupt the cpus that are executing in the guest context.
 1307  * This will force the vcpu to exit and the cached EPT mappings
 1308  * will be invalidated by the host before the next vmresume.
 1309  */
 1310 static __inline void
 1311 pmap_invalidate_ept(pmap_t pmap)
 1312 {
 1313         int ipinum;
 1314 
 1315         sched_pin();
 1316         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 1317             ("pmap_invalidate_ept: absurd pm_active"));
 1318 
 1319         /*
 1320          * The TLB mappings associated with a vcpu context are not
 1321          * flushed each time a different vcpu is chosen to execute.
 1322          *
 1323          * This is in contrast with a process's vtop mappings that
 1324          * are flushed from the TLB on each context switch.
 1325          *
 1326          * Therefore we need to do more than just a TLB shootdown on
 1327          * the active cpus in 'pmap->pm_active'. To do this we keep
 1328          * track of the number of invalidations performed on this pmap.
 1329          *
 1330          * Each vcpu keeps a cache of this counter and compares it
 1331          * just before a vmresume. If the counter is out-of-date an
 1332          * invept will be done to flush stale mappings from the TLB.
 1333          */
 1334         atomic_add_acq_long(&pmap->pm_eptgen, 1);
 1335 
 1336         /*
 1337          * Force the vcpu to exit and trap back into the hypervisor.
 1338          */
 1339         ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
 1340         ipi_selected(pmap->pm_active, ipinum);
 1341         sched_unpin();
 1342 }
 1343 
 1344 void
 1345 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1346 {
 1347         cpuset_t other_cpus;
 1348         u_int cpuid;
 1349 
 1350         if (pmap->pm_type == PT_EPT) {
 1351                 pmap_invalidate_ept(pmap);
 1352                 return;
 1353         }
 1354 
 1355         KASSERT(pmap->pm_type == PT_X86,
 1356             ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 1357 
 1358         sched_pin();
 1359         if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1360                 if (!pmap_pcid_enabled) {
 1361                         invlpg(va);
 1362                 } else {
 1363                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
 1364                                 if (pmap == PCPU_GET(curpmap))
 1365                                         invlpg(va);
 1366                                 else
 1367                                         pmap_invalidate_page_pcid(pmap, va);
 1368                         } else {
 1369                                 invltlb_globpcid();
 1370                         }
 1371                 }
 1372                 smp_invlpg(pmap, va);
 1373         } else {
 1374                 cpuid = PCPU_GET(cpuid);
 1375                 other_cpus = all_cpus;
 1376                 CPU_CLR(cpuid, &other_cpus);
 1377                 if (CPU_ISSET(cpuid, &pmap->pm_active))
 1378                         invlpg(va);
 1379                 else if (pmap_pcid_enabled) {
 1380                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
 1381                                 pmap_invalidate_page_pcid(pmap, va);
 1382                         else
 1383                                 invltlb_globpcid();
 1384                 }
 1385                 if (pmap_pcid_enabled)
 1386                         CPU_AND(&other_cpus, &pmap->pm_save);
 1387                 else
 1388                         CPU_AND(&other_cpus, &pmap->pm_active);
 1389                 if (!CPU_EMPTY(&other_cpus))
 1390                         smp_masked_invlpg(other_cpus, pmap, va);
 1391         }
 1392         sched_unpin();
 1393 }
 1394 
 1395 static void
 1396 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1397 {
 1398         struct invpcid_descr d;
 1399         uint64_t cr3;
 1400         vm_offset_t addr;
 1401 
 1402         if (invpcid_works) {
 1403                 d.pcid = pmap->pm_pcid;
 1404                 d.pad = 0;
 1405                 for (addr = sva; addr < eva; addr += PAGE_SIZE) {
 1406                         d.addr = addr;
 1407                         invpcid(&d, INVPCID_ADDR);
 1408                 }
 1409                 return;
 1410         }
 1411 
 1412         cr3 = rcr3();
 1413         critical_enter();
 1414         load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
 1415         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1416                 invlpg(addr);
 1417         load_cr3(cr3 | CR3_PCID_SAVE);
 1418         critical_exit();
 1419 }
 1420 
 1421 void
 1422 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1423 {
 1424         cpuset_t other_cpus;
 1425         vm_offset_t addr;
 1426         u_int cpuid;
 1427 
 1428         if (pmap->pm_type == PT_EPT) {
 1429                 pmap_invalidate_ept(pmap);
 1430                 return;
 1431         }
 1432 
 1433         KASSERT(pmap->pm_type == PT_X86,
 1434             ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 1435 
 1436         sched_pin();
 1437         if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1438                 if (!pmap_pcid_enabled) {
 1439                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1440                                 invlpg(addr);
 1441                 } else {
 1442                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
 1443                                 if (pmap == PCPU_GET(curpmap)) {
 1444                                         for (addr = sva; addr < eva;
 1445                                             addr += PAGE_SIZE)
 1446                                                 invlpg(addr);
 1447                                 } else {
 1448                                         pmap_invalidate_range_pcid(pmap,
 1449                                             sva, eva);
 1450                                 }
 1451                         } else {
 1452                                 invltlb_globpcid();
 1453                         }
 1454                 }
 1455                 smp_invlpg_range(pmap, sva, eva);
 1456         } else {
 1457                 cpuid = PCPU_GET(cpuid);
 1458                 other_cpus = all_cpus;
 1459                 CPU_CLR(cpuid, &other_cpus);
 1460                 if (CPU_ISSET(cpuid, &pmap->pm_active)) {
 1461                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1462                                 invlpg(addr);
 1463                 } else if (pmap_pcid_enabled) {
 1464                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
 1465                                 pmap_invalidate_range_pcid(pmap, sva, eva);
 1466                         else
 1467                                 invltlb_globpcid();
 1468                 }
 1469                 if (pmap_pcid_enabled)
 1470                         CPU_AND(&other_cpus, &pmap->pm_save);
 1471                 else
 1472                         CPU_AND(&other_cpus, &pmap->pm_active);
 1473                 if (!CPU_EMPTY(&other_cpus))
 1474                         smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
 1475         }
 1476         sched_unpin();
 1477 }
 1478 
 1479 void
 1480 pmap_invalidate_all(pmap_t pmap)
 1481 {
 1482         cpuset_t other_cpus;
 1483         struct invpcid_descr d;
 1484         uint64_t cr3;
 1485         u_int cpuid;
 1486 
 1487         if (pmap->pm_type == PT_EPT) {
 1488                 pmap_invalidate_ept(pmap);
 1489                 return;
 1490         }
 1491 
 1492         KASSERT(pmap->pm_type == PT_X86,
 1493             ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 1494 
 1495         sched_pin();
 1496         cpuid = PCPU_GET(cpuid);
 1497         if (pmap == kernel_pmap ||
 1498             (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
 1499             !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1500                 if (invpcid_works) {
 1501                         bzero(&d, sizeof(d));
 1502                         invpcid(&d, INVPCID_CTXGLOB);
 1503                 } else {
 1504                         invltlb_globpcid();
 1505                 }
 1506                 if (!CPU_ISSET(cpuid, &pmap->pm_active))
 1507                         CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
 1508                 smp_invltlb(pmap);
 1509         } else {
 1510                 other_cpus = all_cpus;
 1511                 CPU_CLR(cpuid, &other_cpus);
 1512 
 1513                 /*
 1514                  * This logic is duplicated in the Xinvltlb shootdown
 1515                  * IPI handler.
 1516                  */
 1517                 if (pmap_pcid_enabled) {
 1518                         if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
 1519                                 if (invpcid_works) {
 1520                                         d.pcid = pmap->pm_pcid;
 1521                                         d.pad = 0;
 1522                                         d.addr = 0;
 1523                                         invpcid(&d, INVPCID_CTX);
 1524                                 } else {
 1525                                         cr3 = rcr3();
 1526                                         critical_enter();
 1527 
 1528                                         /*
 1529                                          * Bit 63 is clear, pcid TLB
 1530                                          * entries are invalidated.
 1531                                          */
 1532                                         load_cr3(pmap->pm_cr3);
 1533                                         load_cr3(cr3 | CR3_PCID_SAVE);
 1534                                         critical_exit();
 1535                                 }
 1536                         } else {
 1537                                 invltlb_globpcid();
 1538                         }
 1539                 } else if (CPU_ISSET(cpuid, &pmap->pm_active))
 1540                         invltlb();
 1541                 if (!CPU_ISSET(cpuid, &pmap->pm_active))
 1542                         CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
 1543                 if (pmap_pcid_enabled)
 1544                         CPU_AND(&other_cpus, &pmap->pm_save);
 1545                 else
 1546                         CPU_AND(&other_cpus, &pmap->pm_active);
 1547                 if (!CPU_EMPTY(&other_cpus))
 1548                         smp_masked_invltlb(other_cpus, pmap);
 1549         }
 1550         sched_unpin();
 1551 }
 1552 
 1553 void
 1554 pmap_invalidate_cache(void)
 1555 {
 1556 
 1557         sched_pin();
 1558         wbinvd();
 1559         smp_cache_flush();
 1560         sched_unpin();
 1561 }
 1562 
 1563 struct pde_action {
 1564         cpuset_t invalidate;    /* processors that invalidate their TLB */
 1565         pmap_t pmap;
 1566         vm_offset_t va;
 1567         pd_entry_t *pde;
 1568         pd_entry_t newpde;
 1569         u_int store;            /* processor that updates the PDE */
 1570 };
 1571 
 1572 static void
 1573 pmap_update_pde_action(void *arg)
 1574 {
 1575         struct pde_action *act = arg;
 1576 
 1577         if (act->store == PCPU_GET(cpuid))
 1578                 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 1579 }
 1580 
 1581 static void
 1582 pmap_update_pde_teardown(void *arg)
 1583 {
 1584         struct pde_action *act = arg;
 1585 
 1586         if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 1587                 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 1588 }
 1589 
 1590 /*
 1591  * Change the page size for the specified virtual address in a way that
 1592  * prevents any possibility of the TLB ever having two entries that map the
 1593  * same virtual address using different page sizes.  This is the recommended
 1594  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
 1595  * machine check exception for a TLB state that is improperly diagnosed as a
 1596  * hardware error.
 1597  */
 1598 static void
 1599 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1600 {
 1601         struct pde_action act;
 1602         cpuset_t active, other_cpus;
 1603         u_int cpuid;
 1604 
 1605         sched_pin();
 1606         cpuid = PCPU_GET(cpuid);
 1607         other_cpus = all_cpus;
 1608         CPU_CLR(cpuid, &other_cpus);
 1609         if (pmap == kernel_pmap || pmap->pm_type == PT_EPT)
 1610                 active = all_cpus;
 1611         else {
 1612                 active = pmap->pm_active;
 1613                 CPU_AND_ATOMIC(&pmap->pm_save, &active);
 1614         }
 1615         if (CPU_OVERLAP(&active, &other_cpus)) { 
 1616                 act.store = cpuid;
 1617                 act.invalidate = active;
 1618                 act.va = va;
 1619                 act.pmap = pmap;
 1620                 act.pde = pde;
 1621                 act.newpde = newpde;
 1622                 CPU_SET(cpuid, &active);
 1623                 smp_rendezvous_cpus(active,
 1624                     smp_no_rendevous_barrier, pmap_update_pde_action,
 1625                     pmap_update_pde_teardown, &act);
 1626         } else {
 1627                 pmap_update_pde_store(pmap, pde, newpde);
 1628                 if (CPU_ISSET(cpuid, &active))
 1629                         pmap_update_pde_invalidate(pmap, va, newpde);
 1630         }
 1631         sched_unpin();
 1632 }
 1633 #else /* !SMP */
 1634 /*
 1635  * Normal, non-SMP, invalidation functions.
 1636  * We inline these within pmap.c for speed.
 1637  */
 1638 PMAP_INLINE void
 1639 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1640 {
 1641 
 1642         switch (pmap->pm_type) {
 1643         case PT_X86:
 1644                 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1645                         invlpg(va);
 1646                 break;
 1647         case PT_EPT:
 1648                 pmap->pm_eptgen++;
 1649                 break;
 1650         default:
 1651                 panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
 1652         }
 1653 }
 1654 
 1655 PMAP_INLINE void
 1656 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1657 {
 1658         vm_offset_t addr;
 1659 
 1660         switch (pmap->pm_type) {
 1661         case PT_X86:
 1662                 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1663                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1664                                 invlpg(addr);
 1665                 break;
 1666         case PT_EPT:
 1667                 pmap->pm_eptgen++;
 1668                 break;
 1669         default:
 1670                 panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
 1671         }
 1672 }
 1673 
 1674 PMAP_INLINE void
 1675 pmap_invalidate_all(pmap_t pmap)
 1676 {
 1677 
 1678         switch (pmap->pm_type) {
 1679         case PT_X86:
 1680                 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1681                         invltlb();
 1682                 break;
 1683         case PT_EPT:
 1684                 pmap->pm_eptgen++;
 1685                 break;
 1686         default:
 1687                 panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
 1688         }
 1689 }
 1690 
 1691 PMAP_INLINE void
 1692 pmap_invalidate_cache(void)
 1693 {
 1694 
 1695         wbinvd();
 1696 }
 1697 
 1698 static void
 1699 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1700 {
 1701 
 1702         pmap_update_pde_store(pmap, pde, newpde);
 1703         if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1704                 pmap_update_pde_invalidate(pmap, va, newpde);
 1705         else
 1706                 CPU_ZERO(&pmap->pm_save);
 1707 }
 1708 #endif /* !SMP */
 1709 
 1710 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 1711 
 1712 void
 1713 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 1714 {
 1715 
 1716         KASSERT((sva & PAGE_MASK) == 0,
 1717             ("pmap_invalidate_cache_range: sva not page-aligned"));
 1718         KASSERT((eva & PAGE_MASK) == 0,
 1719             ("pmap_invalidate_cache_range: eva not page-aligned"));
 1720 
 1721         if (cpu_feature & CPUID_SS)
 1722                 ; /* If "Self Snoop" is supported, do nothing. */
 1723         else if ((cpu_feature & CPUID_CLFSH) != 0 &&
 1724             eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 1725 
 1726                 /*
 1727                  * XXX: Some CPUs fault, hang, or trash the local APIC
 1728                  * registers if we use CLFLUSH on the local APIC
 1729                  * range.  The local APIC is always uncached, so we
 1730                  * don't need to flush for that range anyway.
 1731                  */
 1732                 if (pmap_kextract(sva) == lapic_paddr)
 1733                         return;
 1734 
 1735                 /*
 1736                  * Otherwise, do per-cache line flush.  Use the mfence
 1737                  * instruction to insure that previous stores are
 1738                  * included in the write-back.  The processor
 1739                  * propagates flush to other processors in the cache
 1740                  * coherence domain.
 1741                  */
 1742                 mfence();
 1743                 for (; sva < eva; sva += cpu_clflush_line_size)
 1744                         clflush(sva);
 1745                 mfence();
 1746         } else {
 1747 
 1748                 /*
 1749                  * No targeted cache flush methods are supported by CPU,
 1750                  * or the supplied range is bigger than 2MB.
 1751                  * Globally invalidate cache.
 1752                  */
 1753                 pmap_invalidate_cache();
 1754         }
 1755 }
 1756 
 1757 /*
 1758  * Remove the specified set of pages from the data and instruction caches.
 1759  *
 1760  * In contrast to pmap_invalidate_cache_range(), this function does not
 1761  * rely on the CPU's self-snoop feature, because it is intended for use
 1762  * when moving pages into a different cache domain.
 1763  */
 1764 void
 1765 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 1766 {
 1767         vm_offset_t daddr, eva;
 1768         int i;
 1769 
 1770         if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 1771             (cpu_feature & CPUID_CLFSH) == 0)
 1772                 pmap_invalidate_cache();
 1773         else {
 1774                 mfence();
 1775                 for (i = 0; i < count; i++) {
 1776                         daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 1777                         eva = daddr + PAGE_SIZE;
 1778                         for (; daddr < eva; daddr += cpu_clflush_line_size)
 1779                                 clflush(daddr);
 1780                 }
 1781                 mfence();
 1782         }
 1783 }
 1784 
 1785 /*
 1786  *      Routine:        pmap_extract
 1787  *      Function:
 1788  *              Extract the physical page address associated
 1789  *              with the given map/virtual_address pair.
 1790  */
 1791 vm_paddr_t 
 1792 pmap_extract(pmap_t pmap, vm_offset_t va)
 1793 {
 1794         pdp_entry_t *pdpe;
 1795         pd_entry_t *pde;
 1796         pt_entry_t *pte, PG_V;
 1797         vm_paddr_t pa;
 1798 
 1799         pa = 0;
 1800         PG_V = pmap_valid_bit(pmap);
 1801         PMAP_LOCK(pmap);
 1802         pdpe = pmap_pdpe(pmap, va);
 1803         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 1804                 if ((*pdpe & PG_PS) != 0)
 1805                         pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 1806                 else {
 1807                         pde = pmap_pdpe_to_pde(pdpe, va);
 1808                         if ((*pde & PG_V) != 0) {
 1809                                 if ((*pde & PG_PS) != 0) {
 1810                                         pa = (*pde & PG_PS_FRAME) |
 1811                                             (va & PDRMASK);
 1812                                 } else {
 1813                                         pte = pmap_pde_to_pte(pde, va);
 1814                                         pa = (*pte & PG_FRAME) |
 1815                                             (va & PAGE_MASK);
 1816                                 }
 1817                         }
 1818                 }
 1819         }
 1820         PMAP_UNLOCK(pmap);
 1821         return (pa);
 1822 }
 1823 
 1824 /*
 1825  *      Routine:        pmap_extract_and_hold
 1826  *      Function:
 1827  *              Atomically extract and hold the physical page
 1828  *              with the given pmap and virtual address pair
 1829  *              if that mapping permits the given protection.
 1830  */
 1831 vm_page_t
 1832 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 1833 {
 1834         pd_entry_t pde, *pdep;
 1835         pt_entry_t pte, PG_RW, PG_V;
 1836         vm_paddr_t pa;
 1837         vm_page_t m;
 1838 
 1839         pa = 0;
 1840         m = NULL;
 1841         PG_RW = pmap_rw_bit(pmap);
 1842         PG_V = pmap_valid_bit(pmap);
 1843         PMAP_LOCK(pmap);
 1844 retry:
 1845         pdep = pmap_pde(pmap, va);
 1846         if (pdep != NULL && (pde = *pdep)) {
 1847                 if (pde & PG_PS) {
 1848                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 1849                                 if (vm_page_pa_tryrelock(pmap, (pde &
 1850                                     PG_PS_FRAME) | (va & PDRMASK), &pa))
 1851                                         goto retry;
 1852                                 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 1853                                     (va & PDRMASK));
 1854                                 vm_page_hold(m);
 1855                         }
 1856                 } else {
 1857                         pte = *pmap_pde_to_pte(pdep, va);
 1858                         if ((pte & PG_V) &&
 1859                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 1860                                 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 1861                                     &pa))
 1862                                         goto retry;
 1863                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 1864                                 vm_page_hold(m);
 1865                         }
 1866                 }
 1867         }
 1868         PA_UNLOCK_COND(pa);
 1869         PMAP_UNLOCK(pmap);
 1870         return (m);
 1871 }
 1872 
 1873 vm_paddr_t
 1874 pmap_kextract(vm_offset_t va)
 1875 {
 1876         pd_entry_t pde;
 1877         vm_paddr_t pa;
 1878 
 1879         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 1880                 pa = DMAP_TO_PHYS(va);
 1881         } else {
 1882                 pde = *vtopde(va);
 1883                 if (pde & PG_PS) {
 1884                         pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 1885                 } else {
 1886                         /*
 1887                          * Beware of a concurrent promotion that changes the
 1888                          * PDE at this point!  For example, vtopte() must not
 1889                          * be used to access the PTE because it would use the
 1890                          * new PDE.  It is, however, safe to use the old PDE
 1891                          * because the page table page is preserved by the
 1892                          * promotion.
 1893                          */
 1894                         pa = *pmap_pde_to_pte(&pde, va);
 1895                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 1896                 }
 1897         }
 1898         return (pa);
 1899 }
 1900 
 1901 /***************************************************
 1902  * Low level mapping routines.....
 1903  ***************************************************/
 1904 
 1905 /*
 1906  * Add a wired page to the kva.
 1907  * Note: not SMP coherent.
 1908  */
 1909 PMAP_INLINE void 
 1910 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 1911 {
 1912         pt_entry_t *pte;
 1913 
 1914         pte = vtopte(va);
 1915         pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
 1916 }
 1917 
 1918 static __inline void
 1919 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 1920 {
 1921         pt_entry_t *pte;
 1922         int cache_bits;
 1923 
 1924         pte = vtopte(va);
 1925         cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
 1926         pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
 1927 }
 1928 
 1929 /*
 1930  * Remove a page from the kernel pagetables.
 1931  * Note: not SMP coherent.
 1932  */
 1933 PMAP_INLINE void
 1934 pmap_kremove(vm_offset_t va)
 1935 {
 1936         pt_entry_t *pte;
 1937 
 1938         pte = vtopte(va);
 1939         pte_clear(pte);
 1940 }
 1941 
 1942 /*
 1943  *      Used to map a range of physical addresses into kernel
 1944  *      virtual address space.
 1945  *
 1946  *      The value passed in '*virt' is a suggested virtual address for
 1947  *      the mapping. Architectures which can support a direct-mapped
 1948  *      physical to virtual region can return the appropriate address
 1949  *      within that region, leaving '*virt' unchanged. Other
 1950  *      architectures should map the pages starting at '*virt' and
 1951  *      update '*virt' with the first usable address after the mapped
 1952  *      region.
 1953  */
 1954 vm_offset_t
 1955 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 1956 {
 1957         return PHYS_TO_DMAP(start);
 1958 }
 1959 
 1960 
 1961 /*
 1962  * Add a list of wired pages to the kva
 1963  * this routine is only used for temporary
 1964  * kernel mappings that do not need to have
 1965  * page modification or references recorded.
 1966  * Note that old mappings are simply written
 1967  * over.  The page *must* be wired.
 1968  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1969  */
 1970 void
 1971 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 1972 {
 1973         pt_entry_t *endpte, oldpte, pa, *pte;
 1974         vm_page_t m;
 1975         int cache_bits;
 1976 
 1977         oldpte = 0;
 1978         pte = vtopte(sva);
 1979         endpte = pte + count;
 1980         while (pte < endpte) {
 1981                 m = *ma++;
 1982                 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 1983                 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 1984                 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 1985                         oldpte |= *pte;
 1986                         pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
 1987                 }
 1988                 pte++;
 1989         }
 1990         if (__predict_false((oldpte & X86_PG_V) != 0))
 1991                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
 1992                     PAGE_SIZE);
 1993 }
 1994 
 1995 /*
 1996  * This routine tears out page mappings from the
 1997  * kernel -- it is meant only for temporary mappings.
 1998  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1999  */
 2000 void
 2001 pmap_qremove(vm_offset_t sva, int count)
 2002 {
 2003         vm_offset_t va;
 2004 
 2005         va = sva;
 2006         while (count-- > 0) {
 2007                 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
 2008                 pmap_kremove(va);
 2009                 va += PAGE_SIZE;
 2010         }
 2011         pmap_invalidate_range(kernel_pmap, sva, va);
 2012 }
 2013 
 2014 /***************************************************
 2015  * Page table page management routines.....
 2016  ***************************************************/
 2017 static __inline void
 2018 pmap_free_zero_pages(struct spglist *free)
 2019 {
 2020         vm_page_t m;
 2021 
 2022         while ((m = SLIST_FIRST(free)) != NULL) {
 2023                 SLIST_REMOVE_HEAD(free, plinks.s.ss);
 2024                 /* Preserve the page's PG_ZERO setting. */
 2025                 vm_page_free_toq(m);
 2026         }
 2027 }
 2028 
 2029 /*
 2030  * Schedule the specified unused page table page to be freed.  Specifically,
 2031  * add the page to the specified list of pages that will be released to the
 2032  * physical memory manager after the TLB has been updated.
 2033  */
 2034 static __inline void
 2035 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
 2036     boolean_t set_PG_ZERO)
 2037 {
 2038 
 2039         if (set_PG_ZERO)
 2040                 m->flags |= PG_ZERO;
 2041         else
 2042                 m->flags &= ~PG_ZERO;
 2043         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 2044 }
 2045         
 2046 /*
 2047  * Inserts the specified page table page into the specified pmap's collection
 2048  * of idle page table pages.  Each of a pmap's page table pages is responsible
 2049  * for mapping a distinct range of virtual addresses.  The pmap's collection is
 2050  * ordered by this virtual address range.
 2051  */
 2052 static __inline int
 2053 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 2054 {
 2055 
 2056         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2057         return (vm_radix_insert(&pmap->pm_root, mpte));
 2058 }
 2059 
 2060 /*
 2061  * Looks for a page table page mapping the specified virtual address in the
 2062  * specified pmap's collection of idle page table pages.  Returns NULL if there
 2063  * is no page table page corresponding to the specified virtual address.
 2064  */
 2065 static __inline vm_page_t
 2066 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
 2067 {
 2068 
 2069         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2070         return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
 2071 }
 2072 
 2073 /*
 2074  * Removes the specified page table page from the specified pmap's collection
 2075  * of idle page table pages.  The specified page table page must be a member of
 2076  * the pmap's collection.
 2077  */
 2078 static __inline void
 2079 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
 2080 {
 2081 
 2082         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2083         vm_radix_remove(&pmap->pm_root, mpte->pindex);
 2084 }
 2085 
 2086 /*
 2087  * Decrements a page table page's wire count, which is used to record the
 2088  * number of valid page table entries within the page.  If the wire count
 2089  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
 2090  * page table page was unmapped and FALSE otherwise.
 2091  */
 2092 static inline boolean_t
 2093 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 2094 {
 2095 
 2096         --m->wire_count;
 2097         if (m->wire_count == 0) {
 2098                 _pmap_unwire_ptp(pmap, va, m, free);
 2099                 return (TRUE);
 2100         } else
 2101                 return (FALSE);
 2102 }
 2103 
 2104 static void
 2105 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 2106 {
 2107 
 2108         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2109         /*
 2110          * unmap the page table page
 2111          */
 2112         if (m->pindex >= (NUPDE + NUPDPE)) {
 2113                 /* PDP page */
 2114                 pml4_entry_t *pml4;
 2115                 pml4 = pmap_pml4e(pmap, va);
 2116                 *pml4 = 0;
 2117         } else if (m->pindex >= NUPDE) {
 2118                 /* PD page */
 2119                 pdp_entry_t *pdp;
 2120                 pdp = pmap_pdpe(pmap, va);
 2121                 *pdp = 0;
 2122         } else {
 2123                 /* PTE page */
 2124                 pd_entry_t *pd;
 2125                 pd = pmap_pde(pmap, va);
 2126                 *pd = 0;
 2127         }
 2128         pmap_resident_count_dec(pmap, 1);
 2129         if (m->pindex < NUPDE) {
 2130                 /* We just released a PT, unhold the matching PD */
 2131                 vm_page_t pdpg;
 2132 
 2133                 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 2134                 pmap_unwire_ptp(pmap, va, pdpg, free);
 2135         }
 2136         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 2137                 /* We just released a PD, unhold the matching PDP */
 2138                 vm_page_t pdppg;
 2139 
 2140                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 2141                 pmap_unwire_ptp(pmap, va, pdppg, free);
 2142         }
 2143 
 2144         /*
 2145          * This is a release store so that the ordinary store unmapping
 2146          * the page table page is globally performed before TLB shoot-
 2147          * down is begun.
 2148          */
 2149         atomic_subtract_rel_int(&cnt.v_wire_count, 1);
 2150 
 2151         /* 
 2152          * Put page on a list so that it is released after
 2153          * *ALL* TLB shootdown is done
 2154          */
 2155         pmap_add_delayed_free_list(m, free, TRUE);
 2156 }
 2157 
 2158 /*
 2159  * After removing a page table entry, this routine is used to
 2160  * conditionally free the page, and manage the hold/wire counts.
 2161  */
 2162 static int
 2163 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
 2164     struct spglist *free)
 2165 {
 2166         vm_page_t mpte;
 2167 
 2168         if (va >= VM_MAXUSER_ADDRESS)
 2169                 return (0);
 2170         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 2171         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 2172         return (pmap_unwire_ptp(pmap, va, mpte, free));
 2173 }
 2174 
 2175 void
 2176 pmap_pinit0(pmap_t pmap)
 2177 {
 2178 
 2179         PMAP_LOCK_INIT(pmap);
 2180         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 2181         pmap->pm_cr3 = KPML4phys;
 2182         pmap->pm_root.rt_root = 0;
 2183         CPU_ZERO(&pmap->pm_active);
 2184         CPU_ZERO(&pmap->pm_save);
 2185         PCPU_SET(curpmap, pmap);
 2186         TAILQ_INIT(&pmap->pm_pvchunk);
 2187         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 2188         pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
 2189         pmap->pm_flags = pmap_flags;
 2190 }
 2191 
 2192 /*
 2193  * Initialize a preallocated and zeroed pmap structure,
 2194  * such as one in a vmspace structure.
 2195  */
 2196 int
 2197 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 2198 {
 2199         vm_page_t pml4pg;
 2200         vm_paddr_t pml4phys;
 2201         int i;
 2202 
 2203         /*
 2204          * allocate the page directory page
 2205          */
 2206         while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 2207             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 2208                 VM_WAIT;
 2209 
 2210         pml4phys = VM_PAGE_TO_PHYS(pml4pg);
 2211         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
 2212         pmap->pm_pcid = -1;
 2213         pmap->pm_cr3 = ~0;      /* initialize to an invalid value */
 2214 
 2215         if ((pml4pg->flags & PG_ZERO) == 0)
 2216                 pagezero(pmap->pm_pml4);
 2217 
 2218         /*
 2219          * Do not install the host kernel mappings in the nested page
 2220          * tables. These mappings are meaningless in the guest physical
 2221          * address space.
 2222          */
 2223         if ((pmap->pm_type = pm_type) == PT_X86) {
 2224                 pmap->pm_cr3 = pml4phys;
 2225 
 2226                 /* Wire in kernel global address entries. */
 2227                 for (i = 0; i < NKPML4E; i++) {
 2228                         pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
 2229                             X86_PG_RW | X86_PG_V | PG_U;
 2230                 }
 2231                 for (i = 0; i < ndmpdpphys; i++) {
 2232                         pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
 2233                             X86_PG_RW | X86_PG_V | PG_U;
 2234                 }
 2235 
 2236                 /* install self-referential address mapping entry(s) */
 2237                 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
 2238                     X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 2239 
 2240                 if (pmap_pcid_enabled) {
 2241                         pmap->pm_pcid = alloc_unr(&pcid_unr);
 2242                         if (pmap->pm_pcid != -1)
 2243                                 pmap->pm_cr3 |= pmap->pm_pcid;
 2244                 }
 2245         }
 2246 
 2247         pmap->pm_root.rt_root = 0;
 2248         CPU_ZERO(&pmap->pm_active);
 2249         TAILQ_INIT(&pmap->pm_pvchunk);
 2250         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 2251         pmap->pm_flags = flags;
 2252         pmap->pm_eptgen = 0;
 2253         CPU_ZERO(&pmap->pm_save);
 2254 
 2255         return (1);
 2256 }
 2257 
 2258 int
 2259 pmap_pinit(pmap_t pmap)
 2260 {
 2261 
 2262         return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 2263 }
 2264 
 2265 /*
 2266  * This routine is called if the desired page table page does not exist.
 2267  *
 2268  * If page table page allocation fails, this routine may sleep before
 2269  * returning NULL.  It sleeps only if a lock pointer was given.
 2270  *
 2271  * Note: If a page allocation fails at page table level two or three,
 2272  * one or two pages may be held during the wait, only to be released
 2273  * afterwards.  This conservative approach is easily argued to avoid
 2274  * race conditions.
 2275  */
 2276 static vm_page_t
 2277 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 2278 {
 2279         vm_page_t m, pdppg, pdpg;
 2280         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 2281 
 2282         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2283 
 2284         PG_A = pmap_accessed_bit(pmap);
 2285         PG_M = pmap_modified_bit(pmap);
 2286         PG_V = pmap_valid_bit(pmap);
 2287         PG_RW = pmap_rw_bit(pmap);
 2288 
 2289         /*
 2290          * Allocate a page table page.
 2291          */
 2292         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 2293             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 2294                 if (lockp != NULL) {
 2295                         RELEASE_PV_LIST_LOCK(lockp);
 2296                         PMAP_UNLOCK(pmap);
 2297                         rw_runlock(&pvh_global_lock);
 2298                         VM_WAIT;
 2299                         rw_rlock(&pvh_global_lock);
 2300                         PMAP_LOCK(pmap);
 2301                 }
 2302 
 2303                 /*
 2304                  * Indicate the need to retry.  While waiting, the page table
 2305                  * page may have been allocated.
 2306                  */
 2307                 return (NULL);
 2308         }
 2309         if ((m->flags & PG_ZERO) == 0)
 2310                 pmap_zero_page(m);
 2311 
 2312         /*
 2313          * Map the pagetable page into the process address space, if
 2314          * it isn't already there.
 2315          */
 2316 
 2317         if (ptepindex >= (NUPDE + NUPDPE)) {
 2318                 pml4_entry_t *pml4;
 2319                 vm_pindex_t pml4index;
 2320 
 2321                 /* Wire up a new PDPE page */
 2322                 pml4index = ptepindex - (NUPDE + NUPDPE);
 2323                 pml4 = &pmap->pm_pml4[pml4index];
 2324                 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2325 
 2326         } else if (ptepindex >= NUPDE) {
 2327                 vm_pindex_t pml4index;
 2328                 vm_pindex_t pdpindex;
 2329                 pml4_entry_t *pml4;
 2330                 pdp_entry_t *pdp;
 2331 
 2332                 /* Wire up a new PDE page */
 2333                 pdpindex = ptepindex - NUPDE;
 2334                 pml4index = pdpindex >> NPML4EPGSHIFT;
 2335 
 2336                 pml4 = &pmap->pm_pml4[pml4index];
 2337                 if ((*pml4 & PG_V) == 0) {
 2338                         /* Have to allocate a new pdp, recurse */
 2339                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 2340                             lockp) == NULL) {
 2341                                 --m->wire_count;
 2342                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 2343                                 vm_page_free_zero(m);
 2344                                 return (NULL);
 2345                         }
 2346                 } else {
 2347                         /* Add reference to pdp page */
 2348                         pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 2349                         pdppg->wire_count++;
 2350                 }
 2351                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2352 
 2353                 /* Now find the pdp page */
 2354                 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2355                 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2356 
 2357         } else {
 2358                 vm_pindex_t pml4index;
 2359                 vm_pindex_t pdpindex;
 2360                 pml4_entry_t *pml4;
 2361                 pdp_entry_t *pdp;
 2362                 pd_entry_t *pd;
 2363 
 2364                 /* Wire up a new PTE page */
 2365                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 2366                 pml4index = pdpindex >> NPML4EPGSHIFT;
 2367 
 2368                 /* First, find the pdp and check that its valid. */
 2369                 pml4 = &pmap->pm_pml4[pml4index];
 2370                 if ((*pml4 & PG_V) == 0) {
 2371                         /* Have to allocate a new pd, recurse */
 2372                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 2373                             lockp) == NULL) {
 2374                                 --m->wire_count;
 2375                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 2376                                 vm_page_free_zero(m);
 2377                                 return (NULL);
 2378                         }
 2379                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2380                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2381                 } else {
 2382                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2383                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2384                         if ((*pdp & PG_V) == 0) {
 2385                                 /* Have to allocate a new pd, recurse */
 2386                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 2387                                     lockp) == NULL) {
 2388                                         --m->wire_count;
 2389                                         atomic_subtract_int(&cnt.v_wire_count,
 2390                                             1);
 2391                                         vm_page_free_zero(m);
 2392                                         return (NULL);
 2393                                 }
 2394                         } else {
 2395                                 /* Add reference to the pd page */
 2396                                 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 2397                                 pdpg->wire_count++;
 2398                         }
 2399                 }
 2400                 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 2401 
 2402                 /* Now we know where the page directory page is */
 2403                 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 2404                 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2405         }
 2406 
 2407         pmap_resident_count_inc(pmap, 1);
 2408 
 2409         return (m);
 2410 }
 2411 
 2412 static vm_page_t
 2413 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 2414 {
 2415         vm_pindex_t pdpindex, ptepindex;
 2416         pdp_entry_t *pdpe, PG_V;
 2417         vm_page_t pdpg;
 2418 
 2419         PG_V = pmap_valid_bit(pmap);
 2420 
 2421 retry:
 2422         pdpe = pmap_pdpe(pmap, va);
 2423         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 2424                 /* Add a reference to the pd page. */
 2425                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 2426                 pdpg->wire_count++;
 2427         } else {
 2428                 /* Allocate a pd page. */
 2429                 ptepindex = pmap_pde_pindex(va);
 2430                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 2431                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 2432                 if (pdpg == NULL && lockp != NULL)
 2433                         goto retry;
 2434         }
 2435         return (pdpg);
 2436 }
 2437 
 2438 static vm_page_t
 2439 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 2440 {
 2441         vm_pindex_t ptepindex;
 2442         pd_entry_t *pd, PG_V;
 2443         vm_page_t m;
 2444 
 2445         PG_V = pmap_valid_bit(pmap);
 2446 
 2447         /*
 2448          * Calculate pagetable page index
 2449          */
 2450         ptepindex = pmap_pde_pindex(va);
 2451 retry:
 2452         /*
 2453          * Get the page directory entry
 2454          */
 2455         pd = pmap_pde(pmap, va);
 2456 
 2457         /*
 2458          * This supports switching from a 2MB page to a
 2459          * normal 4K page.
 2460          */
 2461         if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 2462                 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 2463                         /*
 2464                          * Invalidation of the 2MB page mapping may have caused
 2465                          * the deallocation of the underlying PD page.
 2466                          */
 2467                         pd = NULL;
 2468                 }
 2469         }
 2470 
 2471         /*
 2472          * If the page table page is mapped, we just increment the
 2473          * hold count, and activate it.
 2474          */
 2475         if (pd != NULL && (*pd & PG_V) != 0) {
 2476                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 2477                 m->wire_count++;
 2478         } else {
 2479                 /*
 2480                  * Here if the pte page isn't mapped, or if it has been
 2481                  * deallocated.
 2482                  */
 2483                 m = _pmap_allocpte(pmap, ptepindex, lockp);
 2484                 if (m == NULL && lockp != NULL)
 2485                         goto retry;
 2486         }
 2487         return (m);
 2488 }
 2489 
 2490 
 2491 /***************************************************
 2492  * Pmap allocation/deallocation routines.
 2493  ***************************************************/
 2494 
 2495 /*
 2496  * Release any resources held by the given physical map.
 2497  * Called when a pmap initialized by pmap_pinit is being released.
 2498  * Should only be called if the map contains no valid mappings.
 2499  */
 2500 void
 2501 pmap_release(pmap_t pmap)
 2502 {
 2503         vm_page_t m;
 2504         int i;
 2505 
 2506         KASSERT(pmap->pm_stats.resident_count == 0,
 2507             ("pmap_release: pmap resident count %ld != 0",
 2508             pmap->pm_stats.resident_count));
 2509         KASSERT(vm_radix_is_empty(&pmap->pm_root),
 2510             ("pmap_release: pmap has reserved page table page(s)"));
 2511 
 2512         if (pmap_pcid_enabled) {
 2513                 /*
 2514                  * Invalidate any left TLB entries, to allow the reuse
 2515                  * of the pcid.
 2516                  */
 2517                 pmap_invalidate_all(pmap);
 2518         }
 2519 
 2520         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
 2521 
 2522         for (i = 0; i < NKPML4E; i++)   /* KVA */
 2523                 pmap->pm_pml4[KPML4BASE + i] = 0;
 2524         for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 2525                 pmap->pm_pml4[DMPML4I + i] = 0;
 2526         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
 2527 
 2528         m->wire_count--;
 2529         atomic_subtract_int(&cnt.v_wire_count, 1);
 2530         vm_page_free_zero(m);
 2531         if (pmap->pm_pcid != -1)
 2532                 free_unr(&pcid_unr, pmap->pm_pcid);
 2533 }
 2534 
 2535 static int
 2536 kvm_size(SYSCTL_HANDLER_ARGS)
 2537 {
 2538         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 2539 
 2540         return sysctl_handle_long(oidp, &ksize, 0, req);
 2541 }
 2542 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
 2543     0, 0, kvm_size, "LU", "Size of KVM");
 2544 
 2545 static int
 2546 kvm_free(SYSCTL_HANDLER_ARGS)
 2547 {
 2548         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 2549 
 2550         return sysctl_handle_long(oidp, &kfree, 0, req);
 2551 }
 2552 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
 2553     0, 0, kvm_free, "LU", "Amount of KVM free");
 2554 
 2555 /*
 2556  * grow the number of kernel page table entries, if needed
 2557  */
 2558 void
 2559 pmap_growkernel(vm_offset_t addr)
 2560 {
 2561         vm_paddr_t paddr;
 2562         vm_page_t nkpg;
 2563         pd_entry_t *pde, newpdir;
 2564         pdp_entry_t *pdpe;
 2565 
 2566         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 2567 
 2568         /*
 2569          * Return if "addr" is within the range of kernel page table pages
 2570          * that were preallocated during pmap bootstrap.  Moreover, leave
 2571          * "kernel_vm_end" and the kernel page table as they were.
 2572          *
 2573          * The correctness of this action is based on the following
 2574          * argument: vm_map_findspace() allocates contiguous ranges of the
 2575          * kernel virtual address space.  It calls this function if a range
 2576          * ends after "kernel_vm_end".  If the kernel is mapped between
 2577          * "kernel_vm_end" and "addr", then the range cannot begin at
 2578          * "kernel_vm_end".  In fact, its beginning address cannot be less
 2579          * than the kernel.  Thus, there is no immediate need to allocate
 2580          * any new kernel page table pages between "kernel_vm_end" and
 2581          * "KERNBASE".
 2582          */
 2583         if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 2584                 return;
 2585 
 2586         addr = roundup2(addr, NBPDR);
 2587         if (addr - 1 >= kernel_map->max_offset)
 2588                 addr = kernel_map->max_offset;
 2589         while (kernel_vm_end < addr) {
 2590                 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 2591                 if ((*pdpe & X86_PG_V) == 0) {
 2592                         /* We need a new PDP entry */
 2593                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 2594                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 2595                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 2596                         if (nkpg == NULL)
 2597                                 panic("pmap_growkernel: no memory to grow kernel");
 2598                         if ((nkpg->flags & PG_ZERO) == 0)
 2599                                 pmap_zero_page(nkpg);
 2600                         paddr = VM_PAGE_TO_PHYS(nkpg);
 2601                         *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
 2602                             X86_PG_A | X86_PG_M);
 2603                         continue; /* try again */
 2604                 }
 2605                 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 2606                 if ((*pde & X86_PG_V) != 0) {
 2607                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 2608                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 2609                                 kernel_vm_end = kernel_map->max_offset;
 2610                                 break;                       
 2611                         }
 2612                         continue;
 2613                 }
 2614 
 2615                 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 2616                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 2617                     VM_ALLOC_ZERO);
 2618                 if (nkpg == NULL)
 2619                         panic("pmap_growkernel: no memory to grow kernel");
 2620                 if ((nkpg->flags & PG_ZERO) == 0)
 2621                         pmap_zero_page(nkpg);
 2622                 paddr = VM_PAGE_TO_PHYS(nkpg);
 2623                 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 2624                 pde_store(pde, newpdir);
 2625 
 2626                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 2627                 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 2628                         kernel_vm_end = kernel_map->max_offset;
 2629                         break;                       
 2630                 }
 2631         }
 2632 }
 2633 
 2634 
 2635 /***************************************************
 2636  * page management routines.
 2637  ***************************************************/
 2638 
 2639 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 2640 CTASSERT(_NPCM == 3);
 2641 CTASSERT(_NPCPV == 168);
 2642 
 2643 static __inline struct pv_chunk *
 2644 pv_to_chunk(pv_entry_t pv)
 2645 {
 2646 
 2647         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 2648 }
 2649 
 2650 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 2651 
 2652 #define PC_FREE0        0xfffffffffffffffful
 2653 #define PC_FREE1        0xfffffffffffffffful
 2654 #define PC_FREE2        0x000000fffffffffful
 2655 
 2656 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 2657 
 2658 #ifdef PV_STATS
 2659 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 2660 
 2661 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 2662         "Current number of pv entry chunks");
 2663 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 2664         "Current number of pv entry chunks allocated");
 2665 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 2666         "Current number of pv entry chunks frees");
 2667 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 2668         "Number of times tried to get a chunk page but failed.");
 2669 
 2670 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 2671 static int pv_entry_spare;
 2672 
 2673 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 2674         "Current number of pv entry frees");
 2675 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 2676         "Current number of pv entry allocs");
 2677 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 2678         "Current number of pv entries");
 2679 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 2680         "Current number of spare pv entries");
 2681 #endif
 2682 
 2683 /*
 2684  * We are in a serious low memory condition.  Resort to
 2685  * drastic measures to free some pages so we can allocate
 2686  * another pv entry chunk.
 2687  *
 2688  * Returns NULL if PV entries were reclaimed from the specified pmap.
 2689  *
 2690  * We do not, however, unmap 2mpages because subsequent accesses will
 2691  * allocate per-page pv entries until repromotion occurs, thereby
 2692  * exacerbating the shortage of free pv entries.
 2693  */
 2694 static vm_page_t
 2695 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 2696 {
 2697         struct pch new_tail;
 2698         struct pv_chunk *pc;
 2699         struct md_page *pvh;
 2700         pd_entry_t *pde;
 2701         pmap_t pmap;
 2702         pt_entry_t *pte, tpte;
 2703         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 2704         pv_entry_t pv;
 2705         vm_offset_t va;
 2706         vm_page_t m, m_pc;
 2707         struct spglist free;
 2708         uint64_t inuse;
 2709         int bit, field, freed;
 2710 
 2711         rw_assert(&pvh_global_lock, RA_LOCKED);
 2712         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 2713         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 2714         pmap = NULL;
 2715         m_pc = NULL;
 2716         PG_G = PG_A = PG_M = PG_RW = 0;
 2717         SLIST_INIT(&free);
 2718         TAILQ_INIT(&new_tail);
 2719         mtx_lock(&pv_chunks_mutex);
 2720         while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
 2721                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 2722                 mtx_unlock(&pv_chunks_mutex);
 2723                 if (pmap != pc->pc_pmap) {
 2724                         if (pmap != NULL) {
 2725                                 pmap_invalidate_all(pmap);
 2726                                 if (pmap != locked_pmap)
 2727                                         PMAP_UNLOCK(pmap);
 2728                         }
 2729                         pmap = pc->pc_pmap;
 2730                         /* Avoid deadlock and lock recursion. */
 2731                         if (pmap > locked_pmap) {
 2732                                 RELEASE_PV_LIST_LOCK(lockp);
 2733                                 PMAP_LOCK(pmap);
 2734                         } else if (pmap != locked_pmap &&
 2735                             !PMAP_TRYLOCK(pmap)) {
 2736                                 pmap = NULL;
 2737                                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 2738                                 mtx_lock(&pv_chunks_mutex);
 2739                                 continue;
 2740                         }
 2741                         PG_G = pmap_global_bit(pmap);
 2742                         PG_A = pmap_accessed_bit(pmap);
 2743                         PG_M = pmap_modified_bit(pmap);
 2744                         PG_RW = pmap_rw_bit(pmap);
 2745                 }
 2746 
 2747                 /*
 2748                  * Destroy every non-wired, 4 KB page mapping in the chunk.
 2749                  */
 2750                 freed = 0;
 2751                 for (field = 0; field < _NPCM; field++) {
 2752                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 2753                             inuse != 0; inuse &= ~(1UL << bit)) {
 2754                                 bit = bsfq(inuse);
 2755                                 pv = &pc->pc_pventry[field * 64 + bit];
 2756                                 va = pv->pv_va;
 2757                                 pde = pmap_pde(pmap, va);
 2758                                 if ((*pde & PG_PS) != 0)
 2759                                         continue;
 2760                                 pte = pmap_pde_to_pte(pde, va);
 2761                                 if ((*pte & PG_W) != 0)
 2762                                         continue;
 2763                                 tpte = pte_load_clear(pte);
 2764                                 if ((tpte & PG_G) != 0)
 2765                                         pmap_invalidate_page(pmap, va);
 2766                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 2767                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2768                                         vm_page_dirty(m);
 2769                                 if ((tpte & PG_A) != 0)
 2770                                         vm_page_aflag_set(m, PGA_REFERENCED);
 2771                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 2772                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 2773                                 m->md.pv_gen++;
 2774                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 2775                                     (m->flags & PG_FICTITIOUS) == 0) {
 2776                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2777                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 2778                                                 vm_page_aflag_clear(m,
 2779                                                     PGA_WRITEABLE);
 2780                                         }
 2781                                 }
 2782                                 pc->pc_map[field] |= 1UL << bit;
 2783                                 pmap_unuse_pt(pmap, va, *pde, &free);
 2784                                 freed++;
 2785                         }
 2786                 }
 2787                 if (freed == 0) {
 2788                         TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 2789                         mtx_lock(&pv_chunks_mutex);
 2790                         continue;
 2791                 }
 2792                 /* Every freed mapping is for a 4 KB page. */
 2793                 pmap_resident_count_dec(pmap, freed);
 2794                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 2795                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 2796                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 2797                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2798                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 2799                     pc->pc_map[2] == PC_FREE2) {
 2800                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 2801                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 2802                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 2803                         /* Entire chunk is free; return it. */
 2804                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 2805                         dump_drop_page(m_pc->phys_addr);
 2806                         mtx_lock(&pv_chunks_mutex);
 2807                         break;
 2808                 }
 2809                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2810                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 2811                 mtx_lock(&pv_chunks_mutex);
 2812                 /* One freed pv entry in locked_pmap is sufficient. */
 2813                 if (pmap == locked_pmap)
 2814                         break;
 2815         }
 2816         TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 2817         mtx_unlock(&pv_chunks_mutex);
 2818         if (pmap != NULL) {
 2819                 pmap_invalidate_all(pmap);
 2820                 if (pmap != locked_pmap)
 2821                         PMAP_UNLOCK(pmap);
 2822         }
 2823         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 2824                 m_pc = SLIST_FIRST(&free);
 2825                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 2826                 /* Recycle a freed page table page. */
 2827                 m_pc->wire_count = 1;
 2828                 atomic_add_int(&cnt.v_wire_count, 1);
 2829         }
 2830         pmap_free_zero_pages(&free);
 2831         return (m_pc);
 2832 }
 2833 
 2834 /*
 2835  * free the pv_entry back to the free list
 2836  */
 2837 static void
 2838 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 2839 {
 2840         struct pv_chunk *pc;
 2841         int idx, field, bit;
 2842 
 2843         rw_assert(&pvh_global_lock, RA_LOCKED);
 2844         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2845         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 2846         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 2847         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 2848         pc = pv_to_chunk(pv);
 2849         idx = pv - &pc->pc_pventry[0];
 2850         field = idx / 64;
 2851         bit = idx % 64;
 2852         pc->pc_map[field] |= 1ul << bit;
 2853         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 2854             pc->pc_map[2] != PC_FREE2) {
 2855                 /* 98% of the time, pc is already at the head of the list. */
 2856                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 2857                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2858                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2859                 }
 2860                 return;
 2861         }
 2862         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2863         free_pv_chunk(pc);
 2864 }
 2865 
 2866 static void
 2867 free_pv_chunk(struct pv_chunk *pc)
 2868 {
 2869         vm_page_t m;
 2870 
 2871         mtx_lock(&pv_chunks_mutex);
 2872         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 2873         mtx_unlock(&pv_chunks_mutex);
 2874         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 2875         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 2876         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 2877         /* entire chunk is free, return it */
 2878         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 2879         dump_drop_page(m->phys_addr);
 2880         vm_page_unwire(m, 0);
 2881         vm_page_free(m);
 2882 }
 2883 
 2884 /*
 2885  * Returns a new PV entry, allocating a new PV chunk from the system when
 2886  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
 2887  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
 2888  * returned.
 2889  *
 2890  * The given PV list lock may be released.
 2891  */
 2892 static pv_entry_t
 2893 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 2894 {
 2895         int bit, field;
 2896         pv_entry_t pv;
 2897         struct pv_chunk *pc;
 2898         vm_page_t m;
 2899 
 2900         rw_assert(&pvh_global_lock, RA_LOCKED);
 2901         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2902         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 2903 retry:
 2904         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 2905         if (pc != NULL) {
 2906                 for (field = 0; field < _NPCM; field++) {
 2907                         if (pc->pc_map[field]) {
 2908                                 bit = bsfq(pc->pc_map[field]);
 2909                                 break;
 2910                         }
 2911                 }
 2912                 if (field < _NPCM) {
 2913                         pv = &pc->pc_pventry[field * 64 + bit];
 2914                         pc->pc_map[field] &= ~(1ul << bit);
 2915                         /* If this was the last item, move it to tail */
 2916                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 2917                             pc->pc_map[2] == 0) {
 2918                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2919                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 2920                                     pc_list);
 2921                         }
 2922                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 2923                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 2924                         return (pv);
 2925                 }
 2926         }
 2927         /* No free items, allocate another chunk */
 2928         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 2929             VM_ALLOC_WIRED);
 2930         if (m == NULL) {
 2931                 if (lockp == NULL) {
 2932                         PV_STAT(pc_chunk_tryfail++);
 2933                         return (NULL);
 2934                 }
 2935                 m = reclaim_pv_chunk(pmap, lockp);
 2936                 if (m == NULL)
 2937                         goto retry;
 2938         }
 2939         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 2940         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 2941         dump_add_page(m->phys_addr);
 2942         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 2943         pc->pc_pmap = pmap;
 2944         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
 2945         pc->pc_map[1] = PC_FREE1;
 2946         pc->pc_map[2] = PC_FREE2;
 2947         mtx_lock(&pv_chunks_mutex);
 2948         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 2949         mtx_unlock(&pv_chunks_mutex);
 2950         pv = &pc->pc_pventry[0];
 2951         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2952         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 2953         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 2954         return (pv);
 2955 }
 2956 
 2957 /*
 2958  * Returns the number of one bits within the given PV chunk map element.
 2959  */
 2960 static int
 2961 popcnt_pc_map_elem(uint64_t elem)
 2962 {
 2963         int count;
 2964 
 2965         /*
 2966          * This simple method of counting the one bits performs well because
 2967          * the given element typically contains more zero bits than one bits.
 2968          */
 2969         count = 0;
 2970         for (; elem != 0; elem &= elem - 1)
 2971                 count++;
 2972         return (count);
 2973 }
 2974 
 2975 /*
 2976  * Ensure that the number of spare PV entries in the specified pmap meets or
 2977  * exceeds the given count, "needed".
 2978  *
 2979  * The given PV list lock may be released.
 2980  */
 2981 static void
 2982 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 2983 {
 2984         struct pch new_tail;
 2985         struct pv_chunk *pc;
 2986         int avail, free;
 2987         vm_page_t m;
 2988 
 2989         rw_assert(&pvh_global_lock, RA_LOCKED);
 2990         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2991         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 2992 
 2993         /*
 2994          * Newly allocated PV chunks must be stored in a private list until
 2995          * the required number of PV chunks have been allocated.  Otherwise,
 2996          * reclaim_pv_chunk() could recycle one of these chunks.  In
 2997          * contrast, these chunks must be added to the pmap upon allocation.
 2998          */
 2999         TAILQ_INIT(&new_tail);
 3000 retry:
 3001         avail = 0;
 3002         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 3003                 if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
 3004                         free = popcnt_pc_map_elem(pc->pc_map[0]);
 3005                         free += popcnt_pc_map_elem(pc->pc_map[1]);
 3006                         free += popcnt_pc_map_elem(pc->pc_map[2]);
 3007                 } else {
 3008                         free = popcntq(pc->pc_map[0]);
 3009                         free += popcntq(pc->pc_map[1]);
 3010                         free += popcntq(pc->pc_map[2]);
 3011                 }
 3012                 if (free == 0)
 3013                         break;
 3014                 avail += free;
 3015                 if (avail >= needed)
 3016                         break;
 3017         }
 3018         for (; avail < needed; avail += _NPCPV) {
 3019                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 3020                     VM_ALLOC_WIRED);
 3021                 if (m == NULL) {
 3022                         m = reclaim_pv_chunk(pmap, lockp);
 3023                         if (m == NULL)
 3024                                 goto retry;
 3025                 }
 3026                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 3027                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 3028                 dump_add_page(m->phys_addr);
 3029                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 3030                 pc->pc_pmap = pmap;
 3031                 pc->pc_map[0] = PC_FREE0;
 3032                 pc->pc_map[1] = PC_FREE1;
 3033                 pc->pc_map[2] = PC_FREE2;
 3034                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 3035                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 3036                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 3037         }
 3038         if (!TAILQ_EMPTY(&new_tail)) {
 3039                 mtx_lock(&pv_chunks_mutex);
 3040                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 3041                 mtx_unlock(&pv_chunks_mutex);
 3042         }
 3043 }
 3044 
 3045 /*
 3046  * First find and then remove the pv entry for the specified pmap and virtual
 3047  * address from the specified pv list.  Returns the pv entry if found and NULL
 3048  * otherwise.  This operation can be performed on pv lists for either 4KB or
 3049  * 2MB page mappings.
 3050  */
 3051 static __inline pv_entry_t
 3052 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 3053 {
 3054         pv_entry_t pv;
 3055 
 3056         rw_assert(&pvh_global_lock, RA_LOCKED);
 3057         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 3058                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 3059                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 3060                         pvh->pv_gen++;
 3061                         break;
 3062                 }
 3063         }
 3064         return (pv);
 3065 }
 3066 
 3067 /*
 3068  * After demotion from a 2MB page mapping to 512 4KB page mappings,
 3069  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
 3070  * entries for each of the 4KB page mappings.
 3071  */
 3072 static void
 3073 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3074     struct rwlock **lockp)
 3075 {
 3076         struct md_page *pvh;
 3077         struct pv_chunk *pc;
 3078         pv_entry_t pv;
 3079         vm_offset_t va_last;
 3080         vm_page_t m;
 3081         int bit, field;
 3082 
 3083         rw_assert(&pvh_global_lock, RA_LOCKED);
 3084         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3085         KASSERT((pa & PDRMASK) == 0,
 3086             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 3087         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3088 
 3089         /*
 3090          * Transfer the 2mpage's pv entry for this mapping to the first
 3091          * page's pv list.  Once this transfer begins, the pv list lock
 3092          * must not be released until the last pv entry is reinstantiated.
 3093          */
 3094         pvh = pa_to_pvh(pa);
 3095         va = trunc_2mpage(va);
 3096         pv = pmap_pvh_remove(pvh, pmap, va);
 3097         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 3098         m = PHYS_TO_VM_PAGE(pa);
 3099         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3100         m->md.pv_gen++;
 3101         /* Instantiate the remaining NPTEPG - 1 pv entries. */
 3102         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 3103         va_last = va + NBPDR - PAGE_SIZE;
 3104         for (;;) {
 3105                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 3106                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 3107                     pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
 3108                 for (field = 0; field < _NPCM; field++) {
 3109                         while (pc->pc_map[field]) {
 3110                                 bit = bsfq(pc->pc_map[field]);
 3111                                 pc->pc_map[field] &= ~(1ul << bit);
 3112                                 pv = &pc->pc_pventry[field * 64 + bit];
 3113                                 va += PAGE_SIZE;
 3114                                 pv->pv_va = va;
 3115                                 m++;
 3116                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3117                             ("pmap_pv_demote_pde: page %p is not managed", m));
 3118                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3119                                 m->md.pv_gen++;
 3120                                 if (va == va_last)
 3121                                         goto out;
 3122                         }
 3123                 }
 3124                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3125                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 3126         }
 3127 out:
 3128         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 3129                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3130                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 3131         }
 3132         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 3133         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 3134 }
 3135 
 3136 /*
 3137  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
 3138  * replace the many pv entries for the 4KB page mappings by a single pv entry
 3139  * for the 2MB page mapping.
 3140  */
 3141 static void
 3142 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3143     struct rwlock **lockp)
 3144 {
 3145         struct md_page *pvh;
 3146         pv_entry_t pv;
 3147         vm_offset_t va_last;
 3148         vm_page_t m;
 3149 
 3150         rw_assert(&pvh_global_lock, RA_LOCKED);
 3151         KASSERT((pa & PDRMASK) == 0,
 3152             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 3153         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3154 
 3155         /*
 3156          * Transfer the first page's pv entry for this mapping to the 2mpage's
 3157          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 3158          * a transfer avoids the possibility that get_pv_entry() calls
 3159          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 3160          * mappings that is being promoted.
 3161          */
 3162         m = PHYS_TO_VM_PAGE(pa);
 3163         va = trunc_2mpage(va);
 3164         pv = pmap_pvh_remove(&m->md, pmap, va);
 3165         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 3166         pvh = pa_to_pvh(pa);
 3167         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 3168         pvh->pv_gen++;
 3169         /* Free the remaining NPTEPG - 1 pv entries. */
 3170         va_last = va + NBPDR - PAGE_SIZE;
 3171         do {
 3172                 m++;
 3173                 va += PAGE_SIZE;
 3174                 pmap_pvh_free(&m->md, pmap, va);
 3175         } while (va < va_last);
 3176 }
 3177 
 3178 /*
 3179  * First find and then destroy the pv entry for the specified pmap and virtual
 3180  * address.  This operation can be performed on pv lists for either 4KB or 2MB
 3181  * page mappings.
 3182  */
 3183 static void
 3184 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 3185 {
 3186         pv_entry_t pv;
 3187 
 3188         pv = pmap_pvh_remove(pvh, pmap, va);
 3189         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 3190         free_pv_entry(pmap, pv);
 3191 }
 3192 
 3193 /*
 3194  * Conditionally create the PV entry for a 4KB page mapping if the required
 3195  * memory can be allocated without resorting to reclamation.
 3196  */
 3197 static boolean_t
 3198 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
 3199     struct rwlock **lockp)
 3200 {
 3201         pv_entry_t pv;
 3202 
 3203         rw_assert(&pvh_global_lock, RA_LOCKED);
 3204         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3205         /* Pass NULL instead of the lock pointer to disable reclamation. */
 3206         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 3207                 pv->pv_va = va;
 3208                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 3209                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3210                 m->md.pv_gen++;
 3211                 return (TRUE);
 3212         } else
 3213                 return (FALSE);
 3214 }
 3215 
 3216 /*
 3217  * Conditionally create the PV entry for a 2MB page mapping if the required
 3218  * memory can be allocated without resorting to reclamation.
 3219  */
 3220 static boolean_t
 3221 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3222     struct rwlock **lockp)
 3223 {
 3224         struct md_page *pvh;
 3225         pv_entry_t pv;
 3226 
 3227         rw_assert(&pvh_global_lock, RA_LOCKED);
 3228         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3229         /* Pass NULL instead of the lock pointer to disable reclamation. */
 3230         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 3231                 pv->pv_va = va;
 3232                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3233                 pvh = pa_to_pvh(pa);
 3234                 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 3235                 pvh->pv_gen++;
 3236                 return (TRUE);
 3237         } else
 3238                 return (FALSE);
 3239 }
 3240 
 3241 /*
 3242  * Fills a page table page with mappings to consecutive physical pages.
 3243  */
 3244 static void
 3245 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 3246 {
 3247         pt_entry_t *pte;
 3248 
 3249         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 3250                 *pte = newpte;
 3251                 newpte += PAGE_SIZE;
 3252         }
 3253 }
 3254 
 3255 /*
 3256  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
 3257  * mapping is invalidated.
 3258  */
 3259 static boolean_t
 3260 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 3261 {
 3262         struct rwlock *lock;
 3263         boolean_t rv;
 3264 
 3265         lock = NULL;
 3266         rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
 3267         if (lock != NULL)
 3268                 rw_wunlock(lock);
 3269         return (rv);
 3270 }
 3271 
 3272 static boolean_t
 3273 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 3274     struct rwlock **lockp)
 3275 {
 3276         pd_entry_t newpde, oldpde;
 3277         pt_entry_t *firstpte, newpte;
 3278         pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
 3279         vm_paddr_t mptepa;
 3280         vm_page_t mpte;
 3281         struct spglist free;
 3282         int PG_PTE_CACHE;
 3283 
 3284         PG_G = pmap_global_bit(pmap);
 3285         PG_A = pmap_accessed_bit(pmap);
 3286         PG_M = pmap_modified_bit(pmap);
 3287         PG_RW = pmap_rw_bit(pmap);
 3288         PG_V = pmap_valid_bit(pmap);
 3289         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 3290 
 3291         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3292         oldpde = *pde;
 3293         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 3294             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 3295         if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
 3296             NULL)
 3297                 pmap_remove_pt_page(pmap, mpte);
 3298         else {
 3299                 KASSERT((oldpde & PG_W) == 0,
 3300                     ("pmap_demote_pde: page table page for a wired mapping"
 3301                     " is missing"));
 3302 
 3303                 /*
 3304                  * Invalidate the 2MB page mapping and return "failure" if the
 3305                  * mapping was never accessed or the allocation of the new
 3306                  * page table page fails.  If the 2MB page mapping belongs to
 3307                  * the direct map region of the kernel's address space, then
 3308                  * the page allocation request specifies the highest possible
 3309                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
 3310                  * normal.  Page table pages are preallocated for every other
 3311                  * part of the kernel address space, so the direct map region
 3312                  * is the only part of the kernel address space that must be
 3313                  * handled here.
 3314                  */
 3315                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 3316                     pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
 3317                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 3318                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 3319                         SLIST_INIT(&free);
 3320                         pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
 3321                             lockp);
 3322                         pmap_invalidate_page(pmap, trunc_2mpage(va));
 3323                         pmap_free_zero_pages(&free);
 3324                         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
 3325                             " in pmap %p", va, pmap);
 3326                         return (FALSE);
 3327                 }
 3328                 if (va < VM_MAXUSER_ADDRESS)
 3329                         pmap_resident_count_inc(pmap, 1);
 3330         }
 3331         mptepa = VM_PAGE_TO_PHYS(mpte);
 3332         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 3333         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 3334         KASSERT((oldpde & PG_A) != 0,
 3335             ("pmap_demote_pde: oldpde is missing PG_A"));
 3336         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 3337             ("pmap_demote_pde: oldpde is missing PG_M"));
 3338         newpte = oldpde & ~PG_PS;
 3339         newpte = pmap_swap_pat(pmap, newpte);
 3340 
 3341         /*
 3342          * If the page table page is new, initialize it.
 3343          */
 3344         if (mpte->wire_count == 1) {
 3345                 mpte->wire_count = NPTEPG;
 3346                 pmap_fill_ptp(firstpte, newpte);
 3347         }
 3348         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 3349             ("pmap_demote_pde: firstpte and newpte map different physical"
 3350             " addresses"));
 3351 
 3352         /*
 3353          * If the mapping has changed attributes, update the page table
 3354          * entries.
 3355          */
 3356         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 3357                 pmap_fill_ptp(firstpte, newpte);
 3358 
 3359         /*
 3360          * The spare PV entries must be reserved prior to demoting the
 3361          * mapping, that is, prior to changing the PDE.  Otherwise, the state
 3362          * of the PDE and the PV lists will be inconsistent, which can result
 3363          * in reclaim_pv_chunk() attempting to remove a PV entry from the
 3364          * wrong PV list and pmap_pv_demote_pde() failing to find the expected
 3365          * PV entry for the 2MB page mapping that is being demoted.
 3366          */
 3367         if ((oldpde & PG_MANAGED) != 0)
 3368                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 3369 
 3370         /*
 3371          * Demote the mapping.  This pmap is locked.  The old PDE has
 3372          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 3373          * set.  Thus, there is no danger of a race with another
 3374          * processor changing the setting of PG_A and/or PG_M between
 3375          * the read above and the store below. 
 3376          */
 3377         if (workaround_erratum383)
 3378                 pmap_update_pde(pmap, va, pde, newpde);
 3379         else
 3380                 pde_store(pde, newpde);
 3381 
 3382         /*
 3383          * Invalidate a stale recursive mapping of the page table page.
 3384          */
 3385         if (va >= VM_MAXUSER_ADDRESS)
 3386                 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 3387 
 3388         /*
 3389          * Demote the PV entry.
 3390          */
 3391         if ((oldpde & PG_MANAGED) != 0)
 3392                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
 3393 
 3394         atomic_add_long(&pmap_pde_demotions, 1);
 3395         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
 3396             " in pmap %p", va, pmap);
 3397         return (TRUE);
 3398 }
 3399 
 3400 /*
 3401  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
 3402  */
 3403 static void
 3404 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 3405 {
 3406         pd_entry_t newpde;
 3407         vm_paddr_t mptepa;
 3408         vm_page_t mpte;
 3409 
 3410         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 3411         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3412         mpte = pmap_lookup_pt_page(pmap, va);
 3413         if (mpte == NULL)
 3414                 panic("pmap_remove_kernel_pde: Missing pt page.");
 3415 
 3416         pmap_remove_pt_page(pmap, mpte);
 3417         mptepa = VM_PAGE_TO_PHYS(mpte);
 3418         newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
 3419 
 3420         /*
 3421          * Initialize the page table page.
 3422          */
 3423         pagezero((void *)PHYS_TO_DMAP(mptepa));
 3424 
 3425         /*
 3426          * Demote the mapping.
 3427          */
 3428         if (workaround_erratum383)
 3429                 pmap_update_pde(pmap, va, pde, newpde);
 3430         else
 3431                 pde_store(pde, newpde);
 3432 
 3433         /*
 3434          * Invalidate a stale recursive mapping of the page table page.
 3435          */
 3436         pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 3437 }
 3438 
 3439 /*
 3440  * pmap_remove_pde: do the things to unmap a superpage in a process
 3441  */
 3442 static int
 3443 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 3444     struct spglist *free, struct rwlock **lockp)
 3445 {
 3446         struct md_page *pvh;
 3447         pd_entry_t oldpde;
 3448         vm_offset_t eva, va;
 3449         vm_page_t m, mpte;
 3450         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 3451 
 3452         PG_G = pmap_global_bit(pmap);
 3453         PG_A = pmap_accessed_bit(pmap);
 3454         PG_M = pmap_modified_bit(pmap);
 3455         PG_RW = pmap_rw_bit(pmap);
 3456 
 3457         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3458         KASSERT((sva & PDRMASK) == 0,
 3459             ("pmap_remove_pde: sva is not 2mpage aligned"));
 3460         oldpde = pte_load_clear(pdq);
 3461         if (oldpde & PG_W)
 3462                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 3463 
 3464         /*
 3465          * Machines that don't support invlpg, also don't support
 3466          * PG_G.
 3467          */
 3468         if (oldpde & PG_G)
 3469                 pmap_invalidate_page(kernel_pmap, sva);
 3470         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 3471         if (oldpde & PG_MANAGED) {
 3472                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 3473                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 3474                 pmap_pvh_free(pvh, pmap, sva);
 3475                 eva = sva + NBPDR;
 3476                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 3477                     va < eva; va += PAGE_SIZE, m++) {
 3478                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3479                                 vm_page_dirty(m);
 3480                         if (oldpde & PG_A)
 3481                                 vm_page_aflag_set(m, PGA_REFERENCED);
 3482                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 3483                             TAILQ_EMPTY(&pvh->pv_list))
 3484                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 3485                 }
 3486         }
 3487         if (pmap == kernel_pmap) {
 3488                 pmap_remove_kernel_pde(pmap, pdq, sva);
 3489         } else {
 3490                 mpte = pmap_lookup_pt_page(pmap, sva);
 3491                 if (mpte != NULL) {
 3492                         pmap_remove_pt_page(pmap, mpte);
 3493                         pmap_resident_count_dec(pmap, 1);
 3494                         KASSERT(mpte->wire_count == NPTEPG,
 3495                             ("pmap_remove_pde: pte page wire count error"));
 3496                         mpte->wire_count = 0;
 3497                         pmap_add_delayed_free_list(mpte, free, FALSE);
 3498                         atomic_subtract_int(&cnt.v_wire_count, 1);
 3499                 }
 3500         }
 3501         return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 3502 }
 3503 
 3504 /*
 3505  * pmap_remove_pte: do the things to unmap a page in a process
 3506  */
 3507 static int
 3508 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
 3509     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 3510 {
 3511         struct md_page *pvh;
 3512         pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 3513         vm_page_t m;
 3514 
 3515         PG_A = pmap_accessed_bit(pmap);
 3516         PG_M = pmap_modified_bit(pmap);
 3517         PG_RW = pmap_rw_bit(pmap);
 3518 
 3519         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3520         oldpte = pte_load_clear(ptq);
 3521         if (oldpte & PG_W)
 3522                 pmap->pm_stats.wired_count -= 1;
 3523         pmap_resident_count_dec(pmap, 1);
 3524         if (oldpte & PG_MANAGED) {
 3525                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 3526                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3527                         vm_page_dirty(m);
 3528                 if (oldpte & PG_A)
 3529                         vm_page_aflag_set(m, PGA_REFERENCED);
 3530                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 3531                 pmap_pvh_free(&m->md, pmap, va);
 3532                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 3533                     (m->flags & PG_FICTITIOUS) == 0) {
 3534                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3535                         if (TAILQ_EMPTY(&pvh->pv_list))
 3536                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 3537                 }
 3538         }
 3539         return (pmap_unuse_pt(pmap, va, ptepde, free));
 3540 }
 3541 
 3542 /*
 3543  * Remove a single page from a process address space
 3544  */
 3545 static void
 3546 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 3547     struct spglist *free)
 3548 {
 3549         struct rwlock *lock;
 3550         pt_entry_t *pte, PG_V;
 3551 
 3552         PG_V = pmap_valid_bit(pmap);
 3553         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3554         if ((*pde & PG_V) == 0)
 3555                 return;
 3556         pte = pmap_pde_to_pte(pde, va);
 3557         if ((*pte & PG_V) == 0)
 3558                 return;
 3559         lock = NULL;
 3560         pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
 3561         if (lock != NULL)
 3562                 rw_wunlock(lock);
 3563         pmap_invalidate_page(pmap, va);
 3564 }
 3565 
 3566 /*
 3567  *      Remove the given range of addresses from the specified map.
 3568  *
 3569  *      It is assumed that the start and end are properly
 3570  *      rounded to the page size.
 3571  */
 3572 void
 3573 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 3574 {
 3575         struct rwlock *lock;
 3576         vm_offset_t va, va_next;
 3577         pml4_entry_t *pml4e;
 3578         pdp_entry_t *pdpe;
 3579         pd_entry_t ptpaddr, *pde;
 3580         pt_entry_t *pte, PG_G, PG_V;
 3581         struct spglist free;
 3582         int anyvalid;
 3583 
 3584         PG_G = pmap_global_bit(pmap);
 3585         PG_V = pmap_valid_bit(pmap);
 3586 
 3587         /*
 3588          * Perform an unsynchronized read.  This is, however, safe.
 3589          */
 3590         if (pmap->pm_stats.resident_count == 0)
 3591                 return;
 3592 
 3593         anyvalid = 0;
 3594         SLIST_INIT(&free);
 3595 
 3596         rw_rlock(&pvh_global_lock);
 3597         PMAP_LOCK(pmap);
 3598 
 3599         /*
 3600          * special handling of removing one page.  a very
 3601          * common operation and easy to short circuit some
 3602          * code.
 3603          */
 3604         if (sva + PAGE_SIZE == eva) {
 3605                 pde = pmap_pde(pmap, sva);
 3606                 if (pde && (*pde & PG_PS) == 0) {
 3607                         pmap_remove_page(pmap, sva, pde, &free);
 3608                         goto out;
 3609                 }
 3610         }
 3611 
 3612         lock = NULL;
 3613         for (; sva < eva; sva = va_next) {
 3614 
 3615                 if (pmap->pm_stats.resident_count == 0)
 3616                         break;
 3617 
 3618                 pml4e = pmap_pml4e(pmap, sva);
 3619                 if ((*pml4e & PG_V) == 0) {
 3620                         va_next = (sva + NBPML4) & ~PML4MASK;
 3621                         if (va_next < sva)
 3622                                 va_next = eva;
 3623                         continue;
 3624                 }
 3625 
 3626                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 3627                 if ((*pdpe & PG_V) == 0) {
 3628                         va_next = (sva + NBPDP) & ~PDPMASK;
 3629                         if (va_next < sva)
 3630                                 va_next = eva;
 3631                         continue;
 3632                 }
 3633 
 3634                 /*
 3635                  * Calculate index for next page table.
 3636                  */
 3637                 va_next = (sva + NBPDR) & ~PDRMASK;
 3638                 if (va_next < sva)
 3639                         va_next = eva;
 3640 
 3641                 pde = pmap_pdpe_to_pde(pdpe, sva);
 3642                 ptpaddr = *pde;
 3643 
 3644                 /*
 3645                  * Weed out invalid mappings.
 3646                  */
 3647                 if (ptpaddr == 0)
 3648                         continue;
 3649 
 3650                 /*
 3651                  * Check for large page.
 3652                  */
 3653                 if ((ptpaddr & PG_PS) != 0) {
 3654                         /*
 3655                          * Are we removing the entire large page?  If not,
 3656                          * demote the mapping and fall through.
 3657                          */
 3658                         if (sva + NBPDR == va_next && eva >= va_next) {
 3659                                 /*
 3660                                  * The TLB entry for a PG_G mapping is
 3661                                  * invalidated by pmap_remove_pde().
 3662                                  */
 3663                                 if ((ptpaddr & PG_G) == 0)
 3664                                         anyvalid = 1;
 3665                                 pmap_remove_pde(pmap, pde, sva, &free, &lock);
 3666                                 continue;
 3667                         } else if (!pmap_demote_pde_locked(pmap, pde, sva,
 3668                             &lock)) {
 3669                                 /* The large page mapping was destroyed. */
 3670                                 continue;
 3671                         } else
 3672                                 ptpaddr = *pde;
 3673                 }
 3674 
 3675                 /*
 3676                  * Limit our scan to either the end of the va represented
 3677                  * by the current page table page, or to the end of the
 3678                  * range being removed.
 3679                  */
 3680                 if (va_next > eva)
 3681                         va_next = eva;
 3682 
 3683                 va = va_next;
 3684                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 3685                     sva += PAGE_SIZE) {
 3686                         if (*pte == 0) {
 3687                                 if (va != va_next) {
 3688                                         pmap_invalidate_range(pmap, va, sva);
 3689                                         va = va_next;
 3690                                 }
 3691                                 continue;
 3692                         }
 3693                         if ((*pte & PG_G) == 0)
 3694                                 anyvalid = 1;
 3695                         else if (va == va_next)
 3696                                 va = sva;
 3697                         if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
 3698                             &lock)) {
 3699                                 sva += PAGE_SIZE;
 3700                                 break;
 3701                         }
 3702                 }
 3703                 if (va != va_next)
 3704                         pmap_invalidate_range(pmap, va, sva);
 3705         }
 3706         if (lock != NULL)
 3707                 rw_wunlock(lock);
 3708 out:
 3709         if (anyvalid)
 3710                 pmap_invalidate_all(pmap);
 3711         rw_runlock(&pvh_global_lock);   
 3712         PMAP_UNLOCK(pmap);
 3713         pmap_free_zero_pages(&free);
 3714 }
 3715 
 3716 /*
 3717  *      Routine:        pmap_remove_all
 3718  *      Function:
 3719  *              Removes this physical page from
 3720  *              all physical maps in which it resides.
 3721  *              Reflects back modify bits to the pager.
 3722  *
 3723  *      Notes:
 3724  *              Original versions of this routine were very
 3725  *              inefficient because they iteratively called
 3726  *              pmap_remove (slow...)
 3727  */
 3728 
 3729 void
 3730 pmap_remove_all(vm_page_t m)
 3731 {
 3732         struct md_page *pvh;
 3733         pv_entry_t pv;
 3734         pmap_t pmap;
 3735         pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 3736         pd_entry_t *pde;
 3737         vm_offset_t va;
 3738         struct spglist free;
 3739 
 3740         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3741             ("pmap_remove_all: page %p is not managed", m));
 3742         SLIST_INIT(&free);
 3743         rw_wlock(&pvh_global_lock);
 3744         if ((m->flags & PG_FICTITIOUS) != 0)
 3745                 goto small_mappings;
 3746         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3747         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 3748                 pmap = PV_PMAP(pv);
 3749                 PMAP_LOCK(pmap);
 3750                 va = pv->pv_va;
 3751                 pde = pmap_pde(pmap, va);
 3752                 (void)pmap_demote_pde(pmap, pde, va);
 3753                 PMAP_UNLOCK(pmap);
 3754         }
 3755 small_mappings:
 3756         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 3757                 pmap = PV_PMAP(pv);
 3758                 PMAP_LOCK(pmap);
 3759                 PG_A = pmap_accessed_bit(pmap);
 3760                 PG_M = pmap_modified_bit(pmap);
 3761                 PG_RW = pmap_rw_bit(pmap);
 3762                 pmap_resident_count_dec(pmap, 1);
 3763                 pde = pmap_pde(pmap, pv->pv_va);
 3764                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 3765                     " a 2mpage in page %p's pv list", m));
 3766                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 3767                 tpte = pte_load_clear(pte);
 3768                 if (tpte & PG_W)
 3769                         pmap->pm_stats.wired_count--;
 3770                 if (tpte & PG_A)
 3771                         vm_page_aflag_set(m, PGA_REFERENCED);
 3772 
 3773                 /*
 3774                  * Update the vm_page_t clean and reference bits.
 3775                  */
 3776                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3777                         vm_page_dirty(m);
 3778                 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 3779                 pmap_invalidate_page(pmap, pv->pv_va);
 3780                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 3781                 m->md.pv_gen++;
 3782                 free_pv_entry(pmap, pv);
 3783                 PMAP_UNLOCK(pmap);
 3784         }
 3785         vm_page_aflag_clear(m, PGA_WRITEABLE);
 3786         rw_wunlock(&pvh_global_lock);
 3787         pmap_free_zero_pages(&free);
 3788 }
 3789 
 3790 /*
 3791  * pmap_protect_pde: do the things to protect a 2mpage in a process
 3792  */
 3793 static boolean_t
 3794 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 3795 {
 3796         pd_entry_t newpde, oldpde;
 3797         vm_offset_t eva, va;
 3798         vm_page_t m;
 3799         boolean_t anychanged;
 3800         pt_entry_t PG_G, PG_M, PG_RW;
 3801 
 3802         PG_G = pmap_global_bit(pmap);
 3803         PG_M = pmap_modified_bit(pmap);
 3804         PG_RW = pmap_rw_bit(pmap);
 3805 
 3806         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3807         KASSERT((sva & PDRMASK) == 0,
 3808             ("pmap_protect_pde: sva is not 2mpage aligned"));
 3809         anychanged = FALSE;
 3810 retry:
 3811         oldpde = newpde = *pde;
 3812         if (oldpde & PG_MANAGED) {
 3813                 eva = sva + NBPDR;
 3814                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 3815                     va < eva; va += PAGE_SIZE, m++)
 3816                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3817                                 vm_page_dirty(m);
 3818         }
 3819         if ((prot & VM_PROT_WRITE) == 0)
 3820                 newpde &= ~(PG_RW | PG_M);
 3821         if ((prot & VM_PROT_EXECUTE) == 0)
 3822                 newpde |= pg_nx;
 3823         if (newpde != oldpde) {
 3824                 if (!atomic_cmpset_long(pde, oldpde, newpde))
 3825                         goto retry;
 3826                 if (oldpde & PG_G)
 3827                         pmap_invalidate_page(pmap, sva);
 3828                 else
 3829                         anychanged = TRUE;
 3830         }
 3831         return (anychanged);
 3832 }
 3833 
 3834 /*
 3835  *      Set the physical protection on the
 3836  *      specified range of this map as requested.
 3837  */
 3838 void
 3839 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 3840 {
 3841         vm_offset_t va_next;
 3842         pml4_entry_t *pml4e;
 3843         pdp_entry_t *pdpe;
 3844         pd_entry_t ptpaddr, *pde;
 3845         pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 3846         boolean_t anychanged, pv_lists_locked;
 3847 
 3848         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 3849         if (prot == VM_PROT_NONE) {
 3850                 pmap_remove(pmap, sva, eva);
 3851                 return;
 3852         }
 3853 
 3854         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 3855             (VM_PROT_WRITE|VM_PROT_EXECUTE))
 3856                 return;
 3857 
 3858         PG_G = pmap_global_bit(pmap);
 3859         PG_M = pmap_modified_bit(pmap);
 3860         PG_V = pmap_valid_bit(pmap);
 3861         PG_RW = pmap_rw_bit(pmap);
 3862         pv_lists_locked = FALSE;
 3863 resume:
 3864         anychanged = FALSE;
 3865 
 3866         PMAP_LOCK(pmap);
 3867         for (; sva < eva; sva = va_next) {
 3868 
 3869                 pml4e = pmap_pml4e(pmap, sva);
 3870                 if ((*pml4e & PG_V) == 0) {
 3871                         va_next = (sva + NBPML4) & ~PML4MASK;
 3872                         if (va_next < sva)
 3873                                 va_next = eva;
 3874                         continue;
 3875                 }
 3876 
 3877                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 3878                 if ((*pdpe & PG_V) == 0) {
 3879                         va_next = (sva + NBPDP) & ~PDPMASK;
 3880                         if (va_next < sva)
 3881                                 va_next = eva;
 3882                         continue;
 3883                 }
 3884 
 3885                 va_next = (sva + NBPDR) & ~PDRMASK;
 3886                 if (va_next < sva)
 3887                         va_next = eva;
 3888 
 3889                 pde = pmap_pdpe_to_pde(pdpe, sva);
 3890                 ptpaddr = *pde;
 3891 
 3892                 /*
 3893                  * Weed out invalid mappings.
 3894                  */
 3895                 if (ptpaddr == 0)
 3896                         continue;
 3897 
 3898                 /*
 3899                  * Check for large page.
 3900                  */
 3901                 if ((ptpaddr & PG_PS) != 0) {
 3902                         /*
 3903                          * Are we protecting the entire large page?  If not,
 3904                          * demote the mapping and fall through.
 3905                          */
 3906                         if (sva + NBPDR == va_next && eva >= va_next) {
 3907                                 /*
 3908                                  * The TLB entry for a PG_G mapping is
 3909                                  * invalidated by pmap_protect_pde().
 3910                                  */
 3911                                 if (pmap_protect_pde(pmap, pde, sva, prot))
 3912                                         anychanged = TRUE;
 3913                                 continue;
 3914                         } else {
 3915                                 if (!pv_lists_locked) {
 3916                                         pv_lists_locked = TRUE;
 3917                                         if (!rw_try_rlock(&pvh_global_lock)) {
 3918                                                 if (anychanged)
 3919                                                         pmap_invalidate_all(
 3920                                                             pmap);
 3921                                                 PMAP_UNLOCK(pmap);
 3922                                                 rw_rlock(&pvh_global_lock);
 3923                                                 goto resume;
 3924                                         }
 3925                                 }
 3926                                 if (!pmap_demote_pde(pmap, pde, sva)) {
 3927                                         /*
 3928                                          * The large page mapping was
 3929                                          * destroyed.
 3930                                          */
 3931                                         continue;
 3932                                 }
 3933                         }
 3934                 }
 3935 
 3936                 if (va_next > eva)
 3937                         va_next = eva;
 3938 
 3939                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 3940                     sva += PAGE_SIZE) {
 3941                         pt_entry_t obits, pbits;
 3942                         vm_page_t m;
 3943 
 3944 retry:
 3945                         obits = pbits = *pte;
 3946                         if ((pbits & PG_V) == 0)
 3947                                 continue;
 3948 
 3949                         if ((prot & VM_PROT_WRITE) == 0) {
 3950                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 3951                                     (PG_MANAGED | PG_M | PG_RW)) {
 3952                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 3953                                         vm_page_dirty(m);
 3954                                 }
 3955                                 pbits &= ~(PG_RW | PG_M);
 3956                         }
 3957                         if ((prot & VM_PROT_EXECUTE) == 0)
 3958                                 pbits |= pg_nx;
 3959 
 3960                         if (pbits != obits) {
 3961                                 if (!atomic_cmpset_long(pte, obits, pbits))
 3962                                         goto retry;
 3963                                 if (obits & PG_G)
 3964                                         pmap_invalidate_page(pmap, sva);
 3965                                 else
 3966                                         anychanged = TRUE;
 3967                         }
 3968                 }
 3969         }
 3970         if (anychanged)
 3971                 pmap_invalidate_all(pmap);
 3972         if (pv_lists_locked)
 3973                 rw_runlock(&pvh_global_lock);
 3974         PMAP_UNLOCK(pmap);
 3975 }
 3976 
 3977 /*
 3978  * Tries to promote the 512, contiguous 4KB page mappings that are within a
 3979  * single page table page (PTP) to a single 2MB page mapping.  For promotion
 3980  * to occur, two conditions must be met: (1) the 4KB page mappings must map
 3981  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
 3982  * identical characteristics. 
 3983  */
 3984 static void
 3985 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 3986     struct rwlock **lockp)
 3987 {
 3988         pd_entry_t newpde;
 3989         pt_entry_t *firstpte, oldpte, pa, *pte;
 3990         pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
 3991         vm_offset_t oldpteva;
 3992         vm_page_t mpte;
 3993         int PG_PTE_CACHE;
 3994 
 3995         PG_A = pmap_accessed_bit(pmap);
 3996         PG_G = pmap_global_bit(pmap);
 3997         PG_M = pmap_modified_bit(pmap);
 3998         PG_V = pmap_valid_bit(pmap);
 3999         PG_RW = pmap_rw_bit(pmap);
 4000         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 4001 
 4002         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4003 
 4004         /*
 4005          * Examine the first PTE in the specified PTP.  Abort if this PTE is
 4006          * either invalid, unused, or does not map the first 4KB physical page
 4007          * within a 2MB page. 
 4008          */
 4009         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 4010 setpde:
 4011         newpde = *firstpte;
 4012         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 4013                 atomic_add_long(&pmap_pde_p_failures, 1);
 4014                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4015                     " in pmap %p", va, pmap);
 4016                 return;
 4017         }
 4018         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 4019                 /*
 4020                  * When PG_M is already clear, PG_RW can be cleared without
 4021                  * a TLB invalidation.
 4022                  */
 4023                 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 4024                         goto setpde;
 4025                 newpde &= ~PG_RW;
 4026         }
 4027 
 4028         /*
 4029          * Examine each of the other PTEs in the specified PTP.  Abort if this
 4030          * PTE maps an unexpected 4KB physical page or does not have identical
 4031          * characteristics to the first PTE.
 4032          */
 4033         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 4034         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 4035 setpte:
 4036                 oldpte = *pte;
 4037                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 4038                         atomic_add_long(&pmap_pde_p_failures, 1);
 4039                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4040                             " in pmap %p", va, pmap);
 4041                         return;
 4042                 }
 4043                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 4044                         /*
 4045                          * When PG_M is already clear, PG_RW can be cleared
 4046                          * without a TLB invalidation.
 4047                          */
 4048                         if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 4049                                 goto setpte;
 4050                         oldpte &= ~PG_RW;
 4051                         oldpteva = (oldpte & PG_FRAME & PDRMASK) |
 4052                             (va & ~PDRMASK);
 4053                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 4054                             " in pmap %p", oldpteva, pmap);
 4055                 }
 4056                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 4057                         atomic_add_long(&pmap_pde_p_failures, 1);
 4058                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4059                             " in pmap %p", va, pmap);
 4060                         return;
 4061                 }
 4062                 pa -= PAGE_SIZE;
 4063         }
 4064 
 4065         /*
 4066          * Save the page table page in its current state until the PDE
 4067          * mapping the superpage is demoted by pmap_demote_pde() or
 4068          * destroyed by pmap_remove_pde(). 
 4069          */
 4070         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 4071         KASSERT(mpte >= vm_page_array &&
 4072             mpte < &vm_page_array[vm_page_array_size],
 4073             ("pmap_promote_pde: page table page is out of range"));
 4074         KASSERT(mpte->pindex == pmap_pde_pindex(va),
 4075             ("pmap_promote_pde: page table page's pindex is wrong"));
 4076         if (pmap_insert_pt_page(pmap, mpte)) {
 4077                 atomic_add_long(&pmap_pde_p_failures, 1);
 4078                 CTR2(KTR_PMAP,
 4079                     "pmap_promote_pde: failure for va %#lx in pmap %p", va,
 4080                     pmap);
 4081                 return;
 4082         }
 4083 
 4084         /*
 4085          * Promote the pv entries.
 4086          */
 4087         if ((newpde & PG_MANAGED) != 0)
 4088                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 4089 
 4090         /*
 4091          * Propagate the PAT index to its proper position.
 4092          */
 4093         newpde = pmap_swap_pat(pmap, newpde);
 4094 
 4095         /*
 4096          * Map the superpage.
 4097          */
 4098         if (workaround_erratum383)
 4099                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 4100         else
 4101                 pde_store(pde, PG_PS | newpde);
 4102 
 4103         atomic_add_long(&pmap_pde_promotions, 1);
 4104         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 4105             " in pmap %p", va, pmap);
 4106 }
 4107 
 4108 /*
 4109  *      Insert the given physical page (p) at
 4110  *      the specified virtual address (v) in the
 4111  *      target physical map with the protection requested.
 4112  *
 4113  *      If specified, the page will be wired down, meaning
 4114  *      that the related pte can not be reclaimed.
 4115  *
 4116  *      NB:  This is the only routine which MAY NOT lazy-evaluate
 4117  *      or lose information.  That is, this routine must actually
 4118  *      insert this page into the given map NOW.
 4119  */
 4120 int
 4121 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 4122     u_int flags, int8_t psind __unused)
 4123 {
 4124         struct rwlock *lock;
 4125         pd_entry_t *pde;
 4126         pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 4127         pt_entry_t newpte, origpte;
 4128         pv_entry_t pv;
 4129         vm_paddr_t opa, pa;
 4130         vm_page_t mpte, om;
 4131         boolean_t nosleep;
 4132 
 4133         PG_A = pmap_accessed_bit(pmap);
 4134         PG_G = pmap_global_bit(pmap);
 4135         PG_M = pmap_modified_bit(pmap);
 4136         PG_V = pmap_valid_bit(pmap);
 4137         PG_RW = pmap_rw_bit(pmap);
 4138 
 4139         va = trunc_page(va);
 4140         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 4141         KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 4142             ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 4143             va));
 4144         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
 4145             va >= kmi.clean_eva,
 4146             ("pmap_enter: managed mapping within the clean submap"));
 4147         if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 4148                 VM_OBJECT_ASSERT_LOCKED(m->object);
 4149         pa = VM_PAGE_TO_PHYS(m);
 4150         newpte = (pt_entry_t)(pa | PG_A | PG_V);
 4151         if ((flags & VM_PROT_WRITE) != 0)
 4152                 newpte |= PG_M;
 4153         if ((prot & VM_PROT_WRITE) != 0)
 4154                 newpte |= PG_RW;
 4155         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 4156             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 4157         if ((prot & VM_PROT_EXECUTE) == 0)
 4158                 newpte |= pg_nx;
 4159         if ((flags & PMAP_ENTER_WIRED) != 0)
 4160                 newpte |= PG_W;
 4161         if (va < VM_MAXUSER_ADDRESS)
 4162                 newpte |= PG_U;
 4163         if (pmap == kernel_pmap)
 4164                 newpte |= PG_G;
 4165         newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
 4166 
 4167         /*
 4168          * Set modified bit gratuitously for writeable mappings if
 4169          * the page is unmanaged. We do not want to take a fault
 4170          * to do the dirty bit accounting for these mappings.
 4171          */
 4172         if ((m->oflags & VPO_UNMANAGED) != 0) {
 4173                 if ((newpte & PG_RW) != 0)
 4174                         newpte |= PG_M;
 4175         }
 4176 
 4177         mpte = NULL;
 4178 
 4179         lock = NULL;
 4180         rw_rlock(&pvh_global_lock);
 4181         PMAP_LOCK(pmap);
 4182 
 4183         /*
 4184          * In the case that a page table page is not
 4185          * resident, we are creating it here.
 4186          */
 4187 retry:
 4188         pde = pmap_pde(pmap, va);
 4189         if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
 4190             pmap_demote_pde_locked(pmap, pde, va, &lock))) {
 4191                 pte = pmap_pde_to_pte(pde, va);
 4192                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 4193                         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 4194                         mpte->wire_count++;
 4195                 }
 4196         } else if (va < VM_MAXUSER_ADDRESS) {
 4197                 /*
 4198                  * Here if the pte page isn't mapped, or if it has been
 4199                  * deallocated.
 4200                  */
 4201                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 4202                 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
 4203                     nosleep ? NULL : &lock);
 4204                 if (mpte == NULL && nosleep) {
 4205                         if (lock != NULL)
 4206                                 rw_wunlock(lock);
 4207                         rw_runlock(&pvh_global_lock);
 4208                         PMAP_UNLOCK(pmap);
 4209                         return (KERN_RESOURCE_SHORTAGE);
 4210                 }
 4211                 goto retry;
 4212         } else
 4213                 panic("pmap_enter: invalid page directory va=%#lx", va);
 4214 
 4215         origpte = *pte;
 4216 
 4217         /*
 4218          * Is the specified virtual address already mapped?
 4219          */
 4220         if ((origpte & PG_V) != 0) {
 4221                 /*
 4222                  * Wiring change, just update stats. We don't worry about
 4223                  * wiring PT pages as they remain resident as long as there
 4224                  * are valid mappings in them. Hence, if a user page is wired,
 4225                  * the PT page will be also.
 4226                  */
 4227                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 4228                         pmap->pm_stats.wired_count++;
 4229                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 4230                         pmap->pm_stats.wired_count--;
 4231 
 4232                 /*
 4233                  * Remove the extra PT page reference.
 4234                  */
 4235                 if (mpte != NULL) {
 4236                         mpte->wire_count--;
 4237                         KASSERT(mpte->wire_count > 0,
 4238                             ("pmap_enter: missing reference to page table page,"
 4239                              " va: 0x%lx", va));
 4240                 }
 4241 
 4242                 /*
 4243                  * Has the physical page changed?
 4244                  */
 4245                 opa = origpte & PG_FRAME;
 4246                 if (opa == pa) {
 4247                         /*
 4248                          * No, might be a protection or wiring change.
 4249                          */
 4250                         if ((origpte & PG_MANAGED) != 0) {
 4251                                 newpte |= PG_MANAGED;
 4252                                 if ((newpte & PG_RW) != 0)
 4253                                         vm_page_aflag_set(m, PGA_WRITEABLE);
 4254                         }
 4255                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 4256                                 goto unchanged;
 4257                         goto validate;
 4258                 }
 4259         } else {
 4260                 /*
 4261                  * Increment the counters.
 4262                  */
 4263                 if ((newpte & PG_W) != 0)
 4264                         pmap->pm_stats.wired_count++;
 4265                 pmap_resident_count_inc(pmap, 1);
 4266         }
 4267 
 4268         /*
 4269          * Enter on the PV list if part of our managed memory.
 4270          */
 4271         if ((m->oflags & VPO_UNMANAGED) == 0) {
 4272                 newpte |= PG_MANAGED;
 4273                 pv = get_pv_entry(pmap, &lock);
 4274                 pv->pv_va = va;
 4275                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 4276                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 4277                 m->md.pv_gen++;
 4278                 if ((newpte & PG_RW) != 0)
 4279                         vm_page_aflag_set(m, PGA_WRITEABLE);
 4280         }
 4281 
 4282         /*
 4283          * Update the PTE.
 4284          */
 4285         if ((origpte & PG_V) != 0) {
 4286 validate:
 4287                 origpte = pte_load_store(pte, newpte);
 4288                 opa = origpte & PG_FRAME;
 4289                 if (opa != pa) {
 4290                         if ((origpte & PG_MANAGED) != 0) {
 4291                                 om = PHYS_TO_VM_PAGE(opa);
 4292                                 if ((origpte & (PG_M | PG_RW)) == (PG_M |
 4293                                     PG_RW))
 4294                                         vm_page_dirty(om);
 4295                                 if ((origpte & PG_A) != 0)
 4296                                         vm_page_aflag_set(om, PGA_REFERENCED);
 4297                                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 4298                                 pmap_pvh_free(&om->md, pmap, va);
 4299                                 if ((om->aflags & PGA_WRITEABLE) != 0 &&
 4300                                     TAILQ_EMPTY(&om->md.pv_list) &&
 4301                                     ((om->flags & PG_FICTITIOUS) != 0 ||
 4302                                     TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 4303                                         vm_page_aflag_clear(om, PGA_WRITEABLE);
 4304                         }
 4305                 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
 4306                     PG_RW)) == (PG_M | PG_RW)) {
 4307                         if ((origpte & PG_MANAGED) != 0)
 4308                                 vm_page_dirty(m);
 4309 
 4310                         /*
 4311                          * Although the PTE may still have PG_RW set, TLB
 4312                          * invalidation may nonetheless be required because
 4313                          * the PTE no longer has PG_M set.
 4314                          */
 4315                 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 4316                         /*
 4317                          * This PTE change does not require TLB invalidation.
 4318                          */
 4319                         goto unchanged;
 4320                 }
 4321                 if ((origpte & PG_A) != 0)
 4322                         pmap_invalidate_page(pmap, va);
 4323         } else
 4324                 pte_store(pte, newpte);
 4325 
 4326 unchanged:
 4327 
 4328         /*
 4329          * If both the page table page and the reservation are fully
 4330          * populated, then attempt promotion.
 4331          */
 4332         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 4333             pmap_ps_enabled(pmap) &&
 4334             (m->flags & PG_FICTITIOUS) == 0 &&
 4335             vm_reserv_level_iffullpop(m) == 0)
 4336                 pmap_promote_pde(pmap, pde, va, &lock);
 4337 
 4338         if (lock != NULL)
 4339                 rw_wunlock(lock);
 4340         rw_runlock(&pvh_global_lock);
 4341         PMAP_UNLOCK(pmap);
 4342         return (KERN_SUCCESS);
 4343 }
 4344 
 4345 /*
 4346  * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
 4347  * otherwise.  Fails if (1) a page table page cannot be allocated without
 4348  * blocking, (2) a mapping already exists at the specified virtual address, or
 4349  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
 4350  */
 4351 static boolean_t
 4352 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 4353     struct rwlock **lockp)
 4354 {
 4355         pd_entry_t *pde, newpde;
 4356         pt_entry_t PG_V;
 4357         vm_page_t mpde;
 4358         struct spglist free;
 4359 
 4360         PG_V = pmap_valid_bit(pmap);
 4361         rw_assert(&pvh_global_lock, RA_LOCKED);
 4362         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4363 
 4364         if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
 4365                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4366                     " in pmap %p", va, pmap);
 4367                 return (FALSE);
 4368         }
 4369         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
 4370         pde = &pde[pmap_pde_index(va)];
 4371         if ((*pde & PG_V) != 0) {
 4372                 KASSERT(mpde->wire_count > 1,
 4373                     ("pmap_enter_pde: mpde's wire count is too low"));
 4374                 mpde->wire_count--;
 4375                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4376                     " in pmap %p", va, pmap);
 4377                 return (FALSE);
 4378         }
 4379         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 4380             PG_PS | PG_V;
 4381         if ((m->oflags & VPO_UNMANAGED) == 0) {
 4382                 newpde |= PG_MANAGED;
 4383 
 4384                 /*
 4385                  * Abort this mapping if its PV entry could not be created.
 4386                  */
 4387                 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
 4388                     lockp)) {
 4389                         SLIST_INIT(&free);
 4390                         if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
 4391                                 pmap_invalidate_page(pmap, va);
 4392                                 pmap_free_zero_pages(&free);
 4393                         }
 4394                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4395                             " in pmap %p", va, pmap);
 4396                         return (FALSE);
 4397                 }
 4398         }
 4399         if ((prot & VM_PROT_EXECUTE) == 0)
 4400                 newpde |= pg_nx;
 4401         if (va < VM_MAXUSER_ADDRESS)
 4402                 newpde |= PG_U;
 4403 
 4404         /*
 4405          * Increment counters.
 4406          */
 4407         pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 4408 
 4409         /*
 4410          * Map the superpage.
 4411          */
 4412         pde_store(pde, newpde);
 4413 
 4414         atomic_add_long(&pmap_pde_mappings, 1);
 4415         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 4416             " in pmap %p", va, pmap);
 4417         return (TRUE);
 4418 }
 4419 
 4420 /*
 4421  * Maps a sequence of resident pages belonging to the same object.
 4422  * The sequence begins with the given page m_start.  This page is
 4423  * mapped at the given virtual address start.  Each subsequent page is
 4424  * mapped at a virtual address that is offset from start by the same
 4425  * amount as the page is offset from m_start within the object.  The
 4426  * last page in the sequence is the page with the largest offset from
 4427  * m_start that can be mapped at a virtual address less than the given
 4428  * virtual address end.  Not every virtual page between start and end
 4429  * is mapped; only those for which a resident page exists with the
 4430  * corresponding offset from m_start are mapped.
 4431  */
 4432 void
 4433 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
 4434     vm_page_t m_start, vm_prot_t prot)
 4435 {
 4436         struct rwlock *lock;
 4437         vm_offset_t va;
 4438         vm_page_t m, mpte;
 4439         vm_pindex_t diff, psize;
 4440 
 4441         VM_OBJECT_ASSERT_LOCKED(m_start->object);
 4442 
 4443         psize = atop(end - start);
 4444         mpte = NULL;
 4445         m = m_start;
 4446         lock = NULL;
 4447         rw_rlock(&pvh_global_lock);
 4448         PMAP_LOCK(pmap);
 4449         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 4450                 va = start + ptoa(diff);
 4451                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 4452                     m->psind == 1 && pmap_ps_enabled(pmap) &&
 4453                     pmap_enter_pde(pmap, va, m, prot, &lock))
 4454                         m = &m[NBPDR / PAGE_SIZE - 1];
 4455                 else
 4456                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 4457                             mpte, &lock);
 4458                 m = TAILQ_NEXT(m, listq);
 4459         }
 4460         if (lock != NULL)
 4461                 rw_wunlock(lock);
 4462         rw_runlock(&pvh_global_lock);
 4463         PMAP_UNLOCK(pmap);
 4464 }
 4465 
 4466 /*
 4467  * this code makes some *MAJOR* assumptions:
 4468  * 1. Current pmap & pmap exists.
 4469  * 2. Not wired.
 4470  * 3. Read access.
 4471  * 4. No page table pages.
 4472  * but is *MUCH* faster than pmap_enter...
 4473  */
 4474 
 4475 void
 4476 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 4477 {
 4478         struct rwlock *lock;
 4479 
 4480         lock = NULL;
 4481         rw_rlock(&pvh_global_lock);
 4482         PMAP_LOCK(pmap);
 4483         (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 4484         if (lock != NULL)
 4485                 rw_wunlock(lock);
 4486         rw_runlock(&pvh_global_lock);
 4487         PMAP_UNLOCK(pmap);
 4488 }
 4489 
 4490 static vm_page_t
 4491 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 4492     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 4493 {
 4494         struct spglist free;
 4495         pt_entry_t *pte, PG_V;
 4496         vm_paddr_t pa;
 4497 
 4498         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 4499             (m->oflags & VPO_UNMANAGED) != 0,
 4500             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 4501         PG_V = pmap_valid_bit(pmap);
 4502         rw_assert(&pvh_global_lock, RA_LOCKED);
 4503         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4504 
 4505         /*
 4506          * In the case that a page table page is not
 4507          * resident, we are creating it here.
 4508          */
 4509         if (va < VM_MAXUSER_ADDRESS) {
 4510                 vm_pindex_t ptepindex;
 4511                 pd_entry_t *ptepa;
 4512 
 4513                 /*
 4514                  * Calculate pagetable page index
 4515                  */
 4516                 ptepindex = pmap_pde_pindex(va);
 4517                 if (mpte && (mpte->pindex == ptepindex)) {
 4518                         mpte->wire_count++;
 4519                 } else {
 4520                         /*
 4521                          * Get the page directory entry
 4522                          */
 4523                         ptepa = pmap_pde(pmap, va);
 4524 
 4525                         /*
 4526                          * If the page table page is mapped, we just increment
 4527                          * the hold count, and activate it.  Otherwise, we
 4528                          * attempt to allocate a page table page.  If this
 4529                          * attempt fails, we don't retry.  Instead, we give up.
 4530                          */
 4531                         if (ptepa && (*ptepa & PG_V) != 0) {
 4532                                 if (*ptepa & PG_PS)
 4533                                         return (NULL);
 4534                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 4535                                 mpte->wire_count++;
 4536                         } else {
 4537                                 /*
 4538                                  * Pass NULL instead of the PV list lock
 4539                                  * pointer, because we don't intend to sleep.
 4540                                  */
 4541                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 4542                                 if (mpte == NULL)
 4543                                         return (mpte);
 4544                         }
 4545                 }
 4546                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 4547                 pte = &pte[pmap_pte_index(va)];
 4548         } else {
 4549                 mpte = NULL;
 4550                 pte = vtopte(va);
 4551         }
 4552         if (*pte) {
 4553                 if (mpte != NULL) {
 4554                         mpte->wire_count--;
 4555                         mpte = NULL;
 4556                 }
 4557                 return (mpte);
 4558         }
 4559 
 4560         /*
 4561          * Enter on the PV list if part of our managed memory.
 4562          */
 4563         if ((m->oflags & VPO_UNMANAGED) == 0 &&
 4564             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 4565                 if (mpte != NULL) {
 4566                         SLIST_INIT(&free);
 4567                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 4568                                 pmap_invalidate_page(pmap, va);
 4569                                 pmap_free_zero_pages(&free);
 4570                         }
 4571                         mpte = NULL;
 4572                 }
 4573                 return (mpte);
 4574         }
 4575 
 4576         /*
 4577          * Increment counters
 4578          */
 4579         pmap_resident_count_inc(pmap, 1);
 4580 
 4581         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
 4582         if ((prot & VM_PROT_EXECUTE) == 0)
 4583                 pa |= pg_nx;
 4584 
 4585         /*
 4586          * Now validate mapping with RO protection
 4587          */
 4588         if ((m->oflags & VPO_UNMANAGED) != 0)
 4589                 pte_store(pte, pa | PG_V | PG_U);
 4590         else
 4591                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 4592         return (mpte);
 4593 }
 4594 
 4595 /*
 4596  * Make a temporary mapping for a physical address.  This is only intended
 4597  * to be used for panic dumps.
 4598  */
 4599 void *
 4600 pmap_kenter_temporary(vm_paddr_t pa, int i)
 4601 {
 4602         vm_offset_t va;
 4603 
 4604         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 4605         pmap_kenter(va, pa);
 4606         invlpg(va);
 4607         return ((void *)crashdumpmap);
 4608 }
 4609 
 4610 /*
 4611  * This code maps large physical mmap regions into the
 4612  * processor address space.  Note that some shortcuts
 4613  * are taken, but the code works.
 4614  */
 4615 void
 4616 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
 4617     vm_pindex_t pindex, vm_size_t size)
 4618 {
 4619         pd_entry_t *pde;
 4620         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 4621         vm_paddr_t pa, ptepa;
 4622         vm_page_t p, pdpg;
 4623         int pat_mode;
 4624 
 4625         PG_A = pmap_accessed_bit(pmap);
 4626         PG_M = pmap_modified_bit(pmap);
 4627         PG_V = pmap_valid_bit(pmap);
 4628         PG_RW = pmap_rw_bit(pmap);
 4629 
 4630         VM_OBJECT_ASSERT_WLOCKED(object);
 4631         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 4632             ("pmap_object_init_pt: non-device object"));
 4633         if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 4634                 if (!pmap_ps_enabled(pmap))
 4635                         return;
 4636                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
 4637                         return;
 4638                 p = vm_page_lookup(object, pindex);
 4639                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
 4640                     ("pmap_object_init_pt: invalid page %p", p));
 4641                 pat_mode = p->md.pat_mode;
 4642 
 4643                 /*
 4644                  * Abort the mapping if the first page is not physically
 4645                  * aligned to a 2MB page boundary.
 4646                  */
 4647                 ptepa = VM_PAGE_TO_PHYS(p);
 4648                 if (ptepa & (NBPDR - 1))
 4649                         return;
 4650 
 4651                 /*
 4652                  * Skip the first page.  Abort the mapping if the rest of
 4653                  * the pages are not physically contiguous or have differing
 4654                  * memory attributes.
 4655                  */
 4656                 p = TAILQ_NEXT(p, listq);
 4657                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 4658                     pa += PAGE_SIZE) {
 4659                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
 4660                             ("pmap_object_init_pt: invalid page %p", p));
 4661                         if (pa != VM_PAGE_TO_PHYS(p) ||
 4662                             pat_mode != p->md.pat_mode)
 4663                                 return;
 4664                         p = TAILQ_NEXT(p, listq);
 4665                 }
 4666 
 4667                 /*
 4668                  * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 4669                  * "size" is a multiple of 2M, adding the PAT setting to "pa"
 4670                  * will not affect the termination of this loop.
 4671                  */ 
 4672                 PMAP_LOCK(pmap);
 4673                 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 4674                     pa < ptepa + size; pa += NBPDR) {
 4675                         pdpg = pmap_allocpde(pmap, addr, NULL);
 4676                         if (pdpg == NULL) {
 4677                                 /*
 4678                                  * The creation of mappings below is only an
 4679                                  * optimization.  If a page directory page
 4680                                  * cannot be allocated without blocking,
 4681                                  * continue on to the next mapping rather than
 4682                                  * blocking.
 4683                                  */
 4684                                 addr += NBPDR;
 4685                                 continue;
 4686                         }
 4687                         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 4688                         pde = &pde[pmap_pde_index(addr)];
 4689                         if ((*pde & PG_V) == 0) {
 4690                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
 4691                                     PG_U | PG_RW | PG_V);
 4692                                 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 4693                                 atomic_add_long(&pmap_pde_mappings, 1);
 4694                         } else {
 4695                                 /* Continue on if the PDE is already valid. */
 4696                                 pdpg->wire_count--;
 4697                                 KASSERT(pdpg->wire_count > 0,
 4698                                     ("pmap_object_init_pt: missing reference "
 4699                                     "to page directory page, va: 0x%lx", addr));
 4700                         }
 4701                         addr += NBPDR;
 4702                 }
 4703                 PMAP_UNLOCK(pmap);
 4704         }
 4705 }
 4706 
 4707 /*
 4708  *      Clear the wired attribute from the mappings for the specified range of
 4709  *      addresses in the given pmap.  Every valid mapping within that range
 4710  *      must have the wired attribute set.  In contrast, invalid mappings
 4711  *      cannot have the wired attribute set, so they are ignored.
 4712  *
 4713  *      The wired attribute of the page table entry is not a hardware feature,
 4714  *      so there is no need to invalidate any TLB entries.
 4715  */
 4716 void
 4717 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 4718 {
 4719         vm_offset_t va_next;
 4720         pml4_entry_t *pml4e;
 4721         pdp_entry_t *pdpe;
 4722         pd_entry_t *pde;
 4723         pt_entry_t *pte, PG_V;
 4724         boolean_t pv_lists_locked;
 4725 
 4726         PG_V = pmap_valid_bit(pmap);
 4727         pv_lists_locked = FALSE;
 4728 resume:
 4729         PMAP_LOCK(pmap);
 4730         for (; sva < eva; sva = va_next) {
 4731                 pml4e = pmap_pml4e(pmap, sva);
 4732                 if ((*pml4e & PG_V) == 0) {
 4733                         va_next = (sva + NBPML4) & ~PML4MASK;
 4734                         if (va_next < sva)
 4735                                 va_next = eva;
 4736                         continue;
 4737                 }
 4738                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 4739                 if ((*pdpe & PG_V) == 0) {
 4740                         va_next = (sva + NBPDP) & ~PDPMASK;
 4741                         if (va_next < sva)
 4742                                 va_next = eva;
 4743                         continue;
 4744                 }
 4745                 va_next = (sva + NBPDR) & ~PDRMASK;
 4746                 if (va_next < sva)
 4747                         va_next = eva;
 4748                 pde = pmap_pdpe_to_pde(pdpe, sva);
 4749                 if ((*pde & PG_V) == 0)
 4750                         continue;
 4751                 if ((*pde & PG_PS) != 0) {
 4752                         if ((*pde & PG_W) == 0)
 4753                                 panic("pmap_unwire: pde %#jx is missing PG_W",
 4754                                     (uintmax_t)*pde);
 4755 
 4756                         /*
 4757                          * Are we unwiring the entire large page?  If not,
 4758                          * demote the mapping and fall through.
 4759                          */
 4760                         if (sva + NBPDR == va_next && eva >= va_next) {
 4761                                 atomic_clear_long(pde, PG_W);
 4762                                 pmap->pm_stats.wired_count -= NBPDR /
 4763                                     PAGE_SIZE;
 4764                                 continue;
 4765                         } else {
 4766                                 if (!pv_lists_locked) {
 4767                                         pv_lists_locked = TRUE;
 4768                                         if (!rw_try_rlock(&pvh_global_lock)) {
 4769                                                 PMAP_UNLOCK(pmap);
 4770                                                 rw_rlock(&pvh_global_lock);
 4771                                                 /* Repeat sva. */
 4772                                                 goto resume;
 4773                                         }
 4774                                 }
 4775                                 if (!pmap_demote_pde(pmap, pde, sva))
 4776                                         panic("pmap_unwire: demotion failed");
 4777                         }
 4778                 }
 4779                 if (va_next > eva)
 4780                         va_next = eva;
 4781                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 4782                     sva += PAGE_SIZE) {
 4783                         if ((*pte & PG_V) == 0)
 4784                                 continue;
 4785                         if ((*pte & PG_W) == 0)
 4786                                 panic("pmap_unwire: pte %#jx is missing PG_W",
 4787                                     (uintmax_t)*pte);
 4788 
 4789                         /*
 4790                          * PG_W must be cleared atomically.  Although the pmap
 4791                          * lock synchronizes access to PG_W, another processor
 4792                          * could be setting PG_M and/or PG_A concurrently.
 4793                          */
 4794                         atomic_clear_long(pte, PG_W);
 4795                         pmap->pm_stats.wired_count--;
 4796                 }
 4797         }
 4798         if (pv_lists_locked)
 4799                 rw_runlock(&pvh_global_lock);
 4800         PMAP_UNLOCK(pmap);
 4801 }
 4802 
 4803 /*
 4804  *      Copy the range specified by src_addr/len
 4805  *      from the source map to the range dst_addr/len
 4806  *      in the destination map.
 4807  *
 4808  *      This routine is only advisory and need not do anything.
 4809  */
 4810 
 4811 void
 4812 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 4813     vm_offset_t src_addr)
 4814 {
 4815         struct rwlock *lock;
 4816         struct spglist free;
 4817         vm_offset_t addr;
 4818         vm_offset_t end_addr = src_addr + len;
 4819         vm_offset_t va_next;
 4820         pt_entry_t PG_A, PG_M, PG_V;
 4821 
 4822         if (dst_addr != src_addr)
 4823                 return;
 4824 
 4825         if (dst_pmap->pm_type != src_pmap->pm_type)
 4826                 return;
 4827 
 4828         /*
 4829          * EPT page table entries that require emulation of A/D bits are
 4830          * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
 4831          * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
 4832          * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
 4833          * implementations flag an EPT misconfiguration for exec-only
 4834          * mappings we skip this function entirely for emulated pmaps.
 4835          */
 4836         if (pmap_emulate_ad_bits(dst_pmap))
 4837                 return;
 4838 
 4839         lock = NULL;
 4840         rw_rlock(&pvh_global_lock);
 4841         if (dst_pmap < src_pmap) {
 4842                 PMAP_LOCK(dst_pmap);
 4843                 PMAP_LOCK(src_pmap);
 4844         } else {
 4845                 PMAP_LOCK(src_pmap);
 4846                 PMAP_LOCK(dst_pmap);
 4847         }
 4848 
 4849         PG_A = pmap_accessed_bit(dst_pmap);
 4850         PG_M = pmap_modified_bit(dst_pmap);
 4851         PG_V = pmap_valid_bit(dst_pmap);
 4852 
 4853         for (addr = src_addr; addr < end_addr; addr = va_next) {
 4854                 pt_entry_t *src_pte, *dst_pte;
 4855                 vm_page_t dstmpde, dstmpte, srcmpte;
 4856                 pml4_entry_t *pml4e;
 4857                 pdp_entry_t *pdpe;
 4858                 pd_entry_t srcptepaddr, *pde;
 4859 
 4860                 KASSERT(addr < UPT_MIN_ADDRESS,
 4861                     ("pmap_copy: invalid to pmap_copy page tables"));
 4862 
 4863                 pml4e = pmap_pml4e(src_pmap, addr);
 4864                 if ((*pml4e & PG_V) == 0) {
 4865                         va_next = (addr + NBPML4) & ~PML4MASK;
 4866                         if (va_next < addr)
 4867                                 va_next = end_addr;
 4868                         continue;
 4869                 }
 4870 
 4871                 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 4872                 if ((*pdpe & PG_V) == 0) {
 4873                         va_next = (addr + NBPDP) & ~PDPMASK;
 4874                         if (va_next < addr)
 4875                                 va_next = end_addr;
 4876                         continue;
 4877                 }
 4878 
 4879                 va_next = (addr + NBPDR) & ~PDRMASK;
 4880                 if (va_next < addr)
 4881                         va_next = end_addr;
 4882 
 4883                 pde = pmap_pdpe_to_pde(pdpe, addr);
 4884                 srcptepaddr = *pde;
 4885                 if (srcptepaddr == 0)
 4886                         continue;
 4887                         
 4888                 if (srcptepaddr & PG_PS) {
 4889                         if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 4890                                 continue;
 4891                         dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
 4892                         if (dstmpde == NULL)
 4893                                 break;
 4894                         pde = (pd_entry_t *)
 4895                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
 4896                         pde = &pde[pmap_pde_index(addr)];
 4897                         if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 4898                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
 4899                             PG_PS_FRAME, &lock))) {
 4900                                 *pde = srcptepaddr & ~PG_W;
 4901                                 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
 4902                         } else
 4903                                 dstmpde->wire_count--;
 4904                         continue;
 4905                 }
 4906 
 4907                 srcptepaddr &= PG_FRAME;
 4908                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 4909                 KASSERT(srcmpte->wire_count > 0,
 4910                     ("pmap_copy: source page table page is unused"));
 4911 
 4912                 if (va_next > end_addr)
 4913                         va_next = end_addr;
 4914 
 4915                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 4916                 src_pte = &src_pte[pmap_pte_index(addr)];
 4917                 dstmpte = NULL;
 4918                 while (addr < va_next) {
 4919                         pt_entry_t ptetemp;
 4920                         ptetemp = *src_pte;
 4921                         /*
 4922                          * we only virtual copy managed pages
 4923                          */
 4924                         if ((ptetemp & PG_MANAGED) != 0) {
 4925                                 if (dstmpte != NULL &&
 4926                                     dstmpte->pindex == pmap_pde_pindex(addr))
 4927                                         dstmpte->wire_count++;
 4928                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
 4929                                     addr, NULL)) == NULL)
 4930                                         goto out;
 4931                                 dst_pte = (pt_entry_t *)
 4932                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 4933                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
 4934                                 if (*dst_pte == 0 &&
 4935                                     pmap_try_insert_pv_entry(dst_pmap, addr,
 4936                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
 4937                                     &lock)) {
 4938                                         /*
 4939                                          * Clear the wired, modified, and
 4940                                          * accessed (referenced) bits
 4941                                          * during the copy.
 4942                                          */
 4943                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
 4944                                             PG_A);
 4945                                         pmap_resident_count_inc(dst_pmap, 1);
 4946                                 } else {
 4947                                         SLIST_INIT(&free);
 4948                                         if (pmap_unwire_ptp(dst_pmap, addr,
 4949                                             dstmpte, &free)) {
 4950                                                 pmap_invalidate_page(dst_pmap,
 4951                                                     addr);
 4952                                                 pmap_free_zero_pages(&free);
 4953                                         }
 4954                                         goto out;
 4955                                 }
 4956                                 if (dstmpte->wire_count >= srcmpte->wire_count)
 4957                                         break;
 4958                         }
 4959                         addr += PAGE_SIZE;
 4960                         src_pte++;
 4961                 }
 4962         }
 4963 out:
 4964         if (lock != NULL)
 4965                 rw_wunlock(lock);
 4966         rw_runlock(&pvh_global_lock);
 4967         PMAP_UNLOCK(src_pmap);
 4968         PMAP_UNLOCK(dst_pmap);
 4969 }
 4970 
 4971 /*
 4972  *      pmap_zero_page zeros the specified hardware page by mapping
 4973  *      the page into KVM and using bzero to clear its contents.
 4974  */
 4975 void
 4976 pmap_zero_page(vm_page_t m)
 4977 {
 4978         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 4979 
 4980         pagezero((void *)va);
 4981 }
 4982 
 4983 /*
 4984  *      pmap_zero_page_area zeros the specified hardware page by mapping 
 4985  *      the page into KVM and using bzero to clear its contents.
 4986  *
 4987  *      off and size may not cover an area beyond a single hardware page.
 4988  */
 4989 void
 4990 pmap_zero_page_area(vm_page_t m, int off, int size)
 4991 {
 4992         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 4993 
 4994         if (off == 0 && size == PAGE_SIZE)
 4995                 pagezero((void *)va);
 4996         else
 4997                 bzero((char *)va + off, size);
 4998 }
 4999 
 5000 /*
 5001  *      pmap_zero_page_idle zeros the specified hardware page by mapping 
 5002  *      the page into KVM and using bzero to clear its contents.  This
 5003  *      is intended to be called from the vm_pagezero process only and
 5004  *      outside of Giant.
 5005  */
 5006 void
 5007 pmap_zero_page_idle(vm_page_t m)
 5008 {
 5009         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 5010 
 5011         pagezero((void *)va);
 5012 }
 5013 
 5014 /*
 5015  *      pmap_copy_page copies the specified (machine independent)
 5016  *      page by mapping the page into virtual memory and using
 5017  *      bcopy to copy the page, one machine dependent page at a
 5018  *      time.
 5019  */
 5020 void
 5021 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 5022 {
 5023         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 5024         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 5025 
 5026         pagecopy((void *)src, (void *)dst);
 5027 }
 5028 
 5029 int unmapped_buf_allowed = 1;
 5030 
 5031 void
 5032 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
 5033     vm_offset_t b_offset, int xfersize)
 5034 {
 5035         void *a_cp, *b_cp;
 5036         vm_page_t m_a, m_b;
 5037         vm_paddr_t p_a, p_b;
 5038         pt_entry_t *pte;
 5039         vm_offset_t a_pg_offset, b_pg_offset;
 5040         int cnt;
 5041         boolean_t pinned;
 5042 
 5043         /*
 5044          * NB:  The sequence of updating a page table followed by accesses
 5045          * to the corresponding pages used in the !DMAP case is subject to
 5046          * the situation described in the "AMD64 Architecture Programmer's
 5047          * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
 5048          * Coherency Considerations".  Therefore, issuing the INVLPG right
 5049          * after modifying the PTE bits is crucial.
 5050          */
 5051         pinned = FALSE;
 5052         while (xfersize > 0) {
 5053                 a_pg_offset = a_offset & PAGE_MASK;
 5054                 m_a = ma[a_offset >> PAGE_SHIFT];
 5055                 p_a = m_a->phys_addr;
 5056                 b_pg_offset = b_offset & PAGE_MASK;
 5057                 m_b = mb[b_offset >> PAGE_SHIFT];
 5058                 p_b = m_b->phys_addr;
 5059                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 5060                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 5061                 if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
 5062                     p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
 5063                         mtx_lock(&cpage_lock);
 5064                         sched_pin();
 5065                         pinned = TRUE;
 5066                         pte = vtopte(cpage_a);
 5067                         *pte = p_a | X86_PG_A | X86_PG_V |
 5068                             pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
 5069                         invlpg(cpage_a);
 5070                         a_cp = (char *)cpage_a + a_pg_offset;
 5071                 } else {
 5072                         a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
 5073                 }
 5074                 if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
 5075                     p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
 5076                         if (!pinned) {
 5077                                 mtx_lock(&cpage_lock);
 5078                                 sched_pin();
 5079                                 pinned = TRUE;
 5080                         }
 5081                         pte = vtopte(cpage_b);
 5082                         *pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
 5083                             X86_PG_V | pmap_cache_bits(kernel_pmap,
 5084                             m_b->md.pat_mode, 0);
 5085                         invlpg(cpage_b);
 5086                         b_cp = (char *)cpage_b + b_pg_offset;
 5087                 } else {
 5088                         b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
 5089                 }
 5090                 bcopy(a_cp, b_cp, cnt);
 5091                 if (__predict_false(pinned)) {
 5092                         sched_unpin();
 5093                         mtx_unlock(&cpage_lock);
 5094                         pinned = FALSE;
 5095                 }
 5096                 a_offset += cnt;
 5097                 b_offset += cnt;
 5098                 xfersize -= cnt;
 5099         }
 5100 }
 5101 
 5102 /*
 5103  * Returns true if the pmap's pv is one of the first
 5104  * 16 pvs linked to from this page.  This count may
 5105  * be changed upwards or downwards in the future; it
 5106  * is only necessary that true be returned for a small
 5107  * subset of pmaps for proper page aging.
 5108  */
 5109 boolean_t
 5110 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 5111 {
 5112         struct md_page *pvh;
 5113         struct rwlock *lock;
 5114         pv_entry_t pv;
 5115         int loops = 0;
 5116         boolean_t rv;
 5117 
 5118         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5119             ("pmap_page_exists_quick: page %p is not managed", m));
 5120         rv = FALSE;
 5121         rw_rlock(&pvh_global_lock);
 5122         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5123         rw_rlock(lock);
 5124         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5125                 if (PV_PMAP(pv) == pmap) {
 5126                         rv = TRUE;
 5127                         break;
 5128                 }
 5129                 loops++;
 5130                 if (loops >= 16)
 5131                         break;
 5132         }
 5133         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 5134                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5135                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5136                         if (PV_PMAP(pv) == pmap) {
 5137                                 rv = TRUE;
 5138                                 break;
 5139                         }
 5140                         loops++;
 5141                         if (loops >= 16)
 5142                                 break;
 5143                 }
 5144         }
 5145         rw_runlock(lock);
 5146         rw_runlock(&pvh_global_lock);
 5147         return (rv);
 5148 }
 5149 
 5150 /*
 5151  *      pmap_page_wired_mappings:
 5152  *
 5153  *      Return the number of managed mappings to the given physical page
 5154  *      that are wired.
 5155  */
 5156 int
 5157 pmap_page_wired_mappings(vm_page_t m)
 5158 {
 5159         struct rwlock *lock;
 5160         struct md_page *pvh;
 5161         pmap_t pmap;
 5162         pt_entry_t *pte;
 5163         pv_entry_t pv;
 5164         int count, md_gen, pvh_gen;
 5165 
 5166         if ((m->oflags & VPO_UNMANAGED) != 0)
 5167                 return (0);
 5168         rw_rlock(&pvh_global_lock);
 5169         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5170         rw_rlock(lock);
 5171 restart:
 5172         count = 0;
 5173         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5174                 pmap = PV_PMAP(pv);
 5175                 if (!PMAP_TRYLOCK(pmap)) {
 5176                         md_gen = m->md.pv_gen;
 5177                         rw_runlock(lock);
 5178                         PMAP_LOCK(pmap);
 5179                         rw_rlock(lock);
 5180                         if (md_gen != m->md.pv_gen) {
 5181                                 PMAP_UNLOCK(pmap);
 5182                                 goto restart;
 5183                         }
 5184                 }
 5185                 pte = pmap_pte(pmap, pv->pv_va);
 5186                 if ((*pte & PG_W) != 0)
 5187                         count++;
 5188                 PMAP_UNLOCK(pmap);
 5189         }
 5190         if ((m->flags & PG_FICTITIOUS) == 0) {
 5191                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5192                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5193                         pmap = PV_PMAP(pv);
 5194                         if (!PMAP_TRYLOCK(pmap)) {
 5195                                 md_gen = m->md.pv_gen;
 5196                                 pvh_gen = pvh->pv_gen;
 5197                                 rw_runlock(lock);
 5198                                 PMAP_LOCK(pmap);
 5199                                 rw_rlock(lock);
 5200                                 if (md_gen != m->md.pv_gen ||
 5201                                     pvh_gen != pvh->pv_gen) {
 5202                                         PMAP_UNLOCK(pmap);
 5203                                         goto restart;
 5204                                 }
 5205                         }
 5206                         pte = pmap_pde(pmap, pv->pv_va);
 5207                         if ((*pte & PG_W) != 0)
 5208                                 count++;
 5209                         PMAP_UNLOCK(pmap);
 5210                 }
 5211         }
 5212         rw_runlock(lock);
 5213         rw_runlock(&pvh_global_lock);
 5214         return (count);
 5215 }
 5216 
 5217 /*
 5218  * Returns TRUE if the given page is mapped individually or as part of
 5219  * a 2mpage.  Otherwise, returns FALSE.
 5220  */
 5221 boolean_t
 5222 pmap_page_is_mapped(vm_page_t m)
 5223 {
 5224         struct rwlock *lock;
 5225         boolean_t rv;
 5226 
 5227         if ((m->oflags & VPO_UNMANAGED) != 0)
 5228                 return (FALSE);
 5229         rw_rlock(&pvh_global_lock);
 5230         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5231         rw_rlock(lock);
 5232         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 5233             ((m->flags & PG_FICTITIOUS) == 0 &&
 5234             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 5235         rw_runlock(lock);
 5236         rw_runlock(&pvh_global_lock);
 5237         return (rv);
 5238 }
 5239 
 5240 /*
 5241  * Destroy all managed, non-wired mappings in the given user-space
 5242  * pmap.  This pmap cannot be active on any processor besides the
 5243  * caller.
 5244  *                                                                                
 5245  * This function cannot be applied to the kernel pmap.  Moreover, it
 5246  * is not intended for general use.  It is only to be used during
 5247  * process termination.  Consequently, it can be implemented in ways
 5248  * that make it faster than pmap_remove().  First, it can more quickly
 5249  * destroy mappings by iterating over the pmap's collection of PV
 5250  * entries, rather than searching the page table.  Second, it doesn't
 5251  * have to test and clear the page table entries atomically, because
 5252  * no processor is currently accessing the user address space.  In
 5253  * particular, a page table entry's dirty bit won't change state once
 5254  * this function starts.
 5255  */
 5256 void
 5257 pmap_remove_pages(pmap_t pmap)
 5258 {
 5259         pd_entry_t ptepde;
 5260         pt_entry_t *pte, tpte;
 5261         pt_entry_t PG_M, PG_RW, PG_V;
 5262         struct spglist free;
 5263         vm_page_t m, mpte, mt;
 5264         pv_entry_t pv;
 5265         struct md_page *pvh;
 5266         struct pv_chunk *pc, *npc;
 5267         struct rwlock *lock;
 5268         int64_t bit;
 5269         uint64_t inuse, bitmask;
 5270         int allfree, field, freed, idx;
 5271         boolean_t superpage;
 5272         vm_paddr_t pa;
 5273 
 5274         /*
 5275          * Assert that the given pmap is only active on the current
 5276          * CPU.  Unfortunately, we cannot block another CPU from
 5277          * activating the pmap while this function is executing.
 5278          */
 5279         KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
 5280 #ifdef INVARIANTS
 5281         {
 5282                 cpuset_t other_cpus;
 5283 
 5284                 other_cpus = all_cpus;
 5285                 critical_enter();
 5286                 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 5287                 CPU_AND(&other_cpus, &pmap->pm_active);
 5288                 critical_exit();
 5289                 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
 5290         }
 5291 #endif
 5292 
 5293         lock = NULL;
 5294         PG_M = pmap_modified_bit(pmap);
 5295         PG_V = pmap_valid_bit(pmap);
 5296         PG_RW = pmap_rw_bit(pmap);
 5297 
 5298         SLIST_INIT(&free);
 5299         rw_rlock(&pvh_global_lock);
 5300         PMAP_LOCK(pmap);
 5301         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 5302                 allfree = 1;
 5303                 freed = 0;
 5304                 for (field = 0; field < _NPCM; field++) {
 5305                         inuse = ~pc->pc_map[field] & pc_freemask[field];
 5306                         while (inuse != 0) {
 5307                                 bit = bsfq(inuse);
 5308                                 bitmask = 1UL << bit;
 5309                                 idx = field * 64 + bit;
 5310                                 pv = &pc->pc_pventry[idx];
 5311                                 inuse &= ~bitmask;
 5312 
 5313                                 pte = pmap_pdpe(pmap, pv->pv_va);
 5314                                 ptepde = *pte;
 5315                                 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 5316                                 tpte = *pte;
 5317                                 if ((tpte & (PG_PS | PG_V)) == PG_V) {
 5318                                         superpage = FALSE;
 5319                                         ptepde = tpte;
 5320                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 5321                                             PG_FRAME);
 5322                                         pte = &pte[pmap_pte_index(pv->pv_va)];
 5323                                         tpte = *pte;
 5324                                 } else {
 5325                                         /*
 5326                                          * Keep track whether 'tpte' is a
 5327                                          * superpage explicitly instead of
 5328                                          * relying on PG_PS being set.
 5329                                          *
 5330                                          * This is because PG_PS is numerically
 5331                                          * identical to PG_PTE_PAT and thus a
 5332                                          * regular page could be mistaken for
 5333                                          * a superpage.
 5334                                          */
 5335                                         superpage = TRUE;
 5336                                 }
 5337 
 5338                                 if ((tpte & PG_V) == 0) {
 5339                                         panic("bad pte va %lx pte %lx",
 5340                                             pv->pv_va, tpte);
 5341                                 }
 5342 
 5343 /*
 5344  * We cannot remove wired pages from a process' mapping at this time
 5345  */
 5346                                 if (tpte & PG_W) {
 5347                                         allfree = 0;
 5348                                         continue;
 5349                                 }
 5350 
 5351                                 if (superpage)
 5352                                         pa = tpte & PG_PS_FRAME;
 5353                                 else
 5354                                         pa = tpte & PG_FRAME;
 5355 
 5356                                 m = PHYS_TO_VM_PAGE(pa);
 5357                                 KASSERT(m->phys_addr == pa,
 5358                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 5359                                     m, (uintmax_t)m->phys_addr,
 5360                                     (uintmax_t)tpte));
 5361 
 5362                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 5363                                     m < &vm_page_array[vm_page_array_size],
 5364                                     ("pmap_remove_pages: bad tpte %#jx",
 5365                                     (uintmax_t)tpte));
 5366 
 5367                                 pte_clear(pte);
 5368 
 5369                                 /*
 5370                                  * Update the vm_page_t clean/reference bits.
 5371                                  */
 5372                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 5373                                         if (superpage) {
 5374                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 5375                                                         vm_page_dirty(mt);
 5376                                         } else
 5377                                                 vm_page_dirty(m);
 5378                                 }
 5379 
 5380                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 5381 
 5382                                 /* Mark free */
 5383                                 pc->pc_map[field] |= bitmask;
 5384                                 if (superpage) {
 5385                                         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 5386                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 5387                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 5388                                         pvh->pv_gen++;
 5389                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 5390                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 5391                                                         if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 5392                                                             TAILQ_EMPTY(&mt->md.pv_list))
 5393                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
 5394                                         }
 5395                                         mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
 5396                                         if (mpte != NULL) {
 5397                                                 pmap_remove_pt_page(pmap, mpte);
 5398                                                 pmap_resident_count_dec(pmap, 1);
 5399                                                 KASSERT(mpte->wire_count == NPTEPG,
 5400                                                     ("pmap_remove_pages: pte page wire count error"));
 5401                                                 mpte->wire_count = 0;
 5402                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
 5403                                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 5404                                         }
 5405                                 } else {
 5406                                         pmap_resident_count_dec(pmap, 1);
 5407                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 5408                                         m->md.pv_gen++;
 5409                                         if ((m->aflags & PGA_WRITEABLE) != 0 &&
 5410                                             TAILQ_EMPTY(&m->md.pv_list) &&
 5411                                             (m->flags & PG_FICTITIOUS) == 0) {
 5412                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5413                                                 if (TAILQ_EMPTY(&pvh->pv_list))
 5414                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5415                                         }
 5416                                 }
 5417                                 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 5418                                 freed++;
 5419                         }
 5420                 }
 5421                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 5422                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 5423                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 5424                 if (allfree) {
 5425                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 5426                         free_pv_chunk(pc);
 5427                 }
 5428         }
 5429         if (lock != NULL)
 5430                 rw_wunlock(lock);
 5431         pmap_invalidate_all(pmap);
 5432         rw_runlock(&pvh_global_lock);
 5433         PMAP_UNLOCK(pmap);
 5434         pmap_free_zero_pages(&free);
 5435 }
 5436 
 5437 static boolean_t
 5438 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 5439 {
 5440         struct rwlock *lock;
 5441         pv_entry_t pv;
 5442         struct md_page *pvh;
 5443         pt_entry_t *pte, mask;
 5444         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 5445         pmap_t pmap;
 5446         int md_gen, pvh_gen;
 5447         boolean_t rv;
 5448 
 5449         rv = FALSE;
 5450         rw_rlock(&pvh_global_lock);
 5451         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5452         rw_rlock(lock);
 5453 restart:
 5454         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5455                 pmap = PV_PMAP(pv);
 5456                 if (!PMAP_TRYLOCK(pmap)) {
 5457                         md_gen = m->md.pv_gen;
 5458                         rw_runlock(lock);
 5459                         PMAP_LOCK(pmap);
 5460                         rw_rlock(lock);
 5461                         if (md_gen != m->md.pv_gen) {
 5462                                 PMAP_UNLOCK(pmap);
 5463                                 goto restart;
 5464                         }
 5465                 }
 5466                 pte = pmap_pte(pmap, pv->pv_va);
 5467                 mask = 0;
 5468                 if (modified) {
 5469                         PG_M = pmap_modified_bit(pmap);
 5470                         PG_RW = pmap_rw_bit(pmap);
 5471                         mask |= PG_RW | PG_M;
 5472                 }
 5473                 if (accessed) {
 5474                         PG_A = pmap_accessed_bit(pmap);
 5475                         PG_V = pmap_valid_bit(pmap);
 5476                         mask |= PG_V | PG_A;
 5477                 }
 5478                 rv = (*pte & mask) == mask;
 5479                 PMAP_UNLOCK(pmap);
 5480                 if (rv)
 5481                         goto out;
 5482         }
 5483         if ((m->flags & PG_FICTITIOUS) == 0) {
 5484                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5485                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5486                         pmap = PV_PMAP(pv);
 5487                         if (!PMAP_TRYLOCK(pmap)) {
 5488                                 md_gen = m->md.pv_gen;
 5489                                 pvh_gen = pvh->pv_gen;
 5490                                 rw_runlock(lock);
 5491                                 PMAP_LOCK(pmap);
 5492                                 rw_rlock(lock);
 5493                                 if (md_gen != m->md.pv_gen ||
 5494                                     pvh_gen != pvh->pv_gen) {
 5495                                         PMAP_UNLOCK(pmap);
 5496                                         goto restart;
 5497                                 }
 5498                         }
 5499                         pte = pmap_pde(pmap, pv->pv_va);
 5500                         mask = 0;
 5501                         if (modified) {
 5502                                 PG_M = pmap_modified_bit(pmap);
 5503                                 PG_RW = pmap_rw_bit(pmap);
 5504                                 mask |= PG_RW | PG_M;
 5505                         }
 5506                         if (accessed) {
 5507                                 PG_A = pmap_accessed_bit(pmap);
 5508                                 PG_V = pmap_valid_bit(pmap);
 5509                                 mask |= PG_V | PG_A;
 5510                         }
 5511                         rv = (*pte & mask) == mask;
 5512                         PMAP_UNLOCK(pmap);
 5513                         if (rv)
 5514                                 goto out;
 5515                 }
 5516         }
 5517 out:
 5518         rw_runlock(lock);
 5519         rw_runlock(&pvh_global_lock);
 5520         return (rv);
 5521 }
 5522 
 5523 /*
 5524  *      pmap_is_modified:
 5525  *
 5526  *      Return whether or not the specified physical page was modified
 5527  *      in any physical maps.
 5528  */
 5529 boolean_t
 5530 pmap_is_modified(vm_page_t m)
 5531 {
 5532 
 5533         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5534             ("pmap_is_modified: page %p is not managed", m));
 5535 
 5536         /*
 5537          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 5538          * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 5539          * is clear, no PTEs can have PG_M set.
 5540          */
 5541         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5542         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 5543                 return (FALSE);
 5544         return (pmap_page_test_mappings(m, FALSE, TRUE));
 5545 }
 5546 
 5547 /*
 5548  *      pmap_is_prefaultable:
 5549  *
 5550  *      Return whether or not the specified virtual address is eligible
 5551  *      for prefault.
 5552  */
 5553 boolean_t
 5554 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 5555 {
 5556         pd_entry_t *pde;
 5557         pt_entry_t *pte, PG_V;
 5558         boolean_t rv;
 5559 
 5560         PG_V = pmap_valid_bit(pmap);
 5561         rv = FALSE;
 5562         PMAP_LOCK(pmap);
 5563         pde = pmap_pde(pmap, addr);
 5564         if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 5565                 pte = pmap_pde_to_pte(pde, addr);
 5566                 rv = (*pte & PG_V) == 0;
 5567         }
 5568         PMAP_UNLOCK(pmap);
 5569         return (rv);
 5570 }
 5571 
 5572 /*
 5573  *      pmap_is_referenced:
 5574  *
 5575  *      Return whether or not the specified physical page was referenced
 5576  *      in any physical maps.
 5577  */
 5578 boolean_t
 5579 pmap_is_referenced(vm_page_t m)
 5580 {
 5581 
 5582         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5583             ("pmap_is_referenced: page %p is not managed", m));
 5584         return (pmap_page_test_mappings(m, TRUE, FALSE));
 5585 }
 5586 
 5587 /*
 5588  * Clear the write and modified bits in each of the given page's mappings.
 5589  */
 5590 void
 5591 pmap_remove_write(vm_page_t m)
 5592 {
 5593         struct md_page *pvh;
 5594         pmap_t pmap;
 5595         struct rwlock *lock;
 5596         pv_entry_t next_pv, pv;
 5597         pd_entry_t *pde;
 5598         pt_entry_t oldpte, *pte, PG_M, PG_RW;
 5599         vm_offset_t va;
 5600         int pvh_gen, md_gen;
 5601 
 5602         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5603             ("pmap_remove_write: page %p is not managed", m));
 5604 
 5605         /*
 5606          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 5607          * set by another thread while the object is locked.  Thus,
 5608          * if PGA_WRITEABLE is clear, no page table entries need updating.
 5609          */
 5610         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5611         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 5612                 return;
 5613         rw_rlock(&pvh_global_lock);
 5614         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5615         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5616 retry_pv_loop:
 5617         rw_wlock(lock);
 5618         if ((m->flags & PG_FICTITIOUS) != 0)
 5619                 goto small_mappings;
 5620         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 5621                 pmap = PV_PMAP(pv);
 5622                 if (!PMAP_TRYLOCK(pmap)) {
 5623                         pvh_gen = pvh->pv_gen;
 5624                         rw_wunlock(lock);
 5625                         PMAP_LOCK(pmap);
 5626                         rw_wlock(lock);
 5627                         if (pvh_gen != pvh->pv_gen) {
 5628                                 PMAP_UNLOCK(pmap);
 5629                                 rw_wunlock(lock);
 5630                                 goto retry_pv_loop;
 5631                         }
 5632                 }
 5633                 PG_RW = pmap_rw_bit(pmap);
 5634                 va = pv->pv_va;
 5635                 pde = pmap_pde(pmap, va);
 5636                 if ((*pde & PG_RW) != 0)
 5637                         (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 5638                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 5639                     ("inconsistent pv lock %p %p for page %p",
 5640                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 5641                 PMAP_UNLOCK(pmap);
 5642         }
 5643 small_mappings:
 5644         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5645                 pmap = PV_PMAP(pv);
 5646                 if (!PMAP_TRYLOCK(pmap)) {
 5647                         pvh_gen = pvh->pv_gen;
 5648                         md_gen = m->md.pv_gen;
 5649                         rw_wunlock(lock);
 5650                         PMAP_LOCK(pmap);
 5651                         rw_wlock(lock);
 5652                         if (pvh_gen != pvh->pv_gen ||
 5653                             md_gen != m->md.pv_gen) {
 5654                                 PMAP_UNLOCK(pmap);
 5655                                 rw_wunlock(lock);
 5656                                 goto retry_pv_loop;
 5657                         }
 5658                 }
 5659                 PG_M = pmap_modified_bit(pmap);
 5660                 PG_RW = pmap_rw_bit(pmap);
 5661                 pde = pmap_pde(pmap, pv->pv_va);
 5662                 KASSERT((*pde & PG_PS) == 0,
 5663                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
 5664                     m));
 5665                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 5666 retry:
 5667                 oldpte = *pte;
 5668                 if (oldpte & PG_RW) {
 5669                         if (!atomic_cmpset_long(pte, oldpte, oldpte &
 5670                             ~(PG_RW | PG_M)))
 5671                                 goto retry;
 5672                         if ((oldpte & PG_M) != 0)
 5673                                 vm_page_dirty(m);
 5674                         pmap_invalidate_page(pmap, pv->pv_va);
 5675                 }
 5676                 PMAP_UNLOCK(pmap);
 5677         }
 5678         rw_wunlock(lock);
 5679         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5680         rw_runlock(&pvh_global_lock);
 5681 }
 5682 
 5683 static __inline boolean_t
 5684 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
 5685 {
 5686 
 5687         if (!pmap_emulate_ad_bits(pmap))
 5688                 return (TRUE);
 5689 
 5690         KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
 5691 
 5692         /*
 5693          * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
 5694          * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
 5695          * if the EPT_PG_WRITE bit is set.
 5696          */
 5697         if ((pte & EPT_PG_WRITE) != 0)
 5698                 return (FALSE);
 5699 
 5700         /*
 5701          * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
 5702          */
 5703         if ((pte & EPT_PG_EXECUTE) == 0 ||
 5704             ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
 5705                 return (TRUE);
 5706         else
 5707                 return (FALSE);
 5708 }
 5709 
 5710 #define PMAP_TS_REFERENCED_MAX  5
 5711 
 5712 /*
 5713  *      pmap_ts_referenced:
 5714  *
 5715  *      Return a count of reference bits for a page, clearing those bits.
 5716  *      It is not necessary for every reference bit to be cleared, but it
 5717  *      is necessary that 0 only be returned when there are truly no
 5718  *      reference bits set.
 5719  *
 5720  *      XXX: The exact number of bits to check and clear is a matter that
 5721  *      should be tested and standardized at some point in the future for
 5722  *      optimal aging of shared pages.
 5723  */
 5724 int
 5725 pmap_ts_referenced(vm_page_t m)
 5726 {
 5727         struct md_page *pvh;
 5728         pv_entry_t pv, pvf;
 5729         pmap_t pmap;
 5730         struct rwlock *lock;
 5731         pd_entry_t oldpde, *pde;
 5732         pt_entry_t *pte, PG_A;
 5733         vm_offset_t va;
 5734         vm_paddr_t pa;
 5735         int cleared, md_gen, not_cleared, pvh_gen;
 5736         struct spglist free;
 5737         boolean_t demoted;
 5738 
 5739         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5740             ("pmap_ts_referenced: page %p is not managed", m));
 5741         SLIST_INIT(&free);
 5742         cleared = 0;
 5743         pa = VM_PAGE_TO_PHYS(m);
 5744         lock = PHYS_TO_PV_LIST_LOCK(pa);
 5745         pvh = pa_to_pvh(pa);
 5746         rw_rlock(&pvh_global_lock);
 5747         rw_wlock(lock);
 5748 retry:
 5749         not_cleared = 0;
 5750         if ((m->flags & PG_FICTITIOUS) != 0 ||
 5751             (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 5752                 goto small_mappings;
 5753         pv = pvf;
 5754         do {
 5755                 if (pvf == NULL)
 5756                         pvf = pv;
 5757                 pmap = PV_PMAP(pv);
 5758                 if (!PMAP_TRYLOCK(pmap)) {
 5759                         pvh_gen = pvh->pv_gen;
 5760                         rw_wunlock(lock);
 5761                         PMAP_LOCK(pmap);
 5762                         rw_wlock(lock);
 5763                         if (pvh_gen != pvh->pv_gen) {
 5764                                 PMAP_UNLOCK(pmap);
 5765                                 goto retry;
 5766                         }
 5767                 }
 5768                 PG_A = pmap_accessed_bit(pmap);
 5769                 va = pv->pv_va;
 5770                 pde = pmap_pde(pmap, pv->pv_va);
 5771                 oldpde = *pde;
 5772                 if ((*pde & PG_A) != 0) {
 5773                         /*
 5774                          * Since this reference bit is shared by 512 4KB
 5775                          * pages, it should not be cleared every time it is
 5776                          * tested.  Apply a simple "hash" function on the
 5777                          * physical page number, the virtual superpage number,
 5778                          * and the pmap address to select one 4KB page out of
 5779                          * the 512 on which testing the reference bit will
 5780                          * result in clearing that reference bit.  This
 5781                          * function is designed to avoid the selection of the
 5782                          * same 4KB page for every 2MB page mapping.
 5783                          *
 5784                          * On demotion, a mapping that hasn't been referenced
 5785                          * is simply destroyed.  To avoid the possibility of a
 5786                          * subsequent page fault on a demoted wired mapping,
 5787                          * always leave its reference bit set.  Moreover,
 5788                          * since the superpage is wired, the current state of
 5789                          * its reference bit won't affect page replacement.
 5790                          */
 5791                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 5792                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 5793                             (*pde & PG_W) == 0) {
 5794                                 if (safe_to_clear_referenced(pmap, oldpde)) {
 5795                                         atomic_clear_long(pde, PG_A);
 5796                                         pmap_invalidate_page(pmap, pv->pv_va);
 5797                                         demoted = FALSE;
 5798                                 } else if (pmap_demote_pde_locked(pmap, pde,
 5799                                     pv->pv_va, &lock)) {
 5800                                         /*
 5801                                          * Remove the mapping to a single page
 5802                                          * so that a subsequent access may
 5803                                          * repromote.  Since the underlying
 5804                                          * page table page is fully populated,
 5805                                          * this removal never frees a page
 5806                                          * table page.
 5807                                          */
 5808                                         demoted = TRUE;
 5809                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 5810                                             PG_PS_FRAME);
 5811                                         pte = pmap_pde_to_pte(pde, va);
 5812                                         pmap_remove_pte(pmap, pte, va, *pde,
 5813                                             NULL, &lock);
 5814                                         pmap_invalidate_page(pmap, va);
 5815                                 } else
 5816                                         demoted = TRUE;
 5817 
 5818                                 if (demoted) {
 5819                                         /*
 5820                                          * The superpage mapping was removed
 5821                                          * entirely and therefore 'pv' is no
 5822                                          * longer valid.
 5823                                          */
 5824                                         if (pvf == pv)
 5825                                                 pvf = NULL;
 5826                                         pv = NULL;
 5827                                 }
 5828                                 cleared++;
 5829                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 5830                                     ("inconsistent pv lock %p %p for page %p",
 5831                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 5832                         } else
 5833                                 not_cleared++;
 5834                 }
 5835                 PMAP_UNLOCK(pmap);
 5836                 /* Rotate the PV list if it has more than one entry. */
 5837                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 5838                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 5839                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 5840                         pvh->pv_gen++;
 5841                 }
 5842                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 5843                         goto out;
 5844         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 5845 small_mappings:
 5846         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 5847                 goto out;
 5848         pv = pvf;
 5849         do {
 5850                 if (pvf == NULL)
 5851                         pvf = pv;
 5852                 pmap = PV_PMAP(pv);
 5853                 if (!PMAP_TRYLOCK(pmap)) {
 5854                         pvh_gen = pvh->pv_gen;
 5855                         md_gen = m->md.pv_gen;
 5856                         rw_wunlock(lock);
 5857                         PMAP_LOCK(pmap);
 5858                         rw_wlock(lock);
 5859                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 5860                                 PMAP_UNLOCK(pmap);
 5861                                 goto retry;
 5862                         }
 5863                 }
 5864                 PG_A = pmap_accessed_bit(pmap);
 5865                 pde = pmap_pde(pmap, pv->pv_va);
 5866                 KASSERT((*pde & PG_PS) == 0,
 5867                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 5868                     m));
 5869                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 5870                 if ((*pte & PG_A) != 0) {
 5871                         if (safe_to_clear_referenced(pmap, *pte)) {
 5872                                 atomic_clear_long(pte, PG_A);
 5873                                 pmap_invalidate_page(pmap, pv->pv_va);
 5874                                 cleared++;
 5875                         } else if ((*pte & PG_W) == 0) {
 5876                                 /*
 5877                                  * Wired pages cannot be paged out so
 5878                                  * doing accessed bit emulation for
 5879                                  * them is wasted effort. We do the
 5880                                  * hard work for unwired pages only.
 5881                                  */
 5882                                 pmap_remove_pte(pmap, pte, pv->pv_va,
 5883                                     *pde, &free, &lock);
 5884                                 pmap_invalidate_page(pmap, pv->pv_va);
 5885                                 cleared++;
 5886                                 if (pvf == pv)
 5887                                         pvf = NULL;
 5888                                 pv = NULL;
 5889                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 5890                                     ("inconsistent pv lock %p %p for page %p",
 5891                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 5892                         } else
 5893                                 not_cleared++;
 5894                 }
 5895                 PMAP_UNLOCK(pmap);
 5896                 /* Rotate the PV list if it has more than one entry. */
 5897                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) !=