The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-4-Clause
    3  *
    4  * Copyright (c) 1991 Regents of the University of California.
    5  * All rights reserved.
    6  * Copyright (c) 1994 John S. Dyson
    7  * All rights reserved.
    8  * Copyright (c) 1994 David Greenman
    9  * All rights reserved.
   10  * Copyright (c) 2003 Peter Wemm
   11  * All rights reserved.
   12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
   13  * All rights reserved.
   14  *
   15  * This code is derived from software contributed to Berkeley by
   16  * the Systems Programming Group of the University of Utah Computer
   17  * Science Department and William Jolitz of UUNET Technologies Inc.
   18  *
   19  * Redistribution and use in source and binary forms, with or without
   20  * modification, are permitted provided that the following conditions
   21  * are met:
   22  * 1. Redistributions of source code must retain the above copyright
   23  *    notice, this list of conditions and the following disclaimer.
   24  * 2. Redistributions in binary form must reproduce the above copyright
   25  *    notice, this list of conditions and the following disclaimer in the
   26  *    documentation and/or other materials provided with the distribution.
   27  * 3. All advertising materials mentioning features or use of this software
   28  *    must display the following acknowledgement:
   29  *      This product includes software developed by the University of
   30  *      California, Berkeley and its contributors.
   31  * 4. Neither the name of the University nor the names of its contributors
   32  *    may be used to endorse or promote products derived from this software
   33  *    without specific prior written permission.
   34  *
   35  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   36  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   38  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   39  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   40  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   41  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   42  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   43  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   44  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   45  * SUCH DAMAGE.
   46  *
   47  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
   48  */
   49 /*-
   50  * Copyright (c) 2003 Networks Associates Technology, Inc.
   51  * Copyright (c) 2014-2019 The FreeBSD Foundation
   52  * All rights reserved.
   53  *
   54  * This software was developed for the FreeBSD Project by Jake Burkholder,
   55  * Safeport Network Services, and Network Associates Laboratories, the
   56  * Security Research Division of Network Associates, Inc. under
   57  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
   58  * CHATS research program.
   59  *
   60  * Portions of this software were developed by
   61  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
   62  * the FreeBSD Foundation.
   63  *
   64  * Redistribution and use in source and binary forms, with or without
   65  * modification, are permitted provided that the following conditions
   66  * are met:
   67  * 1. Redistributions of source code must retain the above copyright
   68  *    notice, this list of conditions and the following disclaimer.
   69  * 2. Redistributions in binary form must reproduce the above copyright
   70  *    notice, this list of conditions and the following disclaimer in the
   71  *    documentation and/or other materials provided with the distribution.
   72  *
   73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   83  * SUCH DAMAGE.
   84  */
   85 
   86 #define AMD64_NPT_AWARE
   87 
   88 #include <sys/cdefs.h>
   89 __FBSDID("$FreeBSD$");
   90 
   91 /*
   92  *      Manages physical address maps.
   93  *
   94  *      Since the information managed by this module is
   95  *      also stored by the logical address mapping module,
   96  *      this module may throw away valid virtual-to-physical
   97  *      mappings at almost any time.  However, invalidations
   98  *      of virtual-to-physical mappings must be done as
   99  *      requested.
  100  *
  101  *      In order to cope with hardware architectures which
  102  *      make virtual-to-physical map invalidates expensive,
  103  *      this module may delay invalidate or reduced protection
  104  *      operations until such time as they are actually
  105  *      necessary.  This module is given full information as
  106  *      to which processors are currently using which maps,
  107  *      and to when physical maps must be made correct.
  108  */
  109 
  110 #include "opt_ddb.h"
  111 #include "opt_pmap.h"
  112 #include "opt_vm.h"
  113 
  114 #include <sys/param.h>
  115 #include <sys/bitstring.h>
  116 #include <sys/bus.h>
  117 #include <sys/systm.h>
  118 #include <sys/kernel.h>
  119 #include <sys/ktr.h>
  120 #include <sys/lock.h>
  121 #include <sys/malloc.h>
  122 #include <sys/mman.h>
  123 #include <sys/mutex.h>
  124 #include <sys/proc.h>
  125 #include <sys/rangeset.h>
  126 #include <sys/rwlock.h>
  127 #include <sys/sbuf.h>
  128 #include <sys/sx.h>
  129 #include <sys/turnstile.h>
  130 #include <sys/vmem.h>
  131 #include <sys/vmmeter.h>
  132 #include <sys/sched.h>
  133 #include <sys/sysctl.h>
  134 #include <sys/smp.h>
  135 #ifdef DDB
  136 #include <sys/kdb.h>
  137 #include <ddb/ddb.h>
  138 #endif
  139 
  140 #include <vm/vm.h>
  141 #include <vm/vm_param.h>
  142 #include <vm/vm_kern.h>
  143 #include <vm/vm_page.h>
  144 #include <vm/vm_map.h>
  145 #include <vm/vm_object.h>
  146 #include <vm/vm_extern.h>
  147 #include <vm/vm_pageout.h>
  148 #include <vm/vm_pager.h>
  149 #include <vm/vm_phys.h>
  150 #include <vm/vm_radix.h>
  151 #include <vm/vm_reserv.h>
  152 #include <vm/uma.h>
  153 
  154 #include <machine/intr_machdep.h>
  155 #include <x86/apicvar.h>
  156 #include <x86/ifunc.h>
  157 #include <machine/cpu.h>
  158 #include <machine/cputypes.h>
  159 #include <machine/intr_machdep.h>
  160 #include <machine/md_var.h>
  161 #include <machine/pcb.h>
  162 #include <machine/specialreg.h>
  163 #ifdef SMP
  164 #include <machine/smp.h>
  165 #endif
  166 #include <machine/sysarch.h>
  167 #include <machine/tss.h>
  168 
  169 static __inline boolean_t
  170 pmap_type_guest(pmap_t pmap)
  171 {
  172 
  173         return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
  174 }
  175 
  176 static __inline boolean_t
  177 pmap_emulate_ad_bits(pmap_t pmap)
  178 {
  179 
  180         return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
  181 }
  182 
  183 static __inline pt_entry_t
  184 pmap_valid_bit(pmap_t pmap)
  185 {
  186         pt_entry_t mask;
  187 
  188         switch (pmap->pm_type) {
  189         case PT_X86:
  190         case PT_RVI:
  191                 mask = X86_PG_V;
  192                 break;
  193         case PT_EPT:
  194                 if (pmap_emulate_ad_bits(pmap))
  195                         mask = EPT_PG_EMUL_V;
  196                 else
  197                         mask = EPT_PG_READ;
  198                 break;
  199         default:
  200                 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
  201         }
  202 
  203         return (mask);
  204 }
  205 
  206 static __inline pt_entry_t
  207 pmap_rw_bit(pmap_t pmap)
  208 {
  209         pt_entry_t mask;
  210 
  211         switch (pmap->pm_type) {
  212         case PT_X86:
  213         case PT_RVI:
  214                 mask = X86_PG_RW;
  215                 break;
  216         case PT_EPT:
  217                 if (pmap_emulate_ad_bits(pmap))
  218                         mask = EPT_PG_EMUL_RW;
  219                 else
  220                         mask = EPT_PG_WRITE;
  221                 break;
  222         default:
  223                 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
  224         }
  225 
  226         return (mask);
  227 }
  228 
  229 static pt_entry_t pg_g;
  230 
  231 static __inline pt_entry_t
  232 pmap_global_bit(pmap_t pmap)
  233 {
  234         pt_entry_t mask;
  235 
  236         switch (pmap->pm_type) {
  237         case PT_X86:
  238                 mask = pg_g;
  239                 break;
  240         case PT_RVI:
  241         case PT_EPT:
  242                 mask = 0;
  243                 break;
  244         default:
  245                 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
  246         }
  247 
  248         return (mask);
  249 }
  250 
  251 static __inline pt_entry_t
  252 pmap_accessed_bit(pmap_t pmap)
  253 {
  254         pt_entry_t mask;
  255 
  256         switch (pmap->pm_type) {
  257         case PT_X86:
  258         case PT_RVI:
  259                 mask = X86_PG_A;
  260                 break;
  261         case PT_EPT:
  262                 if (pmap_emulate_ad_bits(pmap))
  263                         mask = EPT_PG_READ;
  264                 else
  265                         mask = EPT_PG_A;
  266                 break;
  267         default:
  268                 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
  269         }
  270 
  271         return (mask);
  272 }
  273 
  274 static __inline pt_entry_t
  275 pmap_modified_bit(pmap_t pmap)
  276 {
  277         pt_entry_t mask;
  278 
  279         switch (pmap->pm_type) {
  280         case PT_X86:
  281         case PT_RVI:
  282                 mask = X86_PG_M;
  283                 break;
  284         case PT_EPT:
  285                 if (pmap_emulate_ad_bits(pmap))
  286                         mask = EPT_PG_WRITE;
  287                 else
  288                         mask = EPT_PG_M;
  289                 break;
  290         default:
  291                 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
  292         }
  293 
  294         return (mask);
  295 }
  296 
  297 static __inline pt_entry_t
  298 pmap_pku_mask_bit(pmap_t pmap)
  299 {
  300 
  301         return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
  302 }
  303 
  304 #if !defined(DIAGNOSTIC)
  305 #ifdef __GNUC_GNU_INLINE__
  306 #define PMAP_INLINE     __attribute__((__gnu_inline__)) inline
  307 #else
  308 #define PMAP_INLINE     extern inline
  309 #endif
  310 #else
  311 #define PMAP_INLINE
  312 #endif
  313 
  314 #ifdef PV_STATS
  315 #define PV_STAT(x)      do { x ; } while (0)
  316 #else
  317 #define PV_STAT(x)      do { } while (0)
  318 #endif
  319 
  320 #define pa_index(pa)    ((pa) >> PDRSHIFT)
  321 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
  322 
  323 #define NPV_LIST_LOCKS  MAXCPU
  324 
  325 #define PHYS_TO_PV_LIST_LOCK(pa)        \
  326                         (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
  327 
  328 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
  329         struct rwlock **_lockp = (lockp);               \
  330         struct rwlock *_new_lock;                       \
  331                                                         \
  332         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
  333         if (_new_lock != *_lockp) {                     \
  334                 if (*_lockp != NULL)                    \
  335                         rw_wunlock(*_lockp);            \
  336                 *_lockp = _new_lock;                    \
  337                 rw_wlock(*_lockp);                      \
  338         }                                               \
  339 } while (0)
  340 
  341 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
  342                         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
  343 
  344 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
  345         struct rwlock **_lockp = (lockp);               \
  346                                                         \
  347         if (*_lockp != NULL) {                          \
  348                 rw_wunlock(*_lockp);                    \
  349                 *_lockp = NULL;                         \
  350         }                                               \
  351 } while (0)
  352 
  353 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
  354                         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
  355 
  356 struct pmap kernel_pmap_store;
  357 
  358 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
  359 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
  360 
  361 int nkpt;
  362 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
  363     "Number of kernel page table pages allocated on bootup");
  364 
  365 static int ndmpdp;
  366 vm_paddr_t dmaplimit;
  367 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
  368 pt_entry_t pg_nx;
  369 
  370 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  371 
  372 /* Unused, kept for ABI stability on the stable branch. */
  373 static int pat_works = 1;
  374 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
  375     "Is page attribute table fully functional?");
  376 
  377 static int pg_ps_enabled = 1;
  378 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  379     &pg_ps_enabled, 0, "Are large page mappings enabled?");
  380 
  381 #define PAT_INDEX_SIZE  8
  382 static int pat_index[PAT_INDEX_SIZE];   /* cache mode to PAT index conversion */
  383 
  384 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
  385 static u_int64_t        KPDphys;        /* phys addr of kernel level 2 */
  386 static u_int64_t        KPDPphys;       /* phys addr of kernel level 3 */
  387 u_int64_t               KPML4phys;      /* phys addr of kernel level 4 */
  388 
  389 static u_int64_t        DMPDphys;       /* phys addr of direct mapped level 2 */
  390 static u_int64_t        DMPDPphys;      /* phys addr of direct mapped level 3 */
  391 static int              ndmpdpphys;     /* number of DMPDPphys pages */
  392 
  393 static vm_paddr_t       KERNend;        /* phys addr of end of bootstrap data */
  394 
  395 /*
  396  * pmap_mapdev support pre initialization (i.e. console)
  397  */
  398 #define PMAP_PREINIT_MAPPING_COUNT      8
  399 static struct pmap_preinit_mapping {
  400         vm_paddr_t      pa;
  401         vm_offset_t     va;
  402         vm_size_t       sz;
  403         int             mode;
  404 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
  405 static int pmap_initialized;
  406 
  407 /*
  408  * Data for the pv entry allocation mechanism.
  409  * Updates to pv_invl_gen are protected by the pv_list_locks[]
  410  * elements, but reads are not.
  411  */
  412 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
  413 static struct mtx __exclusive_cache_line pv_chunks_mutex;
  414 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
  415 static u_long pv_invl_gen[NPV_LIST_LOCKS];
  416 static struct md_page *pv_table;
  417 static struct md_page pv_dummy;
  418 
  419 /*
  420  * All those kernel PT submaps that BSD is so fond of
  421  */
  422 pt_entry_t *CMAP1 = NULL;
  423 caddr_t CADDR1 = 0;
  424 static vm_offset_t qframe = 0;
  425 static struct mtx qframe_mtx;
  426 
  427 static int pmap_flags = PMAP_PDE_SUPERPAGE;     /* flags for x86 pmaps */
  428 
  429 static vmem_t *large_vmem;
  430 static u_int lm_ents;
  431 #define PMAP_ADDRESS_IN_LARGEMAP(va)    ((va) >= LARGEMAP_MIN_ADDRESS && \
  432         (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
  433 
  434 int pmap_pcid_enabled = 1;
  435 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  436     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
  437 int invpcid_works = 0;
  438 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
  439     "Is the invpcid instruction available ?");
  440 
  441 int __read_frequently pti = 0;
  442 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  443     &pti, 0,
  444     "Page Table Isolation enabled");
  445 static vm_object_t pti_obj;
  446 static pml4_entry_t *pti_pml4;
  447 static vm_pindex_t pti_pg_idx;
  448 static bool pti_finalized;
  449 
  450 struct pmap_pkru_range {
  451         struct rs_el    pkru_rs_el;
  452         u_int           pkru_keyidx;
  453         int             pkru_flags;
  454 };
  455 
  456 static uma_zone_t pmap_pkru_ranges_zone;
  457 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
  458 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
  459 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
  460 static void *pkru_dup_range(void *ctx, void *data);
  461 static void pkru_free_range(void *ctx, void *node);
  462 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
  463 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
  464 static void pmap_pkru_deassign_all(pmap_t pmap);
  465 
  466 static int
  467 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
  468 {
  469         int i;
  470         uint64_t res;
  471 
  472         res = 0;
  473         CPU_FOREACH(i) {
  474                 res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
  475         }
  476         return (sysctl_handle_64(oidp, &res, 0, req));
  477 }
  478 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD |
  479     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
  480     "Count of saved TLB context on switch");
  481 
  482 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
  483     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
  484 static struct mtx invl_gen_mtx;
  485 /* Fake lock object to satisfy turnstiles interface. */
  486 static struct lock_object invl_gen_ts = {
  487         .lo_name = "invlts",
  488 };
  489 static struct pmap_invl_gen pmap_invl_gen_head = {
  490         .gen = 1,
  491         .next = NULL,
  492 };
  493 static u_long pmap_invl_gen = 1;
  494 static int pmap_invl_waiters;
  495 static struct callout pmap_invl_callout;
  496 static bool pmap_invl_callout_inited;
  497 
  498 #define PMAP_ASSERT_NOT_IN_DI() \
  499     KASSERT(pmap_not_in_di(), ("DI already started"))
  500 
  501 static bool
  502 pmap_di_locked(void)
  503 {
  504         int tun;
  505 
  506         if ((cpu_feature2 & CPUID2_CX16) == 0)
  507                 return (true);
  508         tun = 0;
  509         TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun);
  510         return (tun != 0);
  511 }
  512 
  513 static int
  514 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)
  515 {
  516         int locked;
  517 
  518         locked = pmap_di_locked();
  519         return (sysctl_handle_int(oidp, &locked, 0, req));
  520 }
  521 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN |
  522     CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "",
  523     "Locked delayed invalidation");
  524 
  525 static bool pmap_not_in_di_l(void);
  526 static bool pmap_not_in_di_u(void);
  527 DEFINE_IFUNC(, bool, pmap_not_in_di, (void), static)
  528 {
  529 
  530         return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u);
  531 }
  532 
  533 static bool
  534 pmap_not_in_di_l(void)
  535 {
  536         struct pmap_invl_gen *invl_gen;
  537 
  538         invl_gen = &curthread->td_md.md_invl_gen;
  539         return (invl_gen->gen == 0);
  540 }
  541 
  542 static void
  543 pmap_thread_init_invl_gen_l(struct thread *td)
  544 {
  545         struct pmap_invl_gen *invl_gen;
  546 
  547         invl_gen = &td->td_md.md_invl_gen;
  548         invl_gen->gen = 0;
  549 }
  550 
  551 static void
  552 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen)
  553 {
  554         struct turnstile *ts;
  555 
  556         ts = turnstile_trywait(&invl_gen_ts);
  557         if (*m_gen > atomic_load_long(invl_gen))
  558                 turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
  559         else
  560                 turnstile_cancel(ts);
  561 }
  562 
  563 static void
  564 pmap_delayed_invl_finish_unblock(u_long new_gen)
  565 {
  566         struct turnstile *ts;
  567 
  568         turnstile_chain_lock(&invl_gen_ts);
  569         ts = turnstile_lookup(&invl_gen_ts);
  570         if (new_gen != 0)
  571                 pmap_invl_gen = new_gen;
  572         if (ts != NULL) {
  573                 turnstile_broadcast(ts, TS_SHARED_QUEUE);
  574                 turnstile_unpend(ts);
  575         }
  576         turnstile_chain_unlock(&invl_gen_ts);
  577 }
  578 
  579 /*
  580  * Start a new Delayed Invalidation (DI) block of code, executed by
  581  * the current thread.  Within a DI block, the current thread may
  582  * destroy both the page table and PV list entries for a mapping and
  583  * then release the corresponding PV list lock before ensuring that
  584  * the mapping is flushed from the TLBs of any processors with the
  585  * pmap active.
  586  */
  587 static void
  588 pmap_delayed_invl_start_l(void)
  589 {
  590         struct pmap_invl_gen *invl_gen;
  591         u_long currgen;
  592 
  593         invl_gen = &curthread->td_md.md_invl_gen;
  594         PMAP_ASSERT_NOT_IN_DI();
  595         mtx_lock(&invl_gen_mtx);
  596         if (LIST_EMPTY(&pmap_invl_gen_tracker))
  597                 currgen = pmap_invl_gen;
  598         else
  599                 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
  600         invl_gen->gen = currgen + 1;
  601         LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
  602         mtx_unlock(&invl_gen_mtx);
  603 }
  604 
  605 /*
  606  * Finish the DI block, previously started by the current thread.  All
  607  * required TLB flushes for the pages marked by
  608  * pmap_delayed_invl_page() must be finished before this function is
  609  * called.
  610  *
  611  * This function works by bumping the global DI generation number to
  612  * the generation number of the current thread's DI, unless there is a
  613  * pending DI that started earlier.  In the latter case, bumping the
  614  * global DI generation number would incorrectly signal that the
  615  * earlier DI had finished.  Instead, this function bumps the earlier
  616  * DI's generation number to match the generation number of the
  617  * current thread's DI.
  618  */
  619 static void
  620 pmap_delayed_invl_finish_l(void)
  621 {
  622         struct pmap_invl_gen *invl_gen, *next;
  623 
  624         invl_gen = &curthread->td_md.md_invl_gen;
  625         KASSERT(invl_gen->gen != 0, ("missed invl_start"));
  626         mtx_lock(&invl_gen_mtx);
  627         next = LIST_NEXT(invl_gen, link);
  628         if (next == NULL)
  629                 pmap_delayed_invl_finish_unblock(invl_gen->gen);
  630         else
  631                 next->gen = invl_gen->gen;
  632         LIST_REMOVE(invl_gen, link);
  633         mtx_unlock(&invl_gen_mtx);
  634         invl_gen->gen = 0;
  635 }
  636 
  637 static bool
  638 pmap_not_in_di_u(void)
  639 {
  640         struct pmap_invl_gen *invl_gen;
  641 
  642         invl_gen = &curthread->td_md.md_invl_gen;
  643         return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
  644 }
  645 
  646 static void
  647 pmap_thread_init_invl_gen_u(struct thread *td)
  648 {
  649         struct pmap_invl_gen *invl_gen;
  650 
  651         invl_gen = &td->td_md.md_invl_gen;
  652         invl_gen->gen = 0;
  653         invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
  654 }
  655 
  656 static bool
  657 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
  658 {
  659         uint64_t new_high, new_low, old_high, old_low;
  660         char res;
  661 
  662         old_low = new_low = 0;
  663         old_high = new_high = (uintptr_t)0;
  664 
  665         __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
  666             : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
  667             : "b"(new_low), "c" (new_high)
  668             : "memory", "cc");
  669         if (res == 0) {
  670                 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
  671                         return (false);
  672                 out->gen = old_low;
  673                 out->next = (void *)old_high;
  674         } else {
  675                 out->gen = new_low;
  676                 out->next = (void *)new_high;
  677         }
  678         return (true);
  679 }
  680 
  681 static bool
  682 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
  683     struct pmap_invl_gen *new_val)
  684 {
  685         uint64_t new_high, new_low, old_high, old_low;
  686         char res;
  687 
  688         new_low = new_val->gen;
  689         new_high = (uintptr_t)new_val->next;
  690         old_low = old_val->gen;
  691         old_high = (uintptr_t)old_val->next;
  692 
  693         __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
  694             : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
  695             : "b"(new_low), "c" (new_high)
  696             : "memory", "cc");
  697         return (res);
  698 }
  699 
  700 #ifdef PV_STATS
  701 static long invl_start_restart;
  702 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
  703     &invl_start_restart, 0,
  704     "");
  705 static long invl_finish_restart;
  706 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
  707     &invl_finish_restart, 0,
  708     "");
  709 static int invl_max_qlen;
  710 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
  711     &invl_max_qlen, 0,
  712     "");
  713 #endif
  714 
  715 static struct lock_delay_config __read_frequently di_delay;
  716 LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
  717 
  718 static void
  719 pmap_delayed_invl_start_u(void)
  720 {
  721         struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
  722         struct thread *td;
  723         struct lock_delay_arg lda;
  724         uintptr_t prevl;
  725         u_char pri;
  726 #ifdef PV_STATS
  727         int i, ii;
  728 #endif
  729 
  730         td = curthread;
  731         invl_gen = &td->td_md.md_invl_gen;
  732         PMAP_ASSERT_NOT_IN_DI();
  733         lock_delay_arg_init(&lda, &di_delay);
  734         invl_gen->saved_pri = 0;
  735         pri = td->td_base_pri;
  736         if (pri > PVM) {
  737                 thread_lock(td);
  738                 pri = td->td_base_pri;
  739                 if (pri > PVM) {
  740                         invl_gen->saved_pri = pri;
  741                         sched_prio(td, PVM);
  742                 }
  743                 thread_unlock(td);
  744         }
  745 again:
  746         PV_STAT(i = 0);
  747         for (p = &pmap_invl_gen_head;; p = prev.next) {
  748                 PV_STAT(i++);
  749                 prevl = (uintptr_t)atomic_load_ptr(&p->next);
  750                 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
  751                         PV_STAT(atomic_add_long(&invl_start_restart, 1));
  752                         lock_delay(&lda);
  753                         goto again;
  754                 }
  755                 if (prevl == 0)
  756                         break;
  757                 prev.next = (void *)prevl;
  758         }
  759 #ifdef PV_STATS
  760         if ((ii = invl_max_qlen) < i)
  761                 atomic_cmpset_int(&invl_max_qlen, ii, i);
  762 #endif
  763 
  764         if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
  765                 PV_STAT(atomic_add_long(&invl_start_restart, 1));
  766                 lock_delay(&lda);
  767                 goto again;
  768         }
  769 
  770         new_prev.gen = prev.gen;
  771         new_prev.next = invl_gen;
  772         invl_gen->gen = prev.gen + 1;
  773 
  774         /* Formal fence between store to invl->gen and updating *p. */
  775         atomic_thread_fence_rel();
  776 
  777         /*
  778          * After inserting an invl_gen element with invalid bit set,
  779          * this thread blocks any other thread trying to enter the
  780          * delayed invalidation block.  Do not allow to remove us from
  781          * the CPU, because it causes starvation for other threads.
  782          */
  783         critical_enter();
  784 
  785         /*
  786          * ABA for *p is not possible there, since p->gen can only
  787          * increase.  So if the *p thread finished its di, then
  788          * started a new one and got inserted into the list at the
  789          * same place, its gen will appear greater than the previously
  790          * read gen.
  791          */
  792         if (!pmap_di_store_invl(p, &prev, &new_prev)) {
  793                 critical_exit();
  794                 PV_STAT(atomic_add_long(&invl_start_restart, 1));
  795                 lock_delay(&lda);
  796                 goto again;
  797         }
  798 
  799         /*
  800          * There we clear PMAP_INVL_GEN_NEXT_INVALID in
  801          * invl_gen->next, allowing other threads to iterate past us.
  802          * pmap_di_store_invl() provides fence between the generation
  803          * write and the update of next.
  804          */
  805         invl_gen->next = NULL;
  806         critical_exit();
  807 }
  808 
  809 static bool
  810 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen,
  811     struct pmap_invl_gen *p)
  812 {
  813         struct pmap_invl_gen prev, new_prev;
  814         u_long mygen;
  815 
  816         /*
  817          * Load invl_gen->gen after setting invl_gen->next
  818          * PMAP_INVL_GEN_NEXT_INVALID.  This prevents larger
  819          * generations to propagate to our invl_gen->gen.  Lock prefix
  820          * in atomic_set_ptr() worked as seq_cst fence.
  821          */
  822         mygen = atomic_load_long(&invl_gen->gen);
  823 
  824         if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
  825                 return (false);
  826 
  827         KASSERT(prev.gen < mygen,
  828             ("invalid di gen sequence %lu %lu", prev.gen, mygen));
  829         new_prev.gen = mygen;
  830         new_prev.next = (void *)((uintptr_t)invl_gen->next &
  831             ~PMAP_INVL_GEN_NEXT_INVALID);
  832 
  833         /* Formal fence between load of prev and storing update to it. */
  834         atomic_thread_fence_rel();
  835 
  836         return (pmap_di_store_invl(p, &prev, &new_prev));
  837 }
  838 
  839 static void
  840 pmap_delayed_invl_finish_u(void)
  841 {
  842         struct pmap_invl_gen *invl_gen, *p;
  843         struct thread *td;
  844         struct lock_delay_arg lda;
  845         uintptr_t prevl;
  846 
  847         td = curthread;
  848         invl_gen = &td->td_md.md_invl_gen;
  849         KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
  850         KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
  851             ("missed invl_start: INVALID"));
  852         lock_delay_arg_init(&lda, &di_delay);
  853 
  854 again:
  855         for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
  856                 prevl = (uintptr_t)atomic_load_ptr(&p->next);
  857                 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
  858                         PV_STAT(atomic_add_long(&invl_finish_restart, 1));
  859                         lock_delay(&lda);
  860                         goto again;
  861                 }
  862                 if ((void *)prevl == invl_gen)
  863                         break;
  864         }
  865 
  866         /*
  867          * It is legitimate to not find ourself on the list if a
  868          * thread before us finished its DI and started it again.
  869          */
  870         if (__predict_false(p == NULL)) {
  871                 PV_STAT(atomic_add_long(&invl_finish_restart, 1));
  872                 lock_delay(&lda);
  873                 goto again;
  874         }
  875 
  876         critical_enter();
  877         atomic_set_ptr((uintptr_t *)&invl_gen->next,
  878             PMAP_INVL_GEN_NEXT_INVALID);
  879         if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) {
  880                 atomic_clear_ptr((uintptr_t *)&invl_gen->next,
  881                     PMAP_INVL_GEN_NEXT_INVALID);
  882                 critical_exit();
  883                 PV_STAT(atomic_add_long(&invl_finish_restart, 1));
  884                 lock_delay(&lda);
  885                 goto again;
  886         }
  887         critical_exit();
  888         if (atomic_load_int(&pmap_invl_waiters) > 0)
  889                 pmap_delayed_invl_finish_unblock(0);
  890         if (invl_gen->saved_pri != 0) {
  891                 thread_lock(td);
  892                 sched_prio(td, invl_gen->saved_pri);
  893                 thread_unlock(td);
  894         }
  895 }
  896 
  897 #ifdef DDB
  898 DB_SHOW_COMMAND(di_queue, pmap_di_queue)
  899 {
  900         struct pmap_invl_gen *p, *pn;
  901         struct thread *td;
  902         uintptr_t nextl;
  903         bool first;
  904 
  905         for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
  906             first = false) {
  907                 nextl = (uintptr_t)atomic_load_ptr(&p->next);
  908                 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
  909                 td = first ? NULL : __containerof(p, struct thread,
  910                     td_md.md_invl_gen);
  911                 db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
  912                     (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
  913                     td != NULL ? td->td_tid : -1);
  914         }
  915 }
  916 #endif
  917 
  918 #ifdef PV_STATS
  919 static long invl_wait;
  920 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
  921     "Number of times DI invalidation blocked pmap_remove_all/write");
  922 static long invl_wait_slow;
  923 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0,
  924     "Number of slow invalidation waits for lockless DI");
  925 #endif
  926 
  927 static u_long *
  928 pmap_delayed_invl_genp(vm_page_t m)
  929 {
  930 
  931         return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
  932 }
  933 
  934 static void
  935 pmap_delayed_invl_callout_func(void *arg __unused)
  936 {
  937 
  938         if (atomic_load_int(&pmap_invl_waiters) == 0)
  939                 return;
  940         pmap_delayed_invl_finish_unblock(0);
  941 }
  942 
  943 static void
  944 pmap_delayed_invl_callout_init(void *arg __unused)
  945 {
  946 
  947         if (pmap_di_locked())
  948                 return;
  949         callout_init(&pmap_invl_callout, 1);
  950         pmap_invl_callout_inited = true;
  951 }
  952 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY,
  953     pmap_delayed_invl_callout_init, NULL);
  954 
  955 /*
  956  * Ensure that all currently executing DI blocks, that need to flush
  957  * TLB for the given page m, actually flushed the TLB at the time the
  958  * function returned.  If the page m has an empty PV list and we call
  959  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
  960  * valid mapping for the page m in either its page table or TLB.
  961  *
  962  * This function works by blocking until the global DI generation
  963  * number catches up with the generation number associated with the
  964  * given page m and its PV list.  Since this function's callers
  965  * typically own an object lock and sometimes own a page lock, it
  966  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
  967  * processor.
  968  */
  969 static void
  970 pmap_delayed_invl_wait_l(vm_page_t m)
  971 {
  972         u_long *m_gen;
  973 #ifdef PV_STATS
  974         bool accounted = false;
  975 #endif
  976 
  977         m_gen = pmap_delayed_invl_genp(m);
  978         while (*m_gen > pmap_invl_gen) {
  979 #ifdef PV_STATS
  980                 if (!accounted) {
  981                         atomic_add_long(&invl_wait, 1);
  982                         accounted = true;
  983                 }
  984 #endif
  985                 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen);
  986         }
  987 }
  988 
  989 static void
  990 pmap_delayed_invl_wait_u(vm_page_t m)
  991 {
  992         u_long *m_gen;
  993         struct lock_delay_arg lda;
  994         bool fast;
  995 
  996         fast = true;
  997         m_gen = pmap_delayed_invl_genp(m);
  998         lock_delay_arg_init(&lda, &di_delay);
  999         while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
 1000                 if (fast || !pmap_invl_callout_inited) {
 1001                         PV_STAT(atomic_add_long(&invl_wait, 1));
 1002                         lock_delay(&lda);
 1003                         fast = false;
 1004                 } else {
 1005                         /*
 1006                          * The page's invalidation generation number
 1007                          * is still below the current thread's number.
 1008                          * Prepare to block so that we do not waste
 1009                          * CPU cycles or worse, suffer livelock.
 1010                          *
 1011                          * Since it is impossible to block without
 1012                          * racing with pmap_delayed_invl_finish_u(),
 1013                          * prepare for the race by incrementing
 1014                          * pmap_invl_waiters and arming a 1-tick
 1015                          * callout which will unblock us if we lose
 1016                          * the race.
 1017                          */
 1018                         atomic_add_int(&pmap_invl_waiters, 1);
 1019 
 1020                         /*
 1021                          * Re-check the current thread's invalidation
 1022                          * generation after incrementing
 1023                          * pmap_invl_waiters, so that there is no race
 1024                          * with pmap_delayed_invl_finish_u() setting
 1025                          * the page generation and checking
 1026                          * pmap_invl_waiters.  The only race allowed
 1027                          * is for a missed unblock, which is handled
 1028                          * by the callout.
 1029                          */
 1030                         if (*m_gen >
 1031                             atomic_load_long(&pmap_invl_gen_head.gen)) {
 1032                                 callout_reset(&pmap_invl_callout, 1,
 1033                                     pmap_delayed_invl_callout_func, NULL);
 1034                                 PV_STAT(atomic_add_long(&invl_wait_slow, 1));
 1035                                 pmap_delayed_invl_wait_block(m_gen,
 1036                                     &pmap_invl_gen_head.gen);
 1037                         }
 1038                         atomic_add_int(&pmap_invl_waiters, -1);
 1039                 }
 1040         }
 1041 }
 1042 
 1043 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *), static)
 1044 {
 1045 
 1046         return (pmap_di_locked() ? pmap_thread_init_invl_gen_l :
 1047             pmap_thread_init_invl_gen_u);
 1048 }
 1049 
 1050 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void), static)
 1051 {
 1052 
 1053         return (pmap_di_locked() ? pmap_delayed_invl_start_l :
 1054             pmap_delayed_invl_start_u);
 1055 }
 1056 
 1057 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void), static)
 1058 {
 1059 
 1060         return (pmap_di_locked() ? pmap_delayed_invl_finish_l :
 1061             pmap_delayed_invl_finish_u);
 1062 }
 1063 
 1064 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t), static)
 1065 {
 1066 
 1067         return (pmap_di_locked() ? pmap_delayed_invl_wait_l :
 1068             pmap_delayed_invl_wait_u);
 1069 }
 1070 
 1071 /*
 1072  * Mark the page m's PV list as participating in the current thread's
 1073  * DI block.  Any threads concurrently using m's PV list to remove or
 1074  * restrict all mappings to m will wait for the current thread's DI
 1075  * block to complete before proceeding.
 1076  *
 1077  * The function works by setting the DI generation number for m's PV
 1078  * list to at least the DI generation number of the current thread.
 1079  * This forces a caller of pmap_delayed_invl_wait() to block until
 1080  * current thread calls pmap_delayed_invl_finish().
 1081  */
 1082 static void
 1083 pmap_delayed_invl_page(vm_page_t m)
 1084 {
 1085         u_long gen, *m_gen;
 1086 
 1087         rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
 1088         gen = curthread->td_md.md_invl_gen.gen;
 1089         if (gen == 0)
 1090                 return;
 1091         m_gen = pmap_delayed_invl_genp(m);
 1092         if (*m_gen < gen)
 1093                 *m_gen = gen;
 1094 }
 1095 
 1096 /*
 1097  * Crashdump maps.
 1098  */
 1099 static caddr_t crashdumpmap;
 1100 
 1101 /*
 1102  * Internal flags for pmap_enter()'s helper functions.
 1103  */
 1104 #define PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV entries. */
 1105 #define PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace mappings. */
 1106 
 1107 /*
 1108  * Internal flags for pmap_mapdev_internal() and
 1109  * pmap_change_props_locked().
 1110  */
 1111 #define MAPDEV_FLUSHCACHE       0x00000001      /* Flush cache after mapping. */
 1112 #define MAPDEV_SETATTR          0x00000002      /* Modify existing attrs. */
 1113 #define MAPDEV_ASSERTVALID      0x00000004      /* Assert mapping validity. */
 1114 
 1115 TAILQ_HEAD(pv_chunklist, pv_chunk);
 1116 
 1117 static void     free_pv_chunk(struct pv_chunk *pc);
 1118 static void     free_pv_chunk_batch(struct pv_chunklist *batch);
 1119 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
 1120 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 1121 static int      popcnt_pc_map_pq(uint64_t *map);
 1122 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 1123 static void     reserve_pv_entries(pmap_t pmap, int needed,
 1124                     struct rwlock **lockp);
 1125 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 1126                     struct rwlock **lockp);
 1127 static bool     pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
 1128                     u_int flags, struct rwlock **lockp);
 1129 #if VM_NRESERVLEVEL > 0
 1130 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 1131                     struct rwlock **lockp);
 1132 #endif
 1133 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 1134 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 1135                     vm_offset_t va);
 1136 
 1137 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
 1138     vm_prot_t prot, int mode, int flags);
 1139 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 1140 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
 1141     vm_offset_t va, struct rwlock **lockp);
 1142 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
 1143     vm_offset_t va);
 1144 static bool     pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
 1145                     vm_prot_t prot, struct rwlock **lockp);
 1146 static int      pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
 1147                     u_int flags, vm_page_t m, struct rwlock **lockp);
 1148 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
 1149     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 1150 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 1151 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
 1152 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
 1153     vm_offset_t eva);
 1154 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
 1155     vm_offset_t eva);
 1156 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
 1157                     pd_entry_t pde);
 1158 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 1159 static vm_page_t pmap_large_map_getptp_unlocked(void);
 1160 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
 1161 #if VM_NRESERVLEVEL > 0
 1162 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 1163     struct rwlock **lockp);
 1164 #endif
 1165 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
 1166     vm_prot_t prot);
 1167 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask);
 1168 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
 1169     bool exec);
 1170 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
 1171 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
 1172 static void pmap_pti_wire_pte(void *pte);
 1173 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 1174     struct spglist *free, struct rwlock **lockp);
 1175 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
 1176     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 1177 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 1178 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 1179     struct spglist *free);
 1180 static bool     pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 1181                     pd_entry_t *pde, struct spglist *free,
 1182                     struct rwlock **lockp);
 1183 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
 1184     vm_page_t m, struct rwlock **lockp);
 1185 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 1186     pd_entry_t newpde);
 1187 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 1188 
 1189 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 1190                 struct rwlock **lockp);
 1191 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
 1192                 struct rwlock **lockp);
 1193 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
 1194                 struct rwlock **lockp);
 1195 
 1196 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
 1197     struct spglist *free);
 1198 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 1199 
 1200 /********************/
 1201 /* Inline functions */
 1202 /********************/
 1203 
 1204 /* Return a non-clipped PD index for a given VA */
 1205 static __inline vm_pindex_t
 1206 pmap_pde_pindex(vm_offset_t va)
 1207 {
 1208         return (va >> PDRSHIFT);
 1209 }
 1210 
 1211 
 1212 /* Return a pointer to the PML4 slot that corresponds to a VA */
 1213 static __inline pml4_entry_t *
 1214 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 1215 {
 1216 
 1217         return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 1218 }
 1219 
 1220 /* Return a pointer to the PDP slot that corresponds to a VA */
 1221 static __inline pdp_entry_t *
 1222 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 1223 {
 1224         pdp_entry_t *pdpe;
 1225 
 1226         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 1227         return (&pdpe[pmap_pdpe_index(va)]);
 1228 }
 1229 
 1230 /* Return a pointer to the PDP slot that corresponds to a VA */
 1231 static __inline pdp_entry_t *
 1232 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 1233 {
 1234         pml4_entry_t *pml4e;
 1235         pt_entry_t PG_V;
 1236 
 1237         PG_V = pmap_valid_bit(pmap);
 1238         pml4e = pmap_pml4e(pmap, va);
 1239         if ((*pml4e & PG_V) == 0)
 1240                 return (NULL);
 1241         return (pmap_pml4e_to_pdpe(pml4e, va));
 1242 }
 1243 
 1244 /* Return a pointer to the PD slot that corresponds to a VA */
 1245 static __inline pd_entry_t *
 1246 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 1247 {
 1248         pd_entry_t *pde;
 1249 
 1250         KASSERT((*pdpe & PG_PS) == 0,
 1251             ("%s: pdpe %#lx is a leaf", __func__, *pdpe));
 1252         pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 1253         return (&pde[pmap_pde_index(va)]);
 1254 }
 1255 
 1256 /* Return a pointer to the PD slot that corresponds to a VA */
 1257 static __inline pd_entry_t *
 1258 pmap_pde(pmap_t pmap, vm_offset_t va)
 1259 {
 1260         pdp_entry_t *pdpe;
 1261         pt_entry_t PG_V;
 1262 
 1263         PG_V = pmap_valid_bit(pmap);
 1264         pdpe = pmap_pdpe(pmap, va);
 1265         if (pdpe == NULL || (*pdpe & PG_V) == 0)
 1266                 return (NULL);
 1267         return (pmap_pdpe_to_pde(pdpe, va));
 1268 }
 1269 
 1270 /* Return a pointer to the PT slot that corresponds to a VA */
 1271 static __inline pt_entry_t *
 1272 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 1273 {
 1274         pt_entry_t *pte;
 1275 
 1276         KASSERT((*pde & PG_PS) == 0,
 1277             ("%s: pde %#lx is a leaf", __func__, *pde));
 1278         pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 1279         return (&pte[pmap_pte_index(va)]);
 1280 }
 1281 
 1282 /* Return a pointer to the PT slot that corresponds to a VA */
 1283 static __inline pt_entry_t *
 1284 pmap_pte(pmap_t pmap, vm_offset_t va)
 1285 {
 1286         pd_entry_t *pde;
 1287         pt_entry_t PG_V;
 1288 
 1289         PG_V = pmap_valid_bit(pmap);
 1290         pde = pmap_pde(pmap, va);
 1291         if (pde == NULL || (*pde & PG_V) == 0)
 1292                 return (NULL);
 1293         if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
 1294                 return ((pt_entry_t *)pde);
 1295         return (pmap_pde_to_pte(pde, va));
 1296 }
 1297 
 1298 static __inline void
 1299 pmap_resident_count_inc(pmap_t pmap, int count)
 1300 {
 1301 
 1302         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1303         pmap->pm_stats.resident_count += count;
 1304 }
 1305 
 1306 static __inline void
 1307 pmap_resident_count_dec(pmap_t pmap, int count)
 1308 {
 1309 
 1310         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1311         KASSERT(pmap->pm_stats.resident_count >= count,
 1312             ("pmap %p resident count underflow %ld %d", pmap,
 1313             pmap->pm_stats.resident_count, count));
 1314         pmap->pm_stats.resident_count -= count;
 1315 }
 1316 
 1317 PMAP_INLINE pt_entry_t *
 1318 vtopte(vm_offset_t va)
 1319 {
 1320         u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 1321 
 1322         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
 1323 
 1324         return (PTmap + ((va >> PAGE_SHIFT) & mask));
 1325 }
 1326 
 1327 static __inline pd_entry_t *
 1328 vtopde(vm_offset_t va)
 1329 {
 1330         u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 1331 
 1332         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
 1333 
 1334         return (PDmap + ((va >> PDRSHIFT) & mask));
 1335 }
 1336 
 1337 static u_int64_t
 1338 allocpages(vm_paddr_t *firstaddr, int n)
 1339 {
 1340         u_int64_t ret;
 1341 
 1342         ret = *firstaddr;
 1343         bzero((void *)ret, n * PAGE_SIZE);
 1344         *firstaddr += n * PAGE_SIZE;
 1345         return (ret);
 1346 }
 1347 
 1348 CTASSERT(powerof2(NDMPML4E));
 1349 
 1350 /* number of kernel PDP slots */
 1351 #define NKPDPE(ptpgs)           howmany(ptpgs, NPDEPG)
 1352 
 1353 static void
 1354 nkpt_init(vm_paddr_t addr)
 1355 {
 1356         int pt_pages;
 1357         
 1358 #ifdef NKPT
 1359         pt_pages = NKPT;
 1360 #else
 1361         pt_pages = howmany(addr, 1 << PDRSHIFT);
 1362         pt_pages += NKPDPE(pt_pages);
 1363 
 1364         /*
 1365          * Add some slop beyond the bare minimum required for bootstrapping
 1366          * the kernel.
 1367          *
 1368          * This is quite important when allocating KVA for kernel modules.
 1369          * The modules are required to be linked in the negative 2GB of
 1370          * the address space.  If we run out of KVA in this region then
 1371          * pmap_growkernel() will need to allocate page table pages to map
 1372          * the entire 512GB of KVA space which is an unnecessary tax on
 1373          * physical memory.
 1374          *
 1375          * Secondly, device memory mapped as part of setting up the low-
 1376          * level console(s) is taken from KVA, starting at virtual_avail.
 1377          * This is because cninit() is called after pmap_bootstrap() but
 1378          * before vm_init() and pmap_init(). 20MB for a frame buffer is
 1379          * not uncommon.
 1380          */
 1381         pt_pages += 32;         /* 64MB additional slop. */
 1382 #endif
 1383         nkpt = pt_pages;
 1384 }
 1385 
 1386 /*
 1387  * Returns the proper write/execute permission for a physical page that is
 1388  * part of the initial boot allocations.
 1389  *
 1390  * If the page has kernel text, it is marked as read-only. If the page has
 1391  * kernel read-only data, it is marked as read-only/not-executable. If the
 1392  * page has only read-write data, it is marked as read-write/not-executable.
 1393  * If the page is below/above the kernel range, it is marked as read-write.
 1394  *
 1395  * This function operates on 2M pages, since we map the kernel space that
 1396  * way.
 1397  *
 1398  * Note that this doesn't currently provide any protection for modules.
 1399  */
 1400 static inline pt_entry_t
 1401 bootaddr_rwx(vm_paddr_t pa)
 1402 {
 1403 
 1404         /*
 1405          * Everything in the same 2M page as the start of the kernel
 1406          * should be static. On the other hand, things in the same 2M
 1407          * page as the end of the kernel could be read-write/executable,
 1408          * as the kernel image is not guaranteed to end on a 2M boundary.
 1409          */
 1410         if (pa < trunc_2mpage(btext - KERNBASE) ||
 1411            pa >= trunc_2mpage(_end - KERNBASE))
 1412                 return (X86_PG_RW);
 1413         /*
 1414          * The linker should ensure that the read-only and read-write
 1415          * portions don't share the same 2M page, so this shouldn't
 1416          * impact read-only data. However, in any case, any page with
 1417          * read-write data needs to be read-write.
 1418          */
 1419         if (pa >= trunc_2mpage(brwsection - KERNBASE))
 1420                 return (X86_PG_RW | pg_nx);
 1421         /*
 1422          * Mark any 2M page containing kernel text as read-only. Mark
 1423          * other pages with read-only data as read-only and not executable.
 1424          * (It is likely a small portion of the read-only data section will
 1425          * be marked as read-only, but executable. This should be acceptable
 1426          * since the read-only protection will keep the data from changing.)
 1427          * Note that fixups to the .text section will still work until we
 1428          * set CR0.WP.
 1429          */
 1430         if (pa < round_2mpage(etext - KERNBASE))
 1431                 return (0);
 1432         return (pg_nx);
 1433 }
 1434 
 1435 static void
 1436 create_pagetables(vm_paddr_t *firstaddr)
 1437 {
 1438         int i, j, ndm1g, nkpdpe, nkdmpde;
 1439         pd_entry_t *pd_p;
 1440         pdp_entry_t *pdp_p;
 1441         pml4_entry_t *p4_p;
 1442         uint64_t DMPDkernphys;
 1443 
 1444         /* Allocate page table pages for the direct map */
 1445         ndmpdp = howmany(ptoa(Maxmem), NBPDP);
 1446         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
 1447                 ndmpdp = 4;
 1448         ndmpdpphys = howmany(ndmpdp, NPDPEPG);
 1449         if (ndmpdpphys > NDMPML4E) {
 1450                 /*
 1451                  * Each NDMPML4E allows 512 GB, so limit to that,
 1452                  * and then readjust ndmpdp and ndmpdpphys.
 1453                  */
 1454                 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
 1455                 Maxmem = atop(NDMPML4E * NBPML4);
 1456                 ndmpdpphys = NDMPML4E;
 1457                 ndmpdp = NDMPML4E * NPDEPG;
 1458         }
 1459         DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 1460         ndm1g = 0;
 1461         if ((amd_feature & AMDID_PAGE1GB) != 0) {
 1462                 /*
 1463                  * Calculate the number of 1G pages that will fully fit in
 1464                  * Maxmem.
 1465                  */
 1466                 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
 1467 
 1468                 /*
 1469                  * Allocate 2M pages for the kernel. These will be used in
 1470                  * place of the first one or more 1G pages from ndm1g.
 1471                  */
 1472                 nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
 1473                 DMPDkernphys = allocpages(firstaddr, nkdmpde);
 1474         }
 1475         if (ndm1g < ndmpdp)
 1476                 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
 1477         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 1478 
 1479         /* Allocate pages */
 1480         KPML4phys = allocpages(firstaddr, 1);
 1481         KPDPphys = allocpages(firstaddr, NKPML4E);
 1482 
 1483         /*
 1484          * Allocate the initial number of kernel page table pages required to
 1485          * bootstrap.  We defer this until after all memory-size dependent
 1486          * allocations are done (e.g. direct map), so that we don't have to
 1487          * build in too much slop in our estimate.
 1488          *
 1489          * Note that when NKPML4E > 1, we have an empty page underneath
 1490          * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
 1491          * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 1492          */
 1493         nkpt_init(*firstaddr);
 1494         nkpdpe = NKPDPE(nkpt);
 1495 
 1496         KPTphys = allocpages(firstaddr, nkpt);
 1497         KPDphys = allocpages(firstaddr, nkpdpe);
 1498 
 1499         /*
 1500          * Connect the zero-filled PT pages to their PD entries.  This
 1501          * implicitly maps the PT pages at their correct locations within
 1502          * the PTmap.
 1503          */
 1504         pd_p = (pd_entry_t *)KPDphys;
 1505         for (i = 0; i < nkpt; i++)
 1506                 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 1507 
 1508         /*
 1509          * Map from physical address zero to the end of loader preallocated
 1510          * memory using 2MB pages.  This replaces some of the PD entries
 1511          * created above.
 1512          */
 1513         for (i = 0; (i << PDRSHIFT) < KERNend; i++)
 1514                 /* Preset PG_M and PG_A because demotion expects it. */
 1515                 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
 1516                     X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
 1517 
 1518         /*
 1519          * Because we map the physical blocks in 2M pages, adjust firstaddr
 1520          * to record the physical blocks we've actually mapped into kernel
 1521          * virtual address space.
 1522          */
 1523         if (*firstaddr < round_2mpage(KERNend))
 1524                 *firstaddr = round_2mpage(KERNend);
 1525 
 1526         /* And connect up the PD to the PDP (leaving room for L4 pages) */
 1527         pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
 1528         for (i = 0; i < nkpdpe; i++)
 1529                 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 1530 
 1531         /*
 1532          * Now, set up the direct map region using 2MB and/or 1GB pages.  If
 1533          * the end of physical memory is not aligned to a 1GB page boundary,
 1534          * then the residual physical memory is mapped with 2MB pages.  Later,
 1535          * if pmap_mapdev{_attr}() uses the direct map for non-write-back
 1536          * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
 1537          * that are partially used. 
 1538          */
 1539         pd_p = (pd_entry_t *)DMPDphys;
 1540         for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
 1541                 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 1542                 /* Preset PG_M and PG_A because demotion expects it. */
 1543                 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 1544                     X86_PG_M | X86_PG_A | pg_nx;
 1545         }
 1546         pdp_p = (pdp_entry_t *)DMPDPphys;
 1547         for (i = 0; i < ndm1g; i++) {
 1548                 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 1549                 /* Preset PG_M and PG_A because demotion expects it. */
 1550                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 1551                     X86_PG_M | X86_PG_A | pg_nx;
 1552         }
 1553         for (j = 0; i < ndmpdp; i++, j++) {
 1554                 pdp_p[i] = DMPDphys + ptoa(j);
 1555                 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx;
 1556         }
 1557 
 1558         /*
 1559          * Instead of using a 1G page for the memory containing the kernel,
 1560          * use 2M pages with read-only and no-execute permissions.  (If using 1G
 1561          * pages, this will partially overwrite the PDPEs above.)
 1562          */
 1563         if (ndm1g) {
 1564                 pd_p = (pd_entry_t *)DMPDkernphys;
 1565                 for (i = 0; i < (NPDEPG * nkdmpde); i++)
 1566                         pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
 1567                             X86_PG_M | X86_PG_A | pg_nx |
 1568                             bootaddr_rwx(i << PDRSHIFT);
 1569                 for (i = 0; i < nkdmpde; i++)
 1570                         pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
 1571                             X86_PG_V | pg_nx;
 1572         }
 1573 
 1574         /* And recursively map PML4 to itself in order to get PTmap */
 1575         p4_p = (pml4_entry_t *)KPML4phys;
 1576         p4_p[PML4PML4I] = KPML4phys;
 1577         p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
 1578 
 1579         /* Connect the Direct Map slot(s) up to the PML4. */
 1580         for (i = 0; i < ndmpdpphys; i++) {
 1581                 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
 1582                 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
 1583         }
 1584 
 1585         /* Connect the KVA slots up to the PML4 */
 1586         for (i = 0; i < NKPML4E; i++) {
 1587                 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
 1588                 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
 1589         }
 1590 }
 1591 
 1592 /*
 1593  *      Bootstrap the system enough to run with virtual memory.
 1594  *
 1595  *      On amd64 this is called after mapping has already been enabled
 1596  *      and just syncs the pmap module with what has already been done.
 1597  *      [We can't call it easily with mapping off since the kernel is not
 1598  *      mapped with PA == VA, hence we would have to relocate every address
 1599  *      from the linked base (virtual) address "KERNBASE" to the actual
 1600  *      (physical) address starting relative to 0]
 1601  */
 1602 void
 1603 pmap_bootstrap(vm_paddr_t *firstaddr)
 1604 {
 1605         vm_offset_t va;
 1606         pt_entry_t *pte, *pcpu_pte;
 1607         uint64_t cr4, pcpu_phys;
 1608         u_long res;
 1609         int i;
 1610 
 1611         KERNend = *firstaddr;
 1612         res = atop(KERNend - (vm_paddr_t)kernphys);
 1613 
 1614         if (!pti)
 1615                 pg_g = X86_PG_G;
 1616 
 1617         /*
 1618          * Create an initial set of page tables to run the kernel in.
 1619          */
 1620         create_pagetables(firstaddr);
 1621 
 1622         pcpu_phys = allocpages(firstaddr, MAXCPU);
 1623 
 1624         /*
 1625          * Add a physical memory segment (vm_phys_seg) corresponding to the
 1626          * preallocated kernel page table pages so that vm_page structures
 1627          * representing these pages will be created.  The vm_page structures
 1628          * are required for promotion of the corresponding kernel virtual
 1629          * addresses to superpage mappings.
 1630          */
 1631         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 1632 
 1633         /*
 1634          * Account for the virtual addresses mapped by create_pagetables().
 1635          */
 1636         virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
 1637         virtual_end = VM_MAX_KERNEL_ADDRESS;
 1638 
 1639         /*
 1640          * Enable PG_G global pages, then switch to the kernel page
 1641          * table from the bootstrap page table.  After the switch, it
 1642          * is possible to enable SMEP and SMAP since PG_U bits are
 1643          * correct now.
 1644          */
 1645         cr4 = rcr4();
 1646         cr4 |= CR4_PGE;
 1647         load_cr4(cr4);
 1648         load_cr3(KPML4phys);
 1649         if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 1650                 cr4 |= CR4_SMEP;
 1651         if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
 1652                 cr4 |= CR4_SMAP;
 1653         load_cr4(cr4);
 1654 
 1655         /*
 1656          * Initialize the kernel pmap (which is statically allocated).
 1657          * Count bootstrap data as being resident in case any of this data is
 1658          * later unmapped (using pmap_remove()) and freed.
 1659          */
 1660         PMAP_LOCK_INIT(kernel_pmap);
 1661         kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 1662         kernel_pmap->pm_cr3 = KPML4phys;
 1663         kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
 1664         CPU_FILL(&kernel_pmap->pm_active);      /* don't allow deactivation */
 1665         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 1666         kernel_pmap->pm_stats.resident_count = res;
 1667         kernel_pmap->pm_flags = pmap_flags;
 1668 
 1669         /*
 1670          * Initialize the TLB invalidations generation number lock.
 1671          */
 1672         mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
 1673 
 1674         /*
 1675          * Reserve some special page table entries/VA space for temporary
 1676          * mapping of pages.
 1677          */
 1678 #define SYSMAP(c, p, v, n)      \
 1679         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 1680 
 1681         va = virtual_avail;
 1682         pte = vtopte(va);
 1683 
 1684         /*
 1685          * Crashdump maps.  The first page is reused as CMAP1 for the
 1686          * memory test.
 1687          */
 1688         SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
 1689         CADDR1 = crashdumpmap;
 1690 
 1691         SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
 1692         virtual_avail = va;
 1693 
 1694         for (i = 0; i < MAXCPU; i++) {
 1695                 pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW |
 1696                     pg_g | pg_nx | X86_PG_M | X86_PG_A;
 1697         }
 1698         STAILQ_INIT(&cpuhead);
 1699         wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
 1700         pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
 1701         amd64_bsp_pcpu_init1(&__pcpu[0]);
 1702         amd64_bsp_ist_init(&__pcpu[0]);
 1703         __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
 1704         __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
 1705 
 1706         /*
 1707          * Initialize the PAT MSR.
 1708          * pmap_init_pat() clears and sets CR4_PGE, which, as a
 1709          * side-effect, invalidates stale PG_G TLB entries that might
 1710          * have been created in our pre-boot environment.
 1711          */
 1712         pmap_init_pat();
 1713 
 1714         /* Initialize TLB Context Id. */
 1715         if (pmap_pcid_enabled) {
 1716                 for (i = 0; i < MAXCPU; i++) {
 1717                         kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
 1718                         kernel_pmap->pm_pcids[i].pm_gen = 1;
 1719                 }
 1720 
 1721                 /*
 1722                  * PMAP_PCID_KERN + 1 is used for initialization of
 1723                  * proc0 pmap.  The pmap' pcid state might be used by
 1724                  * EFIRT entry before first context switch, so it
 1725                  * needs to be valid.
 1726                  */
 1727                 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
 1728                 PCPU_SET(pcid_gen, 1);
 1729 
 1730                 /*
 1731                  * pcpu area for APs is zeroed during AP startup.
 1732                  * pc_pcid_next and pc_pcid_gen are initialized by AP
 1733                  * during pcpu setup.
 1734                  */
 1735                 load_cr4(rcr4() | CR4_PCIDE);
 1736         }
 1737 }
 1738 
 1739 /*
 1740  * Setup the PAT MSR.
 1741  */
 1742 void
 1743 pmap_init_pat(void)
 1744 {
 1745         uint64_t pat_msr;
 1746         u_long cr0, cr4;
 1747         int i;
 1748 
 1749         /* Bail if this CPU doesn't implement PAT. */
 1750         if ((cpu_feature & CPUID_PAT) == 0)
 1751                 panic("no PAT??");
 1752 
 1753         /* Set default PAT index table. */
 1754         for (i = 0; i < PAT_INDEX_SIZE; i++)
 1755                 pat_index[i] = -1;
 1756         pat_index[PAT_WRITE_BACK] = 0;
 1757         pat_index[PAT_WRITE_THROUGH] = 1;
 1758         pat_index[PAT_UNCACHEABLE] = 3;
 1759         pat_index[PAT_WRITE_COMBINING] = 6;
 1760         pat_index[PAT_WRITE_PROTECTED] = 5;
 1761         pat_index[PAT_UNCACHED] = 2;
 1762 
 1763         /*
 1764          * Initialize default PAT entries.
 1765          * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 1766          * Program 5 and 6 as WP and WC.
 1767          *
 1768          * Leave 4 and 7 as WB and UC.  Note that a recursive page table
 1769          * mapping for a 2M page uses a PAT value with the bit 3 set due
 1770          * to its overload with PG_PS.
 1771          */
 1772         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 1773             PAT_VALUE(1, PAT_WRITE_THROUGH) |
 1774             PAT_VALUE(2, PAT_UNCACHED) |
 1775             PAT_VALUE(3, PAT_UNCACHEABLE) |
 1776             PAT_VALUE(4, PAT_WRITE_BACK) |
 1777             PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 1778             PAT_VALUE(6, PAT_WRITE_COMBINING) |
 1779             PAT_VALUE(7, PAT_UNCACHEABLE);
 1780 
 1781         /* Disable PGE. */
 1782         cr4 = rcr4();
 1783         load_cr4(cr4 & ~CR4_PGE);
 1784 
 1785         /* Disable caches (CD = 1, NW = 0). */
 1786         cr0 = rcr0();
 1787         load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 1788 
 1789         /* Flushes caches and TLBs. */
 1790         wbinvd();
 1791         invltlb();
 1792 
 1793         /* Update PAT and index table. */
 1794         wrmsr(MSR_PAT, pat_msr);
 1795 
 1796         /* Flush caches and TLBs again. */
 1797         wbinvd();
 1798         invltlb();
 1799 
 1800         /* Restore caches and PGE. */
 1801         load_cr0(cr0);
 1802         load_cr4(cr4);
 1803 }
 1804 
 1805 /*
 1806  *      Initialize a vm_page's machine-dependent fields.
 1807  */
 1808 void
 1809 pmap_page_init(vm_page_t m)
 1810 {
 1811 
 1812         TAILQ_INIT(&m->md.pv_list);
 1813         m->md.pat_mode = PAT_WRITE_BACK;
 1814 }
 1815 
 1816 static int pmap_allow_2m_x_ept;
 1817 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
 1818     &pmap_allow_2m_x_ept, 0,
 1819     "Allow executable superpage mappings in EPT");
 1820 
 1821 void
 1822 pmap_allow_2m_x_ept_recalculate(void)
 1823 {
 1824         /*
 1825          * SKL002, SKL012S.  Since the EPT format is only used by
 1826          * Intel CPUs, the vendor check is merely a formality.
 1827          */
 1828         if (!(cpu_vendor_id != CPU_VENDOR_INTEL ||
 1829             (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 ||
 1830             (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1831             (CPUID_TO_MODEL(cpu_id) == 0x26 ||  /* Atoms */
 1832             CPUID_TO_MODEL(cpu_id) == 0x27 ||
 1833             CPUID_TO_MODEL(cpu_id) == 0x35 ||
 1834             CPUID_TO_MODEL(cpu_id) == 0x36 ||
 1835             CPUID_TO_MODEL(cpu_id) == 0x37 ||
 1836             CPUID_TO_MODEL(cpu_id) == 0x86 ||
 1837             CPUID_TO_MODEL(cpu_id) == 0x1c ||
 1838             CPUID_TO_MODEL(cpu_id) == 0x4a ||
 1839             CPUID_TO_MODEL(cpu_id) == 0x4c ||
 1840             CPUID_TO_MODEL(cpu_id) == 0x4d ||
 1841             CPUID_TO_MODEL(cpu_id) == 0x5a ||
 1842             CPUID_TO_MODEL(cpu_id) == 0x5c ||
 1843             CPUID_TO_MODEL(cpu_id) == 0x5d ||
 1844             CPUID_TO_MODEL(cpu_id) == 0x5f ||
 1845             CPUID_TO_MODEL(cpu_id) == 0x6e ||
 1846             CPUID_TO_MODEL(cpu_id) == 0x7a ||
 1847             CPUID_TO_MODEL(cpu_id) == 0x57 ||   /* Knights */
 1848             CPUID_TO_MODEL(cpu_id) == 0x85))))
 1849                 pmap_allow_2m_x_ept = 1;
 1850         TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept);
 1851 }
 1852 
 1853 static bool
 1854 pmap_allow_2m_x_page(pmap_t pmap, bool executable)
 1855 {
 1856 
 1857         return (pmap->pm_type != PT_EPT || !executable ||
 1858             !pmap_allow_2m_x_ept);
 1859 }
 1860 
 1861 /*
 1862  *      Initialize the pmap module.
 1863  *      Called by vm_init, to initialize any structures that the pmap
 1864  *      system needs to map virtual memory.
 1865  */
 1866 void
 1867 pmap_init(void)
 1868 {
 1869         struct pmap_preinit_mapping *ppim;
 1870         vm_page_t m, mpte;
 1871         vm_size_t s;
 1872         int error, i, pv_npg, ret, skz63;
 1873 
 1874         /* L1TF, reserve page @0 unconditionally */
 1875         vm_page_blacklist_add(0, bootverbose);
 1876 
 1877         /* Detect bare-metal Skylake Server and Skylake-X. */
 1878         if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
 1879             CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
 1880                 /*
 1881                  * Skylake-X errata SKZ63. Processor May Hang When
 1882                  * Executing Code In an HLE Transaction Region between
 1883                  * 40000000H and 403FFFFFH.
 1884                  *
 1885                  * Mark the pages in the range as preallocated.  It
 1886                  * seems to be impossible to distinguish between
 1887                  * Skylake Server and Skylake X.
 1888                  */
 1889                 skz63 = 1;
 1890                 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
 1891                 if (skz63 != 0) {
 1892                         if (bootverbose)
 1893                                 printf("SKZ63: skipping 4M RAM starting "
 1894                                     "at physical 1G\n");
 1895                         for (i = 0; i < atop(0x400000); i++) {
 1896                                 ret = vm_page_blacklist_add(0x40000000 +
 1897                                     ptoa(i), FALSE);
 1898                                 if (!ret && bootverbose)
 1899                                         printf("page at %#lx already used\n",
 1900                                             0x40000000 + ptoa(i));
 1901                         }
 1902                 }
 1903         }
 1904 
 1905         /* IFU */
 1906         pmap_allow_2m_x_ept_recalculate();
 1907 
 1908         /*
 1909          * Initialize the vm page array entries for the kernel pmap's
 1910          * page table pages.
 1911          */ 
 1912         PMAP_LOCK(kernel_pmap);
 1913         for (i = 0; i < nkpt; i++) {
 1914                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 1915                 KASSERT(mpte >= vm_page_array &&
 1916                     mpte < &vm_page_array[vm_page_array_size],
 1917                     ("pmap_init: page table page is out of range"));
 1918                 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
 1919                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 1920                 mpte->wire_count = 1;
 1921 
 1922                 /*
 1923                  * Collect the page table pages that were replaced by a 2MB
 1924                  * page in create_pagetables().  They are zero filled.
 1925                  */
 1926                 if (i << PDRSHIFT < KERNend &&
 1927                     pmap_insert_pt_page(kernel_pmap, mpte, false))
 1928                         panic("pmap_init: pmap_insert_pt_page failed");
 1929         }
 1930         PMAP_UNLOCK(kernel_pmap);
 1931         vm_wire_add(nkpt);
 1932 
 1933         /*
 1934          * If the kernel is running on a virtual machine, then it must assume
 1935          * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 1936          * be prepared for the hypervisor changing the vendor and family that
 1937          * are reported by CPUID.  Consequently, the workaround for AMD Family
 1938          * 10h Erratum 383 is enabled if the processor's feature set does not
 1939          * include at least one feature that is only supported by older Intel
 1940          * or newer AMD processors.
 1941          */
 1942         if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 1943             (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 1944             CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 1945             AMDID2_FMA4)) == 0)
 1946                 workaround_erratum383 = 1;
 1947 
 1948         /*
 1949          * Are large page mappings enabled?
 1950          */
 1951         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 1952         if (pg_ps_enabled) {
 1953                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 1954                     ("pmap_init: can't assign to pagesizes[1]"));
 1955                 pagesizes[1] = NBPDR;
 1956         }
 1957 
 1958         /*
 1959          * Initialize the pv chunk list mutex.
 1960          */
 1961         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 1962 
 1963         /*
 1964          * Initialize the pool of pv list locks.
 1965          */
 1966         for (i = 0; i < NPV_LIST_LOCKS; i++)
 1967                 rw_init(&pv_list_locks[i], "pmap pv list");
 1968 
 1969         /*
 1970          * Calculate the size of the pv head table for superpages.
 1971          */
 1972         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
 1973 
 1974         /*
 1975          * Allocate memory for the pv head table for superpages.
 1976          */
 1977         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 1978         s = round_page(s);
 1979         pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 1980         for (i = 0; i < pv_npg; i++)
 1981                 TAILQ_INIT(&pv_table[i].pv_list);
 1982         TAILQ_INIT(&pv_dummy.pv_list);
 1983 
 1984         pmap_initialized = 1;
 1985         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 1986                 ppim = pmap_preinit_mapping + i;
 1987                 if (ppim->va == 0)
 1988                         continue;
 1989                 /* Make the direct map consistent */
 1990                 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
 1991                         (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
 1992                             ppim->sz, ppim->mode);
 1993                 }
 1994                 if (!bootverbose)
 1995                         continue;
 1996                 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
 1997                     ppim->pa, ppim->va, ppim->sz, ppim->mode);
 1998         }
 1999 
 2000         mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
 2001         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 2002             (vmem_addr_t *)&qframe);
 2003         if (error != 0)
 2004                 panic("qframe allocation failed");
 2005 
 2006         lm_ents = 8;
 2007         TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
 2008         if (lm_ents > LMEPML4I - LMSPML4I + 1)
 2009                 lm_ents = LMEPML4I - LMSPML4I + 1;
 2010         if (bootverbose)
 2011                 printf("pmap: large map %u PML4 slots (%lu Gb)\n",
 2012                     lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
 2013         if (lm_ents != 0) {
 2014                 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
 2015                     (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
 2016                 if (large_vmem == NULL) {
 2017                         printf("pmap: cannot create large map\n");
 2018                         lm_ents = 0;
 2019                 }
 2020                 for (i = 0; i < lm_ents; i++) {
 2021                         m = pmap_large_map_getptp_unlocked();
 2022                         kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
 2023                             X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
 2024                             VM_PAGE_TO_PHYS(m);
 2025                 }
 2026         }
 2027 }
 2028 
 2029 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
 2030     "2MB page mapping counters");
 2031 
 2032 static u_long pmap_pde_demotions;
 2033 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
 2034     &pmap_pde_demotions, 0, "2MB page demotions");
 2035 
 2036 static u_long pmap_pde_mappings;
 2037 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
 2038     &pmap_pde_mappings, 0, "2MB page mappings");
 2039 
 2040 static u_long pmap_pde_p_failures;
 2041 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
 2042     &pmap_pde_p_failures, 0, "2MB page promotion failures");
 2043 
 2044 static u_long pmap_pde_promotions;
 2045 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
 2046     &pmap_pde_promotions, 0, "2MB page promotions");
 2047 
 2048 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
 2049     "1GB page mapping counters");
 2050 
 2051 static u_long pmap_pdpe_demotions;
 2052 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
 2053     &pmap_pdpe_demotions, 0, "1GB page demotions");
 2054 
 2055 /***************************************************
 2056  * Low level helper routines.....
 2057  ***************************************************/
 2058 
 2059 static pt_entry_t
 2060 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
 2061 {
 2062         int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
 2063 
 2064         switch (pmap->pm_type) {
 2065         case PT_X86:
 2066         case PT_RVI:
 2067                 /* Verify that both PAT bits are not set at the same time */
 2068                 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
 2069                     ("Invalid PAT bits in entry %#lx", entry));
 2070 
 2071                 /* Swap the PAT bits if one of them is set */
 2072                 if ((entry & x86_pat_bits) != 0)
 2073                         entry ^= x86_pat_bits;
 2074                 break;
 2075         case PT_EPT:
 2076                 /*
 2077                  * Nothing to do - the memory attributes are represented
 2078                  * the same way for regular pages and superpages.
 2079                  */
 2080                 break;
 2081         default:
 2082                 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
 2083         }
 2084 
 2085         return (entry);
 2086 }
 2087 
 2088 boolean_t
 2089 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 2090 {
 2091 
 2092         return (mode >= 0 && mode < PAT_INDEX_SIZE &&
 2093             pat_index[(int)mode] >= 0);
 2094 }
 2095 
 2096 /*
 2097  * Determine the appropriate bits to set in a PTE or PDE for a specified
 2098  * caching mode.
 2099  */
 2100 int
 2101 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 2102 {
 2103         int cache_bits, pat_flag, pat_idx;
 2104 
 2105         if (!pmap_is_valid_memattr(pmap, mode))
 2106                 panic("Unknown caching mode %d\n", mode);
 2107 
 2108         switch (pmap->pm_type) {
 2109         case PT_X86:
 2110         case PT_RVI:
 2111                 /* The PAT bit is different for PTE's and PDE's. */
 2112                 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 2113 
 2114                 /* Map the caching mode to a PAT index. */
 2115                 pat_idx = pat_index[mode];
 2116 
 2117                 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 2118                 cache_bits = 0;
 2119                 if (pat_idx & 0x4)
 2120                         cache_bits |= pat_flag;
 2121                 if (pat_idx & 0x2)
 2122                         cache_bits |= PG_NC_PCD;
 2123                 if (pat_idx & 0x1)
 2124                         cache_bits |= PG_NC_PWT;
 2125                 break;
 2126 
 2127         case PT_EPT:
 2128                 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
 2129                 break;
 2130 
 2131         default:
 2132                 panic("unsupported pmap type %d", pmap->pm_type);
 2133         }
 2134 
 2135         return (cache_bits);
 2136 }
 2137 
 2138 static int
 2139 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 2140 {
 2141         int mask;
 2142 
 2143         switch (pmap->pm_type) {
 2144         case PT_X86:
 2145         case PT_RVI:
 2146                 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
 2147                 break;
 2148         case PT_EPT:
 2149                 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
 2150                 break;
 2151         default:
 2152                 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
 2153         }
 2154 
 2155         return (mask);
 2156 }
 2157 
 2158 static int
 2159 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
 2160 {
 2161         int pat_flag, pat_idx;
 2162 
 2163         pat_idx = 0;
 2164         switch (pmap->pm_type) {
 2165         case PT_X86:
 2166         case PT_RVI:
 2167                 /* The PAT bit is different for PTE's and PDE's. */
 2168                 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 2169 
 2170                 if ((pte & pat_flag) != 0)
 2171                         pat_idx |= 0x4;
 2172                 if ((pte & PG_NC_PCD) != 0)
 2173                         pat_idx |= 0x2;
 2174                 if ((pte & PG_NC_PWT) != 0)
 2175                         pat_idx |= 0x1;
 2176                 break;
 2177         case PT_EPT:
 2178                 if ((pte & EPT_PG_IGNORE_PAT) != 0)
 2179                         panic("EPT PTE %#lx has no PAT memory type", pte);
 2180                 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
 2181                 break;
 2182         }
 2183 
 2184         /* See pmap_init_pat(). */
 2185         if (pat_idx == 4)
 2186                 pat_idx = 0;
 2187         if (pat_idx == 7)
 2188                 pat_idx = 3;
 2189 
 2190         return (pat_idx);
 2191 }
 2192 
 2193 bool
 2194 pmap_ps_enabled(pmap_t pmap)
 2195 {
 2196 
 2197         return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 2198 }
 2199 
 2200 static void
 2201 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
 2202 {
 2203 
 2204         switch (pmap->pm_type) {
 2205         case PT_X86:
 2206                 break;
 2207         case PT_RVI:
 2208         case PT_EPT:
 2209                 /*
 2210                  * XXX
 2211                  * This is a little bogus since the generation number is
 2212                  * supposed to be bumped up when a region of the address
 2213                  * space is invalidated in the page tables.
 2214                  *
 2215                  * In this case the old PDE entry is valid but yet we want
 2216                  * to make sure that any mappings using the old entry are
 2217                  * invalidated in the TLB.
 2218                  *
 2219                  * The reason this works as expected is because we rendezvous
 2220                  * "all" host cpus and force any vcpu context to exit as a
 2221                  * side-effect.
 2222                  */
 2223                 atomic_add_acq_long(&pmap->pm_eptgen, 1);
 2224                 break;
 2225         default:
 2226                 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
 2227         }
 2228         pde_store(pde, newpde);
 2229 }
 2230 
 2231 /*
 2232  * After changing the page size for the specified virtual address in the page
 2233  * table, flush the corresponding entries from the processor's TLB.  Only the
 2234  * calling processor's TLB is affected.
 2235  *
 2236  * The calling thread must be pinned to a processor.
 2237  */
 2238 static void
 2239 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 2240 {
 2241         pt_entry_t PG_G;
 2242 
 2243         if (pmap_type_guest(pmap))
 2244                 return;
 2245 
 2246         KASSERT(pmap->pm_type == PT_X86,
 2247             ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
 2248 
 2249         PG_G = pmap_global_bit(pmap);
 2250 
 2251         if ((newpde & PG_PS) == 0)
 2252                 /* Demotion: flush a specific 2MB page mapping. */
 2253                 invlpg(va);
 2254         else if ((newpde & PG_G) == 0)
 2255                 /*
 2256                  * Promotion: flush every 4KB page mapping from the TLB
 2257                  * because there are too many to flush individually.
 2258                  */
 2259                 invltlb();
 2260         else {
 2261                 /*
 2262                  * Promotion: flush every 4KB page mapping from the TLB,
 2263                  * including any global (PG_G) mappings.
 2264                  */
 2265                 invltlb_glob();
 2266         }
 2267 }
 2268 #ifdef SMP
 2269 
 2270 /*
 2271  * For SMP, these functions have to use the IPI mechanism for coherence.
 2272  *
 2273  * N.B.: Before calling any of the following TLB invalidation functions,
 2274  * the calling processor must ensure that all stores updating a non-
 2275  * kernel page table are globally performed.  Otherwise, another
 2276  * processor could cache an old, pre-update entry without being
 2277  * invalidated.  This can happen one of two ways: (1) The pmap becomes
 2278  * active on another processor after its pm_active field is checked by
 2279  * one of the following functions but before a store updating the page
 2280  * table is globally performed. (2) The pmap becomes active on another
 2281  * processor before its pm_active field is checked but due to
 2282  * speculative loads one of the following functions stills reads the
 2283  * pmap as inactive on the other processor.
 2284  * 
 2285  * The kernel page table is exempt because its pm_active field is
 2286  * immutable.  The kernel page table is always active on every
 2287  * processor.
 2288  */
 2289 
 2290 /*
 2291  * Interrupt the cpus that are executing in the guest context.
 2292  * This will force the vcpu to exit and the cached EPT mappings
 2293  * will be invalidated by the host before the next vmresume.
 2294  */
 2295 static __inline void
 2296 pmap_invalidate_ept(pmap_t pmap)
 2297 {
 2298         int ipinum;
 2299 
 2300         sched_pin();
 2301         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 2302             ("pmap_invalidate_ept: absurd pm_active"));
 2303 
 2304         /*
 2305          * The TLB mappings associated with a vcpu context are not
 2306          * flushed each time a different vcpu is chosen to execute.
 2307          *
 2308          * This is in contrast with a process's vtop mappings that
 2309          * are flushed from the TLB on each context switch.
 2310          *
 2311          * Therefore we need to do more than just a TLB shootdown on
 2312          * the active cpus in 'pmap->pm_active'. To do this we keep
 2313          * track of the number of invalidations performed on this pmap.
 2314          *
 2315          * Each vcpu keeps a cache of this counter and compares it
 2316          * just before a vmresume. If the counter is out-of-date an
 2317          * invept will be done to flush stale mappings from the TLB.
 2318          */
 2319         atomic_add_acq_long(&pmap->pm_eptgen, 1);
 2320 
 2321         /*
 2322          * Force the vcpu to exit and trap back into the hypervisor.
 2323          */
 2324         ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
 2325         ipi_selected(pmap->pm_active, ipinum);
 2326         sched_unpin();
 2327 }
 2328 
 2329 static cpuset_t
 2330 pmap_invalidate_cpu_mask(pmap_t pmap)
 2331 {
 2332         return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
 2333 }
 2334 
 2335 static inline void
 2336 pmap_invalidate_preipi_pcid(pmap_t pmap)
 2337 {
 2338         u_int cpuid, i;
 2339 
 2340         sched_pin();
 2341 
 2342         cpuid = PCPU_GET(cpuid);
 2343         if (pmap != PCPU_GET(curpmap))
 2344                 cpuid = 0xffffffff;     /* An impossible value */
 2345 
 2346         CPU_FOREACH(i) {
 2347                 if (cpuid != i)
 2348                         pmap->pm_pcids[i].pm_gen = 0;
 2349         }
 2350 
 2351         /*
 2352          * The fence is between stores to pm_gen and the read of the
 2353          * pm_active mask.  We need to ensure that it is impossible
 2354          * for us to miss the bit update in pm_active and
 2355          * simultaneously observe a non-zero pm_gen in
 2356          * pmap_activate_sw(), otherwise TLB update is missed.
 2357          * Without the fence, IA32 allows such an outcome.  Note that
 2358          * pm_active is updated by a locked operation, which provides
 2359          * the reciprocal fence.
 2360          */
 2361         atomic_thread_fence_seq_cst();
 2362 }
 2363 
 2364 static void
 2365 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)
 2366 {
 2367         sched_pin();
 2368 }
 2369 
 2370 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t), static)
 2371 {
 2372         return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid :
 2373             pmap_invalidate_preipi_nopcid);
 2374 }
 2375 
 2376 static inline void
 2377 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va,
 2378     const bool invpcid_works1)
 2379 {
 2380         struct invpcid_descr d;
 2381         uint64_t kcr3, ucr3;
 2382         uint32_t pcid;
 2383         u_int cpuid;
 2384 
 2385         /*
 2386          * Because pm_pcid is recalculated on a context switch, we
 2387          * must ensure there is no preemption, not just pinning.
 2388          * Otherwise, we might use a stale value below.
 2389          */
 2390         CRITICAL_ASSERT(curthread);
 2391 
 2392         /*
 2393          * No need to do anything with user page tables invalidation
 2394          * if there is no user page table.
 2395          */
 2396         if (pmap->pm_ucr3 == PMAP_NO_CR3)
 2397                 return;
 2398 
 2399         cpuid = PCPU_GET(cpuid);
 2400 
 2401         pcid = pmap->pm_pcids[cpuid].pm_pcid;
 2402         if (invpcid_works1) {
 2403                 d.pcid = pcid | PMAP_PCID_USER_PT;
 2404                 d.pad = 0;
 2405                 d.addr = va;
 2406                 invpcid(&d, INVPCID_ADDR);
 2407         } else {
 2408                 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 2409                 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 2410                 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 2411         }
 2412 }
 2413 
 2414 static void
 2415 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va)
 2416 {
 2417         pmap_invalidate_page_pcid_cb(pmap, va, true);
 2418 }
 2419 
 2420 static void
 2421 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va)
 2422 {
 2423         pmap_invalidate_page_pcid_cb(pmap, va, false);
 2424 }
 2425 
 2426 static void
 2427 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused)
 2428 {
 2429 }
 2430 
 2431 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t),
 2432     static)
 2433 {
 2434         if (pmap_pcid_enabled)
 2435                 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb :
 2436                     pmap_invalidate_page_pcid_noinvpcid_cb);
 2437         return (pmap_invalidate_page_nopcid_cb);
 2438 }
 2439 
 2440 static void
 2441 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
 2442     vm_offset_t addr2 __unused)
 2443 {
 2444         if (pmap == kernel_pmap) {
 2445                 invlpg(va);
 2446         } else if (pmap == PCPU_GET(curpmap)) {
 2447                 invlpg(va);
 2448                 pmap_invalidate_page_cb(pmap, va);
 2449         }
 2450 }
 2451 
 2452 void
 2453 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 2454 {
 2455         if (pmap_type_guest(pmap)) {
 2456                 pmap_invalidate_ept(pmap);
 2457                 return;
 2458         }
 2459 
 2460         KASSERT(pmap->pm_type == PT_X86,
 2461             ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 2462 
 2463         pmap_invalidate_preipi(pmap);
 2464         smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap,
 2465             pmap_invalidate_page_curcpu_cb);
 2466 }
 2467 
 2468 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 2469 #define PMAP_INVLPG_THRESHOLD   (4 * 1024 * PAGE_SIZE)
 2470 
 2471 static void
 2472 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 2473     const bool invpcid_works1)
 2474 {
 2475         struct invpcid_descr d;
 2476         uint64_t kcr3, ucr3;
 2477         uint32_t pcid;
 2478         u_int cpuid;
 2479 
 2480         CRITICAL_ASSERT(curthread);
 2481 
 2482         if (pmap != PCPU_GET(curpmap) ||
 2483             pmap->pm_ucr3 == PMAP_NO_CR3)
 2484                 return;
 2485 
 2486         cpuid = PCPU_GET(cpuid);
 2487 
 2488         pcid = pmap->pm_pcids[cpuid].pm_pcid;
 2489         if (invpcid_works1) {
 2490                 d.pcid = pcid | PMAP_PCID_USER_PT;
 2491                 d.pad = 0;
 2492                 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE)
 2493                         invpcid(&d, INVPCID_ADDR);
 2494         } else {
 2495                 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 2496                 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 2497                 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 2498         }
 2499 }
 2500 
 2501 static void
 2502 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva,
 2503     vm_offset_t eva)
 2504 {
 2505         pmap_invalidate_range_pcid_cb(pmap, sva, eva, true);
 2506 }
 2507 
 2508 static void
 2509 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva,
 2510     vm_offset_t eva)
 2511 {
 2512         pmap_invalidate_range_pcid_cb(pmap, sva, eva, false);
 2513 }
 2514 
 2515 static void
 2516 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused,
 2517     vm_offset_t eva __unused)
 2518 {
 2519 }
 2520 
 2521 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t,
 2522     vm_offset_t), static)
 2523 {
 2524         if (pmap_pcid_enabled)
 2525                 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb :
 2526                     pmap_invalidate_range_pcid_noinvpcid_cb);
 2527         return (pmap_invalidate_range_nopcid_cb);
 2528 }
 2529 
 2530 static void
 2531 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 2532 {
 2533         vm_offset_t addr;
 2534 
 2535         if (pmap == kernel_pmap) {
 2536                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 2537                         invlpg(addr);
 2538         } else if (pmap == PCPU_GET(curpmap)) {
 2539                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 2540                         invlpg(addr);
 2541                 pmap_invalidate_range_cb(pmap, sva, eva);
 2542         }
 2543 }
 2544 
 2545 void
 2546 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 2547 {
 2548         if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 2549                 pmap_invalidate_all(pmap);
 2550                 return;
 2551         }
 2552 
 2553         if (pmap_type_guest(pmap)) {
 2554                 pmap_invalidate_ept(pmap);
 2555                 return;
 2556         }
 2557 
 2558         KASSERT(pmap->pm_type == PT_X86,
 2559             ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 2560 
 2561         pmap_invalidate_preipi(pmap);
 2562         smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap,
 2563             pmap_invalidate_range_curcpu_cb);
 2564 }
 2565 
 2566 static inline void
 2567 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1)
 2568 {
 2569         struct invpcid_descr d;
 2570         uint64_t kcr3, ucr3;
 2571         uint32_t pcid;
 2572         u_int cpuid;
 2573 
 2574         if (pmap == kernel_pmap) {
 2575                 if (invpcid_works1) {
 2576                         bzero(&d, sizeof(d));
 2577                         invpcid(&d, INVPCID_CTXGLOB);
 2578                 } else {
 2579                         invltlb_glob();
 2580                 }
 2581         } else if (pmap == PCPU_GET(curpmap)) {
 2582                 CRITICAL_ASSERT(curthread);
 2583                 cpuid = PCPU_GET(cpuid);
 2584 
 2585                 pcid = pmap->pm_pcids[cpuid].pm_pcid;
 2586                 if (invpcid_works1) {
 2587                         d.pcid = pcid;
 2588                         d.pad = 0;
 2589                         d.addr = 0;
 2590                         invpcid(&d, INVPCID_CTX);
 2591                         if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 2592                                 d.pcid |= PMAP_PCID_USER_PT;
 2593                                 invpcid(&d, INVPCID_CTX);
 2594                         }
 2595                 } else {
 2596                         kcr3 = pmap->pm_cr3 | pcid;
 2597                         ucr3 = pmap->pm_ucr3;
 2598                         if (ucr3 != PMAP_NO_CR3) {
 2599                                 ucr3 |= pcid | PMAP_PCID_USER_PT;
 2600                                 pmap_pti_pcid_invalidate(ucr3, kcr3);
 2601                         } else {
 2602                                 load_cr3(kcr3);
 2603                         }
 2604                 }
 2605         }
 2606 }
 2607 
 2608 static void
 2609 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)
 2610 {
 2611         pmap_invalidate_all_pcid_cb(pmap, true);
 2612 }
 2613 
 2614 static void
 2615 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)
 2616 {
 2617         pmap_invalidate_all_pcid_cb(pmap, false);
 2618 }
 2619 
 2620 static void
 2621 pmap_invalidate_all_nopcid_cb(pmap_t pmap)
 2622 {
 2623         if (pmap == kernel_pmap)
 2624                 invltlb_glob();
 2625         else if (pmap == PCPU_GET(curpmap))
 2626                 invltlb();
 2627 }
 2628 
 2629 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t), static)
 2630 {
 2631         if (pmap_pcid_enabled)
 2632                 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb :
 2633                     pmap_invalidate_all_pcid_noinvpcid_cb);
 2634         return (pmap_invalidate_all_nopcid_cb);
 2635 }
 2636 
 2637 static void
 2638 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
 2639     vm_offset_t addr2 __unused)
 2640 {
 2641         pmap_invalidate_all_cb(pmap);
 2642 }
 2643 
 2644 void
 2645 pmap_invalidate_all(pmap_t pmap)
 2646 {
 2647         if (pmap_type_guest(pmap)) {
 2648                 pmap_invalidate_ept(pmap);
 2649                 return;
 2650         }
 2651 
 2652         KASSERT(pmap->pm_type == PT_X86,
 2653             ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 2654 
 2655         pmap_invalidate_preipi(pmap);
 2656         smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap,
 2657             pmap_invalidate_all_curcpu_cb);
 2658 }
 2659 
 2660 static void
 2661 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
 2662     vm_offset_t addr2 __unused)
 2663 {
 2664         wbinvd();
 2665 }
 2666 
 2667 void
 2668 pmap_invalidate_cache(void)
 2669 {
 2670         sched_pin();
 2671         smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
 2672 }
 2673 
 2674 struct pde_action {
 2675         cpuset_t invalidate;    /* processors that invalidate their TLB */
 2676         pmap_t pmap;
 2677         vm_offset_t va;
 2678         pd_entry_t *pde;
 2679         pd_entry_t newpde;
 2680         u_int store;            /* processor that updates the PDE */
 2681 };
 2682 
 2683 static void
 2684 pmap_update_pde_action(void *arg)
 2685 {
 2686         struct pde_action *act = arg;
 2687 
 2688         if (act->store == PCPU_GET(cpuid))
 2689                 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 2690 }
 2691 
 2692 static void
 2693 pmap_update_pde_teardown(void *arg)
 2694 {
 2695         struct pde_action *act = arg;
 2696 
 2697         if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 2698                 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 2699 }
 2700 
 2701 /*
 2702  * Change the page size for the specified virtual address in a way that
 2703  * prevents any possibility of the TLB ever having two entries that map the
 2704  * same virtual address using different page sizes.  This is the recommended
 2705  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
 2706  * machine check exception for a TLB state that is improperly diagnosed as a
 2707  * hardware error.
 2708  */
 2709 static void
 2710 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 2711 {
 2712         struct pde_action act;
 2713         cpuset_t active, other_cpus;
 2714         u_int cpuid;
 2715 
 2716         sched_pin();
 2717         cpuid = PCPU_GET(cpuid);
 2718         other_cpus = all_cpus;
 2719         CPU_CLR(cpuid, &other_cpus);
 2720         if (pmap == kernel_pmap || pmap_type_guest(pmap)) 
 2721                 active = all_cpus;
 2722         else {
 2723                 active = pmap->pm_active;
 2724         }
 2725         if (CPU_OVERLAP(&active, &other_cpus)) { 
 2726                 act.store = cpuid;
 2727                 act.invalidate = active;
 2728                 act.va = va;
 2729                 act.pmap = pmap;
 2730                 act.pde = pde;
 2731                 act.newpde = newpde;
 2732                 CPU_SET(cpuid, &active);
 2733                 smp_rendezvous_cpus(active,
 2734                     smp_no_rendezvous_barrier, pmap_update_pde_action,
 2735                     pmap_update_pde_teardown, &act);
 2736         } else {
 2737                 pmap_update_pde_store(pmap, pde, newpde);
 2738                 if (CPU_ISSET(cpuid, &active))
 2739                         pmap_update_pde_invalidate(pmap, va, newpde);
 2740         }
 2741         sched_unpin();
 2742 }
 2743 #else /* !SMP */
 2744 /*
 2745  * Normal, non-SMP, invalidation functions.
 2746  */
 2747 void
 2748 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 2749 {
 2750         struct invpcid_descr d;
 2751         uint64_t kcr3, ucr3;
 2752         uint32_t pcid;
 2753 
 2754         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 2755                 pmap->pm_eptgen++;
 2756                 return;
 2757         }
 2758         KASSERT(pmap->pm_type == PT_X86,
 2759             ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 2760 
 2761         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 2762                 invlpg(va);
 2763                 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 2764                     pmap->pm_ucr3 != PMAP_NO_CR3) {
 2765                         critical_enter();
 2766                         pcid = pmap->pm_pcids[0].pm_pcid;
 2767                         if (invpcid_works) {
 2768                                 d.pcid = pcid | PMAP_PCID_USER_PT;
 2769                                 d.pad = 0;
 2770                                 d.addr = va;
 2771                                 invpcid(&d, INVPCID_ADDR);
 2772                         } else {
 2773                                 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 2774                                 ucr3 = pmap->pm_ucr3 | pcid |
 2775                                     PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 2776                                 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 2777                         }
 2778                         critical_exit();
 2779                 }
 2780         } else if (pmap_pcid_enabled)
 2781                 pmap->pm_pcids[0].pm_gen = 0;
 2782 }
 2783 
 2784 void
 2785 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 2786 {
 2787         struct invpcid_descr d;
 2788         vm_offset_t addr;
 2789         uint64_t kcr3, ucr3;
 2790 
 2791         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 2792                 pmap->pm_eptgen++;
 2793                 return;
 2794         }
 2795         KASSERT(pmap->pm_type == PT_X86,
 2796             ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 2797 
 2798         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 2799                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 2800                         invlpg(addr);
 2801                 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 2802                     pmap->pm_ucr3 != PMAP_NO_CR3) {
 2803                         critical_enter();
 2804                         if (invpcid_works) {
 2805                                 d.pcid = pmap->pm_pcids[0].pm_pcid |
 2806                                     PMAP_PCID_USER_PT;
 2807                                 d.pad = 0;
 2808                                 d.addr = sva;
 2809                                 for (; d.addr < eva; d.addr += PAGE_SIZE)
 2810                                         invpcid(&d, INVPCID_ADDR);
 2811                         } else {
 2812                                 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
 2813                                     pm_pcid | CR3_PCID_SAVE;
 2814                                 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
 2815                                     pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 2816                                 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 2817                         }
 2818                         critical_exit();
 2819                 }
 2820         } else if (pmap_pcid_enabled) {
 2821                 pmap->pm_pcids[0].pm_gen = 0;
 2822         }
 2823 }
 2824 
 2825 void
 2826 pmap_invalidate_all(pmap_t pmap)
 2827 {
 2828         struct invpcid_descr d;
 2829         uint64_t kcr3, ucr3;
 2830 
 2831         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 2832                 pmap->pm_eptgen++;
 2833                 return;
 2834         }
 2835         KASSERT(pmap->pm_type == PT_X86,
 2836             ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
 2837 
 2838         if (pmap == kernel_pmap) {
 2839                 if (pmap_pcid_enabled && invpcid_works) {
 2840                         bzero(&d, sizeof(d));
 2841                         invpcid(&d, INVPCID_CTXGLOB);
 2842                 } else {
 2843                         invltlb_glob();
 2844                 }
 2845         } else if (pmap == PCPU_GET(curpmap)) {
 2846                 if (pmap_pcid_enabled) {
 2847                         critical_enter();
 2848                         if (invpcid_works) {
 2849                                 d.pcid = pmap->pm_pcids[0].pm_pcid;
 2850                                 d.pad = 0;
 2851                                 d.addr = 0;
 2852                                 invpcid(&d, INVPCID_CTX);
 2853                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 2854                                         d.pcid |= PMAP_PCID_USER_PT;
 2855                                         invpcid(&d, INVPCID_CTX);
 2856                                 }
 2857                         } else {
 2858                                 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
 2859                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 2860                                         ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
 2861                                             0].pm_pcid | PMAP_PCID_USER_PT;
 2862                                         pmap_pti_pcid_invalidate(ucr3, kcr3);
 2863                                 } else
 2864                                         load_cr3(kcr3);
 2865                         }
 2866                         critical_exit();
 2867                 } else {
 2868                         invltlb();
 2869                 }
 2870         } else if (pmap_pcid_enabled) {
 2871                 pmap->pm_pcids[0].pm_gen = 0;
 2872         }
 2873 }
 2874 
 2875 PMAP_INLINE void
 2876 pmap_invalidate_cache(void)
 2877 {
 2878 
 2879         wbinvd();
 2880 }
 2881 
 2882 static void
 2883 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 2884 {
 2885 
 2886         pmap_update_pde_store(pmap, pde, newpde);
 2887         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
 2888                 pmap_update_pde_invalidate(pmap, va, newpde);
 2889         else
 2890                 pmap->pm_pcids[0].pm_gen = 0;
 2891 }
 2892 #endif /* !SMP */
 2893 
 2894 static void
 2895 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 2896 {
 2897 
 2898         /*
 2899          * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
 2900          * by a promotion that did not invalidate the 512 4KB page mappings
 2901          * that might exist in the TLB.  Consequently, at this point, the TLB
 2902          * may hold both 4KB and 2MB page mappings for the address range [va,
 2903          * va + NBPDR).  Therefore, the entire range must be invalidated here.
 2904          * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
 2905          * 4KB page mappings for the address range [va, va + NBPDR), and so a
 2906          * single INVLPG suffices to invalidate the 2MB page mapping from the
 2907          * TLB.
 2908          */
 2909         if ((pde & PG_PROMOTED) != 0)
 2910                 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
 2911         else
 2912                 pmap_invalidate_page(pmap, va);
 2913 }
 2914 
 2915 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
 2916     (vm_offset_t sva, vm_offset_t eva), static)
 2917 {
 2918 
 2919         if ((cpu_feature & CPUID_SS) != 0)
 2920                 return (pmap_invalidate_cache_range_selfsnoop);
 2921         if ((cpu_feature & CPUID_CLFSH) != 0)
 2922                 return (pmap_force_invalidate_cache_range);
 2923         return (pmap_invalidate_cache_range_all);
 2924 }
 2925 
 2926 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 2927 
 2928 static void
 2929 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
 2930 {
 2931 
 2932         KASSERT((sva & PAGE_MASK) == 0,
 2933             ("pmap_invalidate_cache_range: sva not page-aligned"));
 2934         KASSERT((eva & PAGE_MASK) == 0,
 2935             ("pmap_invalidate_cache_range: eva not page-aligned"));
 2936 }
 2937 
 2938 static void
 2939 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
 2940 {
 2941 
 2942         pmap_invalidate_cache_range_check_align(sva, eva);
 2943 }
 2944 
 2945 void
 2946 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 2947 {
 2948 
 2949         sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
 2950 
 2951         /*
 2952          * XXX: Some CPUs fault, hang, or trash the local APIC
 2953          * registers if we use CLFLUSH on the local APIC range.  The
 2954          * local APIC is always uncached, so we don't need to flush
 2955          * for that range anyway.
 2956          */
 2957         if (pmap_kextract(sva) == lapic_paddr)
 2958                 return;
 2959 
 2960         if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
 2961                 /*
 2962                  * Do per-cache line flush.  Use a locked
 2963                  * instruction to insure that previous stores are
 2964                  * included in the write-back.  The processor
 2965                  * propagates flush to other processors in the cache
 2966                  * coherence domain.
 2967                  */
 2968                 atomic_thread_fence_seq_cst();
 2969                 for (; sva < eva; sva += cpu_clflush_line_size)
 2970                         clflushopt(sva);
 2971                 atomic_thread_fence_seq_cst();
 2972         } else {
 2973                 /*
 2974                  * Writes are ordered by CLFLUSH on Intel CPUs.
 2975                  */
 2976                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
 2977                         mfence();
 2978                 for (; sva < eva; sva += cpu_clflush_line_size)
 2979                         clflush(sva);
 2980                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
 2981                         mfence();
 2982         }
 2983 }
 2984 
 2985 static void
 2986 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
 2987 {
 2988 
 2989         pmap_invalidate_cache_range_check_align(sva, eva);
 2990         pmap_invalidate_cache();
 2991 }
 2992 
 2993 /*
 2994  * Remove the specified set of pages from the data and instruction caches.
 2995  *
 2996  * In contrast to pmap_invalidate_cache_range(), this function does not
 2997  * rely on the CPU's self-snoop feature, because it is intended for use
 2998  * when moving pages into a different cache domain.
 2999  */
 3000 void
 3001 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 3002 {
 3003         vm_offset_t daddr, eva;
 3004         int i;
 3005         bool useclflushopt;
 3006 
 3007         useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 3008         if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 3009             ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
 3010                 pmap_invalidate_cache();
 3011         else {
 3012                 if (useclflushopt)
 3013                         atomic_thread_fence_seq_cst();
 3014                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 3015                         mfence();
 3016                 for (i = 0; i < count; i++) {
 3017                         daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 3018                         eva = daddr + PAGE_SIZE;
 3019                         for (; daddr < eva; daddr += cpu_clflush_line_size) {
 3020                                 if (useclflushopt)
 3021                                         clflushopt(daddr);
 3022                                 else
 3023                                         clflush(daddr);
 3024                         }
 3025                 }
 3026                 if (useclflushopt)
 3027                         atomic_thread_fence_seq_cst();
 3028                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 3029                         mfence();
 3030         }
 3031 }
 3032 
 3033 void
 3034 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
 3035 {
 3036 
 3037         pmap_invalidate_cache_range_check_align(sva, eva);
 3038 
 3039         if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
 3040                 pmap_force_invalidate_cache_range(sva, eva);
 3041                 return;
 3042         }
 3043 
 3044         /* See comment in pmap_force_invalidate_cache_range(). */
 3045         if (pmap_kextract(sva) == lapic_paddr)
 3046                 return;
 3047 
 3048         atomic_thread_fence_seq_cst();
 3049         for (; sva < eva; sva += cpu_clflush_line_size)
 3050                 clwb(sva);
 3051         atomic_thread_fence_seq_cst();
 3052 }
 3053 
 3054 void
 3055 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
 3056 {
 3057         pt_entry_t *pte;
 3058         vm_offset_t vaddr;
 3059         int error, pte_bits;
 3060 
 3061         KASSERT((spa & PAGE_MASK) == 0,
 3062             ("pmap_flush_cache_phys_range: spa not page-aligned"));
 3063         KASSERT((epa & PAGE_MASK) == 0,
 3064             ("pmap_flush_cache_phys_range: epa not page-aligned"));
 3065 
 3066         if (spa < dmaplimit) {
 3067                 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
 3068                     dmaplimit, epa)));
 3069                 if (dmaplimit >= epa)
 3070                         return;
 3071                 spa = dmaplimit;
 3072         }
 3073 
 3074         pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
 3075             X86_PG_V;
 3076         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 3077             &vaddr);
 3078         KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 3079         pte = vtopte(vaddr);
 3080         for (; spa < epa; spa += PAGE_SIZE) {
 3081                 sched_pin();
 3082                 pte_store(pte, spa | pte_bits);
 3083                 invlpg(vaddr);
 3084                 /* XXXKIB atomic inside flush_cache_range are excessive */
 3085                 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
 3086                 sched_unpin();
 3087         }
 3088         vmem_free(kernel_arena, vaddr, PAGE_SIZE);
 3089 }
 3090 
 3091 /*
 3092  *      Routine:        pmap_extract
 3093  *      Function:
 3094  *              Extract the physical page address associated
 3095  *              with the given map/virtual_address pair.
 3096  */
 3097 vm_paddr_t 
 3098 pmap_extract(pmap_t pmap, vm_offset_t va)
 3099 {
 3100         pdp_entry_t *pdpe;
 3101         pd_entry_t *pde;
 3102         pt_entry_t *pte, PG_V;
 3103         vm_paddr_t pa;
 3104 
 3105         pa = 0;
 3106         PG_V = pmap_valid_bit(pmap);
 3107         PMAP_LOCK(pmap);
 3108         pdpe = pmap_pdpe(pmap, va);
 3109         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 3110                 if ((*pdpe & PG_PS) != 0)
 3111                         pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 3112                 else {
 3113                         pde = pmap_pdpe_to_pde(pdpe, va);
 3114                         if ((*pde & PG_V) != 0) {
 3115                                 if ((*pde & PG_PS) != 0) {
 3116                                         pa = (*pde & PG_PS_FRAME) |
 3117                                             (va & PDRMASK);
 3118                                 } else {
 3119                                         pte = pmap_pde_to_pte(pde, va);
 3120                                         pa = (*pte & PG_FRAME) |
 3121                                             (va & PAGE_MASK);
 3122                                 }
 3123                         }
 3124                 }
 3125         }
 3126         PMAP_UNLOCK(pmap);
 3127         return (pa);
 3128 }
 3129 
 3130 /*
 3131  *      Routine:        pmap_extract_and_hold
 3132  *      Function:
 3133  *              Atomically extract and hold the physical page
 3134  *              with the given pmap and virtual address pair
 3135  *              if that mapping permits the given protection.
 3136  */
 3137 vm_page_t
 3138 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 3139 {
 3140         pd_entry_t pde, *pdep;
 3141         pt_entry_t pte, PG_RW, PG_V;
 3142         vm_paddr_t pa;
 3143         vm_page_t m;
 3144 
 3145         pa = 0;
 3146         m = NULL;
 3147         PG_RW = pmap_rw_bit(pmap);
 3148         PG_V = pmap_valid_bit(pmap);
 3149         PMAP_LOCK(pmap);
 3150 retry:
 3151         pdep = pmap_pde(pmap, va);
 3152         if (pdep != NULL && (pde = *pdep)) {
 3153                 if (pde & PG_PS) {
 3154                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 3155                                 if (vm_page_pa_tryrelock(pmap, (pde &
 3156                                     PG_PS_FRAME) | (va & PDRMASK), &pa))
 3157                                         goto retry;
 3158                                 m = PHYS_TO_VM_PAGE(pa);
 3159                         }
 3160                 } else {
 3161                         pte = *pmap_pde_to_pte(pdep, va);
 3162                         if ((pte & PG_V) &&
 3163                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 3164                                 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 3165                                     &pa))
 3166                                         goto retry;
 3167                                 m = PHYS_TO_VM_PAGE(pa);
 3168                         }
 3169                 }
 3170                 if (m != NULL)
 3171                         vm_page_hold(m);
 3172         }
 3173         PA_UNLOCK_COND(pa);
 3174         PMAP_UNLOCK(pmap);
 3175         return (m);
 3176 }
 3177 
 3178 vm_paddr_t
 3179 pmap_kextract(vm_offset_t va)
 3180 {
 3181         pd_entry_t pde;
 3182         vm_paddr_t pa;
 3183 
 3184         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 3185                 pa = DMAP_TO_PHYS(va);
 3186         } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
 3187                 pa = pmap_large_map_kextract(va);
 3188         } else {
 3189                 pde = *vtopde(va);
 3190                 if (pde & PG_PS) {
 3191                         pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 3192                 } else {
 3193                         /*
 3194                          * Beware of a concurrent promotion that changes the
 3195                          * PDE at this point!  For example, vtopte() must not
 3196                          * be used to access the PTE because it would use the
 3197                          * new PDE.  It is, however, safe to use the old PDE
 3198                          * because the page table page is preserved by the
 3199                          * promotion.
 3200                          */
 3201                         pa = *pmap_pde_to_pte(&pde, va);
 3202                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 3203                 }
 3204         }
 3205         return (pa);
 3206 }
 3207 
 3208 /***************************************************
 3209  * Low level mapping routines.....
 3210  ***************************************************/
 3211 
 3212 /*
 3213  * Add a wired page to the kva.
 3214  * Note: not SMP coherent.
 3215  */
 3216 PMAP_INLINE void 
 3217 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 3218 {
 3219         pt_entry_t *pte;
 3220 
 3221         pte = vtopte(va);
 3222         pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx);
 3223 }
 3224 
 3225 static __inline void
 3226 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 3227 {
 3228         pt_entry_t *pte;
 3229         int cache_bits;
 3230 
 3231         pte = vtopte(va);
 3232         cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
 3233         pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx | cache_bits);
 3234 }
 3235 
 3236 /*
 3237  * Remove a page from the kernel pagetables.
 3238  * Note: not SMP coherent.
 3239  */
 3240 PMAP_INLINE void
 3241 pmap_kremove(vm_offset_t va)
 3242 {
 3243         pt_entry_t *pte;
 3244 
 3245         pte = vtopte(va);
 3246         pte_clear(pte);
 3247 }
 3248 
 3249 /*
 3250  *      Used to map a range of physical addresses into kernel
 3251  *      virtual address space.
 3252  *
 3253  *      The value passed in '*virt' is a suggested virtual address for
 3254  *      the mapping. Architectures which can support a direct-mapped
 3255  *      physical to virtual region can return the appropriate address
 3256  *      within that region, leaving '*virt' unchanged. Other
 3257  *      architectures should map the pages starting at '*virt' and
 3258  *      update '*virt' with the first usable address after the mapped
 3259  *      region.
 3260  */
 3261 vm_offset_t
 3262 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 3263 {
 3264         return PHYS_TO_DMAP(start);
 3265 }
 3266 
 3267 
 3268 /*
 3269  * Add a list of wired pages to the kva
 3270  * this routine is only used for temporary
 3271  * kernel mappings that do not need to have
 3272  * page modification or references recorded.
 3273  * Note that old mappings are simply written
 3274  * over.  The page *must* be wired.
 3275  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 3276  */
 3277 void
 3278 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 3279 {
 3280         pt_entry_t *endpte, oldpte, pa, *pte;
 3281         vm_page_t m;
 3282         int cache_bits;
 3283 
 3284         oldpte = 0;
 3285         pte = vtopte(sva);
 3286         endpte = pte + count;
 3287         while (pte < endpte) {
 3288                 m = *ma++;
 3289                 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 3290                 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 3291                 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 3292                         oldpte |= *pte;
 3293                         pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
 3294                 }
 3295                 pte++;
 3296         }
 3297         if (__predict_false((oldpte & X86_PG_V) != 0))
 3298                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
 3299                     PAGE_SIZE);
 3300 }
 3301 
 3302 /*
 3303  * This routine tears out page mappings from the
 3304  * kernel -- it is meant only for temporary mappings.
 3305  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 3306  */
 3307 void
 3308 pmap_qremove(vm_offset_t sva, int count)
 3309 {
 3310         vm_offset_t va;
 3311 
 3312         va = sva;
 3313         while (count-- > 0) {
 3314                 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
 3315                 pmap_kremove(va);
 3316                 va += PAGE_SIZE;
 3317         }
 3318         pmap_invalidate_range(kernel_pmap, sva, va);
 3319 }
 3320 
 3321 /***************************************************
 3322  * Page table page management routines.....
 3323  ***************************************************/
 3324 /*
 3325  * Schedule the specified unused page table page to be freed.  Specifically,
 3326  * add the page to the specified list of pages that will be released to the
 3327  * physical memory manager after the TLB has been updated.
 3328  */
 3329 static __inline void
 3330 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
 3331     boolean_t set_PG_ZERO)
 3332 {
 3333 
 3334         if (set_PG_ZERO)
 3335                 m->flags |= PG_ZERO;
 3336         else
 3337                 m->flags &= ~PG_ZERO;
 3338         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 3339 }
 3340         
 3341 /*
 3342  * Inserts the specified page table page into the specified pmap's collection
 3343  * of idle page table pages.  Each of a pmap's page table pages is responsible
 3344  * for mapping a distinct range of virtual addresses.  The pmap's collection is
 3345  * ordered by this virtual address range.
 3346  *
 3347  * If "promoted" is false, then the page table page "mpte" must be zero filled.
 3348  */
 3349 static __inline int
 3350 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
 3351 {
 3352 
 3353         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3354         mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 3355         return (vm_radix_insert(&pmap->pm_root, mpte));
 3356 }
 3357 
 3358 /*
 3359  * Removes the page table page mapping the specified virtual address from the
 3360  * specified pmap's collection of idle page table pages, and returns it.
 3361  * Otherwise, returns NULL if there is no page table page corresponding to the
 3362  * specified virtual address.
 3363  */
 3364 static __inline vm_page_t
 3365 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 3366 {
 3367 
 3368         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3369         return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
 3370 }
 3371 
 3372 /*
 3373  * Decrements a page table page's wire count, which is used to record the
 3374  * number of valid page table entries within the page.  If the wire count
 3375  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
 3376  * page table page was unmapped and FALSE otherwise.
 3377  */
 3378 static inline boolean_t
 3379 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 3380 {
 3381 
 3382         --m->wire_count;
 3383         if (m->wire_count == 0) {
 3384                 _pmap_unwire_ptp(pmap, va, m, free);
 3385                 return (TRUE);
 3386         } else
 3387                 return (FALSE);
 3388 }
 3389 
 3390 static void
 3391 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 3392 {
 3393 
 3394         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3395         /*
 3396          * unmap the page table page
 3397          */
 3398         if (m->pindex >= (NUPDE + NUPDPE)) {
 3399                 /* PDP page */
 3400                 pml4_entry_t *pml4;
 3401                 pml4 = pmap_pml4e(pmap, va);
 3402                 *pml4 = 0;
 3403                 if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
 3404                         pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
 3405                         *pml4 = 0;
 3406                 }
 3407         } else if (m->pindex >= NUPDE) {
 3408                 /* PD page */
 3409                 pdp_entry_t *pdp;
 3410                 pdp = pmap_pdpe(pmap, va);
 3411                 *pdp = 0;
 3412         } else {
 3413                 /* PTE page */
 3414                 pd_entry_t *pd;
 3415                 pd = pmap_pde(pmap, va);
 3416                 *pd = 0;
 3417         }
 3418         pmap_resident_count_dec(pmap, 1);
 3419         if (m->pindex < NUPDE) {
 3420                 /* We just released a PT, unhold the matching PD */
 3421                 vm_page_t pdpg;
 3422 
 3423                 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 3424                 pmap_unwire_ptp(pmap, va, pdpg, free);
 3425         }
 3426         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 3427                 /* We just released a PD, unhold the matching PDP */
 3428                 vm_page_t pdppg;
 3429 
 3430                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 3431                 pmap_unwire_ptp(pmap, va, pdppg, free);
 3432         }
 3433 
 3434         /* 
 3435          * Put page on a list so that it is released after
 3436          * *ALL* TLB shootdown is done
 3437          */
 3438         pmap_add_delayed_free_list(m, free, TRUE);
 3439 }
 3440 
 3441 /*
 3442  * After removing a page table entry, this routine is used to
 3443  * conditionally free the page, and manage the hold/wire counts.
 3444  */
 3445 static int
 3446 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
 3447     struct spglist *free)
 3448 {
 3449         vm_page_t mpte;
 3450 
 3451         if (va >= VM_MAXUSER_ADDRESS)
 3452                 return (0);
 3453         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 3454         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 3455         return (pmap_unwire_ptp(pmap, va, mpte, free));
 3456 }
 3457 
 3458 void
 3459 pmap_pinit0(pmap_t pmap)
 3460 {
 3461         struct proc *p;
 3462         struct thread *td;
 3463         int i;
 3464 
 3465         PMAP_LOCK_INIT(pmap);
 3466         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 3467         pmap->pm_pml4u = NULL;
 3468         pmap->pm_cr3 = KPML4phys;
 3469         /* hack to keep pmap_pti_pcid_invalidate() alive */
 3470         pmap->pm_ucr3 = PMAP_NO_CR3;
 3471         pmap->pm_root.rt_root = 0;
 3472         CPU_ZERO(&pmap->pm_active);
 3473         TAILQ_INIT(&pmap->pm_pvchunk);
 3474         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 3475         pmap->pm_flags = pmap_flags;
 3476         CPU_FOREACH(i) {
 3477                 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
 3478                 pmap->pm_pcids[i].pm_gen = 1;
 3479         }
 3480         pmap_activate_boot(pmap);
 3481         td = curthread;
 3482         if (pti) {
 3483                 p = td->td_proc;
 3484                 PROC_LOCK(p);
 3485                 p->p_amd64_md_flags |= P_MD_KPTI;
 3486                 PROC_UNLOCK(p);
 3487         }
 3488         pmap_thread_init_invl_gen(td);
 3489 
 3490         if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 3491                 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
 3492                     sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
 3493                     UMA_ALIGN_PTR, 0);
 3494         }
 3495 }
 3496 
 3497 void
 3498 pmap_pinit_pml4(vm_page_t pml4pg)
 3499 {
 3500         pml4_entry_t *pm_pml4;
 3501         int i;
 3502 
 3503         pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 3504 
 3505         /* Wire in kernel global address entries. */
 3506         for (i = 0; i < NKPML4E; i++) {
 3507                 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
 3508                     X86_PG_V;
 3509         }
 3510         for (i = 0; i < ndmpdpphys; i++) {
 3511                 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
 3512                     X86_PG_V;
 3513         }
 3514 
 3515         /* install self-referential address mapping entry(s) */
 3516         pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
 3517             X86_PG_A | X86_PG_M;
 3518 
 3519         /* install large map entries if configured */
 3520         for (i = 0; i < lm_ents; i++)
 3521                 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
 3522 }
 3523 
 3524 static void
 3525 pmap_pinit_pml4_pti(vm_page_t pml4pg)
 3526 {
 3527         pml4_entry_t *pm_pml4;
 3528         int i;
 3529 
 3530         pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 3531         for (i = 0; i < NPML4EPG; i++)
 3532                 pm_pml4[i] = pti_pml4[i];
 3533 }
 3534 
 3535 /*
 3536  * Initialize a preallocated and zeroed pmap structure,
 3537  * such as one in a vmspace structure.
 3538  */
 3539 int
 3540 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 3541 {
 3542         vm_page_t pml4pg, pml4pgu;
 3543         vm_paddr_t pml4phys;
 3544         int i;
 3545 
 3546         /*
 3547          * allocate the page directory page
 3548          */
 3549         pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 3550             VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
 3551 
 3552         pml4phys = VM_PAGE_TO_PHYS(pml4pg);
 3553         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
 3554         CPU_FOREACH(i) {
 3555                 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 3556                 pmap->pm_pcids[i].pm_gen = 0;
 3557         }
 3558         pmap->pm_cr3 = PMAP_NO_CR3;     /* initialize to an invalid value */
 3559         pmap->pm_ucr3 = PMAP_NO_CR3;
 3560         pmap->pm_pml4u = NULL;
 3561 
 3562         pmap->pm_type = pm_type;
 3563         if ((pml4pg->flags & PG_ZERO) == 0)
 3564                 pagezero(pmap->pm_pml4);
 3565 
 3566         /*
 3567          * Do not install the host kernel mappings in the nested page
 3568          * tables. These mappings are meaningless in the guest physical
 3569          * address space.
 3570          * Install minimal kernel mappings in PTI case.
 3571          */
 3572         if (pm_type == PT_X86) {
 3573                 pmap->pm_cr3 = pml4phys;
 3574                 pmap_pinit_pml4(pml4pg);
 3575                 if ((curproc->p_amd64_md_flags & P_MD_KPTI) != 0) {
 3576                         pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 3577                             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 3578                         pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
 3579                             VM_PAGE_TO_PHYS(pml4pgu));
 3580                         pmap_pinit_pml4_pti(pml4pgu);
 3581                         pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
 3582                 }
 3583                 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 3584                         rangeset_init(&pmap->pm_pkru, pkru_dup_range,
 3585                             pkru_free_range, pmap, M_NOWAIT);
 3586                 }
 3587         }
 3588 
 3589         pmap->pm_root.rt_root = 0;
 3590         CPU_ZERO(&pmap->pm_active);
 3591         TAILQ_INIT(&pmap->pm_pvchunk);
 3592         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 3593         pmap->pm_flags = flags;
 3594         pmap->pm_eptgen = 0;
 3595 
 3596         return (1);
 3597 }
 3598 
 3599 int
 3600 pmap_pinit(pmap_t pmap)
 3601 {
 3602 
 3603         return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 3604 }
 3605 
 3606 /*
 3607  * This routine is called if the desired page table page does not exist.
 3608  *
 3609  * If page table page allocation fails, this routine may sleep before
 3610  * returning NULL.  It sleeps only if a lock pointer was given.
 3611  *
 3612  * Note: If a page allocation fails at page table level two or three,
 3613  * one or two pages may be held during the wait, only to be released
 3614  * afterwards.  This conservative approach is easily argued to avoid
 3615  * race conditions.
 3616  *
 3617  * The ptepindexes, i.e. page indices, of the page table pages encountered
 3618  * while translating virtual address va are defined as follows:
 3619  * - for the page table page (last level),
 3620  *      ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT,
 3621  *   in other words, it is just the index of the PDE that maps the page
 3622  *   table page.
 3623  * - for the page directory page,
 3624  *      ptepindex = NUPDE (number of userland PD entries) +
 3625  *          (pmap_pde_index(va) >> NPDEPGSHIFT)
 3626  *   i.e. index of PDPE is put after the last index of PDE,
 3627  * - for the page directory pointer page,
 3628  *      ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT +
 3629  *          NPML4EPGSHIFT),
 3630  *   i.e. index of pml4e is put after the last index of PDPE.
 3631  *
 3632  * Define an order on the paging entries, where all entries of the
 3633  * same height are put together, then heights are put from deepest to
 3634  * root.  Then ptexpindex is the sequential number of the
 3635  * corresponding paging entry in this order.
 3636  *
 3637  * The root page at PML4 does not participate in this indexing scheme, since
 3638  * it is statically allocated by pmap_pinit() and not by _pmap_allocpte().
 3639  */
 3640 static vm_page_t
 3641 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 3642 {
 3643         vm_page_t m, pdppg, pdpg;
 3644         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 3645 
 3646         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3647 
 3648         PG_A = pmap_accessed_bit(pmap);
 3649         PG_M = pmap_modified_bit(pmap);
 3650         PG_V = pmap_valid_bit(pmap);
 3651         PG_RW = pmap_rw_bit(pmap);
 3652 
 3653         /*
 3654          * Allocate a page table page.
 3655          */
 3656         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 3657             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 3658                 if (lockp != NULL) {
 3659                         RELEASE_PV_LIST_LOCK(lockp);
 3660                         PMAP_UNLOCK(pmap);
 3661                         PMAP_ASSERT_NOT_IN_DI();
 3662                         vm_wait(NULL);
 3663                         PMAP_LOCK(pmap);
 3664                 }
 3665 
 3666                 /*
 3667                  * Indicate the need to retry.  While waiting, the page table
 3668                  * page may have been allocated.
 3669                  */
 3670                 return (NULL);
 3671         }
 3672         if ((m->flags & PG_ZERO) == 0)
 3673                 pmap_zero_page(m);
 3674 
 3675         /*
 3676          * Map the pagetable page into the process address space, if
 3677          * it isn't already there.
 3678          */
 3679 
 3680         if (ptepindex >= (NUPDE + NUPDPE)) {
 3681                 pml4_entry_t *pml4, *pml4u;
 3682                 vm_pindex_t pml4index;
 3683 
 3684                 /* Wire up a new PDPE page */
 3685                 pml4index = ptepindex - (NUPDE + NUPDPE);
 3686                 pml4 = &pmap->pm_pml4[pml4index];
 3687                 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 3688                 if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
 3689                         /*
 3690                          * PTI: Make all user-space mappings in the
 3691                          * kernel-mode page table no-execute so that
 3692                          * we detect any programming errors that leave
 3693                          * the kernel-mode page table active on return
 3694                          * to user space.
 3695                          */
 3696                         if (pmap->pm_ucr3 != PMAP_NO_CR3)
 3697                                 *pml4 |= pg_nx;
 3698 
 3699                         pml4u = &pmap->pm_pml4u[pml4index];
 3700                         *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
 3701                             PG_A | PG_M;
 3702                 }
 3703 
 3704         } else if (ptepindex >= NUPDE) {
 3705                 vm_pindex_t pml4index;
 3706                 vm_pindex_t pdpindex;
 3707                 pml4_entry_t *pml4;
 3708                 pdp_entry_t *pdp;
 3709 
 3710                 /* Wire up a new PDE page */
 3711                 pdpindex = ptepindex - NUPDE;
 3712                 pml4index = pdpindex >> NPML4EPGSHIFT;
 3713 
 3714                 pml4 = &pmap->pm_pml4[pml4index];
 3715                 if ((*pml4 & PG_V) == 0) {
 3716                         /* Have to allocate a new pdp, recurse */
 3717                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 3718                             lockp) == NULL) {
 3719                                 vm_page_unwire_noq(m);
 3720                                 vm_page_free_zero(m);
 3721                                 return (NULL);
 3722                         }
 3723                 } else {
 3724                         /* Add reference to pdp page */
 3725                         pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 3726                         pdppg->wire_count++;
 3727                 }
 3728                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 3729 
 3730                 /* Now find the pdp page */
 3731                 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 3732                 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 3733 
 3734         } else {
 3735                 vm_pindex_t pml4index;
 3736                 vm_pindex_t pdpindex;
 3737                 pml4_entry_t *pml4;
 3738                 pdp_entry_t *pdp;
 3739                 pd_entry_t *pd;
 3740 
 3741                 /* Wire up a new PTE page */
 3742                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 3743                 pml4index = pdpindex >> NPML4EPGSHIFT;
 3744 
 3745                 /* First, find the pdp and check that its valid. */
 3746                 pml4 = &pmap->pm_pml4[pml4index];
 3747                 if ((*pml4 & PG_V) == 0) {
 3748                         /* Have to allocate a new pd, recurse */
 3749                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 3750                             lockp) == NULL) {
 3751                                 vm_page_unwire_noq(m);
 3752                                 vm_page_free_zero(m);
 3753                                 return (NULL);
 3754                         }
 3755                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 3756                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 3757                 } else {
 3758                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 3759                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 3760                         if ((*pdp & PG_V) == 0) {
 3761                                 /* Have to allocate a new pd, recurse */
 3762                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 3763                                     lockp) == NULL) {
 3764                                         vm_page_unwire_noq(m);
 3765                                         vm_page_free_zero(m);
 3766                                         return (NULL);
 3767                                 }
 3768                         } else {
 3769                                 /* Add reference to the pd page */
 3770                                 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 3771                                 pdpg->wire_count++;
 3772                         }
 3773                 }
 3774                 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 3775 
 3776                 /* Now we know where the page directory page is */
 3777                 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 3778                 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 3779         }
 3780 
 3781         pmap_resident_count_inc(pmap, 1);
 3782 
 3783         return (m);
 3784 }
 3785 
 3786 static vm_page_t
 3787 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 3788 {
 3789         vm_pindex_t pdpindex, ptepindex;
 3790         pdp_entry_t *pdpe, PG_V;
 3791         vm_page_t pdpg;
 3792 
 3793         PG_V = pmap_valid_bit(pmap);
 3794 
 3795 retry:
 3796         pdpe = pmap_pdpe(pmap, va);
 3797         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 3798                 /* Add a reference to the pd page. */
 3799                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 3800                 pdpg->wire_count++;
 3801         } else {
 3802                 /* Allocate a pd page. */
 3803                 ptepindex = pmap_pde_pindex(va);
 3804                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 3805                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 3806                 if (pdpg == NULL && lockp != NULL)
 3807                         goto retry;
 3808         }
 3809         return (pdpg);
 3810 }
 3811 
 3812 static vm_page_t
 3813 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 3814 {
 3815         vm_pindex_t ptepindex;
 3816         pd_entry_t *pd, PG_V;
 3817         vm_page_t m;
 3818 
 3819         PG_V = pmap_valid_bit(pmap);
 3820 
 3821         /*
 3822          * Calculate pagetable page index
 3823          */
 3824         ptepindex = pmap_pde_pindex(va);
 3825 retry:
 3826         /*
 3827          * Get the page directory entry
 3828          */
 3829         pd = pmap_pde(pmap, va);
 3830 
 3831         /*
 3832          * This supports switching from a 2MB page to a
 3833          * normal 4K page.
 3834          */
 3835         if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 3836                 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 3837                         /*
 3838                          * Invalidation of the 2MB page mapping may have caused
 3839                          * the deallocation of the underlying PD page.
 3840                          */
 3841                         pd = NULL;
 3842                 }
 3843         }
 3844 
 3845         /*
 3846          * If the page table page is mapped, we just increment the
 3847          * hold count, and activate it.
 3848          */
 3849         if (pd != NULL && (*pd & PG_V) != 0) {
 3850                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 3851                 m->wire_count++;
 3852         } else {
 3853                 /*
 3854                  * Here if the pte page isn't mapped, or if it has been
 3855                  * deallocated.
 3856                  */
 3857                 m = _pmap_allocpte(pmap, ptepindex, lockp);
 3858                 if (m == NULL && lockp != NULL)
 3859                         goto retry;
 3860         }
 3861         return (m);
 3862 }
 3863 
 3864 
 3865 /***************************************************
 3866  * Pmap allocation/deallocation routines.
 3867  ***************************************************/
 3868 
 3869 /*
 3870  * Release any resources held by the given physical map.
 3871  * Called when a pmap initialized by pmap_pinit is being released.
 3872  * Should only be called if the map contains no valid mappings.
 3873  */
 3874 void
 3875 pmap_release(pmap_t pmap)
 3876 {
 3877         vm_page_t m;
 3878         int i;
 3879 
 3880         KASSERT(pmap->pm_stats.resident_count == 0,
 3881             ("pmap_release: pmap resident count %ld != 0",
 3882             pmap->pm_stats.resident_count));
 3883         KASSERT(vm_radix_is_empty(&pmap->pm_root),
 3884             ("pmap_release: pmap has reserved page table page(s)"));
 3885         KASSERT(CPU_EMPTY(&pmap->pm_active),
 3886             ("releasing active pmap %p", pmap));
 3887 
 3888         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
 3889 
 3890         for (i = 0; i < NKPML4E; i++)   /* KVA */
 3891                 pmap->pm_pml4[KPML4BASE + i] = 0;
 3892         for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 3893                 pmap->pm_pml4[DMPML4I + i] = 0;
 3894         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
 3895         for (i = 0; i < lm_ents; i++)   /* Large Map */
 3896                 pmap->pm_pml4[LMSPML4I + i] = 0;
 3897 
 3898         vm_page_unwire_noq(m);
 3899         vm_page_free_zero(m);
 3900 
 3901         if (pmap->pm_pml4u != NULL) {
 3902                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
 3903                 vm_page_unwire_noq(m);
 3904                 vm_page_free(m);
 3905         }
 3906         if (pmap->pm_type == PT_X86 &&
 3907             (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
 3908                 rangeset_fini(&pmap->pm_pkru);
 3909 }
 3910 
 3911 static int
 3912 kvm_size(SYSCTL_HANDLER_ARGS)
 3913 {
 3914         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 3915 
 3916         return sysctl_handle_long(oidp, &ksize, 0, req);
 3917 }
 3918 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
 3919     0, 0, kvm_size, "LU", "Size of KVM");
 3920 
 3921 static int
 3922 kvm_free(SYSCTL_HANDLER_ARGS)
 3923 {
 3924         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 3925 
 3926         return sysctl_handle_long(oidp, &kfree, 0, req);
 3927 }
 3928 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
 3929     0, 0, kvm_free, "LU", "Amount of KVM free");
 3930 
 3931 /*
 3932  * grow the number of kernel page table entries, if needed
 3933  */
 3934 void
 3935 pmap_growkernel(vm_offset_t addr)
 3936 {
 3937         vm_paddr_t paddr;
 3938         vm_page_t nkpg;
 3939         pd_entry_t *pde, newpdir;
 3940         pdp_entry_t *pdpe;
 3941 
 3942         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 3943 
 3944         /*
 3945          * Return if "addr" is within the range of kernel page table pages
 3946          * that were preallocated during pmap bootstrap.  Moreover, leave
 3947          * "kernel_vm_end" and the kernel page table as they were.
 3948          *
 3949          * The correctness of this action is based on the following
 3950          * argument: vm_map_insert() allocates contiguous ranges of the
 3951          * kernel virtual address space.  It calls this function if a range
 3952          * ends after "kernel_vm_end".  If the kernel is mapped between
 3953          * "kernel_vm_end" and "addr", then the range cannot begin at
 3954          * "kernel_vm_end".  In fact, its beginning address cannot be less
 3955          * than the kernel.  Thus, there is no immediate need to allocate
 3956          * any new kernel page table pages between "kernel_vm_end" and
 3957          * "KERNBASE".
 3958          */
 3959         if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 3960                 return;
 3961 
 3962         addr = roundup2(addr, NBPDR);
 3963         if (addr - 1 >= vm_map_max(kernel_map))
 3964                 addr = vm_map_max(kernel_map);
 3965         while (kernel_vm_end < addr) {
 3966                 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 3967                 if ((*pdpe & X86_PG_V) == 0) {
 3968                         /* We need a new PDP entry */
 3969                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 3970                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 3971                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 3972                         if (nkpg == NULL)
 3973                                 panic("pmap_growkernel: no memory to grow kernel");
 3974                         if ((nkpg->flags & PG_ZERO) == 0)
 3975                                 pmap_zero_page(nkpg);
 3976                         paddr = VM_PAGE_TO_PHYS(nkpg);
 3977                         *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
 3978                             X86_PG_A | X86_PG_M);
 3979                         continue; /* try again */
 3980                 }
 3981                 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 3982                 if ((*pde & X86_PG_V) != 0) {
 3983                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 3984                         if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 3985                                 kernel_vm_end = vm_map_max(kernel_map);
 3986                                 break;                       
 3987                         }
 3988                         continue;
 3989                 }
 3990 
 3991                 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 3992                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 3993                     VM_ALLOC_ZERO);
 3994                 if (nkpg == NULL)
 3995                         panic("pmap_growkernel: no memory to grow kernel");
 3996                 if ((nkpg->flags & PG_ZERO) == 0)
 3997                         pmap_zero_page(nkpg);
 3998                 paddr = VM_PAGE_TO_PHYS(nkpg);
 3999                 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 4000                 pde_store(pde, newpdir);
 4001 
 4002                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 4003                 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 4004                         kernel_vm_end = vm_map_max(kernel_map);
 4005                         break;                       
 4006                 }
 4007         }
 4008 }
 4009 
 4010 
 4011 /***************************************************
 4012  * page management routines.
 4013  ***************************************************/
 4014 
 4015 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 4016 CTASSERT(_NPCM == 3);
 4017 CTASSERT(_NPCPV == 168);
 4018 
 4019 static __inline struct pv_chunk *
 4020 pv_to_chunk(pv_entry_t pv)
 4021 {
 4022 
 4023         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 4024 }
 4025 
 4026 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 4027 
 4028 #define PC_FREE0        0xfffffffffffffffful
 4029 #define PC_FREE1        0xfffffffffffffffful
 4030 #define PC_FREE2        0x000000fffffffffful
 4031 
 4032 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 4033 
 4034 #ifdef PV_STATS
 4035 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 4036 
 4037 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 4038         "Current number of pv entry chunks");
 4039 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 4040         "Current number of pv entry chunks allocated");
 4041 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 4042         "Current number of pv entry chunks frees");
 4043 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 4044         "Number of times tried to get a chunk page but failed.");
 4045 
 4046 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 4047 static int pv_entry_spare;
 4048 
 4049 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 4050         "Current number of pv entry frees");
 4051 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 4052         "Current number of pv entry allocs");
 4053 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 4054         "Current number of pv entries");
 4055 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 4056         "Current number of spare pv entries");
 4057 #endif
 4058 
 4059 static void
 4060 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
 4061 {
 4062 
 4063         if (pmap == NULL)
 4064                 return;
 4065         pmap_invalidate_all(pmap);
 4066         if (pmap != locked_pmap)
 4067                 PMAP_UNLOCK(pmap);
 4068         if (start_di)
 4069                 pmap_delayed_invl_finish();
 4070 }
 4071 
 4072 /*
 4073  * We are in a serious low memory condition.  Resort to
 4074  * drastic measures to free some pages so we can allocate
 4075  * another pv entry chunk.
 4076  *
 4077  * Returns NULL if PV entries were reclaimed from the specified pmap.
 4078  *
 4079  * We do not, however, unmap 2mpages because subsequent accesses will
 4080  * allocate per-page pv entries until repromotion occurs, thereby
 4081  * exacerbating the shortage of free pv entries.
 4082  */
 4083 static vm_page_t
 4084 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 4085 {
 4086         struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 4087         struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 4088         struct md_page *pvh;
 4089         pd_entry_t *pde;
 4090         pmap_t next_pmap, pmap;
 4091         pt_entry_t *pte, tpte;
 4092         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 4093         pv_entry_t pv;
 4094         vm_offset_t va;
 4095         vm_page_t m, m_pc;
 4096         struct spglist free;
 4097         uint64_t inuse;
 4098         int bit, field, freed;
 4099         bool start_di;
 4100         static int active_reclaims = 0;
 4101 
 4102         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 4103         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 4104         pmap = NULL;
 4105         m_pc = NULL;
 4106         PG_G = PG_A = PG_M = PG_RW = 0;
 4107         SLIST_INIT(&free);
 4108         bzero(&pc_marker_b, sizeof(pc_marker_b));
 4109         bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 4110         pc_marker = (struct pv_chunk *)&pc_marker_b;
 4111         pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 4112 
 4113         /*
 4114          * A delayed invalidation block should already be active if
 4115          * pmap_advise() or pmap_remove() called this function by way
 4116          * of pmap_demote_pde_locked().
 4117          */
 4118         start_di = pmap_not_in_di();
 4119 
 4120         mtx_lock(&pv_chunks_mutex);
 4121         active_reclaims++;
 4122         TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
 4123         TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
 4124         while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 4125             SLIST_EMPTY(&free)) {
 4126                 next_pmap = pc->pc_pmap;
 4127                 if (next_pmap == NULL) {
 4128                         /*
 4129                          * The next chunk is a marker.  However, it is
 4130                          * not our marker, so active_reclaims must be
 4131                          * > 1.  Consequently, the next_chunk code
 4132                          * will not rotate the pv_chunks list.
 4133                          */
 4134                         goto next_chunk;
 4135                 }
 4136                 mtx_unlock(&pv_chunks_mutex);
 4137 
 4138                 /*
 4139                  * A pv_chunk can only be removed from the pc_lru list
 4140                  * when both pc_chunks_mutex is owned and the
 4141                  * corresponding pmap is locked.
 4142                  */
 4143                 if (pmap != next_pmap) {
 4144                         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
 4145                             start_di);
 4146                         pmap = next_pmap;
 4147                         /* Avoid deadlock and lock recursion. */
 4148                         if (pmap > locked_pmap) {
 4149                                 RELEASE_PV_LIST_LOCK(lockp);
 4150                                 PMAP_LOCK(pmap);
 4151                                 if (start_di)
 4152                                         pmap_delayed_invl_start();
 4153                                 mtx_lock(&pv_chunks_mutex);
 4154                                 continue;
 4155                         } else if (pmap != locked_pmap) {
 4156                                 if (PMAP_TRYLOCK(pmap)) {
 4157                                         if (start_di)
 4158                                                 pmap_delayed_invl_start();
 4159                                         mtx_lock(&pv_chunks_mutex);
 4160                                         continue;
 4161                                 } else {
 4162                                         pmap = NULL; /* pmap is not locked */
 4163                                         mtx_lock(&pv_chunks_mutex);
 4164                                         pc = TAILQ_NEXT(pc_marker, pc_lru);
 4165                                         if (pc == NULL ||
 4166                                             pc->pc_pmap != next_pmap)
 4167                                                 continue;
 4168                                         goto next_chunk;
 4169                                 }
 4170                         } else if (start_di)
 4171                                 pmap_delayed_invl_start();
 4172                         PG_G = pmap_global_bit(pmap);
 4173                         PG_A = pmap_accessed_bit(pmap);
 4174                         PG_M = pmap_modified_bit(pmap);
 4175                         PG_RW = pmap_rw_bit(pmap);
 4176                 }
 4177 
 4178                 /*
 4179                  * Destroy every non-wired, 4 KB page mapping in the chunk.
 4180                  */
 4181                 freed = 0;
 4182                 for (field = 0; field < _NPCM; field++) {
 4183                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 4184                             inuse != 0; inuse &= ~(1UL << bit)) {
 4185                                 bit = bsfq(inuse);
 4186                                 pv = &pc->pc_pventry[field * 64 + bit];
 4187                                 va = pv->pv_va;
 4188                                 pde = pmap_pde(pmap, va);
 4189                                 if ((*pde & PG_PS) != 0)
 4190                                         continue;
 4191                                 pte = pmap_pde_to_pte(pde, va);
 4192                                 if ((*pte & PG_W) != 0)
 4193                                         continue;
 4194                                 tpte = pte_load_clear(pte);
 4195                                 if ((tpte & PG_G) != 0)
 4196                                         pmap_invalidate_page(pmap, va);
 4197                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 4198                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 4199                                         vm_page_dirty(m);
 4200                                 if ((tpte & PG_A) != 0)
 4201                                         vm_page_aflag_set(m, PGA_REFERENCED);
 4202                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 4203                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 4204                                 m->md.pv_gen++;
 4205                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 4206                                     (m->flags & PG_FICTITIOUS) == 0) {
 4207                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4208                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 4209                                                 vm_page_aflag_clear(m,
 4210                                                     PGA_WRITEABLE);
 4211                                         }
 4212                                 }
 4213                                 pmap_delayed_invl_page(m);
 4214                                 pc->pc_map[field] |= 1UL << bit;
 4215                                 pmap_unuse_pt(pmap, va, *pde, &free);
 4216                                 freed++;
 4217                         }
 4218                 }
 4219                 if (freed == 0) {
 4220                         mtx_lock(&pv_chunks_mutex);
 4221                         goto next_chunk;
 4222                 }
 4223                 /* Every freed mapping is for a 4 KB page. */
 4224                 pmap_resident_count_dec(pmap, freed);
 4225                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 4226                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 4227                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 4228                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4229                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 4230                     pc->pc_map[2] == PC_FREE2) {
 4231                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 4232                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 4233                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 4234                         /* Entire chunk is free; return it. */
 4235                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 4236                         dump_drop_page(m_pc->phys_addr);
 4237                         mtx_lock(&pv_chunks_mutex);
 4238                         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 4239                         break;
 4240                 }
 4241                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 4242                 mtx_lock(&pv_chunks_mutex);
 4243                 /* One freed pv entry in locked_pmap is sufficient. */
 4244                 if (pmap == locked_pmap)
 4245                         break;
 4246 next_chunk:
 4247                 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 4248                 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
 4249                 if (active_reclaims == 1 && pmap != NULL) {
 4250                         /*
 4251                          * Rotate the pv chunks list so that we do not
 4252                          * scan the same pv chunks that could not be
 4253                          * freed (because they contained a wired
 4254                          * and/or superpage mapping) on every
 4255                          * invocation of reclaim_pv_chunk().
 4256                          */
 4257                         while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
 4258                                 MPASS(pc->pc_pmap != NULL);
 4259                                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 4260                                 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 4261                         }
 4262                 }
 4263         }
 4264         TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 4265         TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
 4266         active_reclaims--;
 4267         mtx_unlock(&pv_chunks_mutex);
 4268         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
 4269         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 4270                 m_pc = SLIST_FIRST(&free);
 4271                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 4272                 /* Recycle a freed page table page. */
 4273                 m_pc->wire_count = 1;
 4274         }
 4275         vm_page_free_pages_toq(&free, true);
 4276         return (m_pc);
 4277 }
 4278 
 4279 /*
 4280  * free the pv_entry back to the free list
 4281  */
 4282 static void
 4283 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 4284 {
 4285         struct pv_chunk *pc;
 4286         int idx, field, bit;
 4287 
 4288         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4289         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 4290         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 4291         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 4292         pc = pv_to_chunk(pv);
 4293         idx = pv - &pc->pc_pventry[0];
 4294         field = idx / 64;
 4295         bit = idx % 64;
 4296         pc->pc_map[field] |= 1ul << bit;
 4297         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 4298             pc->pc_map[2] != PC_FREE2) {
 4299                 /* 98% of the time, pc is already at the head of the list. */
 4300                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 4301                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4302                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 4303                 }
 4304                 return;
 4305         }
 4306         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4307         free_pv_chunk(pc);
 4308 }
 4309 
 4310 static void
 4311 free_pv_chunk_dequeued(struct pv_chunk *pc)
 4312 {
 4313         vm_page_t m;
 4314 
 4315         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 4316         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 4317         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 4318         /* entire chunk is free, return it */
 4319         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 4320         dump_drop_page(m->phys_addr);
 4321         vm_page_unwire_noq(m);
 4322         vm_page_free(m);
 4323 }
 4324 
 4325 static void
 4326 free_pv_chunk(struct pv_chunk *pc)
 4327 {
 4328 
 4329         mtx_lock(&pv_chunks_mutex);
 4330         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 4331         mtx_unlock(&pv_chunks_mutex);
 4332         free_pv_chunk_dequeued(pc);
 4333 }
 4334 
 4335 static void
 4336 free_pv_chunk_batch(struct pv_chunklist *batch)
 4337 {
 4338         struct pv_chunk *pc, *npc;
 4339 
 4340         if (TAILQ_EMPTY(batch))
 4341                 return;
 4342 
 4343         mtx_lock(&pv_chunks_mutex);
 4344         TAILQ_FOREACH(pc, batch, pc_list) {
 4345                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 4346         }
 4347         mtx_unlock(&pv_chunks_mutex);
 4348 
 4349         TAILQ_FOREACH_SAFE(pc, batch, pc_list, npc) {
 4350                 free_pv_chunk_dequeued(pc);
 4351         }
 4352 }
 4353 
 4354 /*
 4355  * Returns a new PV entry, allocating a new PV chunk from the system when
 4356  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
 4357  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
 4358  * returned.
 4359  *
 4360  * The given PV list lock may be released.
 4361  */
 4362 static pv_entry_t
 4363 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 4364 {
 4365         int bit, field;
 4366         pv_entry_t pv;
 4367         struct pv_chunk *pc;
 4368         vm_page_t m;
 4369 
 4370         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4371         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 4372 retry:
 4373         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 4374         if (pc != NULL) {
 4375                 for (field = 0; field < _NPCM; field++) {
 4376                         if (pc->pc_map[field]) {
 4377                                 bit = bsfq(pc->pc_map[field]);
 4378                                 break;
 4379                         }
 4380                 }
 4381                 if (field < _NPCM) {
 4382                         pv = &pc->pc_pventry[field * 64 + bit];
 4383                         pc->pc_map[field] &= ~(1ul << bit);
 4384                         /* If this was the last item, move it to tail */
 4385                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 4386                             pc->pc_map[2] == 0) {
 4387                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4388                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 4389                                     pc_list);
 4390                         }
 4391                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 4392                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 4393                         return (pv);
 4394                 }
 4395         }
 4396         /* No free items, allocate another chunk */
 4397         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 4398             VM_ALLOC_WIRED);
 4399         if (m == NULL) {
 4400                 if (lockp == NULL) {
 4401                         PV_STAT(pc_chunk_tryfail++);
 4402                         return (NULL);
 4403                 }
 4404                 m = reclaim_pv_chunk(pmap, lockp);
 4405                 if (m == NULL)
 4406                         goto retry;
 4407         }
 4408         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 4409         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 4410         dump_add_page(m->phys_addr);
 4411         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 4412         pc->pc_pmap = pmap;
 4413         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
 4414         pc->pc_map[1] = PC_FREE1;
 4415         pc->pc_map[2] = PC_FREE2;
 4416         mtx_lock(&pv_chunks_mutex);
 4417         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 4418         mtx_unlock(&pv_chunks_mutex);
 4419         pv = &pc->pc_pventry[0];
 4420         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 4421         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 4422         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 4423         return (pv);
 4424 }
 4425 
 4426 /*
 4427  * Returns the number of one bits within the given PV chunk map.
 4428  *
 4429  * The erratas for Intel processors state that "POPCNT Instruction May
 4430  * Take Longer to Execute Than Expected".  It is believed that the
 4431  * issue is the spurious dependency on the destination register.
 4432  * Provide a hint to the register rename logic that the destination
 4433  * value is overwritten, by clearing it, as suggested in the
 4434  * optimization manual.  It should be cheap for unaffected processors
 4435  * as well.
 4436  *
 4437  * Reference numbers for erratas are
 4438  * 4th Gen Core: HSD146
 4439  * 5th Gen Core: BDM85
 4440  * 6th Gen Core: SKL029
 4441  */
 4442 static int
 4443 popcnt_pc_map_pq(uint64_t *map)
 4444 {
 4445         u_long result, tmp;
 4446 
 4447         __asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
 4448             "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
 4449             "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
 4450             : "=&r" (result), "=&r" (tmp)
 4451             : "m" (map[0]), "m" (map[1]), "m" (map[2]));
 4452         return (result);
 4453 }
 4454 
 4455 /*
 4456  * Ensure that the number of spare PV entries in the specified pmap meets or
 4457  * exceeds the given count, "needed".
 4458  *
 4459  * The given PV list lock may be released.
 4460  */
 4461 static void
 4462 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 4463 {
 4464         struct pch new_tail;
 4465         struct pv_chunk *pc;
 4466         vm_page_t m;
 4467         int avail, free;
 4468         bool reclaimed;
 4469 
 4470         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4471         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 4472 
 4473         /*
 4474          * Newly allocated PV chunks must be stored in a private list until
 4475          * the required number of PV chunks have been allocated.  Otherwise,
 4476          * reclaim_pv_chunk() could recycle one of these chunks.  In
 4477          * contrast, these chunks must be added to the pmap upon allocation.
 4478          */
 4479         TAILQ_INIT(&new_tail);
 4480 retry:
 4481         avail = 0;
 4482         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 4483 #ifndef __POPCNT__
 4484                 if ((cpu_feature2 & CPUID2_POPCNT) == 0)
 4485                         bit_count((bitstr_t *)pc->pc_map, 0,
 4486                             sizeof(pc->pc_map) * NBBY, &free);
 4487                 else
 4488 #endif
 4489                 free = popcnt_pc_map_pq(pc->pc_map);
 4490                 if (free == 0)
 4491                         break;
 4492                 avail += free;
 4493                 if (avail >= needed)
 4494                         break;
 4495         }
 4496         for (reclaimed = false; avail < needed; avail += _NPCPV) {
 4497                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 4498                     VM_ALLOC_WIRED);
 4499                 if (m == NULL) {
 4500                         m = reclaim_pv_chunk(pmap, lockp);
 4501                         if (m == NULL)
 4502                                 goto retry;
 4503                         reclaimed = true;
 4504                 }
 4505                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 4506                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 4507                 dump_add_page(m->phys_addr);
 4508                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 4509                 pc->pc_pmap = pmap;
 4510                 pc->pc_map[0] = PC_FREE0;
 4511                 pc->pc_map[1] = PC_FREE1;
 4512                 pc->pc_map[2] = PC_FREE2;
 4513                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 4514                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 4515                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 4516 
 4517                 /*
 4518                  * The reclaim might have freed a chunk from the current pmap.
 4519                  * If that chunk contained available entries, we need to
 4520                  * re-count the number of available entries.
 4521                  */
 4522                 if (reclaimed)
 4523                         goto retry;
 4524         }
 4525         if (!TAILQ_EMPTY(&new_tail)) {
 4526                 mtx_lock(&pv_chunks_mutex);
 4527                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 4528                 mtx_unlock(&pv_chunks_mutex);
 4529         }
 4530 }
 4531 
 4532 /*
 4533  * First find and then remove the pv entry for the specified pmap and virtual
 4534  * address from the specified pv list.  Returns the pv entry if found and NULL
 4535  * otherwise.  This operation can be performed on pv lists for either 4KB or
 4536  * 2MB page mappings.
 4537  */
 4538 static __inline pv_entry_t
 4539 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 4540 {
 4541         pv_entry_t pv;
 4542 
 4543         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 4544                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 4545                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 4546                         pvh->pv_gen++;
 4547                         break;
 4548                 }
 4549         }
 4550         return (pv);
 4551 }
 4552 
 4553 /*
 4554  * After demotion from a 2MB page mapping to 512 4KB page mappings,
 4555  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
 4556  * entries for each of the 4KB page mappings.
 4557  */
 4558 static void
 4559 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 4560     struct rwlock **lockp)
 4561 {
 4562         struct md_page *pvh;
 4563         struct pv_chunk *pc;
 4564         pv_entry_t pv;
 4565         vm_offset_t va_last;
 4566         vm_page_t m;
 4567         int bit, field;
 4568 
 4569         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4570         KASSERT((pa & PDRMASK) == 0,
 4571             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 4572         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 4573 
 4574         /*
 4575          * Transfer the 2mpage's pv entry for this mapping to the first
 4576          * page's pv list.  Once this transfer begins, the pv list lock
 4577          * must not be released until the last pv entry is reinstantiated.
 4578          */
 4579         pvh = pa_to_pvh(pa);
 4580         va = trunc_2mpage(va);
 4581         pv = pmap_pvh_remove(pvh, pmap, va);
 4582         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 4583         m = PHYS_TO_VM_PAGE(pa);
 4584         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 4585         m->md.pv_gen++;
 4586         /* Instantiate the remaining NPTEPG - 1 pv entries. */
 4587         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 4588         va_last = va + NBPDR - PAGE_SIZE;
 4589         for (;;) {
 4590                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 4591                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 4592                     pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
 4593                 for (field = 0; field < _NPCM; field++) {
 4594                         while (pc->pc_map[field]) {
 4595                                 bit = bsfq(pc->pc_map[field]);
 4596                                 pc->pc_map[field] &= ~(1ul << bit);
 4597                                 pv = &pc->pc_pventry[field * 64 + bit];
 4598                                 va += PAGE_SIZE;
 4599                                 pv->pv_va = va;
 4600                                 m++;
 4601                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 4602                             ("pmap_pv_demote_pde: page %p is not managed", m));
 4603                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 4604                                 m->md.pv_gen++;
 4605                                 if (va == va_last)
 4606                                         goto out;
 4607                         }
 4608                 }
 4609                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4610                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 4611         }
 4612 out:
 4613         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 4614                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4615                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 4616         }
 4617         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 4618         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 4619 }
 4620 
 4621 #if VM_NRESERVLEVEL > 0
 4622 /*
 4623  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
 4624  * replace the many pv entries for the 4KB page mappings by a single pv entry
 4625  * for the 2MB page mapping.
 4626  */
 4627 static void
 4628 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 4629     struct rwlock **lockp)
 4630 {
 4631         struct md_page *pvh;
 4632         pv_entry_t pv;
 4633         vm_offset_t va_last;
 4634         vm_page_t m;
 4635 
 4636         KASSERT((pa & PDRMASK) == 0,
 4637             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 4638         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 4639 
 4640         /*
 4641          * Transfer the first page's pv entry for this mapping to the 2mpage's
 4642          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 4643          * a transfer avoids the possibility that get_pv_entry() calls
 4644          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 4645          * mappings that is being promoted.
 4646          */
 4647         m = PHYS_TO_VM_PAGE(pa);
 4648         va = trunc_2mpage(va);
 4649         pv = pmap_pvh_remove(&m->md, pmap, va);
 4650         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 4651         pvh = pa_to_pvh(pa);
 4652         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 4653         pvh->pv_gen++;
 4654         /* Free the remaining NPTEPG - 1 pv entries. */
 4655         va_last = va + NBPDR - PAGE_SIZE;
 4656         do {
 4657                 m++;
 4658                 va += PAGE_SIZE;
 4659                 pmap_pvh_free(&m->md, pmap, va);
 4660         } while (va < va_last);
 4661 }
 4662 #endif /* VM_NRESERVLEVEL > 0 */
 4663 
 4664 /*
 4665  * First find and then destroy the pv entry for the specified pmap and virtual
 4666  * address.  This operation can be performed on pv lists for either 4KB or 2MB
 4667  * page mappings.
 4668  */
 4669 static void
 4670 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 4671 {
 4672         pv_entry_t pv;
 4673 
 4674         pv = pmap_pvh_remove(pvh, pmap, va);
 4675         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 4676         free_pv_entry(pmap, pv);
 4677 }
 4678 
 4679 /*
 4680  * Conditionally create the PV entry for a 4KB page mapping if the required
 4681  * memory can be allocated without resorting to reclamation.
 4682  */
 4683 static boolean_t
 4684 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
 4685     struct rwlock **lockp)
 4686 {
 4687         pv_entry_t pv;
 4688 
 4689         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4690         /* Pass NULL instead of the lock pointer to disable reclamation. */
 4691         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 4692                 pv->pv_va = va;
 4693                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 4694                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 4695                 m->md.pv_gen++;
 4696                 return (TRUE);
 4697         } else
 4698                 return (FALSE);
 4699 }
 4700 
 4701 /*
 4702  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
 4703  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
 4704  * false if the PV entry cannot be allocated without resorting to reclamation.
 4705  */
 4706 static bool
 4707 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
 4708     struct rwlock **lockp)
 4709 {
 4710         struct md_page *pvh;
 4711         pv_entry_t pv;
 4712         vm_paddr_t pa;
 4713 
 4714         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4715         /* Pass NULL instead of the lock pointer to disable reclamation. */
 4716         if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 4717             NULL : lockp)) == NULL)
 4718                 return (false);
 4719         pv->pv_va = va;
 4720         pa = pde & PG_PS_FRAME;
 4721         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 4722         pvh = pa_to_pvh(pa);
 4723         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 4724         pvh->pv_gen++;
 4725         return (true);
 4726 }
 4727 
 4728 /*
 4729  * Fills a page table page with mappings to consecutive physical pages.
 4730  */
 4731 static void
 4732 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 4733 {
 4734         pt_entry_t *pte;
 4735 
 4736         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 4737                 *pte = newpte;
 4738                 newpte += PAGE_SIZE;
 4739         }
 4740 }
 4741 
 4742 /*
 4743  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
 4744  * mapping is invalidated.
 4745  */
 4746 static boolean_t
 4747 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 4748 {
 4749         struct rwlock *lock;
 4750         boolean_t rv;
 4751 
 4752         lock = NULL;
 4753         rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
 4754         if (lock != NULL)
 4755                 rw_wunlock(lock);
 4756         return (rv);
 4757 }
 4758 
 4759 static void
 4760 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused)
 4761 {
 4762 #ifdef INVARIANTS
 4763 #ifdef DIAGNOSTIC
 4764         pt_entry_t *xpte, *ypte;
 4765 
 4766         for (xpte = firstpte; xpte < firstpte + NPTEPG;
 4767             xpte++, newpte += PAGE_SIZE) {
 4768                 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) {
 4769                         printf("pmap_demote_pde: xpte %zd and newpte map "
 4770                             "different pages: found %#lx, expected %#lx\n",
 4771                             xpte - firstpte, *xpte, newpte);
 4772                         printf("page table dump\n");
 4773                         for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++)
 4774                                 printf("%zd %#lx\n", ypte - firstpte, *ypte);
 4775                         panic("firstpte");
 4776                 }
 4777         }
 4778 #else
 4779         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 4780             ("pmap_demote_pde: firstpte and newpte map different physical"
 4781             " addresses"));
 4782 #endif
 4783 #endif
 4784 }
 4785 
 4786 static void
 4787 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 4788     pd_entry_t oldpde, struct rwlock **lockp)
 4789 {
 4790         struct spglist free;
 4791         vm_offset_t sva;
 4792 
 4793         SLIST_INIT(&free);
 4794         sva = trunc_2mpage(va);
 4795         pmap_remove_pde(pmap, pde, sva, &free, lockp);
 4796         if ((oldpde & pmap_global_bit(pmap)) == 0)
 4797                 pmap_invalidate_pde_page(pmap, sva, oldpde);
 4798         vm_page_free_pages_toq(&free, true);
 4799         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p",
 4800             va, pmap);
 4801 }
 4802 
 4803 static boolean_t
 4804 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 4805     struct rwlock **lockp)
 4806 {
 4807         pd_entry_t newpde, oldpde;
 4808         pt_entry_t *firstpte, newpte;
 4809         pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
 4810         vm_paddr_t mptepa;
 4811         vm_page_t mpte;
 4812         int PG_PTE_CACHE;
 4813         bool in_kernel;
 4814 
 4815         PG_A = pmap_accessed_bit(pmap);
 4816         PG_G = pmap_global_bit(pmap);
 4817         PG_M = pmap_modified_bit(pmap);
 4818         PG_RW = pmap_rw_bit(pmap);
 4819         PG_V = pmap_valid_bit(pmap);
 4820         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 4821         PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 4822 
 4823         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4824         in_kernel = va >= VM_MAXUSER_ADDRESS;
 4825         oldpde = *pde;
 4826         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 4827             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 4828 
 4829         /*
 4830          * Invalidate the 2MB page mapping and return "failure" if the
 4831          * mapping was never accessed.
 4832          */
 4833         if ((oldpde & PG_A) == 0) {
 4834                 KASSERT((oldpde & PG_W) == 0,
 4835                     ("pmap_demote_pde: a wired mapping is missing PG_A"));
 4836                 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
 4837                 return (FALSE);
 4838         }
 4839 
 4840         mpte = pmap_remove_pt_page(pmap, va);
 4841         if (mpte == NULL) {
 4842                 KASSERT((oldpde & PG_W) == 0,
 4843                     ("pmap_demote_pde: page table page for a wired mapping"
 4844                     " is missing"));
 4845 
 4846                 /*
 4847                  * If the page table page is missing and the mapping
 4848                  * is for a kernel address, the mapping must belong to
 4849                  * the direct map.  Page table pages are preallocated
 4850                  * for every other part of the kernel address space,
 4851                  * so the direct map region is the only part of the
 4852                  * kernel address space that must be handled here.
 4853                  */
 4854                 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
 4855                     va < DMAP_MAX_ADDRESS),
 4856                     ("pmap_demote_pde: No saved mpte for va %#lx", va));
 4857 
 4858                 /*
 4859                  * If the 2MB page mapping belongs to the direct map
 4860                  * region of the kernel's address space, then the page
 4861                  * allocation request specifies the highest possible
 4862                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
 4863                  * priority is normal.
 4864                  */
 4865                 mpte = vm_page_alloc(NULL, pmap_pde_pindex(va),
 4866                     (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 4867                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 4868 
 4869                 /*
 4870                  * If the allocation of the new page table page fails,
 4871                  * invalidate the 2MB page mapping and return "failure".
 4872                  */
 4873                 if (mpte == NULL) {
 4874                         pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
 4875                         return (FALSE);
 4876                 }
 4877 
 4878                 if (!in_kernel) {
 4879                         mpte->wire_count = NPTEPG;
 4880                         pmap_resident_count_inc(pmap, 1);
 4881                 }
 4882         }
 4883         mptepa = VM_PAGE_TO_PHYS(mpte);
 4884         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 4885         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 4886         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 4887             ("pmap_demote_pde: oldpde is missing PG_M"));
 4888         newpte = oldpde & ~PG_PS;
 4889         newpte = pmap_swap_pat(pmap, newpte);
 4890 
 4891         /*
 4892          * If the page table page is not leftover from an earlier promotion,
 4893          * initialize it.
 4894          */
 4895         if (mpte->valid == 0)
 4896                 pmap_fill_ptp(firstpte, newpte);
 4897 
 4898         pmap_demote_pde_check(firstpte, newpte);
 4899 
 4900         /*
 4901          * If the mapping has changed attributes, update the page table
 4902          * entries.
 4903          */
 4904         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 4905                 pmap_fill_ptp(firstpte, newpte);
 4906 
 4907         /*
 4908          * The spare PV entries must be reserved prior to demoting the
 4909          * mapping, that is, prior to changing the PDE.  Otherwise, the state
 4910          * of the PDE and the PV lists will be inconsistent, which can result
 4911          * in reclaim_pv_chunk() attempting to remove a PV entry from the
 4912          * wrong PV list and pmap_pv_demote_pde() failing to find the expected
 4913          * PV entry for the 2MB page mapping that is being demoted.
 4914          */
 4915         if ((oldpde & PG_MANAGED) != 0)
 4916                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 4917 
 4918         /*
 4919          * Demote the mapping.  This pmap is locked.  The old PDE has
 4920          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 4921          * set.  Thus, there is no danger of a race with another
 4922          * processor changing the setting of PG_A and/or PG_M between
 4923          * the read above and the store below. 
 4924          */
 4925         if (workaround_erratum383)
 4926                 pmap_update_pde(pmap, va, pde, newpde);
 4927         else
 4928                 pde_store(pde, newpde);
 4929 
 4930         /*
 4931          * Invalidate a stale recursive mapping of the page table page.
 4932          */
 4933         if (in_kernel)
 4934                 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 4935 
 4936         /*
 4937          * Demote the PV entry.
 4938          */
 4939         if ((oldpde & PG_MANAGED) != 0)
 4940                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
 4941 
 4942         atomic_add_long(&pmap_pde_demotions, 1);
 4943         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
 4944             va, pmap);
 4945         return (TRUE);
 4946 }
 4947 
 4948 /*
 4949  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
 4950  */
 4951 static void
 4952 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 4953 {
 4954         pd_entry_t newpde;
 4955         vm_paddr_t mptepa;
 4956         vm_page_t mpte;
 4957 
 4958         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 4959         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4960         mpte = pmap_remove_pt_page(pmap, va);
 4961         if (mpte == NULL)
 4962                 panic("pmap_remove_kernel_pde: Missing pt page.");
 4963 
 4964         mptepa = VM_PAGE_TO_PHYS(mpte);
 4965         newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
 4966 
 4967         /*
 4968          * If this page table page was unmapped by a promotion, then it
 4969          * contains valid mappings.  Zero it to invalidate those mappings.
 4970          */
 4971         if (mpte->valid != 0)
 4972                 pagezero((void *)PHYS_TO_DMAP(mptepa));
 4973 
 4974         /*
 4975          * Demote the mapping.
 4976          */
 4977         if (workaround_erratum383)
 4978                 pmap_update_pde(pmap, va, pde, newpde);
 4979         else
 4980                 pde_store(pde, newpde);
 4981 
 4982         /*
 4983          * Invalidate a stale recursive mapping of the page table page.
 4984          */
 4985         pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 4986 }
 4987 
 4988 /*
 4989  * pmap_remove_pde: do the things to unmap a superpage in a process
 4990  */
 4991 static int
 4992 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 4993     struct spglist *free, struct rwlock **lockp)
 4994 {
 4995         struct md_page *pvh;
 4996         pd_entry_t oldpde;
 4997         vm_offset_t eva, va;
 4998         vm_page_t m, mpte;
 4999         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 5000 
 5001         PG_G = pmap_global_bit(pmap);
 5002         PG_A = pmap_accessed_bit(pmap);
 5003         PG_M = pmap_modified_bit(pmap);
 5004         PG_RW = pmap_rw_bit(pmap);
 5005 
 5006         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5007         KASSERT((sva & PDRMASK) == 0,
 5008             ("pmap_remove_pde: sva is not 2mpage aligned"));
 5009         oldpde = pte_load_clear(pdq);
 5010         if (oldpde & PG_W)
 5011                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 5012         if ((oldpde & PG_G) != 0)
 5013                 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 5014         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 5015         if (oldpde & PG_MANAGED) {
 5016                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 5017                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 5018                 pmap_pvh_free(pvh, pmap, sva);
 5019                 eva = sva + NBPDR;
 5020                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 5021                     va < eva; va += PAGE_SIZE, m++) {
 5022                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5023                                 vm_page_dirty(m);
 5024                         if (oldpde & PG_A)
 5025                                 vm_page_aflag_set(m, PGA_REFERENCED);
 5026                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 5027                             TAILQ_EMPTY(&pvh->pv_list))
 5028                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 5029                         pmap_delayed_invl_page(m);
 5030                 }
 5031         }
 5032         if (pmap == kernel_pmap) {
 5033                 pmap_remove_kernel_pde(pmap, pdq, sva);
 5034         } else {
 5035                 mpte = pmap_remove_pt_page(pmap, sva);
 5036                 if (mpte != NULL) {
 5037                         KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 5038                             ("pmap_remove_pde: pte page not promoted"));
 5039                         pmap_resident_count_dec(pmap, 1);
 5040                         KASSERT(mpte->wire_count == NPTEPG,
 5041                             ("pmap_remove_pde: pte page wire count error"));
 5042                         mpte->wire_count = 0;
 5043                         pmap_add_delayed_free_list(mpte, free, FALSE);
 5044                 }
 5045         }
 5046         return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 5047 }
 5048 
 5049 /*
 5050  * pmap_remove_pte: do the things to unmap a page in a process
 5051  */
 5052 static int
 5053 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
 5054     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 5055 {
 5056         struct md_page *pvh;
 5057         pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 5058         vm_page_t m;
 5059 
 5060         PG_A = pmap_accessed_bit(pmap);
 5061         PG_M = pmap_modified_bit(pmap);
 5062         PG_RW = pmap_rw_bit(pmap);
 5063 
 5064         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5065         oldpte = pte_load_clear(ptq);
 5066         if (oldpte & PG_W)
 5067                 pmap->pm_stats.wired_count -= 1;
 5068         pmap_resident_count_dec(pmap, 1);
 5069         if (oldpte & PG_MANAGED) {
 5070                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 5071                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5072                         vm_page_dirty(m);
 5073                 if (oldpte & PG_A)
 5074                         vm_page_aflag_set(m, PGA_REFERENCED);
 5075                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 5076                 pmap_pvh_free(&m->md, pmap, va);
 5077                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 5078                     (m->flags & PG_FICTITIOUS) == 0) {
 5079                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5080                         if (TAILQ_EMPTY(&pvh->pv_list))
 5081                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 5082                 }
 5083                 pmap_delayed_invl_page(m);
 5084         }
 5085         return (pmap_unuse_pt(pmap, va, ptepde, free));
 5086 }
 5087 
 5088 /*
 5089  * Remove a single page from a process address space
 5090  */
 5091 static void
 5092 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 5093     struct spglist *free)
 5094 {
 5095         struct rwlock *lock;
 5096         pt_entry_t *pte, PG_V;
 5097 
 5098         PG_V = pmap_valid_bit(pmap);
 5099         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5100         if ((*pde & PG_V) == 0)
 5101                 return;
 5102         pte = pmap_pde_to_pte(pde, va);
 5103         if ((*pte & PG_V) == 0)
 5104                 return;
 5105         lock = NULL;
 5106         pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
 5107         if (lock != NULL)
 5108                 rw_wunlock(lock);
 5109         pmap_invalidate_page(pmap, va);
 5110 }
 5111 
 5112 /*
 5113  * Removes the specified range of addresses from the page table page.
 5114  */
 5115 static bool
 5116 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 5117     pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
 5118 {
 5119         pt_entry_t PG_G, *pte;
 5120         vm_offset_t va;
 5121         bool anyvalid;
 5122 
 5123         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5124         PG_G = pmap_global_bit(pmap);
 5125         anyvalid = false;
 5126         va = eva;
 5127         for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
 5128             sva += PAGE_SIZE) {
 5129                 if (*pte == 0) {
 5130                         if (va != eva) {
 5131                                 pmap_invalidate_range(pmap, va, sva);
 5132                                 va = eva;
 5133                         }
 5134                         continue;
 5135                 }
 5136                 if ((*pte & PG_G) == 0)
 5137                         anyvalid = true;
 5138                 else if (va == eva)
 5139                         va = sva;
 5140                 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
 5141                         sva += PAGE_SIZE;
 5142                         break;
 5143                 }
 5144         }
 5145         if (va != eva)
 5146                 pmap_invalidate_range(pmap, va, sva);
 5147         return (anyvalid);
 5148 }
 5149 
 5150 /*
 5151  *      Remove the given range of addresses from the specified map.
 5152  *
 5153  *      It is assumed that the start and end are properly
 5154  *      rounded to the page size.
 5155  */
 5156 void
 5157 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 5158 {
 5159         struct rwlock *lock;
 5160         vm_offset_t va_next;
 5161         pml4_entry_t *pml4e;
 5162         pdp_entry_t *pdpe;
 5163         pd_entry_t ptpaddr, *pde;
 5164         pt_entry_t PG_G, PG_V;
 5165         struct spglist free;
 5166         int anyvalid;
 5167 
 5168         PG_G = pmap_global_bit(pmap);
 5169         PG_V = pmap_valid_bit(pmap);
 5170 
 5171         /*
 5172          * Perform an unsynchronized read.  This is, however, safe.
 5173          */
 5174         if (pmap->pm_stats.resident_count == 0)
 5175                 return;
 5176 
 5177         anyvalid = 0;
 5178         SLIST_INIT(&free);
 5179 
 5180         pmap_delayed_invl_start();
 5181         PMAP_LOCK(pmap);
 5182         pmap_pkru_on_remove(pmap, sva, eva);
 5183 
 5184         /*
 5185          * special handling of removing one page.  a very
 5186          * common operation and easy to short circuit some
 5187          * code.
 5188          */
 5189         if (sva + PAGE_SIZE == eva) {
 5190                 pde = pmap_pde(pmap, sva);
 5191                 if (pde && (*pde & PG_PS) == 0) {
 5192                         pmap_remove_page(pmap, sva, pde, &free);
 5193                         goto out;
 5194                 }
 5195         }
 5196 
 5197         lock = NULL;
 5198         for (; sva < eva; sva = va_next) {
 5199 
 5200                 if (pmap->pm_stats.resident_count == 0)
 5201                         break;
 5202 
 5203                 pml4e = pmap_pml4e(pmap, sva);
 5204                 if ((*pml4e & PG_V) == 0) {
 5205                         va_next = (sva + NBPML4) & ~PML4MASK;
 5206                         if (va_next < sva)
 5207                                 va_next = eva;
 5208                         continue;
 5209                 }
 5210 
 5211                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 5212                 if ((*pdpe & PG_V) == 0) {
 5213                         va_next = (sva + NBPDP) & ~PDPMASK;
 5214                         if (va_next < sva)
 5215                                 va_next = eva;
 5216                         continue;
 5217                 }
 5218 
 5219                 /*
 5220                  * Calculate index for next page table.
 5221                  */
 5222                 va_next = (sva + NBPDR) & ~PDRMASK;
 5223                 if (va_next < sva)
 5224                         va_next = eva;
 5225 
 5226                 pde = pmap_pdpe_to_pde(pdpe, sva);
 5227                 ptpaddr = *pde;
 5228 
 5229                 /*
 5230                  * Weed out invalid mappings.
 5231                  */
 5232                 if (ptpaddr == 0)
 5233                         continue;
 5234 
 5235                 /*
 5236                  * Check for large page.
 5237                  */
 5238                 if ((ptpaddr & PG_PS) != 0) {
 5239                         /*
 5240                          * Are we removing the entire large page?  If not,
 5241                          * demote the mapping and fall through.
 5242                          */
 5243                         if (sva + NBPDR == va_next && eva >= va_next) {
 5244                                 /*
 5245                                  * The TLB entry for a PG_G mapping is
 5246                                  * invalidated by pmap_remove_pde().
 5247                                  */
 5248                                 if ((ptpaddr & PG_G) == 0)
 5249                                         anyvalid = 1;
 5250                                 pmap_remove_pde(pmap, pde, sva, &free, &lock);
 5251                                 continue;
 5252                         } else if (!pmap_demote_pde_locked(pmap, pde, sva,
 5253                             &lock)) {
 5254                                 /* The large page mapping was destroyed. */
 5255                                 continue;
 5256                         } else
 5257                                 ptpaddr = *pde;
 5258                 }
 5259 
 5260                 /*
 5261                  * Limit our scan to either the end of the va represented
 5262                  * by the current page table page, or to the end of the
 5263                  * range being removed.
 5264                  */
 5265                 if (va_next > eva)
 5266                         va_next = eva;
 5267 
 5268                 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
 5269                         anyvalid = 1;
 5270         }
 5271         if (lock != NULL)
 5272                 rw_wunlock(lock);
 5273 out:
 5274         if (anyvalid)
 5275                 pmap_invalidate_all(pmap);
 5276         PMAP_UNLOCK(pmap);
 5277         pmap_delayed_invl_finish();
 5278         vm_page_free_pages_toq(&free, true);
 5279 }
 5280 
 5281 /*
 5282  *      Routine:        pmap_remove_all
 5283  *      Function:
 5284  *              Removes this physical page from
 5285  *              all physical maps in which it resides.
 5286  *              Reflects back modify bits to the pager.
 5287  *
 5288  *      Notes:
 5289  *              Original versions of this routine were very
 5290  *              inefficient because they iteratively called
 5291  *              pmap_remove (slow...)
 5292  */
 5293 
 5294 void
 5295 pmap_remove_all(vm_page_t m)
 5296 {
 5297         struct md_page *pvh;
 5298         pv_entry_t pv;
 5299         pmap_t pmap;
 5300         struct rwlock *lock;
 5301         pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 5302         pd_entry_t *pde;
 5303         vm_offset_t va;
 5304         struct spglist free;
 5305         int pvh_gen, md_gen;
 5306 
 5307         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5308             ("pmap_remove_all: page %p is not managed", m));
 5309         SLIST_INIT(&free);
 5310         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5311         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 5312             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5313 retry:
 5314         rw_wlock(lock);
 5315         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 5316                 pmap = PV_PMAP(pv);
 5317                 if (!PMAP_TRYLOCK(pmap)) {
 5318                         pvh_gen = pvh->pv_gen;
 5319                         rw_wunlock(lock);
 5320                         PMAP_LOCK(pmap);
 5321                         rw_wlock(lock);
 5322                         if (pvh_gen != pvh->pv_gen) {
 5323                                 rw_wunlock(lock);
 5324                                 PMAP_UNLOCK(pmap);
 5325                                 goto retry;
 5326                         }
 5327                 }
 5328                 va = pv->pv_va;
 5329                 pde = pmap_pde(pmap, va);
 5330                 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 5331                 PMAP_UNLOCK(pmap);
 5332         }
 5333         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 5334                 pmap = PV_PMAP(pv);
 5335                 if (!PMAP_TRYLOCK(pmap)) {
 5336                         pvh_gen = pvh->pv_gen;
 5337                         md_gen = m->md.pv_gen;
 5338                         rw_wunlock(lock);
 5339                         PMAP_LOCK(pmap);
 5340                         rw_wlock(lock);
 5341                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 5342                                 rw_wunlock(lock);
 5343                                 PMAP_UNLOCK(pmap);
 5344                                 goto retry;
 5345                         }
 5346                 }
 5347                 PG_A = pmap_accessed_bit(pmap);
 5348                 PG_M = pmap_modified_bit(pmap);
 5349                 PG_RW = pmap_rw_bit(pmap);
 5350                 pmap_resident_count_dec(pmap, 1);
 5351                 pde = pmap_pde(pmap, pv->pv_va);
 5352                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 5353                     " a 2mpage in page %p's pv list", m));
 5354                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 5355                 tpte = pte_load_clear(pte);
 5356                 if (tpte & PG_W)
 5357                         pmap->pm_stats.wired_count--;
 5358                 if (tpte & PG_A)
 5359                         vm_page_aflag_set(m, PGA_REFERENCED);
 5360 
 5361                 /*
 5362                  * Update the vm_page_t clean and reference bits.
 5363                  */
 5364                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5365                         vm_page_dirty(m);
 5366                 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 5367                 pmap_invalidate_page(pmap, pv->pv_va);
 5368                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 5369                 m->md.pv_gen++;
 5370                 free_pv_entry(pmap, pv);
 5371                 PMAP_UNLOCK(pmap);
 5372         }
 5373         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5374         rw_wunlock(lock);
 5375         pmap_delayed_invl_wait(m);
 5376         vm_page_free_pages_toq(&free, true);
 5377 }
 5378 
 5379 /*
 5380  * pmap_protect_pde: do the things to protect a 2mpage in a process
 5381  */
 5382 static boolean_t
 5383 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 5384 {
 5385         pd_entry_t newpde, oldpde;
 5386         vm_page_t m, mt;
 5387         boolean_t anychanged;
 5388         pt_entry_t PG_G, PG_M, PG_RW;
 5389 
 5390         PG_G = pmap_global_bit(pmap);
 5391         PG_M = pmap_modified_bit(pmap);
 5392         PG_RW = pmap_rw_bit(pmap);
 5393 
 5394         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5395         KASSERT((sva & PDRMASK) == 0,
 5396             ("pmap_protect_pde: sva is not 2mpage aligned"));
 5397         anychanged = FALSE;
 5398 retry:
 5399         oldpde = newpde = *pde;
 5400         if ((prot & VM_PROT_WRITE) == 0) {
 5401                 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 5402                     (PG_MANAGED | PG_M | PG_RW)) {
 5403                         m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 5404                         for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 5405                                 vm_page_dirty(mt);
 5406                 }
 5407                 newpde &= ~(PG_RW | PG_M);
 5408         }
 5409         if ((prot & VM_PROT_EXECUTE) == 0)
 5410                 newpde |= pg_nx;
 5411         if (newpde != oldpde) {
 5412                 /*
 5413                  * As an optimization to future operations on this PDE, clear
 5414                  * PG_PROMOTED.  The impending invalidation will remove any
 5415                  * lingering 4KB page mappings from the TLB.
 5416                  */
 5417                 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
 5418                         goto retry;
 5419                 if ((oldpde & PG_G) != 0)
 5420                         pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 5421                 else
 5422                         anychanged = TRUE;
 5423         }
 5424         return (anychanged);
 5425 }
 5426 
 5427 /*
 5428  *      Set the physical protection on the
 5429  *      specified range of this map as requested.
 5430  */
 5431 void
 5432 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 5433 {
 5434         vm_offset_t va_next;
 5435         pml4_entry_t *pml4e;
 5436         pdp_entry_t *pdpe;
 5437         pd_entry_t ptpaddr, *pde;
 5438         pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 5439         boolean_t anychanged;
 5440 
 5441         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 5442         if (prot == VM_PROT_NONE) {
 5443                 pmap_remove(pmap, sva, eva);
 5444                 return;
 5445         }
 5446 
 5447         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 5448             (VM_PROT_WRITE|VM_PROT_EXECUTE))
 5449                 return;
 5450 
 5451         PG_G = pmap_global_bit(pmap);
 5452         PG_M = pmap_modified_bit(pmap);
 5453         PG_V = pmap_valid_bit(pmap);
 5454         PG_RW = pmap_rw_bit(pmap);
 5455         anychanged = FALSE;
 5456 
 5457         /*
 5458          * Although this function delays and batches the invalidation
 5459          * of stale TLB entries, it does not need to call
 5460          * pmap_delayed_invl_start() and
 5461          * pmap_delayed_invl_finish(), because it does not
 5462          * ordinarily destroy mappings.  Stale TLB entries from
 5463          * protection-only changes need only be invalidated before the
 5464          * pmap lock is released, because protection-only changes do
 5465          * not destroy PV entries.  Even operations that iterate over
 5466          * a physical page's PV list of mappings, like
 5467          * pmap_remove_write(), acquire the pmap lock for each
 5468          * mapping.  Consequently, for protection-only changes, the
 5469          * pmap lock suffices to synchronize both page table and TLB
 5470          * updates.
 5471          *
 5472          * This function only destroys a mapping if pmap_demote_pde()
 5473          * fails.  In that case, stale TLB entries are immediately
 5474          * invalidated.
 5475          */
 5476         
 5477         PMAP_LOCK(pmap);
 5478         for (; sva < eva; sva = va_next) {
 5479 
 5480                 pml4e = pmap_pml4e(pmap, sva);
 5481                 if ((*pml4e & PG_V) == 0) {
 5482                         va_next = (sva + NBPML4) & ~PML4MASK;
 5483                         if (va_next < sva)
 5484                                 va_next = eva;
 5485                         continue;
 5486                 }
 5487 
 5488                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 5489                 if ((*pdpe & PG_V) == 0) {
 5490                         va_next = (sva + NBPDP) & ~PDPMASK;
 5491                         if (va_next < sva)
 5492                                 va_next = eva;
 5493                         continue;
 5494                 }
 5495 
 5496                 va_next = (sva + NBPDR) & ~PDRMASK;
 5497                 if (va_next < sva)
 5498                         va_next = eva;
 5499 
 5500                 pde = pmap_pdpe_to_pde(pdpe, sva);
 5501                 ptpaddr = *pde;
 5502 
 5503                 /*
 5504                  * Weed out invalid mappings.
 5505                  */
 5506                 if (ptpaddr == 0)
 5507                         continue;
 5508 
 5509                 /*
 5510                  * Check for large page.
 5511                  */
 5512                 if ((ptpaddr & PG_PS) != 0) {
 5513                         /*
 5514                          * Are we protecting the entire large page?  If not,
 5515                          * demote the mapping and fall through.
 5516                          */
 5517                         if (sva + NBPDR == va_next && eva >= va_next) {
 5518                                 /*
 5519                                  * The TLB entry for a PG_G mapping is
 5520                                  * invalidated by pmap_protect_pde().
 5521                                  */
 5522                                 if (pmap_protect_pde(pmap, pde, sva, prot))
 5523                                         anychanged = TRUE;
 5524                                 continue;
 5525                         } else if (!pmap_demote_pde(pmap, pde, sva)) {
 5526                                 /*
 5527                                  * The large page mapping was destroyed.
 5528                                  */
 5529                                 continue;
 5530                         }
 5531                 }
 5532 
 5533                 if (va_next > eva)
 5534                         va_next = eva;
 5535 
 5536                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 5537                     sva += PAGE_SIZE) {
 5538                         pt_entry_t obits, pbits;
 5539                         vm_page_t m;
 5540 
 5541 retry:
 5542                         obits = pbits = *pte;
 5543                         if ((pbits & PG_V) == 0)
 5544                                 continue;
 5545 
 5546                         if ((prot & VM_PROT_WRITE) == 0) {
 5547                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 5548                                     (PG_MANAGED | PG_M | PG_RW)) {
 5549                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 5550                                         vm_page_dirty(m);
 5551                                 }
 5552                                 pbits &= ~(PG_RW | PG_M);
 5553                         }
 5554                         if ((prot & VM_PROT_EXECUTE) == 0)
 5555                                 pbits |= pg_nx;
 5556 
 5557                         if (pbits != obits) {
 5558                                 if (!atomic_cmpset_long(pte, obits, pbits))
 5559                                         goto retry;
 5560                                 if (obits & PG_G)
 5561                                         pmap_invalidate_page(pmap, sva);
 5562                                 else
 5563                                         anychanged = TRUE;
 5564                         }
 5565                 }
 5566         }
 5567         if (anychanged)
 5568                 pmap_invalidate_all(pmap);
 5569         PMAP_UNLOCK(pmap);
 5570 }
 5571 
 5572 #if VM_NRESERVLEVEL > 0
 5573 static bool
 5574 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde)
 5575 {
 5576 
 5577         if (pmap->pm_type != PT_EPT)
 5578                 return (false);
 5579         return ((pde & EPT_PG_EXECUTE) != 0);
 5580 }
 5581 
 5582 /*
 5583  * Tries to promote the 512, contiguous 4KB page mappings that are within a
 5584  * single page table page (PTP) to a single 2MB page mapping.  For promotion
 5585  * to occur, two conditions must be met: (1) the 4KB page mappings must map
 5586  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
 5587  * identical characteristics. 
 5588  */
 5589 static void
 5590 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 5591     struct rwlock **lockp)
 5592 {
 5593         pd_entry_t newpde;
 5594         pt_entry_t *firstpte, oldpte, pa, *pte;
 5595         pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
 5596         vm_page_t mpte;
 5597         int PG_PTE_CACHE;
 5598 
 5599         PG_A = pmap_accessed_bit(pmap);
 5600         PG_G = pmap_global_bit(pmap);
 5601         PG_M = pmap_modified_bit(pmap);
 5602         PG_V = pmap_valid_bit(pmap);
 5603         PG_RW = pmap_rw_bit(pmap);
 5604         PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 5605         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 5606 
 5607         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5608 
 5609         /*
 5610          * Examine the first PTE in the specified PTP.  Abort if this PTE is
 5611          * either invalid, unused, or does not map the first 4KB physical page
 5612          * within a 2MB page. 
 5613          */
 5614         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 5615 setpde:
 5616         newpde = *firstpte;
 5617         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) ||
 5618             !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
 5619             newpde))) {
 5620                 atomic_add_long(&pmap_pde_p_failures, 1);
 5621                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 5622                     " in pmap %p", va, pmap);
 5623                 return;
 5624         }
 5625         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 5626                 /*
 5627                  * When PG_M is already clear, PG_RW can be cleared without
 5628                  * a TLB invalidation.
 5629                  */
 5630                 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 5631                         goto setpde;
 5632                 newpde &= ~PG_RW;
 5633         }
 5634 
 5635         /*
 5636          * Examine each of the other PTEs in the specified PTP.  Abort if this
 5637          * PTE maps an unexpected 4KB physical page or does not have identical
 5638          * characteristics to the first PTE.
 5639          */
 5640         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 5641         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 5642 setpte:
 5643                 oldpte = *pte;
 5644                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 5645                         atomic_add_long(&pmap_pde_p_failures, 1);
 5646                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 5647                             " in pmap %p", va, pmap);
 5648                         return;
 5649                 }
 5650                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 5651                         /*
 5652                          * When PG_M is already clear, PG_RW can be cleared
 5653                          * without a TLB invalidation.
 5654                          */
 5655                         if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 5656                                 goto setpte;
 5657                         oldpte &= ~PG_RW;
 5658                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 5659                             " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
 5660                             (va & ~PDRMASK), pmap);
 5661                 }
 5662                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 5663                         atomic_add_long(&pmap_pde_p_failures, 1);
 5664                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 5665                             " in pmap %p", va, pmap);
 5666                         return;
 5667                 }
 5668                 pa -= PAGE_SIZE;
 5669         }
 5670 
 5671         /*
 5672          * Save the page table page in its current state until the PDE
 5673          * mapping the superpage is demoted by pmap_demote_pde() or
 5674          * destroyed by pmap_remove_pde(). 
 5675          */
 5676         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 5677         KASSERT(mpte >= vm_page_array &&
 5678             mpte < &vm_page_array[vm_page_array_size],
 5679             ("pmap_promote_pde: page table page is out of range"));
 5680         KASSERT(mpte->pindex == pmap_pde_pindex(va),
 5681             ("pmap_promote_pde: page table page's pindex is wrong"));
 5682         if (pmap_insert_pt_page(pmap, mpte, true)) {
 5683                 atomic_add_long(&pmap_pde_p_failures, 1);
 5684                 CTR2(KTR_PMAP,
 5685                     "pmap_promote_pde: failure for va %#lx in pmap %p", va,
 5686                     pmap);
 5687                 return;
 5688         }
 5689 
 5690         /*
 5691          * Promote the pv entries.
 5692          */
 5693         if ((newpde & PG_MANAGED) != 0)
 5694                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 5695 
 5696         /*
 5697          * Propagate the PAT index to its proper position.
 5698          */
 5699         newpde = pmap_swap_pat(pmap, newpde);
 5700 
 5701         /*
 5702          * Map the superpage.
 5703          */
 5704         if (workaround_erratum383)
 5705                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 5706         else
 5707                 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 5708 
 5709         atomic_add_long(&pmap_pde_promotions, 1);
 5710         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 5711             " in pmap %p", va, pmap);
 5712 }
 5713 #endif /* VM_NRESERVLEVEL > 0 */
 5714 
 5715 /*
 5716  *      Insert the given physical page (p) at
 5717  *      the specified virtual address (v) in the
 5718  *      target physical map with the protection requested.
 5719  *
 5720  *      If specified, the page will be wired down, meaning
 5721  *      that the related pte can not be reclaimed.
 5722  *
 5723  *      NB:  This is the only routine which MAY NOT lazy-evaluate
 5724  *      or lose information.  That is, this routine must actually
 5725  *      insert this page into the given map NOW.
 5726  *
 5727  *      When destroying both a page table and PV entry, this function
 5728  *      performs the TLB invalidation before releasing the PV list
 5729  *      lock, so we do not need pmap_delayed_invl_page() calls here.
 5730  */
 5731 int
 5732 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 5733     u_int flags, int8_t psind)
 5734 {
 5735         struct rwlock *lock;
 5736         pd_entry_t *pde;
 5737         pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 5738         pt_entry_t newpte, origpte;
 5739         pv_entry_t pv;
 5740         vm_paddr_t opa, pa;
 5741         vm_page_t mpte, om;
 5742         int rv;
 5743         boolean_t nosleep;
 5744 
 5745         PG_A = pmap_accessed_bit(pmap);
 5746         PG_G = pmap_global_bit(pmap);
 5747         PG_M = pmap_modified_bit(pmap);
 5748         PG_V = pmap_valid_bit(pmap);
 5749         PG_RW = pmap_rw_bit(pmap);
 5750 
 5751         va = trunc_page(va);
 5752         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 5753         KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 5754             ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 5755             va));
 5756         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
 5757             va >= kmi.clean_eva,
 5758             ("pmap_enter: managed mapping within the clean submap"));
 5759         if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 5760                 VM_OBJECT_ASSERT_LOCKED(m->object);
 5761         KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
 5762             ("pmap_enter: flags %u has reserved bits set", flags));
 5763         pa = VM_PAGE_TO_PHYS(m);
 5764         newpte = (pt_entry_t)(pa | PG_A | PG_V);
 5765         if ((flags & VM_PROT_WRITE) != 0)
 5766                 newpte |= PG_M;
 5767         if ((prot & VM_PROT_WRITE) != 0)
 5768                 newpte |= PG_RW;
 5769         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 5770             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 5771         if ((prot & VM_PROT_EXECUTE) == 0)
 5772                 newpte |= pg_nx;
 5773         if ((flags & PMAP_ENTER_WIRED) != 0)
 5774                 newpte |= PG_W;
 5775         if (va < VM_MAXUSER_ADDRESS)
 5776                 newpte |= PG_U;
 5777         if (pmap == kernel_pmap)
 5778                 newpte |= PG_G;
 5779         newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
 5780 
 5781         /*
 5782          * Set modified bit gratuitously for writeable mappings if
 5783          * the page is unmanaged. We do not want to take a fault
 5784          * to do the dirty bit accounting for these mappings.
 5785          */
 5786         if ((m->oflags & VPO_UNMANAGED) != 0) {
 5787                 if ((newpte & PG_RW) != 0)
 5788                         newpte |= PG_M;
 5789         } else
 5790                 newpte |= PG_MANAGED;
 5791 
 5792         lock = NULL;
 5793         PMAP_LOCK(pmap);
 5794         if (psind == 1) {
 5795                 /* Assert the required virtual and physical alignment. */ 
 5796                 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
 5797                 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 5798                 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
 5799                 goto out;
 5800         }
 5801         mpte = NULL;
 5802 
 5803         /*
 5804          * In the case that a page table page is not
 5805          * resident, we are creating it here.
 5806          */
 5807 retry:
 5808         pde = pmap_pde(pmap, va);
 5809         if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
 5810             pmap_demote_pde_locked(pmap, pde, va, &lock))) {
 5811                 pte = pmap_pde_to_pte(pde, va);
 5812                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 5813                         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 5814                         mpte->wire_count++;
 5815                 }
 5816         } else if (va < VM_MAXUSER_ADDRESS) {
 5817                 /*
 5818                  * Here if the pte page isn't mapped, or if it has been
 5819                  * deallocated.
 5820                  */
 5821                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 5822                 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
 5823                     nosleep ? NULL : &lock);
 5824                 if (mpte == NULL && nosleep) {
 5825                         rv = KERN_RESOURCE_SHORTAGE;
 5826                         goto out;
 5827                 }
 5828                 goto retry;
 5829         } else
 5830                 panic("pmap_enter: invalid page directory va=%#lx", va);
 5831 
 5832         origpte = *pte;
 5833         pv = NULL;
 5834         if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
 5835                 newpte |= pmap_pkru_get(pmap, va);
 5836 
 5837         /*
 5838          * Is the specified virtual address already mapped?
 5839          */
 5840         if ((origpte & PG_V) != 0) {
 5841                 /*
 5842                  * Wiring change, just update stats. We don't worry about
 5843                  * wiring PT pages as they remain resident as long as there
 5844                  * are valid mappings in them. Hence, if a user page is wired,
 5845                  * the PT page will be also.
 5846                  */
 5847                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 5848                         pmap->pm_stats.wired_count++;
 5849                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 5850                         pmap->pm_stats.wired_count--;
 5851 
 5852                 /*
 5853                  * Remove the extra PT page reference.
 5854                  */
 5855                 if (mpte != NULL) {
 5856                         mpte->wire_count--;
 5857                         KASSERT(mpte->wire_count > 0,
 5858                             ("pmap_enter: missing reference to page table page,"
 5859                              " va: 0x%lx", va));
 5860                 }
 5861 
 5862                 /*
 5863                  * Has the physical page changed?
 5864                  */
 5865                 opa = origpte & PG_FRAME;
 5866                 if (opa == pa) {
 5867                         /*
 5868                          * No, might be a protection or wiring change.
 5869                          */
 5870                         if ((origpte & PG_MANAGED) != 0 &&
 5871                             (newpte & PG_RW) != 0)
 5872                                 vm_page_aflag_set(m, PGA_WRITEABLE);
 5873                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 5874                                 goto unchanged;
 5875                         goto validate;
 5876                 }
 5877 
 5878                 /*
 5879                  * The physical page has changed.  Temporarily invalidate
 5880                  * the mapping.  This ensures that all threads sharing the
 5881                  * pmap keep a consistent view of the mapping, which is
 5882                  * necessary for the correct handling of COW faults.  It
 5883                  * also permits reuse of the old mapping's PV entry,
 5884                  * avoiding an allocation.
 5885                  *
 5886                  * For consistency, handle unmanaged mappings the same way.
 5887                  */
 5888                 origpte = pte_load_clear(pte);
 5889                 KASSERT((origpte & PG_FRAME) == opa,
 5890                     ("pmap_enter: unexpected pa update for %#lx", va));
 5891                 if ((origpte & PG_MANAGED) != 0) {
 5892                         om = PHYS_TO_VM_PAGE(opa);
 5893 
 5894                         /*
 5895                          * The pmap lock is sufficient to synchronize with
 5896                          * concurrent calls to pmap_page_test_mappings() and
 5897                          * pmap_ts_referenced().
 5898                          */
 5899                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5900                                 vm_page_dirty(om);
 5901                         if ((origpte & PG_A) != 0)
 5902                                 vm_page_aflag_set(om, PGA_REFERENCED);
 5903                         CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 5904                         pv = pmap_pvh_remove(&om->md, pmap, va);
 5905                         KASSERT(pv != NULL,
 5906                             ("pmap_enter: no PV entry for %#lx", va));
 5907                         if ((newpte & PG_MANAGED) == 0)
 5908                                 free_pv_entry(pmap, pv);
 5909                         if ((om->aflags & PGA_WRITEABLE) != 0 &&
 5910                             TAILQ_EMPTY(&om->md.pv_list) &&
 5911                             ((om->flags & PG_FICTITIOUS) != 0 ||
 5912                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 5913                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
 5914                 }
 5915                 if ((origpte & PG_A) != 0)
 5916                         pmap_invalidate_page(pmap, va);
 5917                 origpte = 0;
 5918         } else {
 5919                 /*
 5920                  * Increment the counters.
 5921                  */
 5922                 if ((newpte & PG_W) != 0)
 5923                         pmap->pm_stats.wired_count++;
 5924                 pmap_resident_count_inc(pmap, 1);
 5925         }
 5926 
 5927         /*
 5928          * Enter on the PV list if part of our managed memory.
 5929          */
 5930         if ((newpte & PG_MANAGED) != 0) {
 5931                 if (pv == NULL) {
 5932                         pv = get_pv_entry(pmap, &lock);
 5933                         pv->pv_va = va;
 5934                 }
 5935                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 5936                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 5937                 m->md.pv_gen++;
 5938                 if ((newpte & PG_RW) != 0)
 5939                         vm_page_aflag_set(m, PGA_WRITEABLE);
 5940         }
 5941 
 5942         /*
 5943          * Update the PTE.
 5944          */
 5945         if ((origpte & PG_V) != 0) {
 5946 validate:
 5947                 origpte = pte_load_store(pte, newpte);
 5948                 KASSERT((origpte & PG_FRAME) == pa,
 5949                     ("pmap_enter: unexpected pa update for %#lx", va));
 5950                 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
 5951                     (PG_M | PG_RW)) {
 5952                         if ((origpte & PG_MANAGED) != 0)
 5953                                 vm_page_dirty(m);
 5954 
 5955                         /*
 5956                          * Although the PTE may still have PG_RW set, TLB
 5957                          * invalidation may nonetheless be required because
 5958                          * the PTE no longer has PG_M set.
 5959                          */
 5960                 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 5961                         /*
 5962                          * This PTE change does not require TLB invalidation.
 5963                          */
 5964                         goto unchanged;
 5965                 }
 5966                 if ((origpte & PG_A) != 0)
 5967                         pmap_invalidate_page(pmap, va);
 5968         } else
 5969                 pte_store(pte, newpte);
 5970 
 5971 unchanged:
 5972 
 5973 #if VM_NRESERVLEVEL > 0
 5974         /*
 5975          * If both the page table page and the reservation are fully
 5976          * populated, then attempt promotion.
 5977          */
 5978         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 5979             pmap_ps_enabled(pmap) &&
 5980             (m->flags & PG_FICTITIOUS) == 0 &&
 5981             vm_reserv_level_iffullpop(m) == 0)
 5982                 pmap_promote_pde(pmap, pde, va, &lock);
 5983 #endif
 5984 
 5985         rv = KERN_SUCCESS;
 5986 out:
 5987         if (lock != NULL)
 5988                 rw_wunlock(lock);
 5989         PMAP_UNLOCK(pmap);
 5990         return (rv);
 5991 }
 5992 
 5993 /*
 5994  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
 5995  * if successful.  Returns false if (1) a page table page cannot be allocated
 5996  * without sleeping, (2) a mapping already exists at the specified virtual
 5997  * address, or (3) a PV entry cannot be allocated without reclaiming another
 5998  * PV entry.
 5999  */
 6000 static bool
 6001 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 6002     struct rwlock **lockp)
 6003 {
 6004         pd_entry_t newpde;
 6005         pt_entry_t PG_V;
 6006 
 6007         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 6008         PG_V = pmap_valid_bit(pmap);
 6009         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 6010             PG_PS | PG_V;
 6011         if ((m->oflags & VPO_UNMANAGED) == 0)
 6012                 newpde |= PG_MANAGED;
 6013         if ((prot & VM_PROT_EXECUTE) == 0)
 6014                 newpde |= pg_nx;
 6015         if (va < VM_MAXUSER_ADDRESS)
 6016                 newpde |= PG_U;
 6017         return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
 6018             PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 6019             KERN_SUCCESS);
 6020 }
 6021 
 6022 /*
 6023  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
 6024  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
 6025  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
 6026  * a mapping already exists at the specified virtual address.  Returns
 6027  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
 6028  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
 6029  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
 6030  *
 6031  * The parameter "m" is only used when creating a managed, writeable mapping.
 6032  */
 6033 static int
 6034 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 6035     vm_page_t m, struct rwlock **lockp)
 6036 {
 6037         struct spglist free;
 6038         pd_entry_t oldpde, *pde;
 6039         pt_entry_t PG_G, PG_RW, PG_V;
 6040         vm_page_t mt, pdpg;
 6041 
 6042         PG_G = pmap_global_bit(pmap);
 6043         PG_RW = pmap_rw_bit(pmap);
 6044         KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
 6045             ("pmap_enter_pde: newpde is missing PG_M"));
 6046         PG_V = pmap_valid_bit(pmap);
 6047         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 6048 
 6049         if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
 6050             newpde))) {
 6051                 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx"
 6052                     " in pmap %p", va, pmap);
 6053                 return (KERN_FAILURE);
 6054         }
 6055         if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 6056             NULL : lockp)) == NULL) {
 6057                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 6058                     " in pmap %p", va, pmap);
 6059                 return (KERN_RESOURCE_SHORTAGE);
 6060         }
 6061 
 6062         /*
 6063          * If pkru is not same for the whole pde range, return failure
 6064          * and let vm_fault() cope.  Check after pde allocation, since
 6065          * it could sleep.
 6066          */
 6067         if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
 6068                 SLIST_INIT(&free);
 6069                 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
 6070                         pmap_invalidate_page(pmap, va);
 6071                         vm_page_free_pages_toq(&free, true);
 6072                 }
 6073                 return (KERN_FAILURE);
 6074         }
 6075         if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
 6076                 newpde &= ~X86_PG_PKU_MASK;
 6077                 newpde |= pmap_pkru_get(pmap, va);
 6078         }
 6079 
 6080         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 6081         pde = &pde[pmap_pde_index(va)];
 6082         oldpde = *pde;
 6083         if ((oldpde & PG_V) != 0) {
 6084                 KASSERT(pdpg->wire_count > 1,
 6085                     ("pmap_enter_pde: pdpg's wire count is too low"));
 6086                 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 6087                         pdpg->wire_count--;
 6088                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 6089                             " in pmap %p", va, pmap);
 6090                         return (KERN_FAILURE);
 6091                 }
 6092                 /* Break the existing mapping(s). */
 6093                 SLIST_INIT(&free);
 6094                 if ((oldpde & PG_PS) != 0) {
 6095                         /*
 6096                          * The reference to the PD page that was acquired by
 6097                          * pmap_allocpde() ensures that it won't be freed.
 6098                          * However, if the PDE resulted from a promotion, then
 6099                          * a reserved PT page could be freed.
 6100                          */
 6101                         (void)pmap_remove_pde(pmap, pde, va, &free, lockp);
 6102                         if ((oldpde & PG_G) == 0)
 6103                                 pmap_invalidate_pde_page(pmap, va, oldpde);
 6104                 } else {
 6105                         pmap_delayed_invl_start();
 6106                         if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
 6107                             lockp))
 6108                                pmap_invalidate_all(pmap);
 6109                         pmap_delayed_invl_finish();
 6110                 }
 6111                 vm_page_free_pages_toq(&free, true);
 6112                 if (va >= VM_MAXUSER_ADDRESS) {
 6113                         /*
 6114                          * Both pmap_remove_pde() and pmap_remove_ptes() will
 6115                          * leave the kernel page table page zero filled.
 6116                          */
 6117                         mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 6118                         if (pmap_insert_pt_page(pmap, mt, false))
 6119                                 panic("pmap_enter_pde: trie insert failed");
 6120                 } else
 6121                         KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
 6122                             pde));
 6123         }
 6124         if ((newpde & PG_MANAGED) != 0) {
 6125                 /*
 6126                  * Abort this mapping if its PV entry could not be created.
 6127                  */
 6128                 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
 6129                         SLIST_INIT(&free);
 6130                         if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
 6131                                 /*
 6132                                  * Although "va" is not mapped, paging-
 6133                                  * structure caches could nonetheless have
 6134                                  * entries that refer to the freed page table
 6135                                  * pages.  Invalidate those entries.
 6136                                  */
 6137                                 pmap_invalidate_page(pmap, va);
 6138                                 vm_page_free_pages_toq(&free, true);
 6139                         }
 6140                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 6141                             " in pmap %p", va, pmap);
 6142                         return (KERN_RESOURCE_SHORTAGE);
 6143                 }
 6144                 if ((newpde & PG_RW) != 0) {
 6145                         for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 6146                                 vm_page_aflag_set(mt, PGA_WRITEABLE);
 6147                 }
 6148         }
 6149 
 6150         /*
 6151          * Increment counters.
 6152          */
 6153         if ((newpde & PG_W) != 0)
 6154                 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
 6155         pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 6156 
 6157         /*
 6158          * Map the superpage.  (This is not a promoted mapping; there will not
 6159          * be any lingering 4KB page mappings in the TLB.)
 6160          */
 6161         pde_store(pde, newpde);
 6162 
 6163         atomic_add_long(&pmap_pde_mappings, 1);
 6164         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 6165             " in pmap %p", va, pmap);
 6166         return (KERN_SUCCESS);
 6167 }
 6168 
 6169 /*
 6170  * Maps a sequence of resident pages belonging to the same object.
 6171  * The sequence begins with the given page m_start.  This page is
 6172  * mapped at the given virtual address start.  Each subsequent page is
 6173  * mapped at a virtual address that is offset from start by the same
 6174  * amount as the page is offset from m_start within the object.  The
 6175  * last page in the sequence is the page with the largest offset from
 6176  * m_start that can be mapped at a virtual address less than the given
 6177  * virtual address end.  Not every virtual page between start and end
 6178  * is mapped; only those for which a resident page exists with the
 6179  * corresponding offset from m_start are mapped.
 6180  */
 6181 void
 6182 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
 6183     vm_page_t m_start, vm_prot_t prot)
 6184 {
 6185         struct rwlock *lock;
 6186         vm_offset_t va;
 6187         vm_page_t m, mpte;
 6188         vm_pindex_t diff, psize;
 6189 
 6190         VM_OBJECT_ASSERT_LOCKED(m_start->object);
 6191 
 6192         psize = atop(end - start);
 6193         mpte = NULL;
 6194         m = m_start;
 6195         lock = NULL;
 6196         PMAP_LOCK(pmap);
 6197         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 6198                 va = start + ptoa(diff);
 6199                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 6200                     m->psind == 1 && pmap_ps_enabled(pmap) &&
 6201                     pmap_allow_2m_x_page(pmap, (prot & VM_PROT_EXECUTE) != 0) &&
 6202                     pmap_enter_2mpage(pmap, va, m, prot, &lock))
 6203                         m = &m[NBPDR / PAGE_SIZE - 1];
 6204                 else
 6205                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 6206                             mpte, &lock);
 6207                 m = TAILQ_NEXT(m, listq);
 6208         }
 6209         if (lock != NULL)
 6210                 rw_wunlock(lock);
 6211         PMAP_UNLOCK(pmap);
 6212 }
 6213 
 6214 /*
 6215  * this code makes some *MAJOR* assumptions:
 6216  * 1. Current pmap & pmap exists.
 6217  * 2. Not wired.
 6218  * 3. Read access.
 6219  * 4. No page table pages.
 6220  * but is *MUCH* faster than pmap_enter...
 6221  */
 6222 
 6223 void
 6224 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 6225 {
 6226         struct rwlock *lock;
 6227 
 6228         lock = NULL;
 6229         PMAP_LOCK(pmap);
 6230         (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 6231         if (lock != NULL)
 6232                 rw_wunlock(lock);
 6233         PMAP_UNLOCK(pmap);
 6234 }
 6235 
 6236 static vm_page_t
 6237 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 6238     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 6239 {
 6240         struct spglist free;
 6241         pt_entry_t newpte, *pte, PG_V;
 6242 
 6243         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 6244             (m->oflags & VPO_UNMANAGED) != 0,
 6245             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 6246         PG_V = pmap_valid_bit(pmap);
 6247         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 6248 
 6249         /*
 6250          * In the case that a page table page is not
 6251          * resident, we are creating it here.
 6252          */
 6253         if (va < VM_MAXUSER_ADDRESS) {
 6254                 vm_pindex_t ptepindex;
 6255                 pd_entry_t *ptepa;
 6256 
 6257                 /*
 6258                  * Calculate pagetable page index
 6259                  */
 6260                 ptepindex = pmap_pde_pindex(va);
 6261                 if (mpte && (mpte->pindex == ptepindex)) {
 6262                         mpte->wire_count++;
 6263                 } else {
 6264                         /*
 6265                          * Get the page directory entry
 6266                          */
 6267                         ptepa = pmap_pde(pmap, va);
 6268 
 6269                         /*
 6270                          * If the page table page is mapped, we just increment
 6271                          * the hold count, and activate it.  Otherwise, we
 6272                          * attempt to allocate a page table page.  If this
 6273                          * attempt fails, we don't retry.  Instead, we give up.
 6274                          */
 6275                         if (ptepa && (*ptepa & PG_V) != 0) {
 6276                                 if (*ptepa & PG_PS)
 6277                                         return (NULL);
 6278                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 6279                                 mpte->wire_count++;
 6280                         } else {
 6281                                 /*
 6282                                  * Pass NULL instead of the PV list lock
 6283                                  * pointer, because we don't intend to sleep.
 6284                                  */
 6285                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 6286                                 if (mpte == NULL)
 6287                                         return (mpte);
 6288                         }
 6289                 }
 6290                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 6291                 pte = &pte[pmap_pte_index(va)];
 6292         } else {
 6293                 mpte = NULL;
 6294                 pte = vtopte(va);
 6295         }
 6296         if (*pte) {
 6297                 if (mpte != NULL) {
 6298                         mpte->wire_count--;
 6299                         mpte = NULL;
 6300                 }
 6301                 return (mpte);
 6302         }
 6303 
 6304         /*
 6305          * Enter on the PV list if part of our managed memory.
 6306          */
 6307         if ((m->oflags & VPO_UNMANAGED) == 0 &&
 6308             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 6309                 if (mpte != NULL) {
 6310                         SLIST_INIT(&free);
 6311                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 6312                                 /*
 6313                                  * Although "va" is not mapped, paging-
 6314                                  * structure caches could nonetheless have
 6315                                  * entries that refer to the freed page table
 6316                                  * pages.  Invalidate those entries.
 6317                                  */
 6318                                 pmap_invalidate_page(pmap, va);
 6319                                 vm_page_free_pages_toq(&free, true);
 6320                         }
 6321                         mpte = NULL;
 6322                 }
 6323                 return (mpte);
 6324         }
 6325 
 6326         /*
 6327          * Increment counters
 6328          */
 6329         pmap_resident_count_inc(pmap, 1);
 6330 
 6331         newpte = VM_PAGE_TO_PHYS(m) | PG_V |
 6332             pmap_cache_bits(pmap, m->md.pat_mode, 0);
 6333         if ((m->oflags & VPO_UNMANAGED) == 0)
 6334                 newpte |= PG_MANAGED;
 6335         if ((prot & VM_PROT_EXECUTE) == 0)
 6336                 newpte |= pg_nx;
 6337         if (va < VM_MAXUSER_ADDRESS)
 6338                 newpte |= PG_U | pmap_pkru_get(pmap, va);
 6339         pte_store(pte, newpte);
 6340         return (mpte);
 6341 }
 6342 
 6343 /*
 6344  * Make a temporary mapping for a physical address.  This is only intended
 6345  * to be used for panic dumps.
 6346  */
 6347 void *
 6348 pmap_kenter_temporary(vm_paddr_t pa, int i)
 6349 {
 6350         vm_offset_t va;
 6351 
 6352         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 6353         pmap_kenter(va, pa);
 6354         invlpg(va);
 6355         return ((void *)crashdumpmap);
 6356 }
 6357 
 6358 /*
 6359  * This code maps large physical mmap regions into the
 6360  * processor address space.  Note that some shortcuts
 6361  * are taken, but the code works.
 6362  */
 6363 void
 6364 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
 6365     vm_pindex_t pindex, vm_size_t size)
 6366 {
 6367         pd_entry_t *pde;
 6368         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 6369         vm_paddr_t pa, ptepa;
 6370         vm_page_t p, pdpg;
 6371         int pat_mode;
 6372 
 6373         PG_A = pmap_accessed_bit(pmap);
 6374         PG_M = pmap_modified_bit(pmap);
 6375         PG_V = pmap_valid_bit(pmap);
 6376         PG_RW = pmap_rw_bit(pmap);
 6377 
 6378         VM_OBJECT_ASSERT_WLOCKED(object);
 6379         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 6380             ("pmap_object_init_pt: non-device object"));
 6381         if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 6382                 if (!pmap_ps_enabled(pmap))
 6383                         return;
 6384                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
 6385                         return;
 6386                 p = vm_page_lookup(object, pindex);
 6387                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
 6388                     ("pmap_object_init_pt: invalid page %p", p));
 6389                 pat_mode = p->md.pat_mode;
 6390 
 6391                 /*
 6392                  * Abort the mapping if the first page is not physically
 6393                  * aligned to a 2MB page boundary.
 6394                  */
 6395                 ptepa = VM_PAGE_TO_PHYS(p);
 6396                 if (ptepa & (NBPDR - 1))
 6397                         return;
 6398 
 6399                 /*
 6400                  * Skip the first page.  Abort the mapping if the rest of
 6401                  * the pages are not physically contiguous or have differing
 6402                  * memory attributes.
 6403                  */
 6404                 p = TAILQ_NEXT(p, listq);
 6405                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 6406                     pa += PAGE_SIZE) {
 6407                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
 6408                             ("pmap_object_init_pt: invalid page %p", p));
 6409                         if (pa != VM_PAGE_TO_PHYS(p) ||
 6410                             pat_mode != p->md.pat_mode)
 6411                                 return;
 6412                         p = TAILQ_NEXT(p, listq);
 6413                 }
 6414 
 6415                 /*
 6416                  * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 6417                  * "size" is a multiple of 2M, adding the PAT setting to "pa"
 6418                  * will not affect the termination of this loop.
 6419                  */ 
 6420                 PMAP_LOCK(pmap);
 6421                 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 6422                     pa < ptepa + size; pa += NBPDR) {
 6423                         pdpg = pmap_allocpde(pmap, addr, NULL);
 6424                         if (pdpg == NULL) {
 6425                                 /*
 6426                                  * The creation of mappings below is only an
 6427                                  * optimization.  If a page directory page
 6428                                  * cannot be allocated without blocking,
 6429                                  * continue on to the next mapping rather than
 6430                                  * blocking.
 6431                                  */
 6432                                 addr += NBPDR;
 6433                                 continue;
 6434                         }
 6435                         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 6436                         pde = &pde[pmap_pde_index(addr)];
 6437                         if ((*pde & PG_V) == 0) {
 6438                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
 6439                                     PG_U | PG_RW | PG_V);
 6440                                 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 6441                                 atomic_add_long(&pmap_pde_mappings, 1);
 6442                         } else {
 6443                                 /* Continue on if the PDE is already valid. */
 6444                                 pdpg->wire_count--;
 6445                                 KASSERT(pdpg->wire_count > 0,
 6446                                     ("pmap_object_init_pt: missing reference "
 6447                                     "to page directory page, va: 0x%lx", addr));
 6448                         }
 6449                         addr += NBPDR;
 6450                 }
 6451                 PMAP_UNLOCK(pmap);
 6452         }
 6453 }
 6454 
 6455 /*
 6456  *      Clear the wired attribute from the mappings for the specified range of
 6457  *      addresses in the given pmap.  Every valid mapping within that range
 6458  *      must have the wired attribute set.  In contrast, invalid mappings
 6459  *      cannot have the wired attribute set, so they are ignored.
 6460  *
 6461  *      The wired attribute of the page table entry is not a hardware
 6462  *      feature, so there is no need to invalidate any TLB entries.
 6463  *      Since pmap_demote_pde() for the wired entry must never fail,
 6464  *      pmap_delayed_invl_start()/finish() calls around the
 6465  *      function are not needed.
 6466  */
 6467 void
 6468 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 6469 {
 6470         vm_offset_t va_next;
 6471         pml4_entry_t *pml4e;
 6472         pdp_entry_t *pdpe;
 6473         pd_entry_t *pde;
 6474         pt_entry_t *pte, PG_V;
 6475 
 6476         PG_V = pmap_valid_bit(pmap);
 6477         PMAP_LOCK(pmap);
 6478         for (; sva < eva; sva = va_next) {
 6479                 pml4e = pmap_pml4e(pmap, sva);
 6480                 if ((*pml4e & PG_V) == 0) {
 6481                         va_next = (sva + NBPML4) & ~PML4MASK;
 6482                         if (va_next < sva)
 6483                                 va_next = eva;
 6484                         continue;
 6485                 }
 6486                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 6487                 if ((*pdpe & PG_V) == 0) {
 6488                         va_next = (sva + NBPDP) & ~PDPMASK;
 6489                         if (va_next < sva)
 6490                                 va_next = eva;
 6491                         continue;
 6492                 }
 6493                 va_next = (sva + NBPDR) & ~PDRMASK;
 6494                 if (va_next < sva)
 6495                         va_next = eva;
 6496                 pde = pmap_pdpe_to_pde(pdpe, sva);
 6497                 if ((*pde & PG_V) == 0)
 6498                         continue;
 6499                 if ((*pde & PG_PS) != 0) {
 6500                         if ((*pde & PG_W) == 0)
 6501                                 panic("pmap_unwire: pde %#jx is missing PG_W",
 6502                                     (uintmax_t)*pde);
 6503 
 6504                         /*
 6505                          * Are we unwiring the entire large page?  If not,
 6506                          * demote the mapping and fall through.
 6507                          */
 6508                         if (sva + NBPDR == va_next && eva >= va_next) {
 6509                                 atomic_clear_long(pde, PG_W);
 6510                                 pmap->pm_stats.wired_count -= NBPDR /
 6511                                     PAGE_SIZE;
 6512                                 continue;
 6513                         } else if (!pmap_demote_pde(pmap, pde, sva))
 6514                                 panic("pmap_unwire: demotion failed");
 6515                 }
 6516                 if (va_next > eva)
 6517                         va_next = eva;
 6518                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 6519                     sva += PAGE_SIZE) {
 6520                         if ((*pte & PG_V) == 0)
 6521                                 continue;
 6522                         if ((*pte & PG_W) == 0)
 6523                                 panic("pmap_unwire: pte %#jx is missing PG_W",
 6524                                     (uintmax_t)*pte);
 6525 
 6526                         /*
 6527                          * PG_W must be cleared atomically.  Although the pmap
 6528                          * lock synchronizes access to PG_W, another processor
 6529                          * could be setting PG_M and/or PG_A concurrently.
 6530                          */
 6531                         atomic_clear_long(pte, PG_W);
 6532                         pmap->pm_stats.wired_count--;
 6533                 }
 6534         }
 6535         PMAP_UNLOCK(pmap);
 6536 }
 6537 
 6538 /*
 6539  *      Copy the range specified by src_addr/len
 6540  *      from the source map to the range dst_addr/len
 6541  *      in the destination map.
 6542  *
 6543  *      This routine is only advisory and need not do anything.
 6544  */
 6545 void
 6546 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 6547     vm_offset_t src_addr)
 6548 {
 6549         struct rwlock *lock;
 6550         struct spglist free;
 6551         pml4_entry_t *pml4e;
 6552         pdp_entry_t *pdpe;
 6553         pd_entry_t *pde, srcptepaddr;
 6554         pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte;
 6555         vm_offset_t addr, end_addr, va_next;
 6556         vm_page_t dst_pdpg, dstmpte, srcmpte;
 6557 
 6558         if (dst_addr != src_addr)
 6559                 return;
 6560 
 6561         if (dst_pmap->pm_type != src_pmap->pm_type)
 6562                 return;
 6563 
 6564         /*
 6565          * EPT page table entries that require emulation of A/D bits are
 6566          * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
 6567          * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
 6568          * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
 6569          * implementations flag an EPT misconfiguration for exec-only
 6570          * mappings we skip this function entirely for emulated pmaps.
 6571          */
 6572         if (pmap_emulate_ad_bits(dst_pmap))
 6573                 return;
 6574 
 6575         end_addr = src_addr + len;
 6576         lock = NULL;
 6577         if (dst_pmap < src_pmap) {
 6578                 PMAP_LOCK(dst_pmap);
 6579                 PMAP_LOCK(src_pmap);
 6580         } else {
 6581                 PMAP_LOCK(src_pmap);
 6582                 PMAP_LOCK(dst_pmap);
 6583         }
 6584 
 6585         PG_A = pmap_accessed_bit(dst_pmap);
 6586         PG_M = pmap_modified_bit(dst_pmap);
 6587         PG_V = pmap_valid_bit(dst_pmap);
 6588 
 6589         for (addr = src_addr; addr < end_addr; addr = va_next) {
 6590                 KASSERT(addr < UPT_MIN_ADDRESS,
 6591                     ("pmap_copy: invalid to pmap_copy page tables"));
 6592 
 6593                 pml4e = pmap_pml4e(src_pmap, addr);
 6594                 if ((*pml4e & PG_V) == 0) {
 6595                         va_next = (addr + NBPML4) & ~PML4MASK;
 6596                         if (va_next < addr)
 6597                                 va_next = end_addr;
 6598                         continue;
 6599                 }
 6600 
 6601                 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 6602                 if ((*pdpe & PG_V) == 0) {
 6603                         va_next = (addr + NBPDP) & ~PDPMASK;
 6604                         if (va_next < addr)
 6605                                 va_next = end_addr;
 6606                         continue;
 6607                 }
 6608 
 6609                 va_next = (addr + NBPDR) & ~PDRMASK;
 6610                 if (va_next < addr)
 6611                         va_next = end_addr;
 6612 
 6613                 pde = pmap_pdpe_to_pde(pdpe, addr);
 6614                 srcptepaddr = *pde;
 6615                 if (srcptepaddr == 0)
 6616                         continue;
 6617                         
 6618                 if (srcptepaddr & PG_PS) {
 6619                         if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 6620                                 continue;
 6621                         dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
 6622                         if (dst_pdpg == NULL)
 6623                                 break;
 6624                         pde = (pd_entry_t *)
 6625                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
 6626                         pde = &pde[pmap_pde_index(addr)];
 6627                         if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 6628                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
 6629                             PMAP_ENTER_NORECLAIM, &lock))) {
 6630                                 *pde = srcptepaddr & ~PG_W;
 6631                                 pmap_resident_count_inc(dst_pmap, NBPDR /
 6632                                     PAGE_SIZE);
 6633                                 atomic_add_long(&pmap_pde_mappings, 1);
 6634                         } else
 6635                                 dst_pdpg->wire_count--;
 6636                         continue;
 6637                 }
 6638 
 6639                 srcptepaddr &= PG_FRAME;
 6640                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 6641                 KASSERT(srcmpte->wire_count > 0,
 6642                     ("pmap_copy: source page table page is unused"));
 6643 
 6644                 if (va_next > end_addr)
 6645                         va_next = end_addr;
 6646 
 6647                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 6648                 src_pte = &src_pte[pmap_pte_index(addr)];
 6649                 dstmpte = NULL;
 6650                 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
 6651                         ptetemp = *src_pte;
 6652 
 6653                         /*
 6654                          * We only virtual copy managed pages.
 6655                          */
 6656                         if ((ptetemp & PG_MANAGED) == 0)
 6657                                 continue;
 6658 
 6659                         if (dstmpte != NULL) {
 6660                                 KASSERT(dstmpte->pindex ==
 6661                                     pmap_pde_pindex(addr),
 6662                                     ("dstmpte pindex/addr mismatch"));
 6663                                 dstmpte->wire_count++;
 6664                         } else if ((dstmpte = pmap_allocpte(dst_pmap, addr,
 6665                             NULL)) == NULL)
 6666                                 goto out;
 6667                         dst_pte = (pt_entry_t *)
 6668                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 6669                         dst_pte = &dst_pte[pmap_pte_index(addr)];
 6670                         if (*dst_pte == 0 &&
 6671                             pmap_try_insert_pv_entry(dst_pmap, addr,
 6672                             PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) {
 6673                                 /*
 6674                                  * Clear the wired, modified, and accessed
 6675                                  * (referenced) bits during the copy.
 6676                                  */
 6677                                 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A);
 6678                                 pmap_resident_count_inc(dst_pmap, 1);
 6679                         } else {
 6680                                 SLIST_INIT(&free);
 6681                                 if (pmap_unwire_ptp(dst_pmap, addr, dstmpte,
 6682                                     &free)) {
 6683                                         /*
 6684                                          * Although "addr" is not mapped,
 6685                                          * paging-structure caches could
 6686                                          * nonetheless have entries that refer
 6687                                          * to the freed page table pages.
 6688                                          * Invalidate those entries.
 6689                                          */
 6690                                         pmap_invalidate_page(dst_pmap, addr);
 6691                                         vm_page_free_pages_toq(&free, true);
 6692                                 }
 6693                                 goto out;
 6694                         }
 6695                         /* Have we copied all of the valid mappings? */ 
 6696                         if (dstmpte->wire_count >= srcmpte->wire_count)
 6697                                 break;
 6698                 }
 6699         }
 6700 out:
 6701         if (lock != NULL)
 6702                 rw_wunlock(lock);
 6703         PMAP_UNLOCK(src_pmap);
 6704         PMAP_UNLOCK(dst_pmap);
 6705 }
 6706 
 6707 int
 6708 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
 6709 {
 6710         int error;
 6711 
 6712         if (dst_pmap->pm_type != src_pmap->pm_type ||
 6713             dst_pmap->pm_type != PT_X86 ||
 6714             (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
 6715                 return (0);
 6716         for (;;) {
 6717                 if (dst_pmap < src_pmap) {
 6718                         PMAP_LOCK(dst_pmap);
 6719                         PMAP_LOCK(src_pmap);
 6720                 } else {
 6721                         PMAP_LOCK(src_pmap);
 6722                         PMAP_LOCK(dst_pmap);
 6723                 }
 6724                 error = pmap_pkru_copy(dst_pmap, src_pmap);
 6725                 /* Clean up partial copy on failure due to no memory. */
 6726                 if (error == ENOMEM)
 6727                         pmap_pkru_deassign_all(dst_pmap);
 6728                 PMAP_UNLOCK(src_pmap);
 6729                 PMAP_UNLOCK(dst_pmap);
 6730                 if (error != ENOMEM)
 6731                         break;
 6732                 vm_wait(NULL);
 6733         }
 6734         return (error);
 6735 }
 6736 
 6737 /*
 6738  * Zero the specified hardware page.
 6739  */
 6740 void
 6741 pmap_zero_page(vm_page_t m)
 6742 {
 6743         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 6744 
 6745         pagezero((void *)va);
 6746 }
 6747 
 6748 /*
 6749  * Zero an area within a single hardware page.  off and size must not
 6750  * cover an area beyond a single hardware page.
 6751  */
 6752 void
 6753 pmap_zero_page_area(vm_page_t m, int off, int size)
 6754 {
 6755         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 6756 
 6757         if (off == 0 && size == PAGE_SIZE)
 6758                 pagezero((void *)va);
 6759         else
 6760                 bzero((char *)va + off, size);
 6761 }
 6762 
 6763 /*
 6764  * Copy 1 specified hardware page to another.
 6765  */
 6766 void
 6767 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 6768 {
 6769         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 6770         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 6771 
 6772         pagecopy((void *)src, (void *)dst);
 6773 }
 6774 
 6775 int unmapped_buf_allowed = 1;
 6776 
 6777 void
 6778 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
 6779     vm_offset_t b_offset, int xfersize)
 6780 {
 6781         void *a_cp, *b_cp;
 6782         vm_page_t pages[2];
 6783         vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
 6784         int cnt;
 6785         boolean_t mapped;
 6786 
 6787         while (xfersize > 0) {
 6788                 a_pg_offset = a_offset & PAGE_MASK;
 6789                 pages[0] = ma[a_offset >> PAGE_SHIFT];
 6790                 b_pg_offset = b_offset & PAGE_MASK;
 6791                 pages[1] = mb[b_offset >> PAGE_SHIFT];
 6792                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 6793                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 6794                 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
 6795                 a_cp = (char *)vaddr[0] + a_pg_offset;
 6796                 b_cp = (char *)vaddr[1] + b_pg_offset;
 6797                 bcopy(a_cp, b_cp, cnt);
 6798                 if (__predict_false(mapped))
 6799                         pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
 6800                 a_offset += cnt;
 6801                 b_offset += cnt;
 6802                 xfersize -= cnt;
 6803         }
 6804 }
 6805 
 6806 /*
 6807  * Returns true if the pmap's pv is one of the first
 6808  * 16 pvs linked to from this page.  This count may
 6809  * be changed upwards or downwards in the future; it
 6810  * is only necessary that true be returned for a small
 6811  * subset of pmaps for proper page aging.
 6812  */
 6813 boolean_t
 6814 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 6815 {
 6816         struct md_page *pvh;
 6817         struct rwlock *lock;
 6818         pv_entry_t pv;
 6819         int loops = 0;
 6820         boolean_t rv;
 6821 
 6822         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 6823             ("pmap_page_exists_quick: page %p is not managed", m));
 6824         rv = FALSE;
 6825         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 6826         rw_rlock(lock);
 6827         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 6828                 if (PV_PMAP(pv) == pmap) {
 6829                         rv = TRUE;
 6830                         break;
 6831                 }
 6832                 loops++;
 6833                 if (loops >= 16)
 6834                         break;
 6835         }
 6836         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 6837                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 6838                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 6839                         if (PV_PMAP(pv) == pmap) {
 6840                                 rv = TRUE;
 6841                                 break;
 6842                         }
 6843                         loops++;
 6844                         if (loops >= 16)
 6845                                 break;
 6846                 }
 6847         }
 6848         rw_runlock(lock);
 6849         return (rv);
 6850 }
 6851 
 6852 /*
 6853  *      pmap_page_wired_mappings:
 6854  *
 6855  *      Return the number of managed mappings to the given physical page
 6856  *      that are wired.
 6857  */
 6858 int
 6859 pmap_page_wired_mappings(vm_page_t m)
 6860 {
 6861         struct rwlock *lock;
 6862         struct md_page *pvh;
 6863         pmap_t pmap;
 6864         pt_entry_t *pte;
 6865         pv_entry_t pv;
 6866         int count, md_gen, pvh_gen;
 6867 
 6868         if ((m->oflags & VPO_UNMANAGED) != 0)
 6869                 return (0);
 6870         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 6871         rw_rlock(lock);
 6872 restart:
 6873         count = 0;
 6874         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 6875                 pmap = PV_PMAP(pv);
 6876                 if (!PMAP_TRYLOCK(pmap)) {
 6877                         md_gen = m->md.pv_gen;
 6878                         rw_runlock(lock);
 6879                         PMAP_LOCK(pmap);
 6880                         rw_rlock(lock);
 6881                         if (md_gen != m->md.pv_gen) {
 6882                                 PMAP_UNLOCK(pmap);
 6883                                 goto restart;
 6884                         }
 6885                 }
 6886                 pte = pmap_pte(pmap, pv->pv_va);
 6887                 if ((*pte & PG_W) != 0)
 6888                         count++;
 6889                 PMAP_UNLOCK(pmap);
 6890         }
 6891         if ((m->flags & PG_FICTITIOUS) == 0) {
 6892                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 6893                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 6894                         pmap = PV_PMAP(pv);
 6895                         if (!PMAP_TRYLOCK(pmap)) {
 6896                                 md_gen = m->md.pv_gen;
 6897                                 pvh_gen = pvh->pv_gen;
 6898                                 rw_runlock(lock);
 6899                                 PMAP_LOCK(pmap);
 6900                                 rw_rlock(lock);
 6901                                 if (md_gen != m->md.pv_gen ||
 6902                                     pvh_gen != pvh->pv_gen) {
 6903                                         PMAP_UNLOCK(pmap);
 6904                                         goto restart;
 6905                                 }
 6906                         }
 6907                         pte = pmap_pde(pmap, pv->pv_va);
 6908                         if ((*pte & PG_W) != 0)
 6909                                 count++;
 6910                         PMAP_UNLOCK(pmap);
 6911                 }
 6912         }
 6913         rw_runlock(lock);
 6914         return (count);
 6915 }
 6916 
 6917 /*
 6918  * Returns TRUE if the given page is mapped individually or as part of
 6919  * a 2mpage.  Otherwise, returns FALSE.
 6920  */
 6921 boolean_t
 6922 pmap_page_is_mapped(vm_page_t m)
 6923 {
 6924         struct rwlock *lock;
 6925         boolean_t rv;
 6926 
 6927         if ((m->oflags & VPO_UNMANAGED) != 0)
 6928                 return (FALSE);
 6929         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 6930         rw_rlock(lock);
 6931         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 6932             ((m->flags & PG_FICTITIOUS) == 0 &&
 6933             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 6934         rw_runlock(lock);
 6935         return (rv);
 6936 }
 6937 
 6938 /*
 6939  * Destroy all managed, non-wired mappings in the given user-space
 6940  * pmap.  This pmap cannot be active on any processor besides the
 6941  * caller.
 6942  *
 6943  * This function cannot be applied to the kernel pmap.  Moreover, it
 6944  * is not intended for general use.  It is only to be used during
 6945  * process termination.  Consequently, it can be implemented in ways
 6946  * that make it faster than pmap_remove().  First, it can more quickly
 6947  * destroy mappings by iterating over the pmap's collection of PV
 6948  * entries, rather than searching the page table.  Second, it doesn't
 6949  * have to test and clear the page table entries atomically, because
 6950  * no processor is currently accessing the user address space.  In
 6951  * particular, a page table entry's dirty bit won't change state once
 6952  * this function starts.
 6953  *
 6954  * Although this function destroys all of the pmap's managed,
 6955  * non-wired mappings, it can delay and batch the invalidation of TLB
 6956  * entries without calling pmap_delayed_invl_start() and
 6957  * pmap_delayed_invl_finish().  Because the pmap is not active on
 6958  * any other processor, none of these TLB entries will ever be used
 6959  * before their eventual invalidation.  Consequently, there is no need
 6960  * for either pmap_remove_all() or pmap_remove_write() to wait for
 6961  * that eventual TLB invalidation.
 6962  */
 6963 void
 6964 pmap_remove_pages(pmap_t pmap)
 6965 {
 6966         pd_entry_t ptepde;
 6967         pt_entry_t *pte, tpte;
 6968         pt_entry_t PG_M, PG_RW, PG_V;
 6969         struct spglist free;
 6970         struct pv_chunklist free_chunks;
 6971         vm_page_t m, mpte, mt;
 6972         pv_entry_t pv;
 6973         struct md_page *pvh;
 6974         struct pv_chunk *pc, *npc;
 6975         struct rwlock *lock;
 6976         int64_t bit;
 6977         uint64_t inuse, bitmask;
 6978         int allfree, field, idx;
 6979 #ifdef PV_STATS
 6980         int freed;
 6981 #endif
 6982         boolean_t superpage;
 6983         vm_paddr_t pa;
 6984 
 6985         /*
 6986          * Assert that the given pmap is only active on the current
 6987          * CPU.  Unfortunately, we cannot block another CPU from
 6988          * activating the pmap while this function is executing.
 6989          */
 6990         KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
 6991 #ifdef INVARIANTS
 6992         {
 6993                 cpuset_t other_cpus;
 6994 
 6995                 other_cpus = all_cpus;
 6996                 critical_enter();
 6997                 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 6998                 CPU_AND(&other_cpus, &pmap->pm_active);
 6999                 critical_exit();
 7000                 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
 7001         }
 7002 #endif
 7003 
 7004         lock = NULL;
 7005         PG_M = pmap_modified_bit(pmap);
 7006         PG_V = pmap_valid_bit(pmap);
 7007         PG_RW = pmap_rw_bit(pmap);
 7008 
 7009         TAILQ_INIT(&free_chunks);
 7010         SLIST_INIT(&free);
 7011         PMAP_LOCK(pmap);
 7012         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 7013                 allfree = 1;
 7014 #ifdef PV_STATS
 7015                 freed = 0;
 7016 #endif
 7017                 for (field = 0; field < _NPCM; field++) {
 7018                         inuse = ~pc->pc_map[field] & pc_freemask[field];
 7019                         while (inuse != 0) {
 7020                                 bit = bsfq(inuse);
 7021                                 bitmask = 1UL << bit;
 7022                                 idx = field * 64 + bit;
 7023                                 pv = &pc->pc_pventry[idx];
 7024                                 inuse &= ~bitmask;
 7025 
 7026                                 pte = pmap_pdpe(pmap, pv->pv_va);
 7027                                 ptepde = *pte;
 7028                                 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 7029                                 tpte = *pte;
 7030                                 if ((tpte & (PG_PS | PG_V)) == PG_V) {
 7031                                         superpage = FALSE;
 7032                                         ptepde = tpte;
 7033                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 7034                                             PG_FRAME);
 7035                                         pte = &pte[pmap_pte_index(pv->pv_va)];
 7036                                         tpte = *pte;
 7037                                 } else {
 7038                                         /*
 7039                                          * Keep track whether 'tpte' is a
 7040                                          * superpage explicitly instead of
 7041                                          * relying on PG_PS being set.
 7042                                          *
 7043                                          * This is because PG_PS is numerically
 7044                                          * identical to PG_PTE_PAT and thus a
 7045                                          * regular page could be mistaken for
 7046                                          * a superpage.
 7047                                          */
 7048                                         superpage = TRUE;
 7049                                 }
 7050 
 7051                                 if ((tpte & PG_V) == 0) {
 7052                                         panic("bad pte va %lx pte %lx",
 7053                                             pv->pv_va, tpte);
 7054                                 }
 7055 
 7056 /*
 7057  * We cannot remove wired pages from a process' mapping at this time
 7058  */
 7059                                 if (tpte & PG_W) {
 7060                                         allfree = 0;
 7061                                         continue;
 7062                                 }
 7063 
 7064                                 if (superpage)
 7065                                         pa = tpte & PG_PS_FRAME;
 7066                                 else
 7067                                         pa = tpte & PG_FRAME;
 7068 
 7069                                 m = PHYS_TO_VM_PAGE(pa);
 7070                                 KASSERT(m->phys_addr == pa,
 7071                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 7072                                     m, (uintmax_t)m->phys_addr,
 7073                                     (uintmax_t)tpte));
 7074 
 7075                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 7076                                     m < &vm_page_array[vm_page_array_size],
 7077                                     ("pmap_remove_pages: bad tpte %#jx",
 7078                                     (uintmax_t)tpte));
 7079 
 7080                                 pte_clear(pte);
 7081 
 7082                                 /*
 7083                                  * Update the vm_page_t clean/reference bits.
 7084                                  */
 7085                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 7086                                         if (superpage) {
 7087                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 7088                                                         vm_page_dirty(mt);
 7089                                         } else
 7090                                                 vm_page_dirty(m);
 7091                                 }
 7092 
 7093                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 7094 
 7095                                 /* Mark free */
 7096                                 pc->pc_map[field] |= bitmask;
 7097                                 if (superpage) {
 7098                                         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 7099                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 7100                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 7101                                         pvh->pv_gen++;
 7102                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 7103                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 7104                                                         if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 7105                                                             TAILQ_EMPTY(&mt->md.pv_list))
 7106                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
 7107                                         }
 7108                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 7109                                         if (mpte != NULL) {
 7110                                                 KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 7111                                                     ("pmap_remove_pages: pte page not promoted"));
 7112                                                 pmap_resident_count_dec(pmap, 1);
 7113                                                 KASSERT(mpte->wire_count == NPTEPG,
 7114                                                     ("pmap_remove_pages: pte page wire count error"));
 7115                                                 mpte->wire_count = 0;
 7116                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
 7117                                         }
 7118                                 } else {
 7119                                         pmap_resident_count_dec(pmap, 1);
 7120                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 7121                                         m->md.pv_gen++;
 7122                                         if ((m->aflags & PGA_WRITEABLE) != 0 &&
 7123                                             TAILQ_EMPTY(&m->md.pv_list) &&
 7124                                             (m->flags & PG_FICTITIOUS) == 0) {
 7125                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 7126                                                 if (TAILQ_EMPTY(&pvh->pv_list))
 7127                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
 7128                                         }
 7129                                 }
 7130                                 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 7131 #ifdef PV_STATS
 7132                                 freed++;
 7133 #endif
 7134                         }
 7135                 }
 7136                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 7137                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 7138                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 7139                 if (allfree) {
 7140                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 7141                         TAILQ_INSERT_TAIL(&free_chunks, pc, pc_list);
 7142                 }
 7143         }
 7144         if (lock != NULL)
 7145                 rw_wunlock(lock);
 7146         pmap_invalidate_all(pmap);
 7147         pmap_pkru_deassign_all(pmap);
 7148         free_pv_chunk_batch(&free_chunks);
 7149         PMAP_UNLOCK(pmap);
 7150         vm_page_free_pages_toq(&free, true);
 7151 }
 7152 
 7153 static boolean_t
 7154 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 7155 {
 7156         struct rwlock *lock;
 7157         pv_entry_t pv;
 7158         struct md_page *pvh;
 7159         pt_entry_t *pte, mask;
 7160         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 7161         pmap_t pmap;
 7162         int md_gen, pvh_gen;
 7163         boolean_t rv;
 7164 
 7165         rv = FALSE;
 7166         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 7167         rw_rlock(lock);
 7168 restart:
 7169         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 7170                 pmap = PV_PMAP(pv);
 7171                 if (!PMAP_TRYLOCK(pmap)) {
 7172                         md_gen = m->md.pv_gen;
 7173                         rw_runlock(lock);
 7174                         PMAP_LOCK(pmap);
 7175                         rw_rlock(lock);
 7176                         if (md_gen != m->md.pv_gen) {
 7177                                 PMAP_UNLOCK(pmap);
 7178                                 goto restart;
 7179                         }
 7180                 }
 7181                 pte = pmap_pte(pmap, pv->pv_va);
 7182                 mask = 0;
 7183                 if (modified) {
 7184                         PG_M = pmap_modified_bit(pmap);
 7185                         PG_RW = pmap_rw_bit(pmap);
 7186                         mask |= PG_RW | PG_M;
 7187                 }
 7188                 if (accessed) {
 7189                         PG_A = pmap_accessed_bit(pmap);
 7190                         PG_V = pmap_valid_bit(pmap);
 7191                         mask |= PG_V | PG_A;
 7192                 }
 7193                 rv = (*pte & mask) == mask;
 7194                 PMAP_UNLOCK(pmap);
 7195                 if (rv)
 7196                         goto out;
 7197         }
 7198         if ((m->flags & PG_FICTITIOUS) == 0) {
 7199                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 7200                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 7201                         pmap = PV_PMAP(pv);
 7202                         if (!PMAP_TRYLOCK(pmap)) {
 7203                                 md_gen = m->md.pv_gen;
 7204                                 pvh_gen = pvh->pv_gen;
 7205                                 rw_runlock(lock);
 7206                                 PMAP_LOCK(pmap);
 7207                                 rw_rlock(lock);
 7208                                 if (md_gen != m->md.pv_gen ||
 7209                                     pvh_gen != pvh->pv_gen) {
 7210                                         PMAP_UNLOCK(pmap);
 7211                                         goto restart;
 7212                                 }
 7213                         }
 7214                         pte = pmap_pde(pmap, pv->pv_va);
 7215                         mask = 0;
 7216                         if (modified) {
 7217                                 PG_M = pmap_modified_bit(pmap);
 7218                                 PG_RW = pmap_rw_bit(pmap);
 7219                                 mask |= PG_RW | PG_M;
 7220                         }
 7221                         if (accessed) {
 7222                                 PG_A = pmap_accessed_bit(pmap);
 7223                                 PG_V = pmap_valid_bit(pmap);
 7224                                 mask |= PG_V | PG_A;
 7225                         }
 7226                         rv = (*pte & mask) == mask;
 7227                         PMAP_UNLOCK(pmap);
 7228                         if (rv)
 7229                                 goto out;
 7230                 }
 7231         }
 7232 out:
 7233         rw_runlock(lock);
 7234         return (rv);
 7235 }
 7236 
 7237 /*
 7238  *      pmap_is_modified:
 7239  *
 7240  *      Return whether or not the specified physical page was modified
 7241  *      in any physical maps.
 7242  */
 7243 boolean_t
 7244 pmap_is_modified(vm_page_t m)
 7245 {
 7246 
 7247         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 7248             ("pmap_is_modified: page %p is not managed", m));
 7249 
 7250         /*
 7251          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 7252          * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 7253          * is clear, no PTEs can have PG_M set.
 7254          */
 7255         VM_OBJECT_ASSERT_WLOCKED(m->object);
 7256         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 7257                 return (FALSE);
 7258         return (pmap_page_test_mappings(m, FALSE, TRUE));
 7259 }
 7260 
 7261 /*
 7262  *      pmap_is_prefaultable:
 7263  *
 7264  *      Return whether or not the specified virtual address is eligible
 7265  *      for prefault.
 7266  */
 7267 boolean_t
 7268 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 7269 {
 7270         pd_entry_t *pde;
 7271         pt_entry_t *pte, PG_V;
 7272         boolean_t rv;
 7273 
 7274         PG_V = pmap_valid_bit(pmap);
 7275         rv = FALSE;
 7276         PMAP_LOCK(pmap);
 7277         pde = pmap_pde(pmap, addr);
 7278         if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 7279                 pte = pmap_pde_to_pte(pde, addr);
 7280                 rv = (*pte & PG_V) == 0;
 7281         }
 7282         PMAP_UNLOCK(pmap);
 7283         return (rv);
 7284 }
 7285 
 7286 /*
 7287  *      pmap_is_referenced:
 7288  *
 7289  *      Return whether or not the specified physical page was referenced
 7290  *      in any physical maps.
 7291  */
 7292 boolean_t
 7293 pmap_is_referenced(vm_page_t m)
 7294 {
 7295 
 7296         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 7297             ("pmap_is_referenced: page %p is not managed", m));
 7298         return (pmap_page_test_mappings(m, TRUE, FALSE));
 7299 }
 7300 
 7301 /*
 7302  * Clear the write and modified bits in each of the given page's mappings.
 7303  */
 7304 void
 7305 pmap_remove_write(vm_page_t m)
 7306 {
 7307         struct md_page *pvh;
 7308         pmap_t pmap;
 7309         struct rwlock *lock;
 7310         pv_entry_t next_pv, pv;
 7311         pd_entry_t *pde;
 7312         pt_entry_t oldpte, *pte, PG_M, PG_RW;
 7313         vm_offset_t va;
 7314         int pvh_gen, md_gen;
 7315 
 7316         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 7317             ("pmap_remove_write: page %p is not managed", m));
 7318 
 7319         /*
 7320          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 7321          * set by another thread while the object is locked.  Thus,
 7322          * if PGA_WRITEABLE is clear, no page table entries need updating.
 7323          */
 7324         VM_OBJECT_ASSERT_WLOCKED(m->object);
 7325         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 7326                 return;
 7327         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 7328         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 7329             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 7330 retry_pv_loop:
 7331         rw_wlock(lock);
 7332         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 7333                 pmap = PV_PMAP(pv);
 7334                 if (!PMAP_TRYLOCK(pmap)) {
 7335                         pvh_gen = pvh->pv_gen;
 7336                         rw_wunlock(lock);
 7337                         PMAP_LOCK(pmap);
 7338                         rw_wlock(lock);
 7339                         if (pvh_gen != pvh->pv_gen) {
 7340                                 PMAP_UNLOCK(pmap);
 7341                                 rw_wunlock(lock);
 7342                                 goto retry_pv_loop;
 7343                         }
 7344                 }
 7345                 PG_RW = pmap_rw_bit(pmap);
 7346                 va = pv->pv_va;
 7347                 pde = pmap_pde(pmap, va);
 7348                 if ((*pde & PG_RW) != 0)
 7349                         (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 7350                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 7351                     ("inconsistent pv lock %p %p for page %p",
 7352                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 7353                 PMAP_UNLOCK(pmap);
 7354         }
 7355         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 7356                 pmap = PV_PMAP(pv);
 7357                 if (!PMAP_TRYLOCK(pmap)) {
 7358                         pvh_gen = pvh->pv_gen;
 7359                         md_gen = m->md.pv_gen;
 7360                         rw_wunlock(lock);
 7361                         PMAP_LOCK(pmap);
 7362                         rw_wlock(lock);
 7363                         if (pvh_gen != pvh->pv_gen ||
 7364                             md_gen != m->md.pv_gen) {
 7365                                 PMAP_UNLOCK(pmap);
 7366                                 rw_wunlock(lock);
 7367                                 goto retry_pv_loop;
 7368                         }
 7369                 }
 7370                 PG_M = pmap_modified_bit(pmap);
 7371                 PG_RW = pmap_rw_bit(pmap);
 7372                 pde = pmap_pde(pmap, pv->pv_va);
 7373                 KASSERT((*pde & PG_PS) == 0,
 7374                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
 7375                     m));
 7376                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 7377 retry:
 7378                 oldpte = *pte;
 7379                 if (oldpte & PG_RW) {
 7380                         if (!atomic_cmpset_long(pte, oldpte, oldpte &
 7381                             ~(PG_RW | PG_M)))
 7382                                 goto retry;
 7383                         if ((oldpte & PG_M) != 0)
 7384                                 vm_page_dirty(m);
 7385                         pmap_invalidate_page(pmap, pv->pv_va);
 7386                 }
 7387                 PMAP_UNLOCK(pmap);
 7388         }
 7389         rw_wunlock(lock);
 7390         vm_page_aflag_clear(m, PGA_WRITEABLE);
 7391         pmap_delayed_invl_wait(m);
 7392 }
 7393 
 7394 static __inline boolean_t
 7395 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
 7396 {
 7397 
 7398         if (!pmap_emulate_ad_bits(pmap))
 7399                 return (TRUE);
 7400 
 7401         KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
 7402 
 7403         /*
 7404          * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
 7405          * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
 7406          * if the EPT_PG_WRITE bit is set.
 7407          */
 7408         if ((pte & EPT_PG_WRITE) != 0)
 7409                 return (FALSE);
 7410 
 7411         /*
 7412          * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
 7413          */
 7414         if ((pte & EPT_PG_EXECUTE) == 0 ||
 7415             ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
 7416                 return (TRUE);
 7417         else
 7418                 return (FALSE);
 7419 }
 7420 
 7421 /*
 7422  *      pmap_ts_referenced:
 7423  *
 7424  *      Return a count of reference bits for a page, clearing those bits.
 7425  *      It is not necessary for every reference bit to be cleared, but it
 7426  *      is necessary that 0 only be returned when there are truly no
 7427  *      reference bits set.
 7428  *
 7429  *      As an optimization, update the page's dirty field if a modified bit is
 7430  *      found while counting reference bits.  This opportunistic update can be
 7431  *      performed at low cost and can eliminate the need for some future calls
 7432  *      to pmap_is_modified().  However, since this function stops after
 7433  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
 7434  *      dirty pages.  Those dirty pages will only be detected by a future call
 7435  *      to pmap_is_modified().
 7436  *
 7437  *      A DI block is not needed within this function, because
 7438  *      invalidations are performed before the PV list lock is
 7439  *      released.
 7440  */
 7441 int
 7442 pmap_ts_referenced(vm_page_t m)
 7443 {
 7444         struct md_page *pvh;
 7445         pv_entry_t pv, pvf;
 7446         pmap_t pmap;
 7447         struct rwlock *lock;
 7448         pd_entry_t oldpde, *pde;
 7449         pt_entry_t *pte, PG_A, PG_M, PG_RW;
 7450         vm_offset_t va;
 7451         vm_paddr_t pa;
 7452         int cleared, md_gen, not_cleared, pvh_gen;
 7453         struct spglist free;
 7454         boolean_t demoted;
 7455 
 7456         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 7457             ("pmap_ts_referenced: page %p is not managed", m));
 7458         SLIST_INIT(&free);
 7459         cleared = 0;
 7460         pa = VM_PAGE_TO_PHYS(m);
 7461         lock = PHYS_TO_PV_LIST_LOCK(pa);
 7462         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 7463         rw_wlock(lock);
 7464 retry:
 7465         not_cleared = 0;
 7466         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 7467                 goto small_mappings;
 7468         pv = pvf;
 7469         do {
 7470                 if (pvf == NULL)
 7471                         pvf = pv;
 7472                 pmap = PV_PMAP(pv);
 7473                 if (!PMAP_TRYLOCK(pmap)) {
 7474                         pvh_gen = pvh->pv_gen;
 7475                         rw_wunlock(lock);
 7476                         PMAP_LOCK(pmap);
 7477                         rw_wlock(lock);
 7478                         if (pvh_gen != pvh->pv_gen) {
 7479                                 PMAP_UNLOCK(pmap);
 7480                                 goto retry;
 7481                         }
 7482                 }
 7483                 PG_A = pmap_accessed_bit(pmap);
 7484                 PG_M = pmap_modified_bit(pmap);
 7485                 PG_RW = pmap_rw_bit(pmap);
 7486                 va = pv->pv_va;
 7487                 pde = pmap_pde(pmap, pv->pv_va);
 7488                 oldpde = *pde;
 7489                 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 7490                         /*
 7491                          * Although "oldpde" is mapping a 2MB page, because
 7492                          * this function is called at a 4KB page granularity,
 7493                          * we only update the 4KB page under test.
 7494                          */
 7495                         vm_page_dirty(m);
 7496                 }
 7497                 if ((oldpde & PG_A) != 0) {
 7498                         /*
 7499                          * Since this reference bit is shared by 512 4KB
 7500                          * pages, it should not be cleared every time it is
 7501                          * tested.  Apply a simple "hash" function on the
 7502                          * physical page number, the virtual superpage number,
 7503                          * and the pmap address to select one 4KB page out of
 7504                          * the 512 on which testing the reference bit will
 7505                          * result in clearing that reference bit.  This
 7506                          * function is designed to avoid the selection of the
 7507                          * same 4KB page for every 2MB page mapping.
 7508                          *
 7509                          * On demotion, a mapping that hasn't been referenced
 7510                          * is simply destroyed.  To avoid the possibility of a
 7511                          * subsequent page fault on a demoted wired mapping,
 7512                          * always leave its reference bit set.  Moreover,
 7513                          * since the superpage is wired, the current state of
 7514                          * its reference bit won't affect page replacement.
 7515                          */
 7516                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 7517                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 7518                             (oldpde & PG_W) == 0) {
 7519                                 if (safe_to_clear_referenced(pmap, oldpde)) {
 7520                                         atomic_clear_long(pde, PG_A);
 7521                                         pmap_invalidate_page(pmap, pv->pv_va);
 7522                                         demoted = FALSE;
 7523                                 } else if (pmap_demote_pde_locked(pmap, pde,
 7524                                     pv->pv_va, &lock)) {
 7525                                         /*
 7526                                          * Remove the mapping to a single page
 7527                                          * so that a subsequent access may
 7528                                          * repromote.  Since the underlying
 7529                                          * page table page is fully populated,
 7530                                          * this removal never frees a page
 7531                                          * table page.
 7532                                          */
 7533                                         demoted = TRUE;
 7534                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 7535                                             PG_PS_FRAME);
 7536                                         pte = pmap_pde_to_pte(pde, va);
 7537                                         pmap_remove_pte(pmap, pte, va, *pde,
 7538                                             NULL, &lock);
 7539                                         pmap_invalidate_page(pmap, va);
 7540                                 } else
 7541                                         demoted = TRUE;
 7542 
 7543                                 if (demoted) {
 7544                                         /*
 7545                                          * The superpage mapping was removed
 7546                                          * entirely and therefore 'pv' is no
 7547                                          * longer valid.
 7548                                          */
 7549                                         if (pvf == pv)
 7550                                                 pvf = NULL;
 7551                                         pv = NULL;
 7552                                 }
 7553                                 cleared++;
 7554                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 7555                                     ("inconsistent pv lock %p %p for page %p",
 7556                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 7557                         } else
 7558                                 not_cleared++;
 7559                 }
 7560                 PMAP_UNLOCK(pmap);
 7561                 /* Rotate the PV list if it has more than one entry. */
 7562                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 7563                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 7564                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 7565                         pvh->pv_gen++;
 7566                 }
 7567                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 7568                         goto out;
 7569         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 7570 small_mappings:
 7571         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 7572                 goto out;
 7573         pv = pvf;
 7574         do {
 7575                 if (pvf == NULL)
 7576                         pvf = pv;
 7577                 pmap = PV_PMAP(pv);
 7578                 if (!PMAP_TRYLOCK(pmap)) {
 7579                         pvh_gen = pvh->pv_gen;
 7580                         md_gen = m->md.pv_gen;
 7581                         rw_wunlock(lock);
 7582                         PMAP_LOCK(pmap);
 7583                         rw_wlock(lock);
 7584                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 7585                                 PMAP_UNLOCK(pmap);
 7586                                 goto retry;
 7587                         }
 7588                 }
 7589                 PG_A = pmap_accessed_bit(pmap);
 7590                 PG_M = pmap_modified_bit(pmap);
 7591                 PG_RW = pmap_rw_bit(pmap);
 7592                 pde = pmap_pde(pmap, pv->pv_va);
 7593                 KASSERT((*pde & PG_PS) == 0,
 7594                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 7595                     m));
 7596                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 7597                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 7598                         vm_page_dirty(m);
 7599                 if ((*pte & PG_A) != 0) {
 7600                         if (safe_to_clear_referenced(pmap, *pte)) {
 7601                                 atomic_clear_long(pte, PG_A);
 7602                                 pmap_invalidate_page(pmap, pv->pv_va);
 7603                                 cleared++;
 7604                         } else if ((*pte & PG_W) == 0) {
 7605                                 /*
 7606                                  * Wired pages cannot be paged out so
 7607                                  * doing accessed bit emulation for
 7608                                  * them is wasted effort. We do the
 7609                                  * hard work for unwired pages only.
 7610                                  */
 7611                                 pmap_remove_pte(pmap, pte, pv->pv_va,
 7612                                     *pde, &free, &lock);
 7613                                 pmap_invalidate_page(pmap, pv->pv_va);
 7614                                 cleared++;
 7615                                 if (pvf == pv)
 7616                                         pvf = NULL;
 7617                                 pv = NULL;
 7618                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 7619                                     ("inconsistent pv lock %p %p for page %p",
 7620                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 7621                         } else
 7622                                 not_cleared++;
 7623                 }
 7624                 PMAP_UNLOCK(pmap);
 7625                 /* Rotate the PV list if it has more than one entry. */
 7626                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 7627                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 7628                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 7629                         m->md.pv_gen++;
 7630                 }
 7631         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 7632             not_cleared < PMAP_TS_REFERENCED_MAX);
 7633 out:
 7634         rw_wunlock(lock);
 7635         vm_page_free_pages_toq(&free, true);
 7636         return (cleared + not_cleared);
 7637 }
 7638 
 7639 /*
 7640  *      Apply the given advice to the specified range of addresses within the
 7641  *      given pmap.  Depending on the advice, clear the referenced and/or
 7642  *      modified flags in each mapping and set the mapped page's dirty field.
 7643  */
 7644 void
 7645 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 7646 {
 7647         struct rwlock *lock;
 7648         pml4_entry_t *pml4e;
 7649         pdp_entry_t *pdpe;
 7650         pd_entry_t oldpde, *pde;
 7651         pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
 7652         vm_offset_t va, va_next;
 7653         vm_page_t m;
 7654         bool anychanged;
 7655 
 7656         if (advice != MADV_DONTNEED && advice != MADV_FREE)
 7657                 return;
 7658 
 7659         /*
 7660          * A/D bit emulation requires an alternate code path when clearing
 7661          * the modified and accessed bits below. Since this function is
 7662          * advisory in nature we skip it entirely for pmaps that require
 7663          * A/D bit emulation.
 7664          */
 7665         if (pmap_emulate_ad_bits(pmap))
 7666                 return;
 7667 
 7668         PG_A = pmap_accessed_bit(pmap);
 7669         PG_G = pmap_global_bit(pmap);
 7670         PG_M = pmap_modified_bit(pmap);
 7671         PG_V = pmap_valid_bit(pmap);
 7672         PG_RW = pmap_rw_bit(pmap);
 7673         anychanged = false;
 7674         pmap_delayed_invl_start();
 7675         PMAP_LOCK(pmap);
 7676         for (; sva < eva; sva = va_next) {
 7677                 pml4e = pmap_pml4e(pmap, sva);
 7678                 if ((*pml4e & PG_V) == 0) {
 7679                         va_next = (sva + NBPML4) & ~PML4MASK;
 7680                         if (va_next < sva)
 7681                                 va_next = eva;
 7682                         continue;
 7683                 }
 7684                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 7685                 if ((*pdpe & PG_V) == 0) {
 7686                         va_next = (sva + NBPDP) & ~PDPMASK;
 7687                         if (va_next < sva)
 7688                                 va_next = eva;
 7689                         continue;
 7690                 }
 7691                 va_next = (sva + NBPDR) & ~PDRMASK;
 7692                 if (va_next < sva)
 7693                         va_next = eva;
 7694                 pde = pmap_pdpe_to_pde(pdpe, sva);
 7695                 oldpde = *pde;
 7696                 if ((oldpde & PG_V) == 0)
 7697                         continue;
 7698                 else if ((oldpde & PG_PS) != 0) {
 7699                         if ((oldpde & PG_MANAGED) == 0)
 7700                                 continue;
 7701                         lock = NULL;
 7702                         if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
 7703                                 if (lock != NULL)
 7704                                         rw_wunlock(lock);
 7705 
 7706                                 /*
 7707                                  * The large page mapping was destroyed.
 7708                                  */
 7709                                 continue;
 7710                         }
 7711 
 7712                         /*
 7713                          * Unless the page mappings are wired, remove the
 7714                          * mapping to a single page so that a subsequent
 7715                          * access may repromote.  Choosing the last page
 7716                          * within the address range [sva, min(va_next, eva))
 7717                          * generally results in more repromotions.  Since the
 7718                          * underlying page table page is fully populated, this
 7719                          * removal never frees a page table page.
 7720                          */
 7721                         if ((oldpde & PG_W) == 0) {
 7722                                 va = eva;
 7723                                 if (va > va_next)
 7724                                         va = va_next;
 7725                                 va -= PAGE_SIZE;
 7726                                 KASSERT(va >= sva,
 7727                                     ("pmap_advise: no address gap"));
 7728                                 pte = pmap_pde_to_pte(pde, va);
 7729                                 KASSERT((*pte & PG_V) != 0,
 7730                                     ("pmap_advise: invalid PTE"));
 7731                                 pmap_remove_pte(pmap, pte, va, *pde, NULL,
 7732                                     &lock);
 7733                                 anychanged = true;
 7734                         }
 7735                         if (lock != NULL)
 7736                                 rw_wunlock(lock);
 7737                 }
 7738                 if (va_next > eva)
 7739                         va_next = eva;
 7740                 va = va_next;
 7741                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 7742                     sva += PAGE_SIZE) {
 7743                         if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 7744                                 goto maybe_invlrng;
 7745                         else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 7746                                 if (advice == MADV_DONTNEED) {
 7747                                         /*
 7748                                          * Future calls to pmap_is_modified()
 7749                                          * can be avoided by making the page
 7750                                          * dirty now.
 7751                                          */
 7752                                         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 7753                                         vm_page_dirty(m);
 7754                                 }
 7755                                 atomic_clear_long(pte, PG_M | PG_A);
 7756                         } else if ((*pte & PG_A) != 0)
 7757                                 atomic_clear_long(pte, PG_A);
 7758                         else
 7759                                 goto maybe_invlrng;
 7760 
 7761                         if ((*pte & PG_G) != 0) {
 7762                                 if (va == va_next)
 7763                                         va = sva;
 7764                         } else
 7765                                 anychanged = true;
 7766                         continue;
 7767 maybe_invlrng:
 7768                         if (va != va_next) {
 7769                                 pmap_invalidate_range(pmap, va, sva);
 7770                                 va = va_next;
 7771                         }
 7772                 }
 7773                 if (va != va_next)
 7774                         pmap_invalidate_range(pmap, va, sva);
 7775         }
 7776         if (anychanged)
 7777                 pmap_invalidate_all(pmap);
 7778         PMAP_UNLOCK(pmap);
 7779         pmap_delayed_invl_finish();
 7780 }
 7781 
 7782 /*
 7783  *      Clear the modify bits on the specified physical page.
 7784  */
 7785 void
 7786 pmap_clear_modify(vm_page_t m)
 7787 {
 7788         struct md_page *pvh;
 7789         pmap_t pmap;
 7790         pv_entry_t next_pv, pv;
 7791         pd_entry_t oldpde, *pde;
 7792         pt_entry_t *pte, PG_M, PG_RW;
 7793         struct rwlock *lock;
 7794         vm_offset_t va;
 7795         int md_gen, pvh_gen;
 7796 
 7797         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 7798             ("pmap_clear_modify: page %p is not managed", m));
 7799         VM_OBJECT_ASSERT_WLOCKED(m->object);
 7800         KASSERT(!vm_page_xbusied(m),
 7801             ("pmap_clear_modify: page %p is exclusive busied", m));
 7802 
 7803         /*
 7804          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 7805          * If the object containing the page is locked and the page is not
 7806          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 7807          */
 7808         if ((m->aflags & PGA_WRITEABLE) == 0)
 7809                 return;
 7810         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 7811             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 7812         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 7813         rw_wlock(lock);
 7814 restart:
 7815         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 7816                 pmap = PV_PMAP(pv);
 7817                 if (!PMAP_TRYLOCK(pmap)) {
 7818                         pvh_gen = pvh->pv_gen;
 7819                         rw_wunlock(lock);
 7820                         PMAP_LOCK(pmap);
 7821                         rw_wlock(lock);
 7822                         if (pvh_gen != pvh->pv_gen) {
 7823                                 PMAP_UNLOCK(pmap);
 7824                                 goto restart;
 7825                         }
 7826                 }
 7827                 PG_M = pmap_modified_bit(pmap);
 7828                 PG_RW = pmap_rw_bit(pmap);
 7829                 va = pv->pv_va;
 7830                 pde = pmap_pde(pmap, va);
 7831                 oldpde = *pde;
 7832                 /* If oldpde has PG_RW set, then it also has PG_M set. */
 7833                 if ((oldpde & PG_RW) != 0 &&
 7834                     pmap_demote_pde_locked(pmap, pde, va, &lock) &&
 7835                     (oldpde & PG_W) == 0) {
 7836                         /*
 7837                          * Write protect the mapping to a single page so that
 7838                          * a subsequent write access may repromote.
 7839                          */
 7840                         va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
 7841                         pte = pmap_pde_to_pte(pde, va);
 7842                         atomic_clear_long(pte, PG_M | PG_RW);
 7843                         vm_page_dirty(m);
 7844                         pmap_invalidate_page(pmap, va);
 7845                 }
 7846                 PMAP_UNLOCK(pmap);
 7847         }
 7848         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 7849                 pmap = PV_PMAP(pv);
 7850                 if (!PMAP_TRYLOCK(pmap)) {
 7851                         md_gen = m->md.pv_gen;
 7852                         pvh_gen = pvh->pv_gen;
 7853                         rw_wunlock(lock);
 7854                         PMAP_LOCK(pmap);
 7855                         rw_wlock(lock);
 7856                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 7857                                 PMAP_UNLOCK(pmap);
 7858                                 goto restart;
 7859                         }
 7860                 }
 7861                 PG_M = pmap_modified_bit(pmap);
 7862                 PG_RW = pmap_rw_bit(pmap);
 7863                 pde = pmap_pde(pmap, pv->pv_va);
 7864                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 7865                     " a 2mpage in page %p's pv list", m));
 7866                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 7867                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 7868                         atomic_clear_long(pte, PG_M);
 7869                         pmap_invalidate_page(pmap, pv->pv_va);
 7870                 }
 7871                 PMAP_UNLOCK(pmap);
 7872         }
 7873         rw_wunlock(lock);
 7874 }
 7875 
 7876 /*
 7877  * Miscellaneous support routines follow
 7878  */
 7879 
 7880 /* Adjust the properties for a leaf page table entry. */
 7881 static __inline void
 7882 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask)
 7883 {
 7884         u_long opte, npte;
 7885 
 7886         opte = *(u_long *)pte;
 7887         do {
 7888                 npte = opte & ~mask;
 7889                 npte |= bits;
 7890         } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte,
 7891             npte));
 7892 }
 7893 
 7894 /*
 7895  * Map a set of physical memory pages into the kernel virtual
 7896  * address space. Return a pointer to where it is mapped. This
 7897  * routine is intended to be used for mapping device memory,
 7898  * NOT real memory.
 7899  */
 7900 static void *
 7901 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
 7902 {
 7903         struct pmap_preinit_mapping *ppim;
 7904         vm_offset_t va, offset;
 7905         vm_size_t tmpsize;
 7906         int i;
 7907 
 7908         offset = pa & PAGE_MASK;
 7909         size = round_page(offset + size);
 7910         pa = trunc_page(pa);
 7911 
 7912         if (!pmap_initialized) {
 7913                 va = 0;
 7914                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 7915                         ppim = pmap_preinit_mapping + i;
 7916                         if (ppim->va == 0) {
 7917                                 ppim->pa = pa;
 7918                                 ppim->sz = size;
 7919                                 ppim->mode = mode;
 7920                                 ppim->va = virtual_avail;
 7921                                 virtual_avail += size;
 7922                                 va = ppim->va;
 7923                                 break;
 7924                         }
 7925                 }
 7926                 if (va == 0)
 7927                         panic("%s: too many preinit mappings", __func__);
 7928         } else {
 7929                 /*
 7930                  * If we have a preinit mapping, re-use it.
 7931                  */
 7932                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 7933                         ppim = pmap_preinit_mapping + i;
 7934                         if (ppim->pa == pa && ppim->sz == size &&
 7935                             (ppim->mode == mode ||
 7936                             (flags & MAPDEV_SETATTR) == 0))
 7937                                 return ((void *)(ppim->va + offset));
 7938                 }
 7939                 /*
 7940                  * If the specified range of physical addresses fits within
 7941                  * the direct map window, use the direct map.
 7942                  */
 7943                 if (pa < dmaplimit && pa + size <= dmaplimit) {
 7944                         va = PHYS_TO_DMAP(pa);
 7945                         if ((flags & MAPDEV_SETATTR) != 0) {
 7946                                 PMAP_LOCK(kernel_pmap);
 7947                                 i = pmap_change_props_locked(va, size,
 7948                                     PROT_NONE, mode, flags);
 7949                                 PMAP_UNLOCK(kernel_pmap);
 7950                         } else
 7951                                 i = 0;
 7952                         if (!i)
 7953                                 return ((void *)(va + offset));
 7954                 }
 7955                 va = kva_alloc(size);
 7956                 if (va == 0)
 7957                         panic("%s: Couldn't allocate KVA", __func__);
 7958         }
 7959         for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 7960                 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 7961         pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 7962         if ((flags & MAPDEV_FLUSHCACHE) != 0)
 7963                 pmap_invalidate_cache_range(va, va + tmpsize);
 7964         return ((void *)(va + offset));
 7965 }
 7966 
 7967 void *
 7968 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 7969 {
 7970 
 7971         return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE |
 7972             MAPDEV_SETATTR));
 7973 }
 7974 
 7975 void *
 7976 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 7977 {
 7978 
 7979         return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 7980 }
 7981 
 7982 void *
 7983 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
 7984 {
 7985 
 7986         return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE,
 7987             MAPDEV_SETATTR));
 7988 }
 7989 
 7990 void *
 7991 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 7992 {
 7993 
 7994         return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK,
 7995             MAPDEV_FLUSHCACHE));
 7996 }
 7997 
 7998 void
 7999 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 8000 {
 8001         struct pmap_preinit_mapping *ppim;
 8002         vm_offset_t offset;
 8003         int i;
 8004 
 8005         /* If we gave a direct map region in pmap_mapdev, do nothing */
 8006         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 8007                 return;
 8008         offset = va & PAGE_MASK;
 8009         size = round_page(offset + size);
 8010         va = trunc_page(va);
 8011         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 8012                 ppim = pmap_preinit_mapping + i;
 8013                 if (ppim->va == va && ppim->sz == size) {
 8014                         if (pmap_initialized)
 8015                                 return;
 8016                         ppim->pa = 0;
 8017                         ppim->va = 0;
 8018                         ppim->sz = 0;
 8019                         ppim->mode = 0;
 8020                         if (va + size == virtual_avail)
 8021                                 virtual_avail = va;
 8022                         return;
 8023                 }
 8024         }
 8025         if (pmap_initialized) {
 8026                 pmap_qremove(va, atop(size));
 8027                 kva_free(va, size);
 8028         }
 8029 }
 8030 
 8031 /*
 8032  * Tries to demote a 1GB page mapping.
 8033  */
 8034 static boolean_t
 8035 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
 8036 {
 8037         pdp_entry_t newpdpe, oldpdpe;
 8038         pd_entry_t *firstpde, newpde, *pde;
 8039         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 8040         vm_paddr_t pdpgpa;
 8041         vm_page_t pdpg;
 8042 
 8043         PG_A = pmap_accessed_bit(pmap);
 8044         PG_M = pmap_modified_bit(pmap);
 8045         PG_V = pmap_valid_bit(pmap);
 8046         PG_RW = pmap_rw_bit(pmap);
 8047 
 8048         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 8049         oldpdpe = *pdpe;
 8050         KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
 8051             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 8052         if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
 8053             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 8054                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 8055                     " in pmap %p", va, pmap);
 8056                 return (FALSE);
 8057         }
 8058         pdpgpa = VM_PAGE_TO_PHYS(pdpg);
 8059         firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
 8060         newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
 8061         KASSERT((oldpdpe & PG_A) != 0,
 8062             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 8063         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 8064             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 8065         newpde = oldpdpe;
 8066 
 8067         /*
 8068          * Initialize the page directory page.
 8069          */
 8070         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 8071                 *pde = newpde;
 8072                 newpde += NBPDR;
 8073         }
 8074 
 8075         /*
 8076          * Demote the mapping.
 8077          */
 8078         *pdpe = newpdpe;
 8079 
 8080         /*
 8081          * Invalidate a stale recursive mapping of the page directory page.
 8082          */
 8083         pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
 8084 
 8085         pmap_pdpe_demotions++;
 8086         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 8087             " in pmap %p", va, pmap);
 8088         return (TRUE);
 8089 }
 8090 
 8091 /*
 8092  * Sets the memory attribute for the specified page.
 8093  */
 8094 void
 8095 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 8096 {
 8097 
 8098         m->md.pat_mode = ma;
 8099 
 8100         /*
 8101          * If "m" is a normal page, update its direct mapping.  This update
 8102          * can be relied upon to perform any cache operations that are
 8103          * required for data coherence.
 8104          */
 8105         if ((m->flags & PG_FICTITIOUS) == 0 &&
 8106             pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 8107             m->md.pat_mode))
 8108                 panic("memory attribute change on the direct map failed");
 8109 }
 8110 
 8111 /*
 8112  * Changes the specified virtual address range's memory type to that given by
 8113  * the parameter "mode".  The specified virtual address range must be
 8114  * completely contained within either the direct map or the kernel map.  If
 8115  * the virtual address range is contained within the kernel map, then the
 8116  * memory type for each of the corresponding ranges of the direct map is also
 8117  * changed.  (The corresponding ranges of the direct map are those ranges that
 8118  * map the same physical pages as the specified virtual address range.)  These
 8119  * changes to the direct map are necessary because Intel describes the
 8120  * behavior of their processors as "undefined" if two or more mappings to the
 8121  * same physical page have different memory types.
 8122  *
 8123  * Returns zero if the change completed successfully, and either EINVAL or
 8124  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
 8125  * of the virtual address range was not mapped, and ENOMEM is returned if
 8126  * there was insufficient memory available to complete the change.  In the
 8127  * latter case, the memory type may have been changed on some part of the
 8128  * virtual address range or the direct map.
 8129  */
 8130 int
 8131 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 8132 {
 8133         int error;
 8134 
 8135         PMAP_LOCK(kernel_pmap);
 8136         error = pmap_change_props_locked(va, size, PROT_NONE, mode,
 8137             MAPDEV_FLUSHCACHE);
 8138         PMAP_UNLOCK(kernel_pmap);
 8139         return (error);
 8140 }
 8141 
 8142 /*
 8143  * Changes the specified virtual address range's protections to those
 8144  * specified by "prot".  Like pmap_change_attr(), protections for aliases
 8145  * in the direct map are updated as well.  Protections on aliasing mappings may
 8146  * be a subset of the requested protections; for example, mappings in the direct
 8147  * map are never executable.
 8148  */
 8149 int
 8150 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
 8151 {
 8152         int error;
 8153 
 8154         /* Only supported within the kernel map. */
 8155         if (va < VM_MIN_KERNEL_ADDRESS)
 8156                 return (EINVAL);
 8157 
 8158         PMAP_LOCK(kernel_pmap);
 8159         error = pmap_change_props_locked(va, size, prot, -1,
 8160             MAPDEV_ASSERTVALID);
 8161         PMAP_UNLOCK(kernel_pmap);
 8162         return (error);
 8163 }
 8164 
 8165 static int
 8166 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
 8167     int mode, int flags)
 8168 {
 8169         vm_offset_t base, offset, tmpva;
 8170         vm_paddr_t pa_start, pa_end, pa_end1;
 8171         pdp_entry_t *pdpe;
 8172         pd_entry_t *pde, pde_bits, pde_mask;
 8173         pt_entry_t *pte, pte_bits, pte_mask;
 8174         int error;
 8175         bool changed;
 8176 
 8177         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 8178         base = trunc_page(va);
 8179         offset = va & PAGE_MASK;
 8180         size = round_page(offset + size);
 8181 
 8182         /*
 8183          * Only supported on kernel virtual addresses, including the direct
 8184          * map but excluding the recursive map.
 8185          */
 8186         if (base < DMAP_MIN_ADDRESS)
 8187                 return (EINVAL);
 8188 
 8189         /*
 8190          * Construct our flag sets and masks.  "bits" is the subset of
 8191          * "mask" that will be set in each modified PTE.
 8192          *
 8193          * Mappings in the direct map are never allowed to be executable.
 8194          */
 8195         pde_bits = pte_bits = 0;
 8196         pde_mask = pte_mask = 0;
 8197         if (mode != -1) {
 8198                 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true);
 8199                 pde_mask |= X86_PG_PDE_CACHE;
 8200                 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false);
 8201                 pte_mask |= X86_PG_PTE_CACHE;
 8202         }
 8203         if (prot != VM_PROT_NONE) {
 8204                 if ((prot & VM_PROT_WRITE) != 0) {
 8205                         pde_bits |= X86_PG_RW;
 8206                         pte_bits |= X86_PG_RW;
 8207                 }
 8208                 if ((prot & VM_PROT_EXECUTE) == 0 ||
 8209                     va < VM_MIN_KERNEL_ADDRESS) {
 8210                         pde_bits |= pg_nx;
 8211                         pte_bits |= pg_nx;
 8212                 }
 8213                 pde_mask |= X86_PG_RW | pg_nx;
 8214                 pte_mask |= X86_PG_RW | pg_nx;
 8215         }
 8216 
 8217         /*
 8218          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 8219          * into 4KB pages if required.
 8220          */
 8221         for (tmpva = base; tmpva < base + size; ) {
 8222                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
 8223                 if (pdpe == NULL || *pdpe == 0) {
 8224                         KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
 8225                             ("%s: addr %#lx is not mapped", __func__, tmpva));
 8226                         return (EINVAL);
 8227                 }
 8228                 if (*pdpe & PG_PS) {
 8229                         /*
 8230                          * If the current 1GB page already has the required
 8231                          * properties, then we need not demote this page.  Just
 8232                          * increment tmpva to the next 1GB page frame.
 8233                          */
 8234                         if ((*pdpe & pde_mask) == pde_bits) {
 8235                                 tmpva = trunc_1gpage(tmpva) + NBPDP;
 8236                                 continue;
 8237                         }
 8238 
 8239                         /*
 8240                          * If the current offset aligns with a 1GB page frame
 8241                          * and there is at least 1GB left within the range, then
 8242                          * we need not break down this page into 2MB pages.
 8243                          */
 8244                         if ((tmpva & PDPMASK) == 0 &&
 8245                             tmpva + PDPMASK < base + size) {
 8246                                 tmpva += NBPDP;
 8247                                 continue;
 8248                         }
 8249                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
 8250                                 return (ENOMEM);
 8251                 }
 8252                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
 8253                 if (*pde == 0) {
 8254                         KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
 8255                             ("%s: addr %#lx is not mapped", __func__, tmpva));
 8256                         return (EINVAL);
 8257                 }
 8258                 if (*pde & PG_PS) {
 8259                         /*
 8260                          * If the current 2MB page already has the required
 8261                          * properties, then we need not demote this page.  Just
 8262                          * increment tmpva to the next 2MB page frame.
 8263                          */
 8264                         if ((*pde & pde_mask) == pde_bits) {
 8265                                 tmpva = trunc_2mpage(tmpva) + NBPDR;
 8266                                 continue;
 8267                         }
 8268 
 8269                         /*
 8270                          * If the current offset aligns with a 2MB page frame
 8271                          * and there is at least 2MB left within the range, then
 8272                          * we need not break down this page into 4KB pages.
 8273                          */
 8274                         if ((tmpva & PDRMASK) == 0 &&
 8275                             tmpva + PDRMASK < base + size) {
 8276                                 tmpva += NBPDR;
 8277                                 continue;
 8278                         }
 8279                         if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
 8280                                 return (ENOMEM);
 8281                 }
 8282                 pte = pmap_pde_to_pte(pde, tmpva);
 8283                 if (*pte == 0) {
 8284                         KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
 8285                             ("%s: addr %#lx is not mapped", __func__, tmpva));
 8286                         return (EINVAL);
 8287                 }
 8288                 tmpva += PAGE_SIZE;
 8289         }
 8290         error = 0;
 8291 
 8292         /*
 8293          * Ok, all the pages exist, so run through them updating their
 8294          * properties if required.
 8295          */
 8296         changed = false;
 8297         pa_start = pa_end = 0;
 8298         for (tmpva = base; tmpva < base + size; ) {
 8299                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
 8300                 if (*pdpe & PG_PS) {
 8301                         if ((*pdpe & pde_mask) != pde_bits) {
 8302                                 pmap_pte_props(pdpe, pde_bits, pde_mask);
 8303                                 changed = true;
 8304                         }
 8305                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 8306                             (*pdpe & PG_PS_FRAME) < dmaplimit) {
 8307                                 if (pa_start == pa_end) {
 8308                                         /* Start physical address run. */
 8309                                         pa_start = *pdpe & PG_PS_FRAME;
 8310                                         pa_end = pa_start + NBPDP;
 8311                                 } else if (pa_end == (*pdpe & PG_PS_FRAME))
 8312                                         pa_end += NBPDP;
 8313                                 else {
 8314                                         /* Run ended, update direct map. */
 8315                                         error = pmap_change_props_locked(
 8316                                             PHYS_TO_DMAP(pa_start),
 8317                                             pa_end - pa_start, prot, mode,
 8318                                             flags);
 8319                                         if (error != 0)
 8320                                                 break;
 8321                                         /* Start physical address run. */
 8322                                         pa_start = *pdpe & PG_PS_FRAME;
 8323                                         pa_end = pa_start + NBPDP;
 8324                                 }
 8325                         }
 8326                         tmpva = trunc_1gpage(tmpva) + NBPDP;
 8327                         continue;
 8328                 }
 8329                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
 8330                 if (*pde & PG_PS) {
 8331                         if ((*pde & pde_mask) != pde_bits) {
 8332                                 pmap_pte_props(pde, pde_bits, pde_mask);
 8333                                 changed = true;
 8334                         }
 8335                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 8336                             (*pde & PG_PS_FRAME) < dmaplimit) {
 8337                                 if (pa_start == pa_end) {
 8338                                         /* Start physical address run. */
 8339                                         pa_start = *pde & PG_PS_FRAME;
 8340                                         pa_end = pa_start + NBPDR;
 8341                                 } else if (pa_end == (*pde & PG_PS_FRAME))
 8342                                         pa_end += NBPDR;
 8343                                 else {
 8344                                         /* Run ended, update direct map. */
 8345                                         error = pmap_change_props_locked(
 8346                                             PHYS_TO_DMAP(pa_start),
 8347                                             pa_end - pa_start, prot, mode,
 8348                                             flags);
 8349                                         if (error != 0)
 8350                                                 break;
 8351                                         /* Start physical address run. */
 8352                                         pa_start = *pde & PG_PS_FRAME;
 8353                                         pa_end = pa_start + NBPDR;
 8354                                 }
 8355                         }
 8356                         tmpva = trunc_2mpage(tmpva) + NBPDR;
 8357                 } else {
 8358                         pte = pmap_pde_to_pte(pde, tmpva);
 8359                         if ((*pte & pte_mask) != pte_bits) {
 8360                                 pmap_pte_props(pte, pte_bits, pte_mask);
 8361                                 changed = true;
 8362                         }
 8363                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 8364                             (*pte & PG_FRAME) < dmaplimit) {
 8365                                 if (pa_start == pa_end) {
 8366                                         /* Start physical address run. */
 8367                                         pa_start = *pte & PG_FRAME;
 8368                                         pa_end = pa_start + PAGE_SIZE;
 8369                                 } else if (pa_end == (*pte & PG_FRAME))
 8370                                         pa_end += PAGE_SIZE;
 8371                                 else {
 8372                                         /* Run ended, update direct map. */
 8373                                         error = pmap_change_props_locked(
 8374                                             PHYS_TO_DMAP(pa_start),
 8375                                             pa_end - pa_start, prot, mode,
 8376                                             flags);
 8377                                         if (error != 0)
 8378                                                 break;
 8379                                         /* Start physical address run. */
 8380                                         pa_start = *pte & PG_FRAME;
 8381                                         pa_end = pa_start + PAGE_SIZE;
 8382                                 }
 8383                         }
 8384                         tmpva += PAGE_SIZE;
 8385                 }
 8386         }
 8387         if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
 8388                 pa_end1 = MIN(pa_end, dmaplimit);
 8389                 if (pa_start != pa_end1)
 8390                         error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start),
 8391                             pa_end1 - pa_start, prot, mode, flags);
 8392         }
 8393 
 8394         /*
 8395          * Flush CPU caches if required to make sure any data isn't cached that
 8396          * shouldn't be, etc.
 8397          */
 8398         if (changed) {
 8399                 pmap_invalidate_range(kernel_pmap, base, tmpva);
 8400                 if ((flags & MAPDEV_FLUSHCACHE) != 0)
 8401                         pmap_invalidate_cache_range(base, tmpva);
 8402         }
 8403         return (error);
 8404 }
 8405 
 8406 /*
 8407  * Demotes any mapping within the direct map region that covers more than the
 8408  * specified range of physical addresses.  This range's size must be a power
 8409  * of two and its starting address must be a multiple of its size.  Since the
 8410  * demotion does not change any attributes of the mapping, a TLB invalidation
 8411  * is not mandatory.  The caller may, however, request a TLB invalidation.
 8412  */
 8413 void
 8414 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
 8415 {
 8416         pdp_entry_t *pdpe;
 8417         pd_entry_t *pde;
 8418         vm_offset_t va;
 8419         boolean_t changed;
 8420 
 8421         if (len == 0)
 8422                 return;
 8423         KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
 8424         KASSERT((base & (len - 1)) == 0,
 8425             ("pmap_demote_DMAP: base is not a multiple of len"));
 8426         if (len < NBPDP && base < dmaplimit) {
 8427                 va = PHYS_TO_DMAP(base);
 8428                 changed = FALSE;
 8429                 PMAP_LOCK(kernel_pmap);
 8430                 pdpe = pmap_pdpe(kernel_pmap, va);
 8431                 if ((*pdpe & X86_PG_V) == 0)
 8432                         panic("pmap_demote_DMAP: invalid PDPE");
 8433                 if ((*pdpe & PG_PS) != 0) {
 8434                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
 8435                                 panic("pmap_demote_DMAP: PDPE failed");
 8436                         changed = TRUE;
 8437                 }
 8438                 if (len < NBPDR) {
 8439                         pde = pmap_pdpe_to_pde(pdpe, va);
 8440                         if ((*pde & X86_PG_V) == 0)
 8441                                 panic("pmap_demote_DMAP: invalid PDE");
 8442                         if ((*pde & PG_PS) != 0) {
 8443                                 if (!pmap_demote_pde(kernel_pmap, pde, va))
 8444                                         panic("pmap_demote_DMAP: PDE failed");
 8445                                 changed = TRUE;
 8446                         }
 8447                 }
 8448                 if (changed && invalidate)
 8449                         pmap_invalidate_page(kernel_pmap, va);
 8450                 PMAP_UNLOCK(kernel_pmap);
 8451         }
 8452 }
 8453 
 8454 /*
 8455  * perform the pmap work for mincore
 8456  */
 8457 int
 8458 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 8459 {
 8460         pd_entry_t *pdep;
 8461         pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
 8462         vm_paddr_t pa;
 8463         int val;
 8464 
 8465         PG_A = pmap_accessed_bit(pmap);
 8466         PG_M = pmap_modified_bit(pmap);
 8467         PG_V = pmap_valid_bit(pmap);
 8468         PG_RW = pmap_rw_bit(pmap);
 8469 
 8470         PMAP_LOCK(pmap);
 8471 retry:
 8472         pdep = pmap_pde(pmap, addr);
 8473         if (pdep != NULL && (*pdep & PG_V)) {
 8474                 if (*pdep & PG_PS) {
 8475                         pte = *pdep;
 8476                         /* Compute the physical address of the 4KB page. */
 8477                         pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 8478                             PG_FRAME;
 8479                         val = MINCORE_SUPER;
 8480                 } else {
 8481                         pte = *pmap_pde_to_pte(pdep, addr);
 8482                         pa = pte & PG_FRAME;
 8483                         val = 0;
 8484                 }
 8485         } else {
 8486                 pte = 0;
 8487                 pa = 0;
 8488                 val = 0;
 8489         }
 8490         if ((pte & PG_V) != 0) {
 8491                 val |= MINCORE_INCORE;
 8492                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 8493                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 8494                 if ((pte & PG_A) != 0)
 8495                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 8496         }
 8497         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 8498             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 8499             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 8500                 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 8501                 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 8502                         goto retry;
 8503         } else
 8504                 PA_UNLOCK_COND(*locked_pa);
 8505         PMAP_UNLOCK(pmap);
 8506         return (val);
 8507 }
 8508 
 8509 static uint64_t
 8510 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
 8511 {
 8512         uint32_t gen, new_gen, pcid_next;
 8513 
 8514         CRITICAL_ASSERT(curthread);
 8515         gen = PCPU_GET(pcid_gen);
 8516         if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN)
 8517                 return (pti ? 0 : CR3_PCID_SAVE);
 8518         if (pmap->pm_pcids[cpuid].pm_gen == gen)
 8519                 return (CR3_PCID_SAVE);
 8520         pcid_next = PCPU_GET(pcid_next);
 8521         KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
 8522             (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
 8523             ("cpu %d pcid_next %#x", cpuid, pcid_next));
 8524         if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
 8525             (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
 8526                 new_gen = gen + 1;
 8527                 if (new_gen == 0)
 8528                         new_gen = 1;
 8529                 PCPU_SET(pcid_gen, new_gen);
 8530                 pcid_next = PMAP_PCID_KERN + 1;
 8531         } else {
 8532                 new_gen = gen;
 8533         }
 8534         pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
 8535         pmap->pm_pcids[cpuid].pm_gen = new_gen;
 8536         PCPU_SET(pcid_next, pcid_next + 1);
 8537         return (0);
 8538 }
 8539 
 8540 static uint64_t
 8541 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid)
 8542 {
 8543         uint64_t cached;
 8544 
 8545         cached = pmap_pcid_alloc(pmap, cpuid);
 8546         KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
 8547             ("pmap %p cpu %d pcid %#x", pmap, cpuid,
 8548             pmap->pm_pcids[cpuid].pm_pcid));
 8549         KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
 8550             pmap == kernel_pmap,
 8551             ("non-kernel pmap pmap %p cpu %d pcid %#x",
 8552             pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
 8553         return (cached);
 8554 }
 8555 
 8556 static void
 8557 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap)
 8558 {
 8559 
 8560         PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ?
 8561             PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base;
 8562 }
 8563 
 8564 static void inline
 8565 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1)
 8566 {
 8567         struct invpcid_descr d;
 8568         uint64_t cached, cr3, kcr3, ucr3;
 8569 
 8570         cached = pmap_pcid_alloc_checked(pmap, cpuid);
 8571         cr3 = rcr3();
 8572         if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
 8573                 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid);
 8574         PCPU_SET(curpmap, pmap);
 8575         kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
 8576         ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
 8577             PMAP_PCID_USER_PT;
 8578 
 8579         if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) {
 8580                 /*
 8581                  * Explicitly invalidate translations cached from the
 8582                  * user page table.  They are not automatically
 8583                  * flushed by reload of cr3 with the kernel page table
 8584                  * pointer above.
 8585                  *
 8586                  * Note that the if() condition is resolved statically
 8587                  * by using the function argument instead of
 8588                  * runtime-evaluated invpcid_works value.
 8589                  */
 8590                 if (invpcid_works1) {
 8591                         d.pcid = PMAP_PCID_USER_PT |
 8592                             pmap->pm_pcids[cpuid].pm_pcid;
 8593                         d.pad = 0;
 8594                         d.addr = 0;
 8595                         invpcid(&d, INVPCID_CTX);
 8596                 } else {
 8597                         pmap_pti_pcid_invalidate(ucr3, kcr3);
 8598                 }
 8599         }
 8600 
 8601         PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
 8602         PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
 8603         if (cached)
 8604                 PCPU_INC(pm_save_cnt);
 8605 }
 8606 
 8607 static void
 8608 pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
 8609 {
 8610 
 8611         pmap_activate_sw_pcid_pti(pmap, cpuid, true);
 8612         pmap_activate_sw_pti_post(td, pmap);
 8613 }
 8614 
 8615 static void
 8616 pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap,
 8617     u_int cpuid)
 8618 {
 8619         register_t rflags;
 8620 
 8621         /*
 8622          * If the INVPCID instruction is not available,
 8623          * invltlb_pcid_handler() is used to handle an invalidate_all
 8624          * IPI, which checks for curpmap == smp_tlb_pmap.  The below
 8625          * sequence of operations has a window where %CR3 is loaded
 8626          * with the new pmap's PML4 address, but the curpmap value has
 8627          * not yet been updated.  This causes the invltlb IPI handler,
 8628          * which is called between the updates, to execute as a NOP,
 8629          * which leaves stale TLB entries.
 8630          *
 8631          * Note that the most typical use of pmap_activate_sw(), from
 8632          * the context switch, is immune to this race, because
 8633          * interrupts are disabled (while the thread lock is owned),
 8634          * and the IPI happens after curpmap is updated.  Protect
 8635          * other callers in a similar way, by disabling interrupts
 8636          * around the %cr3 register reload and curpmap assignment.
 8637          */
 8638         rflags = intr_disable();
 8639         pmap_activate_sw_pcid_pti(pmap, cpuid, false);
 8640         intr_restore(rflags);
 8641         pmap_activate_sw_pti_post(td, pmap);
 8642 }
 8643 
 8644 static void
 8645 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap,
 8646     u_int cpuid)
 8647 {
 8648         uint64_t cached, cr3;
 8649 
 8650         cached = pmap_pcid_alloc_checked(pmap, cpuid);
 8651         cr3 = rcr3();
 8652         if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
 8653                 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
 8654                     cached);
 8655         PCPU_SET(curpmap, pmap);
 8656         if (cached)
 8657                 PCPU_INC(pm_save_cnt);
 8658 }
 8659 
 8660 static void
 8661 pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap,
 8662     u_int cpuid)
 8663 {
 8664         register_t rflags;
 8665 
 8666         rflags = intr_disable();
 8667         pmap_activate_sw_pcid_nopti(td, pmap, cpuid);
 8668         intr_restore(rflags);
 8669 }
 8670 
 8671 static void
 8672 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap,
 8673     u_int cpuid __unused)
 8674 {
 8675 
 8676         load_cr3(pmap->pm_cr3);
 8677         PCPU_SET(curpmap, pmap);
 8678 }
 8679 
 8680 static void
 8681 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap,
 8682     u_int cpuid __unused)
 8683 {
 8684 
 8685         pmap_activate_sw_nopcid_nopti(td, pmap, cpuid);
 8686         PCPU_SET(kcr3, pmap->pm_cr3);
 8687         PCPU_SET(ucr3, pmap->pm_ucr3);
 8688         pmap_activate_sw_pti_post(td, pmap);
 8689 }
 8690 
 8691 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t,
 8692     u_int), static)
 8693 {
 8694 
 8695         if (pmap_pcid_enabled && pti && invpcid_works)
 8696                 return (pmap_activate_sw_pcid_invpcid_pti);
 8697         else if (pmap_pcid_enabled && pti && !invpcid_works)
 8698                 return (pmap_activate_sw_pcid_noinvpcid_pti);
 8699         else if (pmap_pcid_enabled && !pti && invpcid_works)
 8700                 return (pmap_activate_sw_pcid_nopti);
 8701         else if (pmap_pcid_enabled && !pti && !invpcid_works)
 8702                 return (pmap_activate_sw_pcid_noinvpcid_nopti);
 8703         else if (!pmap_pcid_enabled && pti)
 8704                 return (pmap_activate_sw_nopcid_pti);
 8705         else /* if (!pmap_pcid_enabled && !pti) */
 8706                 return (pmap_activate_sw_nopcid_nopti);
 8707 }
 8708 
 8709 void
 8710 pmap_activate_sw(struct thread *td)
 8711 {
 8712         pmap_t oldpmap, pmap;
 8713         u_int cpuid;
 8714 
 8715         oldpmap = PCPU_GET(curpmap);
 8716         pmap = vmspace_pmap(td->td_proc->p_vmspace);
 8717         if (oldpmap == pmap) {
 8718                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
 8719                         mfence();
 8720                 return;
 8721         }
 8722         cpuid = PCPU_GET(cpuid);
 8723 #ifdef SMP
 8724         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 8725 #else
 8726         CPU_SET(cpuid, &pmap->pm_active);
 8727 #endif
 8728         pmap_activate_sw_mode(td, pmap, cpuid);
 8729 #ifdef SMP
 8730         CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 8731 #else
 8732         CPU_CLR(cpuid, &oldpmap->pm_active);
 8733 #endif
 8734 }
 8735 
 8736 void
 8737 pmap_activate(struct thread *td)
 8738 {
 8739 
 8740         critical_enter();
 8741         pmap_activate_sw(td);
 8742         critical_exit();
 8743 }
 8744 
 8745 void
 8746 pmap_activate_boot(pmap_t pmap)
 8747 {
 8748         uint64_t kcr3;
 8749         u_int cpuid;
 8750 
 8751         /*
 8752          * kernel_pmap must be never deactivated, and we ensure that
 8753          * by never activating it at all.
 8754          */
 8755         MPASS(pmap != kernel_pmap);
 8756 
 8757         cpuid = PCPU_GET(cpuid);
 8758 #ifdef SMP
 8759         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 8760 #else
 8761         CPU_SET(cpuid, &pmap->pm_active);
 8762 #endif
 8763         PCPU_SET(curpmap, pmap);
 8764         if (pti) {
 8765                 kcr3 = pmap->pm_cr3;
 8766                 if (pmap_pcid_enabled)
 8767                         kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE;
 8768         } else {
 8769                 kcr3 = PMAP_NO_CR3;
 8770         }
 8771         PCPU_SET(kcr3, kcr3);
 8772         PCPU_SET(ucr3, PMAP_NO_CR3);
 8773 }
 8774 
 8775 void
 8776 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 8777 {
 8778 }
 8779 
 8780 /*
 8781  *      Increase the starting virtual address of the given mapping if a
 8782  *      different alignment might result in more superpage mappings.
 8783  */
 8784 void
 8785 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
 8786     vm_offset_t *addr, vm_size_t size)
 8787 {
 8788         vm_offset_t superpage_offset;
 8789 
 8790         if (size < NBPDR)
 8791                 return;
 8792         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 8793                 offset += ptoa(object->pg_color);
 8794         superpage_offset = offset & PDRMASK;
 8795         if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 8796             (*addr & PDRMASK) == superpage_offset)
 8797                 return;
 8798         if ((*addr & PDRMASK) < superpage_offset)
 8799                 *addr = (*addr & ~PDRMASK) + superpage_offset;
 8800         else
 8801                 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 8802 }
 8803 
 8804 #ifdef INVARIANTS
 8805 static unsigned long num_dirty_emulations;
 8806 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
 8807              &num_dirty_emulations, 0, NULL);
 8808 
 8809 static unsigned long num_accessed_emulations;
 8810 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
 8811              &num_accessed_emulations, 0, NULL);
 8812 
 8813 static unsigned long num_superpage_accessed_emulations;
 8814 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
 8815              &num_superpage_accessed_emulations, 0, NULL);
 8816 
 8817 static unsigned long ad_emulation_superpage_promotions;
 8818 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
 8819              &ad_emulation_superpage_promotions, 0, NULL);
 8820 #endif  /* INVARIANTS */
 8821 
 8822 int
 8823 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
 8824 {
 8825         int rv;
 8826         struct rwlock *lock;
 8827 #if VM_NRESERVLEVEL > 0
 8828         vm_page_t m, mpte;
 8829 #endif
 8830         pd_entry_t *pde;
 8831         pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
 8832 
 8833         KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
 8834             ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
 8835 
 8836         if (!pmap_emulate_ad_bits(pmap))
 8837                 return (-1);
 8838 
 8839         PG_A = pmap_accessed_bit(pmap);
 8840         PG_M = pmap_modified_bit(pmap);
 8841         PG_V = pmap_valid_bit(pmap);
 8842         PG_RW = pmap_rw_bit(pmap);
 8843 
 8844         rv = -1;
 8845         lock = NULL;
 8846         PMAP_LOCK(pmap);
 8847 
 8848         pde = pmap_pde(pmap, va);
 8849         if (pde == NULL || (*pde & PG_V) == 0)
 8850                 goto done;
 8851 
 8852         if ((*pde & PG_PS) != 0) {
 8853                 if (ftype == VM_PROT_READ) {
 8854 #ifdef INVARIANTS
 8855                         atomic_add_long(&num_superpage_accessed_emulations, 1);
 8856 #endif
 8857                         *pde |= PG_A;
 8858                         rv = 0;
 8859                 }
 8860                 goto done;
 8861         }
 8862 
 8863         pte = pmap_pde_to_pte(pde, va);
 8864         if ((*pte & PG_V) == 0)
 8865                 goto done;
 8866 
 8867         if (ftype == VM_PROT_WRITE) {
 8868                 if ((*pte & PG_RW) == 0)
 8869                         goto done;
 8870                 /*
 8871                  * Set the modified and accessed bits simultaneously.
 8872                  *
 8873                  * Intel EPT PTEs that do software emulation of A/D bits map
 8874                  * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
 8875                  * An EPT misconfiguration is triggered if the PTE is writable
 8876                  * but not readable (WR=10). This is avoided by setting PG_A
 8877                  * and PG_M simultaneously.
 8878                  */
 8879                 *pte |= PG_M | PG_A;
 8880         } else {
 8881                 *pte |= PG_A;
 8882         }
 8883 
 8884 #if VM_NRESERVLEVEL > 0
 8885         /* try to promote the mapping */
 8886         if (va < VM_MAXUSER_ADDRESS)
 8887                 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 8888         else
 8889                 mpte = NULL;
 8890 
 8891         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 8892 
 8893         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 8894             pmap_ps_enabled(pmap) &&
 8895             (m->flags & PG_FICTITIOUS) == 0 &&
 8896             vm_reserv_level_iffullpop(m) == 0) {
 8897                 pmap_promote_pde(pmap, pde, va, &lock);
 8898 #ifdef INVARIANTS
 8899                 atomic_add_long(&ad_emulation_superpage_promotions, 1);
 8900 #endif
 8901         }
 8902 #endif
 8903 
 8904 #ifdef INVARIANTS
 8905         if (ftype == VM_PROT_WRITE)
 8906                 atomic_add_long(&num_dirty_emulations, 1);
 8907         else
 8908                 atomic_add_long(&num_accessed_emulations, 1);
 8909 #endif
 8910         rv = 0;         /* success */
 8911 done:
 8912         if (lock != NULL)
 8913                 rw_wunlock(lock);
 8914         PMAP_UNLOCK(pmap);
 8915         return (rv);
 8916 }
 8917 
 8918 void
 8919 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
 8920 {
 8921         pml4_entry_t *pml4;
 8922         pdp_entry_t *pdp;
 8923         pd_entry_t *pde;
 8924         pt_entry_t *pte, PG_V;
 8925         int idx;
 8926 
 8927         idx = 0;
 8928         PG_V = pmap_valid_bit(pmap);
 8929         PMAP_LOCK(pmap);
 8930 
 8931         pml4 = pmap_pml4e(pmap, va);
 8932         ptr[idx++] = *pml4;
 8933         if ((*pml4 & PG_V) == 0)
 8934                 goto done;
 8935 
 8936         pdp = pmap_pml4e_to_pdpe(pml4, va);
 8937         ptr[idx++] = *pdp;
 8938         if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
 8939                 goto done;
 8940 
 8941         pde = pmap_pdpe_to_pde(pdp, va);
 8942         ptr[idx++] = *pde;
 8943         if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
 8944                 goto done;
 8945 
 8946         pte = pmap_pde_to_pte(pde, va);
 8947         ptr[idx++] = *pte;
 8948 
 8949 done:
 8950         PMAP_UNLOCK(pmap);
 8951         *num = idx;
 8952 }
 8953 
 8954 /**
 8955  * Get the kernel virtual address of a set of physical pages. If there are
 8956  * physical addresses not covered by the DMAP perform a transient mapping
 8957  * that will be removed when calling pmap_unmap_io_transient.
 8958  *
 8959  * \param page        The pages the caller wishes to obtain the virtual
 8960  *                    address on the kernel memory map.
 8961  * \param vaddr       On return contains the kernel virtual memory address
 8962  *                    of the pages passed in the page parameter.
 8963  * \param count       Number of pages passed in.
 8964  * \param can_fault   TRUE if the thread using the mapped pages can take
 8965  *                    page faults, FALSE otherwise.
 8966  *
 8967  * \returns TRUE if the caller must call pmap_unmap_io_transient when
 8968  *          finished or FALSE otherwise.
 8969  *
 8970  */
 8971 boolean_t
 8972 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
 8973     boolean_t can_fault)
 8974 {
 8975         vm_paddr_t paddr;
 8976         boolean_t needs_mapping;
 8977         pt_entry_t *pte;
 8978         int cache_bits, error __unused, i;
 8979 
 8980         /*
 8981          * Allocate any KVA space that we need, this is done in a separate
 8982          * loop to prevent calling vmem_alloc while pinned.
 8983          */
 8984         needs_mapping = FALSE;
 8985         for (i = 0; i < count; i++) {
 8986                 paddr = VM_PAGE_TO_PHYS(page[i]);
 8987                 if (__predict_false(paddr >= dmaplimit)) {
 8988                         error = vmem_alloc(kernel_arena, PAGE_SIZE,
 8989                             M_BESTFIT | M_WAITOK, &vaddr[i]);
 8990                         KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 8991                         needs_mapping = TRUE;
 8992                 } else {
 8993                         vaddr[i] = PHYS_TO_DMAP(paddr);
 8994                 }
 8995         }
 8996 
 8997         /* Exit early if everything is covered by the DMAP */
 8998         if (!needs_mapping)
 8999                 return (FALSE);
 9000 
 9001         /*
 9002          * NB:  The sequence of updating a page table followed by accesses
 9003          * to the corresponding pages used in the !DMAP case is subject to
 9004          * the situation described in the "AMD64 Architecture Programmer's
 9005          * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
 9006          * Coherency Considerations".  Therefore, issuing the INVLPG right
 9007          * after modifying the PTE bits is crucial.
 9008          */
 9009         if (!can_fault)
 9010                 sched_pin();
 9011         for (i = 0; i < count; i++) {
 9012                 paddr = VM_PAGE_TO_PHYS(page[i]);
 9013                 if (paddr >= dmaplimit) {
 9014                         if (can_fault) {
 9015                                 /*
 9016                                  * Slow path, since we can get page faults
 9017                                  * while mappings are active don't pin the
 9018                                  * thread to the CPU and instead add a global
 9019                                  * mapping visible to all CPUs.
 9020                                  */
 9021                                 pmap_qenter(vaddr[i], &page[i], 1);
 9022                         } else {
 9023                                 pte = vtopte(vaddr[i]);
 9024                                 cache_bits = pmap_cache_bits(kernel_pmap,
 9025                                     page[i]->md.pat_mode, 0);
 9026                                 pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
 9027                                     cache_bits);
 9028                                 invlpg(vaddr[i]);
 9029                         }
 9030                 }
 9031         }
 9032 
 9033         return (needs_mapping);
 9034 }
 9035 
 9036 void
 9037 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
 9038     boolean_t can_fault)
 9039 {
 9040         vm_paddr_t paddr;
 9041         int i;
 9042 
 9043         if (!can_fault)
 9044                 sched_unpin();
 9045         for (i = 0; i < count; i++) {
 9046                 paddr = VM_PAGE_TO_PHYS(page[i]);
 9047                 if (paddr >= dmaplimit) {
 9048                         if (can_fault)
 9049                                 pmap_qremove(vaddr[i], 1);
 9050                         vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
 9051                 }
 9052         }
 9053 }
 9054 
 9055 vm_offset_t
 9056 pmap_quick_enter_page(vm_page_t m)
 9057 {
 9058         vm_paddr_t paddr;
 9059 
 9060         paddr = VM_PAGE_TO_PHYS(m);
 9061         if (paddr < dmaplimit)
 9062                 return (PHYS_TO_DMAP(paddr));
 9063         mtx_lock_spin(&qframe_mtx);
 9064         KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
 9065         pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
 9066             X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
 9067         return (qframe);
 9068 }
 9069 
 9070 void
 9071 pmap_quick_remove_page(vm_offset_t addr)
 9072 {
 9073 
 9074         if (addr != qframe)
 9075                 return;
 9076         pte_store(vtopte(qframe), 0);
 9077         invlpg(qframe);
 9078         mtx_unlock_spin(&qframe_mtx);
 9079 }
 9080 
 9081 /*
 9082  * Pdp pages from the large map are managed differently from either
 9083  * kernel or user page table pages.  They are permanently allocated at
 9084  * initialization time, and their wire count is permanently set to
 9085  * zero.  The pml4 entries pointing to those pages are copied into
 9086  * each allocated pmap.
 9087  *
 9088  * In contrast, pd and pt pages are managed like user page table
 9089  * pages.  They are dynamically allocated, and their wire count
 9090  * represents the number of valid entries within the page.
 9091  */
 9092 static vm_page_t
 9093 pmap_large_map_getptp_unlocked(void)
 9094 {
 9095         vm_page_t m;
 9096 
 9097         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 9098             VM_ALLOC_ZERO);
 9099         if (m != NULL && (m->flags & PG_ZERO) == 0)
 9100                 pmap_zero_page(m);
 9101         return (m);
 9102 }
 9103 
 9104 static vm_page_t
 9105 pmap_large_map_getptp(void)
 9106 {
 9107         vm_page_t m;
 9108 
 9109         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 9110         m = pmap_large_map_getptp_unlocked();
 9111         if (m == NULL) {
 9112                 PMAP_UNLOCK(kernel_pmap);
 9113                 vm_wait(NULL);
 9114                 PMAP_LOCK(kernel_pmap);
 9115                 /* Callers retry. */
 9116         }
 9117         return (m);
 9118 }
 9119 
 9120 static pdp_entry_t *
 9121 pmap_large_map_pdpe(vm_offset_t va)
 9122 {
 9123         vm_pindex_t pml4_idx;
 9124         vm_paddr_t mphys;
 9125 
 9126         pml4_idx = pmap_pml4e_index(va);
 9127         KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
 9128             ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
 9129             "%#jx lm_ents %d",
 9130             (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
 9131         KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
 9132             ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
 9133             "LMSPML4I %#jx lm_ents %d",
 9134             (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
 9135         mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
 9136         return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
 9137 }
 9138 
 9139 static pd_entry_t *
 9140 pmap_large_map_pde(vm_offset_t va)
 9141 {
 9142         pdp_entry_t *pdpe;
 9143         vm_page_t m;
 9144         vm_paddr_t mphys;
 9145 
 9146 retry:
 9147         pdpe = pmap_large_map_pdpe(va);
 9148         if (*pdpe == 0) {
 9149                 m = pmap_large_map_getptp();
 9150                 if (m == NULL)
 9151                         goto retry;
 9152                 mphys = VM_PAGE_TO_PHYS(m);
 9153                 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
 9154         } else {
 9155                 MPASS((*pdpe & X86_PG_PS) == 0);
 9156                 mphys = *pdpe & PG_FRAME;
 9157         }
 9158         return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
 9159 }
 9160 
 9161 static pt_entry_t *
 9162 pmap_large_map_pte(vm_offset_t va)
 9163 {
 9164         pd_entry_t *pde;
 9165         vm_page_t m;
 9166         vm_paddr_t mphys;
 9167 
 9168 retry:
 9169         pde = pmap_large_map_pde(va);
 9170         if (*pde == 0) {
 9171                 m = pmap_large_map_getptp();
 9172                 if (m == NULL)
 9173                         goto retry;
 9174                 mphys = VM_PAGE_TO_PHYS(m);
 9175                 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
 9176                 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++;
 9177         } else {
 9178                 MPASS((*pde & X86_PG_PS) == 0);
 9179                 mphys = *pde & PG_FRAME;
 9180         }
 9181         return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
 9182 }
 9183 
 9184 static vm_paddr_t
 9185 pmap_large_map_kextract(vm_offset_t va)
 9186 {
 9187         pdp_entry_t *pdpe, pdp;
 9188         pd_entry_t *pde, pd;
 9189         pt_entry_t *pte, pt;
 9190 
 9191         KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va),
 9192             ("not largemap range %#lx", (u_long)va));
 9193         pdpe = pmap_large_map_pdpe(va);
 9194         pdp = *pdpe;
 9195         KASSERT((pdp & X86_PG_V) != 0,
 9196             ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
 9197             (u_long)pdpe, pdp));
 9198         if ((pdp & X86_PG_PS) != 0) {
 9199                 KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
 9200                     ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
 9201                     (u_long)pdpe, pdp));
 9202                 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK));
 9203         }
 9204         pde = pmap_pdpe_to_pde(pdpe, va);
 9205         pd = *pde;
 9206         KASSERT((pd & X86_PG_V) != 0,
 9207             ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd));
 9208         if ((pd & X86_PG_PS) != 0)
 9209                 return ((pd & PG_PS_FRAME) | (va & PDRMASK));
 9210         pte = pmap_pde_to_pte(pde, va);
 9211         pt = *pte;
 9212         KASSERT((pt & X86_PG_V) != 0,
 9213             ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt));
 9214         return ((pt & PG_FRAME) | (va & PAGE_MASK));
 9215 }
 9216 
 9217 static int
 9218 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
 9219     vmem_addr_t *vmem_res)
 9220 {
 9221 
 9222         /*
 9223          * Large mappings are all but static.  Consequently, there
 9224          * is no point in waiting for an earlier allocation to be
 9225          * freed.
 9226          */
 9227         return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
 9228             VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
 9229 }
 9230 
 9231 int
 9232 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
 9233     vm_memattr_t mattr)
 9234 {
 9235         pdp_entry_t *pdpe;
 9236         pd_entry_t *pde;
 9237         pt_entry_t *pte;
 9238         vm_offset_t va, inc;
 9239         vmem_addr_t vmem_res;
 9240         vm_paddr_t pa;
 9241         int error;
 9242 
 9243         if (len == 0 || spa + len < spa)
 9244                 return (EINVAL);
 9245 
 9246         /* See if DMAP can serve. */
 9247         if (spa + len <= dmaplimit) {
 9248                 va = PHYS_TO_DMAP(spa);
 9249                 *addr = (void *)va;
 9250                 return (pmap_change_attr(va, len, mattr));
 9251         }
 9252 
 9253         /*
 9254          * No, allocate KVA.  Fit the address with best possible
 9255          * alignment for superpages.  Fall back to worse align if
 9256          * failed.
 9257          */
 9258         error = ENOMEM;
 9259         if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
 9260             NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
 9261                 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
 9262                     &vmem_res);
 9263         if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
 9264             NBPDR) + NBPDR)
 9265                 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
 9266                     &vmem_res);
 9267         if (error != 0)
 9268                 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
 9269         if (error != 0)
 9270                 return (error);
 9271 
 9272         /*
 9273          * Fill pagetable.  PG_M is not pre-set, we scan modified bits
 9274          * in the pagetable to minimize flushing.  No need to
 9275          * invalidate TLB, since we only update invalid entries.
 9276          */
 9277         PMAP_LOCK(kernel_pmap);
 9278         for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
 9279             len -= inc) {
 9280                 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
 9281                     (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
 9282                         pdpe = pmap_large_map_pdpe(va);
 9283                         MPASS(*pdpe == 0);
 9284                         *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
 9285                             X86_PG_V | X86_PG_A | pg_nx |
 9286                             pmap_cache_bits(kernel_pmap, mattr, TRUE);
 9287                         inc = NBPDP;
 9288                 } else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
 9289                     (va & PDRMASK) == 0) {
 9290                         pde = pmap_large_map_pde(va);
 9291                         MPASS(*pde == 0);
 9292                         *pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
 9293                             X86_PG_V | X86_PG_A | pg_nx |
 9294                             pmap_cache_bits(kernel_pmap, mattr, TRUE);
 9295                         PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
 9296                             wire_count++;
 9297                         inc = NBPDR;
 9298                 } else {
 9299                         pte = pmap_large_map_pte(va);
 9300                         MPASS(*pte == 0);
 9301                         *pte = pa | pg_g | X86_PG_RW | X86_PG_V |
 9302                             X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
 9303                             mattr, FALSE);
 9304                         PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
 9305                             wire_count++;
 9306                         inc = PAGE_SIZE;
 9307                 }
 9308         }
 9309         PMAP_UNLOCK(kernel_pmap);
 9310         MPASS(len == 0);
 9311 
 9312         *addr = (void *)vmem_res;
 9313         return (0);
 9314 }
 9315 
 9316 void
 9317 pmap_large_unmap(void *svaa, vm_size_t len)
 9318 {
 9319         vm_offset_t sva, va;
 9320         vm_size_t inc;
 9321         pdp_entry_t *pdpe, pdp;
 9322         pd_entry_t *pde, pd;
 9323         pt_entry_t *pte;
 9324         vm_page_t m;
 9325         struct spglist spgf;
 9326 
 9327         sva = (vm_offset_t)svaa;
 9328         if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
 9329             sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
 9330                 return;
 9331 
 9332         SLIST_INIT(&spgf);
 9333         KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) &&
 9334             PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1),
 9335             ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
 9336         PMAP_LOCK(kernel_pmap);
 9337         for (va = sva; va < sva + len; va += inc) {
 9338                 pdpe = pmap_large_map_pdpe(va);
 9339                 pdp = *pdpe;
 9340                 KASSERT((pdp & X86_PG_V) != 0,
 9341                     ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
 9342                     (u_long)pdpe, pdp));
 9343                 if ((pdp & X86_PG_PS) != 0) {
 9344                         KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
 9345                             ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
 9346                             (u_long)pdpe, pdp));
 9347                         KASSERT((va & PDPMASK) == 0,
 9348                             ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
 9349                             (u_long)pdpe, pdp));
 9350                         KASSERT(va + NBPDP <= sva + len,
 9351                             ("unmap covers partial 1GB page, sva %#lx va %#lx "
 9352                             "pdpe %#lx pdp %#lx len %#lx", sva, va,
 9353                             (u_long)pdpe, pdp, len));
 9354                         *pdpe = 0;
 9355                         inc = NBPDP;
 9356                         continue;
 9357                 }
 9358                 pde = pmap_pdpe_to_pde(pdpe, va);
 9359                 pd = *pde;
 9360                 KASSERT((pd & X86_PG_V) != 0,
 9361                     ("invalid pd va %#lx pde %#lx pd %#lx", va,
 9362                     (u_long)pde, pd));
 9363                 if ((pd & X86_PG_PS) != 0) {
 9364                         KASSERT((va & PDRMASK) == 0,
 9365                             ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
 9366                             (u_long)pde, pd));
 9367                         KASSERT(va + NBPDR <= sva + len,
 9368                             ("unmap covers partial 2MB page, sva %#lx va %#lx "
 9369                             "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde,
 9370                             pd, len));
 9371                         pde_store(pde, 0);
 9372                         inc = NBPDR;
 9373                         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
 9374                         m->wire_count--;
 9375                         if (m->wire_count == 0) {
 9376                                 *pdpe = 0;
 9377                                 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 9378                         }
 9379                         continue;
 9380                 }
 9381                 pte = pmap_pde_to_pte(pde, va);
 9382                 KASSERT((*pte & X86_PG_V) != 0,
 9383                     ("invalid pte va %#lx pte %#lx pt %#lx", va,
 9384                     (u_long)pte, *pte));
 9385                 pte_clear(pte);
 9386                 inc = PAGE_SIZE;
 9387                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
 9388                 m->wire_count--;
 9389                 if (m->wire_count == 0) {
 9390                         *pde = 0;
 9391                         SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 9392                         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
 9393                         m->wire_count--;
 9394                         if (m->wire_count == 0) {
 9395                                 *pdpe = 0;
 9396                                 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 9397                         }
 9398                 }
 9399         }
 9400         pmap_invalidate_range(kernel_pmap, sva, sva + len);
 9401         PMAP_UNLOCK(kernel_pmap);
 9402         vm_page_free_pages_toq(&spgf, false);
 9403         vmem_free(large_vmem, sva, len);
 9404 }
 9405 
 9406 static void
 9407 pmap_large_map_wb_fence_mfence(void)
 9408 {
 9409 
 9410         mfence();
 9411 }
 9412 
 9413 static void
 9414 pmap_large_map_wb_fence_atomic(void)
 9415 {
 9416 
 9417         atomic_thread_fence_seq_cst();
 9418 }
 9419 
 9420 static void
 9421 pmap_large_map_wb_fence_nop(void)
 9422 {
 9423 }
 9424 
 9425 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void), static)
 9426 {
 9427 
 9428         if (cpu_vendor_id != CPU_VENDOR_INTEL)
 9429                 return (pmap_large_map_wb_fence_mfence);
 9430         else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
 9431             CPUID_STDEXT_CLFLUSHOPT)) == 0)
 9432                 return (pmap_large_map_wb_fence_atomic);
 9433         else
 9434                 /* clflush is strongly enough ordered */
 9435                 return (pmap_large_map_wb_fence_nop);
 9436 }
 9437 
 9438 static void
 9439 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
 9440 {
 9441 
 9442         for (; len > 0; len -= cpu_clflush_line_size,
 9443             va += cpu_clflush_line_size)
 9444                 clwb(va);
 9445 }
 9446 
 9447 static void
 9448 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
 9449 {
 9450 
 9451         for (; len > 0; len -= cpu_clflush_line_size,
 9452             va += cpu_clflush_line_size)
 9453                 clflushopt(va);
 9454 }
 9455 
 9456 static void
 9457 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
 9458 {
 9459 
 9460         for (; len > 0; len -= cpu_clflush_line_size,
 9461             va += cpu_clflush_line_size)
 9462                 clflush(va);
 9463 }
 9464 
 9465 static void
 9466 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused)
 9467 {
 9468 }
 9469 
 9470 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t),
 9471     static)
 9472 {
 9473 
 9474         if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
 9475                 return (pmap_large_map_flush_range_clwb);
 9476         else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
 9477                 return (pmap_large_map_flush_range_clflushopt);
 9478         else if ((cpu_feature & CPUID_CLFSH) != 0)
 9479                 return (pmap_large_map_flush_range_clflush);
 9480         else
 9481                 return (pmap_large_map_flush_range_nop);
 9482 }
 9483 
 9484 static void
 9485 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
 9486 {
 9487         volatile u_long *pe;
 9488         u_long p;
 9489         vm_offset_t va;
 9490         vm_size_t inc;
 9491         bool seen_other;
 9492 
 9493         for (va = sva; va < eva; va += inc) {
 9494                 inc = 0;
 9495                 if ((amd_feature & AMDID_PAGE1GB) != 0) {
 9496                         pe = (volatile u_long *)pmap_large_map_pdpe(va);
 9497                         p = *pe;
 9498                         if ((p & X86_PG_PS) != 0)
 9499                                 inc = NBPDP;
 9500                 }
 9501                 if (inc == 0) {
 9502                         pe = (volatile u_long *)pmap_large_map_pde(va);
 9503                         p = *pe;
 9504                         if ((p & X86_PG_PS) != 0)
 9505                                 inc = NBPDR;
 9506                 }
 9507                 if (inc == 0) {
 9508                         pe = (volatile u_long *)pmap_large_map_pte(va);
 9509                         p = *pe;
 9510                         inc = PAGE_SIZE;
 9511                 }
 9512                 seen_other = false;
 9513                 for (;;) {
 9514                         if ((p & X86_PG_AVAIL1) != 0) {
 9515                                 /*
 9516                                  * Spin-wait for the end of a parallel
 9517                                  * write-back.
 9518                                  */
 9519                                 cpu_spinwait();
 9520                                 p = *pe;
 9521 
 9522                                 /*
 9523                                  * If we saw other write-back
 9524                                  * occuring, we cannot rely on PG_M to
 9525                                  * indicate state of the cache.  The
 9526                                  * PG_M bit is cleared before the
 9527                                  * flush to avoid ignoring new writes,
 9528                                  * and writes which are relevant for
 9529                                  * us might happen after.
 9530                                  */
 9531                                 seen_other = true;
 9532                                 continue;
 9533                         }
 9534 
 9535                         if ((p & X86_PG_M) != 0 || seen_other) {
 9536                                 if (!atomic_fcmpset_long(pe, &p,
 9537                                     (p & ~X86_PG_M) | X86_PG_AVAIL1))
 9538                                         /*
 9539                                          * If we saw PG_M without
 9540                                          * PG_AVAIL1, and then on the
 9541                                          * next attempt we do not
 9542                                          * observe either PG_M or
 9543                                          * PG_AVAIL1, the other
 9544                                          * write-back started after us
 9545                                          * and finished before us.  We
 9546                                          * can rely on it doing our
 9547                                          * work.
 9548                                          */
 9549                                         continue;
 9550                                 pmap_large_map_flush_range(va, inc);
 9551                                 atomic_clear_long(pe, X86_PG_AVAIL1);
 9552                         }
 9553                         break;
 9554                 }
 9555                 maybe_yield();
 9556         }
 9557 }
 9558 
 9559 /*
 9560  * Write-back cache lines for the given address range.
 9561  *
 9562  * Must be called only on the range or sub-range returned from
 9563  * pmap_large_map().  Must not be called on the coalesced ranges.
 9564  *
 9565  * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
 9566  * instructions support.
 9567  */
 9568 void
 9569 pmap_large_map_wb(void *svap, vm_size_t len)
 9570 {
 9571         vm_offset_t eva, sva;
 9572 
 9573         sva = (vm_offset_t)svap;
 9574         eva = sva + len;
 9575         pmap_large_map_wb_fence();
 9576         if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
 9577                 pmap_large_map_flush_range(sva, len);
 9578         } else {
 9579                 KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
 9580                     eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
 9581                     ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
 9582                 pmap_large_map_wb_large(sva, eva);
 9583         }
 9584         pmap_large_map_wb_fence();
 9585 }
 9586 
 9587 static vm_page_t
 9588 pmap_pti_alloc_page(void)
 9589 {
 9590         vm_page_t m;
 9591 
 9592         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 9593         m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
 9594             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 9595         return (m);
 9596 }
 9597 
 9598 static bool
 9599 pmap_pti_free_page(vm_page_t m)
 9600 {
 9601 
 9602         KASSERT(m->wire_count > 0, ("page %p not wired", m));
 9603         if (!vm_page_unwire_noq(m))
 9604                 return (false);
 9605         vm_page_free_zero(m);
 9606         return (true);
 9607 }
 9608 
 9609 static void
 9610 pmap_pti_init(void)
 9611 {
 9612         vm_page_t pml4_pg;
 9613         pdp_entry_t *pdpe;
 9614         vm_offset_t va;
 9615         int i;
 9616 
 9617         if (!pti)
 9618                 return;
 9619         pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
 9620         VM_OBJECT_WLOCK(pti_obj);
 9621         pml4_pg = pmap_pti_alloc_page();
 9622         pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
 9623         for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
 9624             va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
 9625                 pdpe = pmap_pti_pdpe(va);
 9626                 pmap_pti_wire_pte(pdpe);
 9627         }
 9628         pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
 9629             (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
 9630         pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
 9631             sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
 9632         pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
 9633             sizeof(struct gate_descriptor) * NIDT, false);
 9634         pmap_pti_add_kva_locked((vm_offset_t)common_tss,
 9635             (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
 9636         CPU_FOREACH(i) {
 9637                 /* Doublefault stack IST 1 */
 9638                 va = common_tss[i].tss_ist1;
 9639                 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false);
 9640                 /* NMI stack IST 2 */
 9641                 va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
 9642                 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false);
 9643                 /* MC# stack IST 3 */
 9644                 va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
 9645                 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false);
 9646                 /* DB# stack IST 4 */
 9647                 va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
 9648                 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
 9649         }
 9650         pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
 9651             (vm_offset_t)etext, true);
 9652         pti_finalized = true;
 9653         VM_OBJECT_WUNLOCK(pti_obj);
 9654 }
 9655 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
 9656 
 9657 static pdp_entry_t *
 9658 pmap_pti_pdpe(vm_offset_t va)
 9659 {
 9660         pml4_entry_t *pml4e;
 9661         pdp_entry_t *pdpe;
 9662         vm_page_t m;
 9663         vm_pindex_t pml4_idx;
 9664         vm_paddr_t mphys;
 9665 
 9666         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 9667 
 9668         pml4_idx = pmap_pml4e_index(va);
 9669         pml4e = &pti_pml4[pml4_idx];
 9670         m = NULL;
 9671         if (*pml4e == 0) {
 9672                 if (pti_finalized)
 9673                         panic("pml4 alloc after finalization\n");
 9674                 m = pmap_pti_alloc_page();
 9675                 if (*pml4e != 0) {
 9676                         pmap_pti_free_page(m);
 9677                         mphys = *pml4e & ~PAGE_MASK;
 9678                 } else {
 9679                         mphys = VM_PAGE_TO_PHYS(m);
 9680                         *pml4e = mphys | X86_PG_RW | X86_PG_V;
 9681                 }
 9682         } else {
 9683                 mphys = *pml4e & ~PAGE_MASK;
 9684         }
 9685         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
 9686         return (pdpe);
 9687 }
 9688 
 9689 static void
 9690 pmap_pti_wire_pte(void *pte)
 9691 {
 9692         vm_page_t m;
 9693 
 9694         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 9695         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 9696         m->wire_count++;
 9697 }
 9698 
 9699 static void
 9700 pmap_pti_unwire_pde(void *pde, bool only_ref)
 9701 {
 9702         vm_page_t m;
 9703 
 9704         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 9705         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
 9706         MPASS(m->wire_count > 0);
 9707         MPASS(only_ref || m->wire_count > 1);
 9708         pmap_pti_free_page(m);
 9709 }
 9710 
 9711 static void
 9712 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
 9713 {
 9714         vm_page_t m;
 9715         pd_entry_t *pde;
 9716 
 9717         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 9718         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 9719         MPASS(m->wire_count > 0);
 9720         if (pmap_pti_free_page(m)) {
 9721                 pde = pmap_pti_pde(va);
 9722                 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
 9723                 *pde = 0;
 9724                 pmap_pti_unwire_pde(pde, false);
 9725         }
 9726 }
 9727 
 9728 static pd_entry_t *
 9729 pmap_pti_pde(vm_offset_t va)
 9730 {
 9731         pdp_entry_t *pdpe;
 9732         pd_entry_t *pde;
 9733         vm_page_t m;
 9734         vm_pindex_t pd_idx;
 9735         vm_paddr_t mphys;
 9736 
 9737         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 9738 
 9739         pdpe = pmap_pti_pdpe(va);
 9740         if (*pdpe == 0) {
 9741                 m = pmap_pti_alloc_page();
 9742                 if (*pdpe != 0) {
 9743                         pmap_pti_free_page(m);
 9744                         MPASS((*pdpe & X86_PG_PS) == 0);
 9745                         mphys = *pdpe & ~PAGE_MASK;
 9746                 } else {
 9747                         mphys =  VM_PAGE_TO_PHYS(m);
 9748                         *pdpe = mphys | X86_PG_RW | X86_PG_V;
 9749                 }
 9750         } else {
 9751                 MPASS((*pdpe & X86_PG_PS) == 0);
 9752                 mphys = *pdpe & ~PAGE_MASK;
 9753         }
 9754 
 9755         pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
 9756         pd_idx = pmap_pde_index(va);
 9757         pde += pd_idx;
 9758         return (pde);
 9759 }
 9760 
 9761 static pt_entry_t *
 9762 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
 9763 {
 9764         pd_entry_t *pde;
 9765         pt_entry_t *pte;
 9766         vm_page_t m;
 9767         vm_paddr_t mphys;
 9768 
 9769         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 9770 
 9771         pde = pmap_pti_pde(va);
 9772         if (unwire_pde != NULL) {
 9773                 *unwire_pde = true;
 9774                 pmap_pti_wire_pte(pde);
 9775         }
 9776         if (*pde == 0) {
 9777                 m = pmap_pti_alloc_page();
 9778                 if (*pde != 0) {
 9779                         pmap_pti_free_page(m);
 9780                         MPASS((*pde & X86_PG_PS) == 0);
 9781                         mphys = *pde & ~(PAGE_MASK | pg_nx);
 9782                 } else {
 9783                         mphys = VM_PAGE_TO_PHYS(m);
 9784                         *pde = mphys | X86_PG_RW | X86_PG_V;
 9785                         if (unwire_pde != NULL)
 9786                                 *unwire_pde = false;
 9787                 }
 9788         } else {
 9789                 MPASS((*pde & X86_PG_PS) == 0);
 9790                 mphys = *pde & ~(PAGE_MASK | pg_nx);
 9791         }
 9792 
 9793         pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
 9794         pte += pmap_pte_index(va);
 9795 
 9796         return (pte);
 9797 }
 9798 
 9799 static void
 9800 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
 9801 {
 9802         vm_paddr_t pa;
 9803         pd_entry_t *pde;
 9804         pt_entry_t *pte, ptev;
 9805         bool unwire_pde;
 9806 
 9807         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 9808 
 9809         sva = trunc_page(sva);
 9810         MPASS(sva > VM_MAXUSER_ADDRESS);
 9811         eva = round_page(eva);
 9812         MPASS(sva < eva);
 9813         for (; sva < eva; sva += PAGE_SIZE) {
 9814                 pte = pmap_pti_pte(sva, &unwire_pde);
 9815                 pa = pmap_kextract(sva);
 9816                 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
 9817                     (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
 9818                     VM_MEMATTR_DEFAULT, FALSE);
 9819                 if (*pte == 0) {
 9820                         pte_store(pte, ptev);
 9821                         pmap_pti_wire_pte(pte);
 9822                 } else {
 9823                         KASSERT(!pti_finalized,
 9824                             ("pti overlap after fin %#lx %#lx %#lx",
 9825                             sva, *pte, ptev));
 9826                         KASSERT(*pte == ptev,
 9827                             ("pti non-identical pte after fin %#lx %#lx %#lx",
 9828                             sva, *pte, ptev));
 9829                 }
 9830                 if (unwire_pde) {
 9831                         pde = pmap_pti_pde(sva);
 9832                         pmap_pti_unwire_pde(pde, true);
 9833                 }
 9834         }
 9835 }
 9836 
 9837 void
 9838 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
 9839 {
 9840 
 9841         if (!pti)
 9842                 return;
 9843         VM_OBJECT_WLOCK(pti_obj);
 9844         pmap_pti_add_kva_locked(sva, eva, exec);
 9845         VM_OBJECT_WUNLOCK(pti_obj);
 9846 }
 9847 
 9848 void
 9849 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
 9850 {
 9851         pt_entry_t *pte;
 9852         vm_offset_t va;
 9853 
 9854         if (!pti)
 9855                 return;
 9856         sva = rounddown2(sva, PAGE_SIZE);
 9857         MPASS(sva > VM_MAXUSER_ADDRESS);
 9858         eva = roundup2(eva, PAGE_SIZE);
 9859         MPASS(sva < eva);
 9860         VM_OBJECT_WLOCK(pti_obj);
 9861         for (va = sva; va < eva; va += PAGE_SIZE) {
 9862                 pte = pmap_pti_pte(va, NULL);
 9863                 KASSERT((*pte & X86_PG_V) != 0,
 9864                     ("invalid pte va %#lx pte %#lx pt %#lx", va,
 9865                     (u_long)pte, *pte));
 9866                 pte_clear(pte);
 9867                 pmap_pti_unwire_pte(pte, va);
 9868         }
 9869         pmap_invalidate_range(kernel_pmap, sva, eva);
 9870         VM_OBJECT_WUNLOCK(pti_obj);
 9871 }
 9872 
 9873 static void *
 9874 pkru_dup_range(void *ctx __unused, void *data)
 9875 {
 9876         struct pmap_pkru_range *node, *new_node;
 9877 
 9878         new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
 9879         if (new_node == NULL)
 9880                 return (NULL);
 9881         node = data;
 9882         memcpy(new_node, node, sizeof(*node));
 9883         return (new_node);
 9884 }
 9885 
 9886 static void
 9887 pkru_free_range(void *ctx __unused, void *node)
 9888 {
 9889 
 9890         uma_zfree(pmap_pkru_ranges_zone, node);
 9891 }
 9892 
 9893 static int
 9894 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
 9895     int flags)
 9896 {
 9897         struct pmap_pkru_range *ppr;
 9898         int error;
 9899 
 9900         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 9901         MPASS(pmap->pm_type == PT_X86);
 9902         MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 9903         if ((flags & AMD64_PKRU_EXCL) != 0 &&
 9904             !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
 9905                 return (EBUSY);
 9906         ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
 9907         if (ppr == NULL)
 9908                 return (ENOMEM);
 9909         ppr->pkru_keyidx = keyidx;
 9910         ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
 9911         error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
 9912         if (error != 0)
 9913                 uma_zfree(pmap_pkru_ranges_zone, ppr);
 9914         return (error);
 9915 }
 9916 
 9917 static int
 9918 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 9919 {
 9920 
 9921         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 9922         MPASS(pmap->pm_type == PT_X86);
 9923         MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 9924         return (rangeset_remove(&pmap->pm_pkru, sva, eva));
 9925 }
 9926 
 9927 static void
 9928 pmap_pkru_deassign_all(pmap_t pmap)
 9929 {
 9930 
 9931         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 9932         if (pmap->pm_type == PT_X86 &&
 9933             (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
 9934                 rangeset_remove_all(&pmap->pm_pkru);
 9935 }
 9936 
 9937 static bool
 9938 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 9939 {
 9940         struct pmap_pkru_range *ppr, *prev_ppr;
 9941         vm_offset_t va;
 9942 
 9943         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 9944         if (pmap->pm_type != PT_X86 ||
 9945             (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
 9946             sva >= VM_MAXUSER_ADDRESS)
 9947                 return (true);
 9948         MPASS(eva <= VM_MAXUSER_ADDRESS);
 9949         for (va = sva, prev_ppr = NULL; va < eva;) {
 9950                 ppr = rangeset_lookup(&pmap->pm_pkru, va);
 9951                 if ((ppr == NULL) ^ (prev_ppr == NULL))
 9952                         return (false);
 9953                 if (ppr == NULL) {
 9954                         va += PAGE_SIZE;
 9955                         continue;
 9956                 }
 9957                 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
 9958                         return (false);
 9959                 va = ppr->pkru_rs_el.re_end;
 9960         }
 9961         return (true);
 9962 }
 9963 
 9964 static pt_entry_t
 9965 pmap_pkru_get(pmap_t pmap, vm_offset_t va)
 9966 {
 9967         struct pmap_pkru_range *ppr;
 9968 
 9969         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 9970         if (pmap->pm_type != PT_X86 ||
 9971             (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
 9972             va >= VM_MAXUSER_ADDRESS)
 9973                 return (0);
 9974         ppr = rangeset_lookup(&pmap->pm_pkru, va);
 9975         if (ppr != NULL)
 9976                 return (X86_PG_PKU(ppr->pkru_keyidx));
 9977         return (0);
 9978 }
 9979 
 9980 static bool
 9981 pred_pkru_on_remove(void *ctx __unused, void *r)
 9982 {
 9983         struct pmap_pkru_range *ppr;
 9984 
 9985         ppr = r;
 9986         return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
 9987 }
 9988 
 9989 static void
 9990 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 9991 {
 9992 
 9993         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 9994         if (pmap->pm_type == PT_X86 &&
 9995             (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 9996                 rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
 9997                     pred_pkru_on_remove);
 9998         }
 9999 }
10000 
10001 static int
10002 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
10003 {
10004 
10005         PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
10006         PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
10007         MPASS(dst_pmap->pm_type == PT_X86);
10008         MPASS(src_pmap->pm_type == PT_X86);
10009         MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
10010         if (src_pmap->pm_pkru.rs_data_ctx == NULL)
10011                 return (0);
10012         return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
10013 }
10014 
10015 static void
10016 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
10017     u_int keyidx)
10018 {
10019         pml4_entry_t *pml4e;
10020         pdp_entry_t *pdpe;
10021         pd_entry_t newpde, ptpaddr, *pde;
10022         pt_entry_t newpte, *ptep, pte;
10023         vm_offset_t va, va_next;
10024         bool changed;
10025 
10026         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
10027         MPASS(pmap->pm_type == PT_X86);
10028         MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
10029 
10030         for (changed = false, va = sva; va < eva; va = va_next) {
10031                 pml4e = pmap_pml4e(pmap, va);
10032                 if ((*pml4e & X86_PG_V) == 0) {
10033                         va_next = (va + NBPML4) & ~PML4MASK;
10034                         if (va_next < va)
10035                                 va_next = eva;
10036                         continue;
10037                 }
10038 
10039                 pdpe = pmap_pml4e_to_pdpe(pml4e, va);
10040                 if ((*pdpe & X86_PG_V) == 0) {
10041                         va_next = (va + NBPDP) & ~PDPMASK;
10042                         if (va_next < va)
10043                                 va_next = eva;
10044                         continue;
10045                 }
10046 
10047                 va_next = (va + NBPDR) & ~PDRMASK;
10048                 if (va_next < va)
10049                         va_next = eva;
10050 
10051                 pde = pmap_pdpe_to_pde(pdpe, va);
10052                 ptpaddr = *pde;
10053                 if (ptpaddr == 0)
10054                         continue;
10055 
10056                 MPASS((ptpaddr & X86_PG_V) != 0);
10057                 if ((ptpaddr & PG_PS) != 0) {
10058                         if (va + NBPDR == va_next && eva >= va_next) {
10059                                 newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
10060                                     X86_PG_PKU(keyidx);
10061                                 if (newpde != ptpaddr) {
10062                                         *pde = newpde;
10063                                         changed = true;
10064                                 }
10065                                 continue;
10066                         } else if (!pmap_demote_pde(pmap, pde, va)) {
10067                                 continue;
10068                         }
10069                 }
10070 
10071                 if (va_next > eva)
10072                         va_next = eva;
10073 
10074                 for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
10075                     ptep++, va += PAGE_SIZE) {
10076                         pte = *ptep;
10077                         if ((pte & X86_PG_V) == 0)
10078                                 continue;
10079                         newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
10080                         if (newpte != pte) {
10081                                 *ptep = newpte;
10082                                 changed = true;
10083                         }
10084                 }
10085         }
10086         if (changed)
10087                 pmap_invalidate_range(pmap, sva, eva);
10088 }
10089 
10090 static int
10091 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
10092     u_int keyidx, int flags)
10093 {
10094 
10095         if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
10096             (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
10097                 return (EINVAL);
10098         if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
10099                 return (EFAULT);
10100         if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
10101                 return (ENOTSUP);
10102         return (0);
10103 }
10104 
10105 int
10106 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
10107     int flags)
10108 {
10109         int error;
10110 
10111         sva = trunc_page(sva);
10112         eva = round_page(eva);
10113         error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
10114         if (error != 0)
10115                 return (error);
10116         for (;;) {
10117                 PMAP_LOCK(pmap);
10118                 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
10119                 if (error == 0)
10120                         pmap_pkru_update_range(pmap, sva, eva, keyidx);
10121                 PMAP_UNLOCK(pmap);
10122                 if (error != ENOMEM)
10123                         break;
10124                 vm_wait(NULL);
10125         }
10126         return (error);
10127 }
10128 
10129 int
10130 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
10131 {
10132         int error;
10133 
10134         sva = trunc_page(sva);
10135         eva = round_page(eva);
10136         error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
10137         if (error != 0)
10138                 return (error);
10139         for (;;) {
10140                 PMAP_LOCK(pmap);
10141                 error = pmap_pkru_deassign(pmap, sva, eva);
10142                 if (error == 0)
10143                         pmap_pkru_update_range(pmap, sva, eva, 0);
10144                 PMAP_UNLOCK(pmap);
10145                 if (error != ENOMEM)
10146                         break;
10147                 vm_wait(NULL);
10148         }
10149         return (error);
10150 }
10151 
10152 /*
10153  * Track a range of the kernel's virtual address space that is contiguous
10154  * in various mapping attributes.
10155  */
10156 struct pmap_kernel_map_range {
10157         vm_offset_t sva;
10158         pt_entry_t attrs;
10159         int ptes;
10160         int pdes;
10161         int pdpes;
10162 };
10163 
10164 static void
10165 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
10166     vm_offset_t eva)
10167 {
10168         const char *mode;
10169         int i, pat_idx;
10170 
10171         if (eva <= range->sva)
10172                 return;
10173 
10174         pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true);
10175         for (i = 0; i < PAT_INDEX_SIZE; i++)
10176                 if (pat_index[i] == pat_idx)
10177                         break;
10178 
10179         switch (i) {
10180         case PAT_WRITE_BACK:
10181                 mode = "WB";
10182                 break;
10183         case PAT_WRITE_THROUGH:
10184                 mode = "WT";
10185                 break;
10186         case PAT_UNCACHEABLE:
10187                 mode = "UC";
10188                 break;
10189         case PAT_UNCACHED:
10190                 mode = "U-";
10191                 break;
10192         case PAT_WRITE_PROTECTED:
10193                 mode = "WP";
10194                 break;
10195         case PAT_WRITE_COMBINING:
10196                 mode = "WC";
10197                 break;
10198         default:
10199                 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n",
10200                     __func__, pat_idx, range->sva, eva);
10201                 mode = "??";
10202                 break;
10203         }
10204 
10205         sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
10206             range->sva, eva,
10207             (range->attrs & X86_PG_RW) != 0 ? 'w' : '-',
10208             (range->attrs & pg_nx) != 0 ? '-' : 'x',
10209             (range->attrs & X86_PG_U) != 0 ? 'u' : 's',
10210             (range->attrs & X86_PG_G) != 0 ? 'g' : '-',
10211             mode, range->pdpes, range->pdes, range->ptes);
10212 
10213         /* Reset to sentinel value. */
10214         range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
10215 }
10216 
10217 /*
10218  * Determine whether the attributes specified by a page table entry match those
10219  * being tracked by the current range.  This is not quite as simple as a direct
10220  * flag comparison since some PAT modes have multiple representations.
10221  */
10222 static bool
10223 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
10224 {
10225         pt_entry_t diff, mask;
10226 
10227         mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx;
10228         diff = (range->attrs ^ attrs) & mask;
10229         if (diff == 0)
10230                 return (true);
10231         if ((diff & ~X86_PG_PDE_PAT) == 0 &&
10232             pmap_pat_index(kernel_pmap, range->attrs, true) ==
10233             pmap_pat_index(kernel_pmap, attrs, true))
10234                 return (true);
10235         return (false);
10236 }
10237 
10238 static void
10239 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
10240     pt_entry_t attrs)
10241 {
10242 
10243         memset(range, 0, sizeof(*range));
10244         range->sva = va;
10245         range->attrs = attrs;
10246 }
10247 
10248 /*
10249  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
10250  * those of the current run, dump the address range and its attributes, and
10251  * begin a new run.
10252  */
10253 static void
10254 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
10255     vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
10256     pt_entry_t pte)
10257 {
10258         pt_entry_t attrs;
10259 
10260         attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
10261 
10262         attrs |= pdpe & pg_nx;
10263         attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
10264         if ((pdpe & PG_PS) != 0) {
10265                 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE);
10266         } else if (pde != 0) {
10267                 attrs |= pde & pg_nx;
10268                 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U));
10269         }
10270         if ((pde & PG_PS) != 0) {
10271                 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE);
10272         } else if (pte != 0) {
10273                 attrs |= pte & pg_nx;
10274                 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U));
10275                 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE);
10276 
10277                 /* Canonicalize by always using the PDE PAT bit. */
10278                 if ((attrs & X86_PG_PTE_PAT) != 0)
10279                         attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT;
10280         }
10281 
10282         if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
10283                 sysctl_kmaps_dump(sb, range, va);
10284                 sysctl_kmaps_reinit(range, va, attrs);
10285         }
10286 }
10287 
10288 static int
10289 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
10290 {
10291         struct pmap_kernel_map_range range;
10292         struct sbuf sbuf, *sb;
10293         pml4_entry_t pml4e;
10294         pdp_entry_t *pdp, pdpe;
10295         pd_entry_t *pd, pde;
10296         pt_entry_t *pt, pte;
10297         vm_offset_t sva;
10298         vm_paddr_t pa;
10299         int error, i, j, k, l;
10300 
10301         error = sysctl_wire_old_buffer(req, 0);
10302         if (error != 0)
10303                 return (error);
10304         sb = &sbuf;
10305         sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
10306 
10307         /* Sentinel value. */
10308         range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
10309 
10310         /*
10311          * Iterate over the kernel page tables without holding the kernel pmap
10312          * lock.  Outside of the large map, kernel page table pages are never
10313          * freed, so at worst we will observe inconsistencies in the output.
10314          * Within the large map, ensure that PDP and PD page addresses are
10315          * valid before descending.
10316          */
10317         for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
10318                 switch (i) {
10319                 case PML4PML4I:
10320                         sbuf_printf(sb, "\nRecursive map:\n");
10321                         break;
10322                 case DMPML4I:
10323                         sbuf_printf(sb, "\nDirect map:\n");
10324                         break;
10325                 case KPML4BASE:
10326                         sbuf_printf(sb, "\nKernel map:\n");
10327                         break;
10328                 case LMSPML4I:
10329                         sbuf_printf(sb, "\nLarge map:\n");
10330                         break;
10331                 }
10332 
10333                 /* Convert to canonical form. */
10334                 if (sva == 1ul << 47)
10335                         sva |= -1ul << 48;
10336 
10337 restart:
10338                 pml4e = kernel_pmap->pm_pml4[i];
10339                 if ((pml4e & X86_PG_V) == 0) {
10340                         sva = rounddown2(sva, NBPML4);
10341                         sysctl_kmaps_dump(sb, &range, sva);
10342                         sva += NBPML4;
10343                         continue;
10344                 }
10345                 pa = pml4e & PG_FRAME;
10346                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa);
10347 
10348                 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) {
10349                         pdpe = pdp[j];
10350                         if ((pdpe & X86_PG_V) == 0) {
10351                                 sva = rounddown2(sva, NBPDP);
10352                                 sysctl_kmaps_dump(sb, &range, sva);
10353                                 sva += NBPDP;
10354                                 continue;
10355                         }
10356                         pa = pdpe & PG_FRAME;
10357                         if ((pdpe & PG_PS) != 0) {
10358                                 sva = rounddown2(sva, NBPDP);
10359                                 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
10360                                     0, 0);
10361                                 range.pdpes++;
10362                                 sva += NBPDP;
10363                                 continue;
10364                         }
10365                         if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
10366                             vm_phys_paddr_to_vm_page(pa) == NULL) {
10367                                 /*
10368                                  * Page table pages for the large map may be
10369                                  * freed.  Validate the next-level address
10370                                  * before descending.
10371                                  */
10372                                 goto restart;
10373                         }
10374                         pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
10375 
10376                         for (k = pmap_pde_index(sva); k < NPDEPG; k++) {
10377                                 pde = pd[k];
10378                                 if ((pde & X86_PG_V) == 0) {
10379                                         sva = rounddown2(sva, NBPDR);
10380                                         sysctl_kmaps_dump(sb, &range, sva);
10381                                         sva += NBPDR;
10382                                         continue;
10383                                 }
10384                                 pa = pde & PG_FRAME;
10385                                 if ((pde & PG_PS) != 0) {
10386                                         sva = rounddown2(sva, NBPDR);
10387                                         sysctl_kmaps_check(sb, &range, sva,
10388                                             pml4e, pdpe, pde, 0);
10389                                         range.pdes++;
10390                                         sva += NBPDR;
10391                                         continue;
10392                                 }
10393                                 if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
10394                                     vm_phys_paddr_to_vm_page(pa) == NULL) {
10395                                         /*
10396                                          * Page table pages for the large map
10397                                          * may be freed.  Validate the
10398                                          * next-level address before descending.
10399                                          */
10400                                         goto restart;
10401                                 }
10402                                 pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
10403 
10404                                 for (l = pmap_pte_index(sva); l < NPTEPG; l++,
10405                                     sva += PAGE_SIZE) {
10406                                         pte = pt[l];
10407                                         if ((pte & X86_PG_V) == 0) {
10408                                                 sysctl_kmaps_dump(sb, &range,
10409                                                     sva);
10410                                                 continue;
10411                                         }
10412                                         sysctl_kmaps_check(sb, &range, sva,
10413                                             pml4e, pdpe, pde, pte);
10414                                         range.ptes++;
10415                                 }
10416                         }
10417                 }
10418         }
10419 
10420         error = sbuf_finish(sb);
10421         sbuf_delete(sb);
10422         return (error);
10423 }
10424 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
10425     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
10426     NULL, 0, sysctl_kmaps, "A",
10427     "Dump kernel address layout");
10428 
10429 #ifdef DDB
10430 DB_SHOW_COMMAND(pte, pmap_print_pte)
10431 {
10432         pmap_t pmap;
10433         pml4_entry_t *pml4;
10434         pdp_entry_t *pdp;
10435         pd_entry_t *pde;
10436         pt_entry_t *pte, PG_V;
10437         vm_offset_t va;
10438 
10439         if (!have_addr) {
10440                 db_printf("show pte addr\n");
10441                 return;
10442         }
10443         va = (vm_offset_t)addr;
10444 
10445         if (kdb_thread != NULL)
10446                 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
10447         else
10448                 pmap = PCPU_GET(curpmap);
10449 
10450         PG_V = pmap_valid_bit(pmap);
10451         pml4 = pmap_pml4e(pmap, va);
10452         db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4);
10453         if ((*pml4 & PG_V) == 0) {
10454                 db_printf("\n");
10455                 return;
10456         }
10457         pdp = pmap_pml4e_to_pdpe(pml4, va);
10458         db_printf(" pdpe 0x%016lx", *pdp);
10459         if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
10460                 db_printf("\n");
10461                 return;
10462         }
10463         pde = pmap_pdpe_to_pde(pdp, va);
10464         db_printf(" pde 0x%016lx", *pde);
10465         if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
10466                 db_printf("\n");
10467                 return;
10468         }
10469         pte = pmap_pde_to_pte(pde, va);
10470         db_printf(" pte 0x%016lx\n", *pte);
10471 }
10472 
10473 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
10474 {
10475         vm_paddr_t a;
10476 
10477         if (have_addr) {
10478                 a = (vm_paddr_t)addr;
10479                 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
10480         } else {
10481                 db_printf("show phys2dmap addr\n");
10482         }
10483 }
10484 #endif

Cache object: 9f6b25a88312e59b5b8ea2b14a370149


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.