The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1991 Regents of the University of California.
    3  * All rights reserved.
    4  * Copyright (c) 1994 John S. Dyson
    5  * All rights reserved.
    6  * Copyright (c) 1994 David Greenman
    7  * All rights reserved.
    8  * Copyright (c) 2003 Peter Wemm
    9  * All rights reserved.
   10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
   11  * All rights reserved.
   12  * Copyright (c) 2014-2018 The FreeBSD Foundation
   13  * All rights reserved.
   14  *
   15  * This code is derived from software contributed to Berkeley by
   16  * the Systems Programming Group of the University of Utah Computer
   17  * Science Department and William Jolitz of UUNET Technologies Inc.
   18  *
   19  * Portions of this software were developed by
   20  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
   21  * the FreeBSD Foundation.
   22  *
   23  * Redistribution and use in source and binary forms, with or without
   24  * modification, are permitted provided that the following conditions
   25  * are met:
   26  * 1. Redistributions of source code must retain the above copyright
   27  *    notice, this list of conditions and the following disclaimer.
   28  * 2. Redistributions in binary form must reproduce the above copyright
   29  *    notice, this list of conditions and the following disclaimer in the
   30  *    documentation and/or other materials provided with the distribution.
   31  * 3. All advertising materials mentioning features or use of this software
   32  *    must display the following acknowledgement:
   33  *      This product includes software developed by the University of
   34  *      California, Berkeley and its contributors.
   35  * 4. Neither the name of the University nor the names of its contributors
   36  *    may be used to endorse or promote products derived from this software
   37  *    without specific prior written permission.
   38  *
   39  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   40  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   41  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   42  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   43  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   44  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   45  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   46  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   47  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   48  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   49  * SUCH DAMAGE.
   50  *
   51  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
   52  */
   53 /*-
   54  * Copyright (c) 2003 Networks Associates Technology, Inc.
   55  * All rights reserved.
   56  *
   57  * This software was developed for the FreeBSD Project by Jake Burkholder,
   58  * Safeport Network Services, and Network Associates Laboratories, the
   59  * Security Research Division of Network Associates, Inc. under
   60  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
   61  * CHATS research program.
   62  *
   63  * Redistribution and use in source and binary forms, with or without
   64  * modification, are permitted provided that the following conditions
   65  * are met:
   66  * 1. Redistributions of source code must retain the above copyright
   67  *    notice, this list of conditions and the following disclaimer.
   68  * 2. Redistributions in binary form must reproduce the above copyright
   69  *    notice, this list of conditions and the following disclaimer in the
   70  *    documentation and/or other materials provided with the distribution.
   71  *
   72  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   73  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   74  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   75  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   76  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   77  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   78  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   79  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   80  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   81  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   82  * SUCH DAMAGE.
   83  */
   84 
   85 #define AMD64_NPT_AWARE
   86 
   87 #include <sys/cdefs.h>
   88 __FBSDID("$FreeBSD: releng/11.1/sys/amd64/amd64/pmap.c 337828 2018-08-15 02:30:11Z delphij $");
   89 
   90 /*
   91  *      Manages physical address maps.
   92  *
   93  *      Since the information managed by this module is
   94  *      also stored by the logical address mapping module,
   95  *      this module may throw away valid virtual-to-physical
   96  *      mappings at almost any time.  However, invalidations
   97  *      of virtual-to-physical mappings must be done as
   98  *      requested.
   99  *
  100  *      In order to cope with hardware architectures which
  101  *      make virtual-to-physical map invalidates expensive,
  102  *      this module may delay invalidate or reduced protection
  103  *      operations until such time as they are actually
  104  *      necessary.  This module is given full information as
  105  *      to which processors are currently using which maps,
  106  *      and to when physical maps must be made correct.
  107  */
  108 
  109 #include "opt_pmap.h"
  110 #include "opt_vm.h"
  111 
  112 #include <sys/param.h>
  113 #include <sys/bitstring.h>
  114 #include <sys/bus.h>
  115 #include <sys/systm.h>
  116 #include <sys/kernel.h>
  117 #include <sys/ktr.h>
  118 #include <sys/lock.h>
  119 #include <sys/malloc.h>
  120 #include <sys/mman.h>
  121 #include <sys/mutex.h>
  122 #include <sys/proc.h>
  123 #include <sys/rwlock.h>
  124 #include <sys/sx.h>
  125 #include <sys/turnstile.h>
  126 #include <sys/vmem.h>
  127 #include <sys/vmmeter.h>
  128 #include <sys/sched.h>
  129 #include <sys/sysctl.h>
  130 #include <sys/smp.h>
  131 
  132 #include <vm/vm.h>
  133 #include <vm/vm_param.h>
  134 #include <vm/vm_kern.h>
  135 #include <vm/vm_page.h>
  136 #include <vm/vm_map.h>
  137 #include <vm/vm_object.h>
  138 #include <vm/vm_extern.h>
  139 #include <vm/vm_pageout.h>
  140 #include <vm/vm_pager.h>
  141 #include <vm/vm_phys.h>
  142 #include <vm/vm_radix.h>
  143 #include <vm/vm_reserv.h>
  144 #include <vm/uma.h>
  145 
  146 #include <machine/intr_machdep.h>
  147 #include <x86/apicvar.h>
  148 #include <machine/cpu.h>
  149 #include <machine/cputypes.h>
  150 #include <machine/md_var.h>
  151 #include <machine/pcb.h>
  152 #include <machine/specialreg.h>
  153 #ifdef SMP
  154 #include <machine/smp.h>
  155 #endif
  156 #include <machine/tss.h>
  157 
  158 static __inline boolean_t
  159 pmap_type_guest(pmap_t pmap)
  160 {
  161 
  162         return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
  163 }
  164 
  165 static __inline boolean_t
  166 pmap_emulate_ad_bits(pmap_t pmap)
  167 {
  168 
  169         return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
  170 }
  171 
  172 static __inline pt_entry_t
  173 pmap_valid_bit(pmap_t pmap)
  174 {
  175         pt_entry_t mask;
  176 
  177         switch (pmap->pm_type) {
  178         case PT_X86:
  179         case PT_RVI:
  180                 mask = X86_PG_V;
  181                 break;
  182         case PT_EPT:
  183                 if (pmap_emulate_ad_bits(pmap))
  184                         mask = EPT_PG_EMUL_V;
  185                 else
  186                         mask = EPT_PG_READ;
  187                 break;
  188         default:
  189                 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
  190         }
  191 
  192         return (mask);
  193 }
  194 
  195 static __inline pt_entry_t
  196 pmap_rw_bit(pmap_t pmap)
  197 {
  198         pt_entry_t mask;
  199 
  200         switch (pmap->pm_type) {
  201         case PT_X86:
  202         case PT_RVI:
  203                 mask = X86_PG_RW;
  204                 break;
  205         case PT_EPT:
  206                 if (pmap_emulate_ad_bits(pmap))
  207                         mask = EPT_PG_EMUL_RW;
  208                 else
  209                         mask = EPT_PG_WRITE;
  210                 break;
  211         default:
  212                 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
  213         }
  214 
  215         return (mask);
  216 }
  217 
  218 static pt_entry_t pg_g;
  219 
  220 static __inline pt_entry_t
  221 pmap_global_bit(pmap_t pmap)
  222 {
  223         pt_entry_t mask;
  224 
  225         switch (pmap->pm_type) {
  226         case PT_X86:
  227                 mask = pg_g;
  228                 break;
  229         case PT_RVI:
  230         case PT_EPT:
  231                 mask = 0;
  232                 break;
  233         default:
  234                 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
  235         }
  236 
  237         return (mask);
  238 }
  239 
  240 static __inline pt_entry_t
  241 pmap_accessed_bit(pmap_t pmap)
  242 {
  243         pt_entry_t mask;
  244 
  245         switch (pmap->pm_type) {
  246         case PT_X86:
  247         case PT_RVI:
  248                 mask = X86_PG_A;
  249                 break;
  250         case PT_EPT:
  251                 if (pmap_emulate_ad_bits(pmap))
  252                         mask = EPT_PG_READ;
  253                 else
  254                         mask = EPT_PG_A;
  255                 break;
  256         default:
  257                 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
  258         }
  259 
  260         return (mask);
  261 }
  262 
  263 static __inline pt_entry_t
  264 pmap_modified_bit(pmap_t pmap)
  265 {
  266         pt_entry_t mask;
  267 
  268         switch (pmap->pm_type) {
  269         case PT_X86:
  270         case PT_RVI:
  271                 mask = X86_PG_M;
  272                 break;
  273         case PT_EPT:
  274                 if (pmap_emulate_ad_bits(pmap))
  275                         mask = EPT_PG_WRITE;
  276                 else
  277                         mask = EPT_PG_M;
  278                 break;
  279         default:
  280                 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
  281         }
  282 
  283         return (mask);
  284 }
  285 
  286 extern  struct pcpu __pcpu[];
  287 
  288 #if !defined(DIAGNOSTIC)
  289 #ifdef __GNUC_GNU_INLINE__
  290 #define PMAP_INLINE     __attribute__((__gnu_inline__)) inline
  291 #else
  292 #define PMAP_INLINE     extern inline
  293 #endif
  294 #else
  295 #define PMAP_INLINE
  296 #endif
  297 
  298 #ifdef PV_STATS
  299 #define PV_STAT(x)      do { x ; } while (0)
  300 #else
  301 #define PV_STAT(x)      do { } while (0)
  302 #endif
  303 
  304 #define pa_index(pa)    ((pa) >> PDRSHIFT)
  305 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
  306 
  307 #define NPV_LIST_LOCKS  MAXCPU
  308 
  309 #define PHYS_TO_PV_LIST_LOCK(pa)        \
  310                         (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
  311 
  312 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
  313         struct rwlock **_lockp = (lockp);               \
  314         struct rwlock *_new_lock;                       \
  315                                                         \
  316         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
  317         if (_new_lock != *_lockp) {                     \
  318                 if (*_lockp != NULL)                    \
  319                         rw_wunlock(*_lockp);            \
  320                 *_lockp = _new_lock;                    \
  321                 rw_wlock(*_lockp);                      \
  322         }                                               \
  323 } while (0)
  324 
  325 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
  326                         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
  327 
  328 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
  329         struct rwlock **_lockp = (lockp);               \
  330                                                         \
  331         if (*_lockp != NULL) {                          \
  332                 rw_wunlock(*_lockp);                    \
  333                 *_lockp = NULL;                         \
  334         }                                               \
  335 } while (0)
  336 
  337 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
  338                         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
  339 
  340 struct pmap kernel_pmap_store;
  341 
  342 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
  343 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
  344 
  345 int nkpt;
  346 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
  347     "Number of kernel page table pages allocated on bootup");
  348 
  349 static int ndmpdp;
  350 vm_paddr_t dmaplimit;
  351 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
  352 pt_entry_t pg_nx;
  353 
  354 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  355 
  356 static int pat_works = 1;
  357 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
  358     "Is page attribute table fully functional?");
  359 
  360 static int pg_ps_enabled = 1;
  361 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  362     &pg_ps_enabled, 0, "Are large page mappings enabled?");
  363 
  364 #define PAT_INDEX_SIZE  8
  365 static int pat_index[PAT_INDEX_SIZE];   /* cache mode to PAT index conversion */
  366 
  367 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
  368 static u_int64_t        KPDphys;        /* phys addr of kernel level 2 */
  369 u_int64_t               KPDPphys;       /* phys addr of kernel level 3 */
  370 u_int64_t               KPML4phys;      /* phys addr of kernel level 4 */
  371 
  372 static u_int64_t        DMPDphys;       /* phys addr of direct mapped level 2 */
  373 static u_int64_t        DMPDPphys;      /* phys addr of direct mapped level 3 */
  374 static int              ndmpdpphys;     /* number of DMPDPphys pages */
  375 
  376 /*
  377  * pmap_mapdev support pre initialization (i.e. console)
  378  */
  379 #define PMAP_PREINIT_MAPPING_COUNT      8
  380 static struct pmap_preinit_mapping {
  381         vm_paddr_t      pa;
  382         vm_offset_t     va;
  383         vm_size_t       sz;
  384         int             mode;
  385 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
  386 static int pmap_initialized;
  387 
  388 /*
  389  * Data for the pv entry allocation mechanism.
  390  * Updates to pv_invl_gen are protected by the pv_list_locks[]
  391  * elements, but reads are not.
  392  */
  393 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
  394 static struct mtx pv_chunks_mutex;
  395 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
  396 static u_long pv_invl_gen[NPV_LIST_LOCKS];
  397 static struct md_page *pv_table;
  398 static struct md_page pv_dummy;
  399 
  400 /*
  401  * All those kernel PT submaps that BSD is so fond of
  402  */
  403 pt_entry_t *CMAP1 = NULL;
  404 caddr_t CADDR1 = 0;
  405 static vm_offset_t qframe = 0;
  406 static struct mtx qframe_mtx;
  407 
  408 static int pmap_flags = PMAP_PDE_SUPERPAGE;     /* flags for x86 pmaps */
  409 
  410 int pmap_pcid_enabled = 1;
  411 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  412     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
  413 int invpcid_works = 0;
  414 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
  415     "Is the invpcid instruction available ?");
  416 
  417 int pti = 0;
  418 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  419     &pti, 0,
  420     "Page Table Isolation enabled");
  421 static vm_object_t pti_obj;
  422 static pml4_entry_t *pti_pml4;
  423 static vm_pindex_t pti_pg_idx;
  424 static bool pti_finalized;
  425 
  426 static int
  427 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
  428 {
  429         int i;
  430         uint64_t res;
  431 
  432         res = 0;
  433         CPU_FOREACH(i) {
  434                 res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
  435         }
  436         return (sysctl_handle_64(oidp, &res, 0, req));
  437 }
  438 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
  439     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
  440     "Count of saved TLB context on switch");
  441 
  442 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
  443     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
  444 static struct mtx invl_gen_mtx;
  445 static u_long pmap_invl_gen = 0;
  446 /* Fake lock object to satisfy turnstiles interface. */
  447 static struct lock_object invl_gen_ts = {
  448         .lo_name = "invlts",
  449 };
  450 
  451 #define PMAP_ASSERT_NOT_IN_DI() \
  452     KASSERT(curthread->td_md.md_invl_gen.gen == 0, ("DI already started"))
  453 
  454 /*
  455  * Start a new Delayed Invalidation (DI) block of code, executed by
  456  * the current thread.  Within a DI block, the current thread may
  457  * destroy both the page table and PV list entries for a mapping and
  458  * then release the corresponding PV list lock before ensuring that
  459  * the mapping is flushed from the TLBs of any processors with the
  460  * pmap active.
  461  */
  462 static void
  463 pmap_delayed_invl_started(void)
  464 {
  465         struct pmap_invl_gen *invl_gen;
  466         u_long currgen;
  467 
  468         invl_gen = &curthread->td_md.md_invl_gen;
  469         PMAP_ASSERT_NOT_IN_DI();
  470         mtx_lock(&invl_gen_mtx);
  471         if (LIST_EMPTY(&pmap_invl_gen_tracker))
  472                 currgen = pmap_invl_gen;
  473         else
  474                 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
  475         invl_gen->gen = currgen + 1;
  476         LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
  477         mtx_unlock(&invl_gen_mtx);
  478 }
  479 
  480 /*
  481  * Finish the DI block, previously started by the current thread.  All
  482  * required TLB flushes for the pages marked by
  483  * pmap_delayed_invl_page() must be finished before this function is
  484  * called.
  485  *
  486  * This function works by bumping the global DI generation number to
  487  * the generation number of the current thread's DI, unless there is a
  488  * pending DI that started earlier.  In the latter case, bumping the
  489  * global DI generation number would incorrectly signal that the
  490  * earlier DI had finished.  Instead, this function bumps the earlier
  491  * DI's generation number to match the generation number of the
  492  * current thread's DI.
  493  */
  494 static void
  495 pmap_delayed_invl_finished(void)
  496 {
  497         struct pmap_invl_gen *invl_gen, *next;
  498         struct turnstile *ts;
  499 
  500         invl_gen = &curthread->td_md.md_invl_gen;
  501         KASSERT(invl_gen->gen != 0, ("missed invl_started"));
  502         mtx_lock(&invl_gen_mtx);
  503         next = LIST_NEXT(invl_gen, link);
  504         if (next == NULL) {
  505                 turnstile_chain_lock(&invl_gen_ts);
  506                 ts = turnstile_lookup(&invl_gen_ts);
  507                 pmap_invl_gen = invl_gen->gen;
  508                 if (ts != NULL) {
  509                         turnstile_broadcast(ts, TS_SHARED_QUEUE);
  510                         turnstile_unpend(ts, TS_SHARED_LOCK);
  511                 }
  512                 turnstile_chain_unlock(&invl_gen_ts);
  513         } else {
  514                 next->gen = invl_gen->gen;
  515         }
  516         LIST_REMOVE(invl_gen, link);
  517         mtx_unlock(&invl_gen_mtx);
  518         invl_gen->gen = 0;
  519 }
  520 
  521 #ifdef PV_STATS
  522 static long invl_wait;
  523 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
  524     "Number of times DI invalidation blocked pmap_remove_all/write");
  525 #endif
  526 
  527 static u_long *
  528 pmap_delayed_invl_genp(vm_page_t m)
  529 {
  530 
  531         return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
  532 }
  533 
  534 /*
  535  * Ensure that all currently executing DI blocks, that need to flush
  536  * TLB for the given page m, actually flushed the TLB at the time the
  537  * function returned.  If the page m has an empty PV list and we call
  538  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
  539  * valid mapping for the page m in either its page table or TLB.
  540  *
  541  * This function works by blocking until the global DI generation
  542  * number catches up with the generation number associated with the
  543  * given page m and its PV list.  Since this function's callers
  544  * typically own an object lock and sometimes own a page lock, it
  545  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
  546  * processor.
  547  */
  548 static void
  549 pmap_delayed_invl_wait(vm_page_t m)
  550 {
  551         struct thread *td;
  552         struct turnstile *ts;
  553         u_long *m_gen;
  554 #ifdef PV_STATS
  555         bool accounted = false;
  556 #endif
  557 
  558         td = curthread;
  559         m_gen = pmap_delayed_invl_genp(m);
  560         while (*m_gen > pmap_invl_gen) {
  561 #ifdef PV_STATS
  562                 if (!accounted) {
  563                         atomic_add_long(&invl_wait, 1);
  564                         accounted = true;
  565                 }
  566 #endif
  567                 ts = turnstile_trywait(&invl_gen_ts);
  568                 if (*m_gen > pmap_invl_gen)
  569                         turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
  570                 else
  571                         turnstile_cancel(ts);
  572         }
  573 }
  574 
  575 /*
  576  * Mark the page m's PV list as participating in the current thread's
  577  * DI block.  Any threads concurrently using m's PV list to remove or
  578  * restrict all mappings to m will wait for the current thread's DI
  579  * block to complete before proceeding.
  580  *
  581  * The function works by setting the DI generation number for m's PV
  582  * list to at least the DI generation number of the current thread.
  583  * This forces a caller of pmap_delayed_invl_wait() to block until
  584  * current thread calls pmap_delayed_invl_finished().
  585  */
  586 static void
  587 pmap_delayed_invl_page(vm_page_t m)
  588 {
  589         u_long gen, *m_gen;
  590 
  591         rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
  592         gen = curthread->td_md.md_invl_gen.gen;
  593         if (gen == 0)
  594                 return;
  595         m_gen = pmap_delayed_invl_genp(m);
  596         if (*m_gen < gen)
  597                 *m_gen = gen;
  598 }
  599 
  600 /*
  601  * Crashdump maps.
  602  */
  603 static caddr_t crashdumpmap;
  604 
  605 static void     free_pv_chunk(struct pv_chunk *pc);
  606 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
  607 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
  608 static int      popcnt_pc_map_pq(uint64_t *map);
  609 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
  610 static void     reserve_pv_entries(pmap_t pmap, int needed,
  611                     struct rwlock **lockp);
  612 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  613                     struct rwlock **lockp);
  614 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  615                     struct rwlock **lockp);
  616 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  617                     struct rwlock **lockp);
  618 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  619 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
  620                     vm_offset_t va);
  621 
  622 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
  623 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  624 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
  625     vm_offset_t va, struct rwlock **lockp);
  626 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
  627     vm_offset_t va);
  628 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
  629     vm_prot_t prot, struct rwlock **lockp);
  630 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
  631     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
  632 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
  633 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
  634 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
  635                     pd_entry_t pde);
  636 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
  637 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
  638 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
  639     struct rwlock **lockp);
  640 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
  641     vm_prot_t prot);
  642 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
  643 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
  644     bool exec);
  645 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
  646 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
  647 static void pmap_pti_wire_pte(void *pte);
  648 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  649     struct spglist *free, struct rwlock **lockp);
  650 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
  651     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
  652 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
  653 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  654     struct spglist *free);
  655 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
  656     vm_page_t m, struct rwlock **lockp);
  657 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  658     pd_entry_t newpde);
  659 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
  660 
  661 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
  662                 struct rwlock **lockp);
  663 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
  664                 struct rwlock **lockp);
  665 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
  666                 struct rwlock **lockp);
  667 
  668 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
  669     struct spglist *free);
  670 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
  671 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
  672 
  673 /*
  674  * Move the kernel virtual free pointer to the next
  675  * 2MB.  This is used to help improve performance
  676  * by using a large (2MB) page for much of the kernel
  677  * (.text, .data, .bss)
  678  */
  679 static vm_offset_t
  680 pmap_kmem_choose(vm_offset_t addr)
  681 {
  682         vm_offset_t newaddr = addr;
  683 
  684         newaddr = roundup2(addr, NBPDR);
  685         return (newaddr);
  686 }
  687 
  688 /********************/
  689 /* Inline functions */
  690 /********************/
  691 
  692 /* Return a non-clipped PD index for a given VA */
  693 static __inline vm_pindex_t
  694 pmap_pde_pindex(vm_offset_t va)
  695 {
  696         return (va >> PDRSHIFT);
  697 }
  698 
  699 
  700 /* Return a pointer to the PML4 slot that corresponds to a VA */
  701 static __inline pml4_entry_t *
  702 pmap_pml4e(pmap_t pmap, vm_offset_t va)
  703 {
  704 
  705         return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
  706 }
  707 
  708 /* Return a pointer to the PDP slot that corresponds to a VA */
  709 static __inline pdp_entry_t *
  710 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
  711 {
  712         pdp_entry_t *pdpe;
  713 
  714         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
  715         return (&pdpe[pmap_pdpe_index(va)]);
  716 }
  717 
  718 /* Return a pointer to the PDP slot that corresponds to a VA */
  719 static __inline pdp_entry_t *
  720 pmap_pdpe(pmap_t pmap, vm_offset_t va)
  721 {
  722         pml4_entry_t *pml4e;
  723         pt_entry_t PG_V;
  724 
  725         PG_V = pmap_valid_bit(pmap);
  726         pml4e = pmap_pml4e(pmap, va);
  727         if ((*pml4e & PG_V) == 0)
  728                 return (NULL);
  729         return (pmap_pml4e_to_pdpe(pml4e, va));
  730 }
  731 
  732 /* Return a pointer to the PD slot that corresponds to a VA */
  733 static __inline pd_entry_t *
  734 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
  735 {
  736         pd_entry_t *pde;
  737 
  738         pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
  739         return (&pde[pmap_pde_index(va)]);
  740 }
  741 
  742 /* Return a pointer to the PD slot that corresponds to a VA */
  743 static __inline pd_entry_t *
  744 pmap_pde(pmap_t pmap, vm_offset_t va)
  745 {
  746         pdp_entry_t *pdpe;
  747         pt_entry_t PG_V;
  748 
  749         PG_V = pmap_valid_bit(pmap);
  750         pdpe = pmap_pdpe(pmap, va);
  751         if (pdpe == NULL || (*pdpe & PG_V) == 0)
  752                 return (NULL);
  753         return (pmap_pdpe_to_pde(pdpe, va));
  754 }
  755 
  756 /* Return a pointer to the PT slot that corresponds to a VA */
  757 static __inline pt_entry_t *
  758 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
  759 {
  760         pt_entry_t *pte;
  761 
  762         pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
  763         return (&pte[pmap_pte_index(va)]);
  764 }
  765 
  766 /* Return a pointer to the PT slot that corresponds to a VA */
  767 static __inline pt_entry_t *
  768 pmap_pte(pmap_t pmap, vm_offset_t va)
  769 {
  770         pd_entry_t *pde;
  771         pt_entry_t PG_V;
  772 
  773         PG_V = pmap_valid_bit(pmap);
  774         pde = pmap_pde(pmap, va);
  775         if (pde == NULL || (*pde & PG_V) == 0)
  776                 return (NULL);
  777         if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
  778                 return ((pt_entry_t *)pde);
  779         return (pmap_pde_to_pte(pde, va));
  780 }
  781 
  782 static __inline void
  783 pmap_resident_count_inc(pmap_t pmap, int count)
  784 {
  785 
  786         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  787         pmap->pm_stats.resident_count += count;
  788 }
  789 
  790 static __inline void
  791 pmap_resident_count_dec(pmap_t pmap, int count)
  792 {
  793 
  794         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  795         KASSERT(pmap->pm_stats.resident_count >= count,
  796             ("pmap %p resident count underflow %ld %d", pmap,
  797             pmap->pm_stats.resident_count, count));
  798         pmap->pm_stats.resident_count -= count;
  799 }
  800 
  801 PMAP_INLINE pt_entry_t *
  802 vtopte(vm_offset_t va)
  803 {
  804         u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  805 
  806         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
  807 
  808         return (PTmap + ((va >> PAGE_SHIFT) & mask));
  809 }
  810 
  811 static __inline pd_entry_t *
  812 vtopde(vm_offset_t va)
  813 {
  814         u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  815 
  816         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
  817 
  818         return (PDmap + ((va >> PDRSHIFT) & mask));
  819 }
  820 
  821 static u_int64_t
  822 allocpages(vm_paddr_t *firstaddr, int n)
  823 {
  824         u_int64_t ret;
  825 
  826         ret = *firstaddr;
  827         bzero((void *)ret, n * PAGE_SIZE);
  828         *firstaddr += n * PAGE_SIZE;
  829         return (ret);
  830 }
  831 
  832 CTASSERT(powerof2(NDMPML4E));
  833 
  834 /* number of kernel PDP slots */
  835 #define NKPDPE(ptpgs)           howmany(ptpgs, NPDEPG)
  836 
  837 static void
  838 nkpt_init(vm_paddr_t addr)
  839 {
  840         int pt_pages;
  841         
  842 #ifdef NKPT
  843         pt_pages = NKPT;
  844 #else
  845         pt_pages = howmany(addr, 1 << PDRSHIFT);
  846         pt_pages += NKPDPE(pt_pages);
  847 
  848         /*
  849          * Add some slop beyond the bare minimum required for bootstrapping
  850          * the kernel.
  851          *
  852          * This is quite important when allocating KVA for kernel modules.
  853          * The modules are required to be linked in the negative 2GB of
  854          * the address space.  If we run out of KVA in this region then
  855          * pmap_growkernel() will need to allocate page table pages to map
  856          * the entire 512GB of KVA space which is an unnecessary tax on
  857          * physical memory.
  858          *
  859          * Secondly, device memory mapped as part of setting up the low-
  860          * level console(s) is taken from KVA, starting at virtual_avail.
  861          * This is because cninit() is called after pmap_bootstrap() but
  862          * before vm_init() and pmap_init(). 20MB for a frame buffer is
  863          * not uncommon.
  864          */
  865         pt_pages += 32;         /* 64MB additional slop. */
  866 #endif
  867         nkpt = pt_pages;
  868 }
  869 
  870 static void
  871 create_pagetables(vm_paddr_t *firstaddr)
  872 {
  873         int i, j, ndm1g, nkpdpe;
  874         pt_entry_t *pt_p;
  875         pd_entry_t *pd_p;
  876         pdp_entry_t *pdp_p;
  877         pml4_entry_t *p4_p;
  878 
  879         /* Allocate page table pages for the direct map */
  880         ndmpdp = howmany(ptoa(Maxmem), NBPDP);
  881         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
  882                 ndmpdp = 4;
  883         ndmpdpphys = howmany(ndmpdp, NPDPEPG);
  884         if (ndmpdpphys > NDMPML4E) {
  885                 /*
  886                  * Each NDMPML4E allows 512 GB, so limit to that,
  887                  * and then readjust ndmpdp and ndmpdpphys.
  888                  */
  889                 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
  890                 Maxmem = atop(NDMPML4E * NBPML4);
  891                 ndmpdpphys = NDMPML4E;
  892                 ndmpdp = NDMPML4E * NPDEPG;
  893         }
  894         DMPDPphys = allocpages(firstaddr, ndmpdpphys);
  895         ndm1g = 0;
  896         if ((amd_feature & AMDID_PAGE1GB) != 0)
  897                 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
  898         if (ndm1g < ndmpdp)
  899                 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
  900         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
  901 
  902         /* Allocate pages */
  903         KPML4phys = allocpages(firstaddr, 1);
  904         KPDPphys = allocpages(firstaddr, NKPML4E);
  905 
  906         /*
  907          * Allocate the initial number of kernel page table pages required to
  908          * bootstrap.  We defer this until after all memory-size dependent
  909          * allocations are done (e.g. direct map), so that we don't have to
  910          * build in too much slop in our estimate.
  911          *
  912          * Note that when NKPML4E > 1, we have an empty page underneath
  913          * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
  914          * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
  915          */
  916         nkpt_init(*firstaddr);
  917         nkpdpe = NKPDPE(nkpt);
  918 
  919         KPTphys = allocpages(firstaddr, nkpt);
  920         KPDphys = allocpages(firstaddr, nkpdpe);
  921 
  922         /* Fill in the underlying page table pages */
  923         /* Nominally read-only (but really R/W) from zero to physfree */
  924         /* XXX not fully used, underneath 2M pages */
  925         pt_p = (pt_entry_t *)KPTphys;
  926         for (i = 0; ptoa(i) < *firstaddr; i++)
  927                 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g;
  928 
  929         /* Now map the page tables at their location within PTmap */
  930         pd_p = (pd_entry_t *)KPDphys;
  931         for (i = 0; i < nkpt; i++)
  932                 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
  933 
  934         /* Map from zero to end of allocations under 2M pages */
  935         /* This replaces some of the KPTphys entries above */
  936         for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
  937                 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
  938                     pg_g;
  939 
  940         /* And connect up the PD to the PDP (leaving room for L4 pages) */
  941         pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
  942         for (i = 0; i < nkpdpe; i++)
  943                 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
  944                     PG_U;
  945 
  946         /*
  947          * Now, set up the direct map region using 2MB and/or 1GB pages.  If
  948          * the end of physical memory is not aligned to a 1GB page boundary,
  949          * then the residual physical memory is mapped with 2MB pages.  Later,
  950          * if pmap_mapdev{_attr}() uses the direct map for non-write-back
  951          * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
  952          * that are partially used. 
  953          */
  954         pd_p = (pd_entry_t *)DMPDphys;
  955         for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
  956                 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
  957                 /* Preset PG_M and PG_A because demotion expects it. */
  958                 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
  959                     X86_PG_M | X86_PG_A;
  960         }
  961         pdp_p = (pdp_entry_t *)DMPDPphys;
  962         for (i = 0; i < ndm1g; i++) {
  963                 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
  964                 /* Preset PG_M and PG_A because demotion expects it. */
  965                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
  966                     X86_PG_M | X86_PG_A;
  967         }
  968         for (j = 0; i < ndmpdp; i++, j++) {
  969                 pdp_p[i] = DMPDphys + ptoa(j);
  970                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
  971         }
  972 
  973         /* And recursively map PML4 to itself in order to get PTmap */
  974         p4_p = (pml4_entry_t *)KPML4phys;
  975         p4_p[PML4PML4I] = KPML4phys;
  976         p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
  977 
  978         /* Connect the Direct Map slot(s) up to the PML4. */
  979         for (i = 0; i < ndmpdpphys; i++) {
  980                 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
  981                 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
  982         }
  983 
  984         /* Connect the KVA slots up to the PML4 */
  985         for (i = 0; i < NKPML4E; i++) {
  986                 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
  987                 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
  988         }
  989 }
  990 
  991 /*
  992  *      Bootstrap the system enough to run with virtual memory.
  993  *
  994  *      On amd64 this is called after mapping has already been enabled
  995  *      and just syncs the pmap module with what has already been done.
  996  *      [We can't call it easily with mapping off since the kernel is not
  997  *      mapped with PA == VA, hence we would have to relocate every address
  998  *      from the linked base (virtual) address "KERNBASE" to the actual
  999  *      (physical) address starting relative to 0]
 1000  */
 1001 void
 1002 pmap_bootstrap(vm_paddr_t *firstaddr)
 1003 {
 1004         vm_offset_t va;
 1005         pt_entry_t *pte;
 1006         int i;
 1007 
 1008         if (!pti)
 1009                 pg_g = X86_PG_G;
 1010 
 1011         /*
 1012          * Create an initial set of page tables to run the kernel in.
 1013          */
 1014         create_pagetables(firstaddr);
 1015 
 1016         /*
 1017          * Add a physical memory segment (vm_phys_seg) corresponding to the
 1018          * preallocated kernel page table pages so that vm_page structures
 1019          * representing these pages will be created.  The vm_page structures
 1020          * are required for promotion of the corresponding kernel virtual
 1021          * addresses to superpage mappings.
 1022          */
 1023         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 1024 
 1025         virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
 1026         virtual_avail = pmap_kmem_choose(virtual_avail);
 1027 
 1028         virtual_end = VM_MAX_KERNEL_ADDRESS;
 1029 
 1030 
 1031         /* XXX do %cr0 as well */
 1032         load_cr4(rcr4() | CR4_PGE);
 1033         load_cr3(KPML4phys);
 1034         if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 1035                 load_cr4(rcr4() | CR4_SMEP);
 1036 
 1037         /*
 1038          * Initialize the kernel pmap (which is statically allocated).
 1039          */
 1040         PMAP_LOCK_INIT(kernel_pmap);
 1041         kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 1042         kernel_pmap->pm_cr3 = KPML4phys;
 1043         kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
 1044         CPU_FILL(&kernel_pmap->pm_active);      /* don't allow deactivation */
 1045         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 1046         kernel_pmap->pm_flags = pmap_flags;
 1047 
 1048         /*
 1049          * Initialize the TLB invalidations generation number lock.
 1050          */
 1051         mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
 1052 
 1053         /*
 1054          * Reserve some special page table entries/VA space for temporary
 1055          * mapping of pages.
 1056          */
 1057 #define SYSMAP(c, p, v, n)      \
 1058         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 1059 
 1060         va = virtual_avail;
 1061         pte = vtopte(va);
 1062 
 1063         /*
 1064          * Crashdump maps.  The first page is reused as CMAP1 for the
 1065          * memory test.
 1066          */
 1067         SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
 1068         CADDR1 = crashdumpmap;
 1069 
 1070         virtual_avail = va;
 1071 
 1072         /*
 1073          * Initialize the PAT MSR.
 1074          * pmap_init_pat() clears and sets CR4_PGE, which, as a
 1075          * side-effect, invalidates stale PG_G TLB entries that might
 1076          * have been created in our pre-boot environment.
 1077          */
 1078         pmap_init_pat();
 1079 
 1080         /* Initialize TLB Context Id. */
 1081         TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 1082         if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 1083                 /* Check for INVPCID support */
 1084                 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
 1085                     != 0;
 1086                 for (i = 0; i < MAXCPU; i++) {
 1087                         kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
 1088                         kernel_pmap->pm_pcids[i].pm_gen = 1;
 1089                 }
 1090                 __pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1;
 1091                 __pcpu[0].pc_pcid_gen = 1;
 1092                 /*
 1093                  * pcpu area for APs is zeroed during AP startup.
 1094                  * pc_pcid_next and pc_pcid_gen are initialized by AP
 1095                  * during pcpu setup.
 1096                  */
 1097                 load_cr4(rcr4() | CR4_PCIDE);
 1098         } else {
 1099                 pmap_pcid_enabled = 0;
 1100         }
 1101 }
 1102 
 1103 /*
 1104  * Setup the PAT MSR.
 1105  */
 1106 void
 1107 pmap_init_pat(void)
 1108 {
 1109         int pat_table[PAT_INDEX_SIZE];
 1110         uint64_t pat_msr;
 1111         u_long cr0, cr4;
 1112         int i;
 1113 
 1114         /* Bail if this CPU doesn't implement PAT. */
 1115         if ((cpu_feature & CPUID_PAT) == 0)
 1116                 panic("no PAT??");
 1117 
 1118         /* Set default PAT index table. */
 1119         for (i = 0; i < PAT_INDEX_SIZE; i++)
 1120                 pat_table[i] = -1;
 1121         pat_table[PAT_WRITE_BACK] = 0;
 1122         pat_table[PAT_WRITE_THROUGH] = 1;
 1123         pat_table[PAT_UNCACHEABLE] = 3;
 1124         pat_table[PAT_WRITE_COMBINING] = 3;
 1125         pat_table[PAT_WRITE_PROTECTED] = 3;
 1126         pat_table[PAT_UNCACHED] = 3;
 1127 
 1128         /* Initialize default PAT entries. */
 1129         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 1130             PAT_VALUE(1, PAT_WRITE_THROUGH) |
 1131             PAT_VALUE(2, PAT_UNCACHED) |
 1132             PAT_VALUE(3, PAT_UNCACHEABLE) |
 1133             PAT_VALUE(4, PAT_WRITE_BACK) |
 1134             PAT_VALUE(5, PAT_WRITE_THROUGH) |
 1135             PAT_VALUE(6, PAT_UNCACHED) |
 1136             PAT_VALUE(7, PAT_UNCACHEABLE);
 1137 
 1138         if (pat_works) {
 1139                 /*
 1140                  * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 1141                  * Program 5 and 6 as WP and WC.
 1142                  * Leave 4 and 7 as WB and UC.
 1143                  */
 1144                 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
 1145                 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 1146                     PAT_VALUE(6, PAT_WRITE_COMBINING);
 1147                 pat_table[PAT_UNCACHED] = 2;
 1148                 pat_table[PAT_WRITE_PROTECTED] = 5;
 1149                 pat_table[PAT_WRITE_COMBINING] = 6;
 1150         } else {
 1151                 /*
 1152                  * Just replace PAT Index 2 with WC instead of UC-.
 1153                  */
 1154                 pat_msr &= ~PAT_MASK(2);
 1155                 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 1156                 pat_table[PAT_WRITE_COMBINING] = 2;
 1157         }
 1158 
 1159         /* Disable PGE. */
 1160         cr4 = rcr4();
 1161         load_cr4(cr4 & ~CR4_PGE);
 1162 
 1163         /* Disable caches (CD = 1, NW = 0). */
 1164         cr0 = rcr0();
 1165         load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 1166 
 1167         /* Flushes caches and TLBs. */
 1168         wbinvd();
 1169         invltlb();
 1170 
 1171         /* Update PAT and index table. */
 1172         wrmsr(MSR_PAT, pat_msr);
 1173         for (i = 0; i < PAT_INDEX_SIZE; i++)
 1174                 pat_index[i] = pat_table[i];
 1175 
 1176         /* Flush caches and TLBs again. */
 1177         wbinvd();
 1178         invltlb();
 1179 
 1180         /* Restore caches and PGE. */
 1181         load_cr0(cr0);
 1182         load_cr4(cr4);
 1183 }
 1184 
 1185 /*
 1186  *      Initialize a vm_page's machine-dependent fields.
 1187  */
 1188 void
 1189 pmap_page_init(vm_page_t m)
 1190 {
 1191 
 1192         TAILQ_INIT(&m->md.pv_list);
 1193         m->md.pat_mode = PAT_WRITE_BACK;
 1194 }
 1195 
 1196 /*
 1197  *      Initialize the pmap module.
 1198  *      Called by vm_init, to initialize any structures that the pmap
 1199  *      system needs to map virtual memory.
 1200  */
 1201 void
 1202 pmap_init(void)
 1203 {
 1204         struct pmap_preinit_mapping *ppim;
 1205         vm_page_t mpte;
 1206         vm_size_t s;
 1207         int error, i, pv_npg;
 1208 
 1209         /* L1TF, reserve page @0 unconditionally */
 1210         vm_page_blacklist_add(0, bootverbose);
 1211 
 1212         /*
 1213          * Initialize the vm page array entries for the kernel pmap's
 1214          * page table pages.
 1215          */ 
 1216         for (i = 0; i < nkpt; i++) {
 1217                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 1218                 KASSERT(mpte >= vm_page_array &&
 1219                     mpte < &vm_page_array[vm_page_array_size],
 1220                     ("pmap_init: page table page is out of range"));
 1221                 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
 1222                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 1223         }
 1224 
 1225         /*
 1226          * If the kernel is running on a virtual machine, then it must assume
 1227          * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 1228          * be prepared for the hypervisor changing the vendor and family that
 1229          * are reported by CPUID.  Consequently, the workaround for AMD Family
 1230          * 10h Erratum 383 is enabled if the processor's feature set does not
 1231          * include at least one feature that is only supported by older Intel
 1232          * or newer AMD processors.
 1233          */
 1234         if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 1235             (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 1236             CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 1237             AMDID2_FMA4)) == 0)
 1238                 workaround_erratum383 = 1;
 1239 
 1240         /*
 1241          * Are large page mappings enabled?
 1242          */
 1243         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 1244         if (pg_ps_enabled) {
 1245                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 1246                     ("pmap_init: can't assign to pagesizes[1]"));
 1247                 pagesizes[1] = NBPDR;
 1248         }
 1249 
 1250         /*
 1251          * Initialize the pv chunk list mutex.
 1252          */
 1253         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 1254 
 1255         /*
 1256          * Initialize the pool of pv list locks.
 1257          */
 1258         for (i = 0; i < NPV_LIST_LOCKS; i++)
 1259                 rw_init(&pv_list_locks[i], "pmap pv list");
 1260 
 1261         /*
 1262          * Calculate the size of the pv head table for superpages.
 1263          */
 1264         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
 1265 
 1266         /*
 1267          * Allocate memory for the pv head table for superpages.
 1268          */
 1269         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 1270         s = round_page(s);
 1271         pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
 1272             M_WAITOK | M_ZERO);
 1273         for (i = 0; i < pv_npg; i++)
 1274                 TAILQ_INIT(&pv_table[i].pv_list);
 1275         TAILQ_INIT(&pv_dummy.pv_list);
 1276 
 1277         pmap_initialized = 1;
 1278         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 1279                 ppim = pmap_preinit_mapping + i;
 1280                 if (ppim->va == 0)
 1281                         continue;
 1282                 /* Make the direct map consistent */
 1283                 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
 1284                         (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
 1285                             ppim->sz, ppim->mode);
 1286                 }
 1287                 if (!bootverbose)
 1288                         continue;
 1289                 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
 1290                     ppim->pa, ppim->va, ppim->sz, ppim->mode);
 1291         }
 1292 
 1293         mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
 1294         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 1295             (vmem_addr_t *)&qframe);
 1296         if (error != 0)
 1297                 panic("qframe allocation failed");
 1298 }
 1299 
 1300 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
 1301     "2MB page mapping counters");
 1302 
 1303 static u_long pmap_pde_demotions;
 1304 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
 1305     &pmap_pde_demotions, 0, "2MB page demotions");
 1306 
 1307 static u_long pmap_pde_mappings;
 1308 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
 1309     &pmap_pde_mappings, 0, "2MB page mappings");
 1310 
 1311 static u_long pmap_pde_p_failures;
 1312 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
 1313     &pmap_pde_p_failures, 0, "2MB page promotion failures");
 1314 
 1315 static u_long pmap_pde_promotions;
 1316 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
 1317     &pmap_pde_promotions, 0, "2MB page promotions");
 1318 
 1319 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
 1320     "1GB page mapping counters");
 1321 
 1322 static u_long pmap_pdpe_demotions;
 1323 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
 1324     &pmap_pdpe_demotions, 0, "1GB page demotions");
 1325 
 1326 /***************************************************
 1327  * Low level helper routines.....
 1328  ***************************************************/
 1329 
 1330 static pt_entry_t
 1331 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
 1332 {
 1333         int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
 1334 
 1335         switch (pmap->pm_type) {
 1336         case PT_X86:
 1337         case PT_RVI:
 1338                 /* Verify that both PAT bits are not set at the same time */
 1339                 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
 1340                     ("Invalid PAT bits in entry %#lx", entry));
 1341 
 1342                 /* Swap the PAT bits if one of them is set */
 1343                 if ((entry & x86_pat_bits) != 0)
 1344                         entry ^= x86_pat_bits;
 1345                 break;
 1346         case PT_EPT:
 1347                 /*
 1348                  * Nothing to do - the memory attributes are represented
 1349                  * the same way for regular pages and superpages.
 1350                  */
 1351                 break;
 1352         default:
 1353                 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
 1354         }
 1355 
 1356         return (entry);
 1357 }
 1358 
 1359 /*
 1360  * Determine the appropriate bits to set in a PTE or PDE for a specified
 1361  * caching mode.
 1362  */
 1363 int
 1364 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 1365 {
 1366         int cache_bits, pat_flag, pat_idx;
 1367 
 1368         if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
 1369                 panic("Unknown caching mode %d\n", mode);
 1370 
 1371         switch (pmap->pm_type) {
 1372         case PT_X86:
 1373         case PT_RVI:
 1374                 /* The PAT bit is different for PTE's and PDE's. */
 1375                 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 1376 
 1377                 /* Map the caching mode to a PAT index. */
 1378                 pat_idx = pat_index[mode];
 1379 
 1380                 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 1381                 cache_bits = 0;
 1382                 if (pat_idx & 0x4)
 1383                         cache_bits |= pat_flag;
 1384                 if (pat_idx & 0x2)
 1385                         cache_bits |= PG_NC_PCD;
 1386                 if (pat_idx & 0x1)
 1387                         cache_bits |= PG_NC_PWT;
 1388                 break;
 1389 
 1390         case PT_EPT:
 1391                 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
 1392                 break;
 1393 
 1394         default:
 1395                 panic("unsupported pmap type %d", pmap->pm_type);
 1396         }
 1397 
 1398         return (cache_bits);
 1399 }
 1400 
 1401 static int
 1402 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 1403 {
 1404         int mask;
 1405 
 1406         switch (pmap->pm_type) {
 1407         case PT_X86:
 1408         case PT_RVI:
 1409                 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
 1410                 break;
 1411         case PT_EPT:
 1412                 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
 1413                 break;
 1414         default:
 1415                 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
 1416         }
 1417 
 1418         return (mask);
 1419 }
 1420 
 1421 static __inline boolean_t
 1422 pmap_ps_enabled(pmap_t pmap)
 1423 {
 1424 
 1425         return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 1426 }
 1427 
 1428 static void
 1429 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
 1430 {
 1431 
 1432         switch (pmap->pm_type) {
 1433         case PT_X86:
 1434                 break;
 1435         case PT_RVI:
 1436         case PT_EPT:
 1437                 /*
 1438                  * XXX
 1439                  * This is a little bogus since the generation number is
 1440                  * supposed to be bumped up when a region of the address
 1441                  * space is invalidated in the page tables.
 1442                  *
 1443                  * In this case the old PDE entry is valid but yet we want
 1444                  * to make sure that any mappings using the old entry are
 1445                  * invalidated in the TLB.
 1446                  *
 1447                  * The reason this works as expected is because we rendezvous
 1448                  * "all" host cpus and force any vcpu context to exit as a
 1449                  * side-effect.
 1450                  */
 1451                 atomic_add_acq_long(&pmap->pm_eptgen, 1);
 1452                 break;
 1453         default:
 1454                 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
 1455         }
 1456         pde_store(pde, newpde);
 1457 }
 1458 
 1459 /*
 1460  * After changing the page size for the specified virtual address in the page
 1461  * table, flush the corresponding entries from the processor's TLB.  Only the
 1462  * calling processor's TLB is affected.
 1463  *
 1464  * The calling thread must be pinned to a processor.
 1465  */
 1466 static void
 1467 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 1468 {
 1469         pt_entry_t PG_G;
 1470 
 1471         if (pmap_type_guest(pmap))
 1472                 return;
 1473 
 1474         KASSERT(pmap->pm_type == PT_X86,
 1475             ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
 1476 
 1477         PG_G = pmap_global_bit(pmap);
 1478 
 1479         if ((newpde & PG_PS) == 0)
 1480                 /* Demotion: flush a specific 2MB page mapping. */
 1481                 invlpg(va);
 1482         else if ((newpde & PG_G) == 0)
 1483                 /*
 1484                  * Promotion: flush every 4KB page mapping from the TLB
 1485                  * because there are too many to flush individually.
 1486                  */
 1487                 invltlb();
 1488         else {
 1489                 /*
 1490                  * Promotion: flush every 4KB page mapping from the TLB,
 1491                  * including any global (PG_G) mappings.
 1492                  */
 1493                 invltlb_glob();
 1494         }
 1495 }
 1496 #ifdef SMP
 1497 
 1498 /*
 1499  * For SMP, these functions have to use the IPI mechanism for coherence.
 1500  *
 1501  * N.B.: Before calling any of the following TLB invalidation functions,
 1502  * the calling processor must ensure that all stores updating a non-
 1503  * kernel page table are globally performed.  Otherwise, another
 1504  * processor could cache an old, pre-update entry without being
 1505  * invalidated.  This can happen one of two ways: (1) The pmap becomes
 1506  * active on another processor after its pm_active field is checked by
 1507  * one of the following functions but before a store updating the page
 1508  * table is globally performed. (2) The pmap becomes active on another
 1509  * processor before its pm_active field is checked but due to
 1510  * speculative loads one of the following functions stills reads the
 1511  * pmap as inactive on the other processor.
 1512  * 
 1513  * The kernel page table is exempt because its pm_active field is
 1514  * immutable.  The kernel page table is always active on every
 1515  * processor.
 1516  */
 1517 
 1518 /*
 1519  * Interrupt the cpus that are executing in the guest context.
 1520  * This will force the vcpu to exit and the cached EPT mappings
 1521  * will be invalidated by the host before the next vmresume.
 1522  */
 1523 static __inline void
 1524 pmap_invalidate_ept(pmap_t pmap)
 1525 {
 1526         int ipinum;
 1527 
 1528         sched_pin();
 1529         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 1530             ("pmap_invalidate_ept: absurd pm_active"));
 1531 
 1532         /*
 1533          * The TLB mappings associated with a vcpu context are not
 1534          * flushed each time a different vcpu is chosen to execute.
 1535          *
 1536          * This is in contrast with a process's vtop mappings that
 1537          * are flushed from the TLB on each context switch.
 1538          *
 1539          * Therefore we need to do more than just a TLB shootdown on
 1540          * the active cpus in 'pmap->pm_active'. To do this we keep
 1541          * track of the number of invalidations performed on this pmap.
 1542          *
 1543          * Each vcpu keeps a cache of this counter and compares it
 1544          * just before a vmresume. If the counter is out-of-date an
 1545          * invept will be done to flush stale mappings from the TLB.
 1546          */
 1547         atomic_add_acq_long(&pmap->pm_eptgen, 1);
 1548 
 1549         /*
 1550          * Force the vcpu to exit and trap back into the hypervisor.
 1551          */
 1552         ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
 1553         ipi_selected(pmap->pm_active, ipinum);
 1554         sched_unpin();
 1555 }
 1556 
 1557 void
 1558 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1559 {
 1560         cpuset_t *mask;
 1561         struct invpcid_descr d;
 1562         uint64_t kcr3, ucr3;
 1563         uint32_t pcid;
 1564         u_int cpuid, i;
 1565 
 1566         if (pmap_type_guest(pmap)) {
 1567                 pmap_invalidate_ept(pmap);
 1568                 return;
 1569         }
 1570 
 1571         KASSERT(pmap->pm_type == PT_X86,
 1572             ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 1573 
 1574         sched_pin();
 1575         if (pmap == kernel_pmap) {
 1576                 invlpg(va);
 1577                 mask = &all_cpus;
 1578         } else {
 1579                 cpuid = PCPU_GET(cpuid);
 1580                 if (pmap == PCPU_GET(curpmap)) {
 1581                         invlpg(va);
 1582                         if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
 1583                                 /*
 1584                                  * Disable context switching. pm_pcid
 1585                                  * is recalculated on switch, which
 1586                                  * might make us use wrong pcid below.
 1587                                  */
 1588                                 critical_enter();
 1589                                 pcid = pmap->pm_pcids[cpuid].pm_pcid;
 1590 
 1591                                 if (invpcid_works) {
 1592                                         d.pcid = pcid | PMAP_PCID_USER_PT;
 1593                                         d.pad = 0;
 1594                                         d.addr = va;
 1595                                         invpcid(&d, INVPCID_ADDR);
 1596                                 } else {
 1597                                         kcr3 = pmap->pm_cr3 | pcid |
 1598                                             CR3_PCID_SAVE;
 1599                                         ucr3 = pmap->pm_ucr3 | pcid |
 1600                                             PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 1601                                         pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 1602                                 }
 1603                                 critical_exit();
 1604                         }
 1605                 } else if (pmap_pcid_enabled)
 1606                         pmap->pm_pcids[cpuid].pm_gen = 0;
 1607                 if (pmap_pcid_enabled) {
 1608                         CPU_FOREACH(i) {
 1609                                 if (cpuid != i)
 1610                                         pmap->pm_pcids[i].pm_gen = 0;
 1611                         }
 1612                 }
 1613                 mask = &pmap->pm_active;
 1614         }
 1615         smp_masked_invlpg(*mask, va, pmap);
 1616         sched_unpin();
 1617 }
 1618 
 1619 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 1620 #define PMAP_INVLPG_THRESHOLD   (4 * 1024 * PAGE_SIZE)
 1621 
 1622 void
 1623 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1624 {
 1625         cpuset_t *mask;
 1626         struct invpcid_descr d;
 1627         vm_offset_t addr;
 1628         uint64_t kcr3, ucr3;
 1629         uint32_t pcid;
 1630         u_int cpuid, i;
 1631 
 1632         if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 1633                 pmap_invalidate_all(pmap);
 1634                 return;
 1635         }
 1636 
 1637         if (pmap_type_guest(pmap)) {
 1638                 pmap_invalidate_ept(pmap);
 1639                 return;
 1640         }
 1641 
 1642         KASSERT(pmap->pm_type == PT_X86,
 1643             ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 1644 
 1645         sched_pin();
 1646         cpuid = PCPU_GET(cpuid);
 1647         if (pmap == kernel_pmap) {
 1648                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1649                         invlpg(addr);
 1650                 mask = &all_cpus;
 1651         } else {
 1652                 if (pmap == PCPU_GET(curpmap)) {
 1653                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1654                                 invlpg(addr);
 1655                         if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
 1656                                 critical_enter();
 1657                                 pcid = pmap->pm_pcids[cpuid].pm_pcid;
 1658                                 if (invpcid_works) {
 1659                                         d.pcid = pcid | PMAP_PCID_USER_PT;
 1660                                         d.pad = 0;
 1661                                         d.addr = sva;
 1662                                         for (; d.addr < eva; d.addr +=
 1663                                             PAGE_SIZE)
 1664                                                 invpcid(&d, INVPCID_ADDR);
 1665                                 } else {
 1666                                         kcr3 = pmap->pm_cr3 | pcid |
 1667                                             CR3_PCID_SAVE;
 1668                                         ucr3 = pmap->pm_ucr3 | pcid |
 1669                                             PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 1670                                         pmap_pti_pcid_invlrng(ucr3, kcr3, sva,
 1671                                             eva);
 1672                                 }
 1673                                 critical_exit();
 1674                         }
 1675                 } else if (pmap_pcid_enabled) {
 1676                         pmap->pm_pcids[cpuid].pm_gen = 0;
 1677                 }
 1678                 if (pmap_pcid_enabled) {
 1679                         CPU_FOREACH(i) {
 1680                                 if (cpuid != i)
 1681                                         pmap->pm_pcids[i].pm_gen = 0;
 1682                         }
 1683                 }
 1684                 mask = &pmap->pm_active;
 1685         }
 1686         smp_masked_invlpg_range(*mask, sva, eva, pmap);
 1687         sched_unpin();
 1688 }
 1689 
 1690 void
 1691 pmap_invalidate_all(pmap_t pmap)
 1692 {
 1693         cpuset_t *mask;
 1694         struct invpcid_descr d;
 1695         uint64_t kcr3, ucr3;
 1696         uint32_t pcid;
 1697         u_int cpuid, i;
 1698 
 1699         if (pmap_type_guest(pmap)) {
 1700                 pmap_invalidate_ept(pmap);
 1701                 return;
 1702         }
 1703 
 1704         KASSERT(pmap->pm_type == PT_X86,
 1705             ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 1706 
 1707         sched_pin();
 1708         if (pmap == kernel_pmap) {
 1709                 if (pmap_pcid_enabled && invpcid_works) {
 1710                         bzero(&d, sizeof(d));
 1711                         invpcid(&d, INVPCID_CTXGLOB);
 1712                 } else {
 1713                         invltlb_glob();
 1714                 }
 1715                 mask = &all_cpus;
 1716         } else {
 1717                 cpuid = PCPU_GET(cpuid);
 1718                 if (pmap == PCPU_GET(curpmap)) {
 1719                         if (pmap_pcid_enabled) {
 1720                                 critical_enter();
 1721                                 pcid = pmap->pm_pcids[cpuid].pm_pcid;
 1722                                 if (invpcid_works) {
 1723                                         d.pcid = pcid;
 1724                                         d.pad = 0;
 1725                                         d.addr = 0;
 1726                                         invpcid(&d, INVPCID_CTX);
 1727                                         if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 1728                                                 d.pcid |= PMAP_PCID_USER_PT;
 1729                                                 invpcid(&d, INVPCID_CTX);
 1730                                         }
 1731                                 } else {
 1732                                         kcr3 = pmap->pm_cr3 | pcid;
 1733                                         ucr3 = pmap->pm_ucr3;
 1734                                         if (ucr3 != PMAP_NO_CR3) {
 1735                                                 ucr3 |= pcid | PMAP_PCID_USER_PT;
 1736                                                 pmap_pti_pcid_invalidate(ucr3,
 1737                                                     kcr3);
 1738                                         } else {
 1739                                                 load_cr3(kcr3);
 1740                                         }
 1741                                 }
 1742                                 critical_exit();
 1743                         } else {
 1744                                 invltlb();
 1745                         }
 1746                 } else if (pmap_pcid_enabled) {
 1747                         pmap->pm_pcids[cpuid].pm_gen = 0;
 1748                 }
 1749                 if (pmap_pcid_enabled) {
 1750                         CPU_FOREACH(i) {
 1751                                 if (cpuid != i)
 1752                                         pmap->pm_pcids[i].pm_gen = 0;
 1753                         }
 1754                 }
 1755                 mask = &pmap->pm_active;
 1756         }
 1757         smp_masked_invltlb(*mask, pmap);
 1758         sched_unpin();
 1759 }
 1760 
 1761 void
 1762 pmap_invalidate_cache(void)
 1763 {
 1764 
 1765         sched_pin();
 1766         wbinvd();
 1767         smp_cache_flush();
 1768         sched_unpin();
 1769 }
 1770 
 1771 struct pde_action {
 1772         cpuset_t invalidate;    /* processors that invalidate their TLB */
 1773         pmap_t pmap;
 1774         vm_offset_t va;
 1775         pd_entry_t *pde;
 1776         pd_entry_t newpde;
 1777         u_int store;            /* processor that updates the PDE */
 1778 };
 1779 
 1780 static void
 1781 pmap_update_pde_action(void *arg)
 1782 {
 1783         struct pde_action *act = arg;
 1784 
 1785         if (act->store == PCPU_GET(cpuid))
 1786                 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 1787 }
 1788 
 1789 static void
 1790 pmap_update_pde_teardown(void *arg)
 1791 {
 1792         struct pde_action *act = arg;
 1793 
 1794         if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 1795                 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 1796 }
 1797 
 1798 /*
 1799  * Change the page size for the specified virtual address in a way that
 1800  * prevents any possibility of the TLB ever having two entries that map the
 1801  * same virtual address using different page sizes.  This is the recommended
 1802  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
 1803  * machine check exception for a TLB state that is improperly diagnosed as a
 1804  * hardware error.
 1805  */
 1806 static void
 1807 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1808 {
 1809         struct pde_action act;
 1810         cpuset_t active, other_cpus;
 1811         u_int cpuid;
 1812 
 1813         sched_pin();
 1814         cpuid = PCPU_GET(cpuid);
 1815         other_cpus = all_cpus;
 1816         CPU_CLR(cpuid, &other_cpus);
 1817         if (pmap == kernel_pmap || pmap_type_guest(pmap)) 
 1818                 active = all_cpus;
 1819         else {
 1820                 active = pmap->pm_active;
 1821         }
 1822         if (CPU_OVERLAP(&active, &other_cpus)) { 
 1823                 act.store = cpuid;
 1824                 act.invalidate = active;
 1825                 act.va = va;
 1826                 act.pmap = pmap;
 1827                 act.pde = pde;
 1828                 act.newpde = newpde;
 1829                 CPU_SET(cpuid, &active);
 1830                 smp_rendezvous_cpus(active,
 1831                     smp_no_rendevous_barrier, pmap_update_pde_action,
 1832                     pmap_update_pde_teardown, &act);
 1833         } else {
 1834                 pmap_update_pde_store(pmap, pde, newpde);
 1835                 if (CPU_ISSET(cpuid, &active))
 1836                         pmap_update_pde_invalidate(pmap, va, newpde);
 1837         }
 1838         sched_unpin();
 1839 }
 1840 #else /* !SMP */
 1841 /*
 1842  * Normal, non-SMP, invalidation functions.
 1843  */
 1844 void
 1845 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1846 {
 1847         struct invpcid_descr d;
 1848         uint64_t kcr3, ucr3;
 1849         uint32_t pcid;
 1850 
 1851         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 1852                 pmap->pm_eptgen++;
 1853                 return;
 1854         }
 1855         KASSERT(pmap->pm_type == PT_X86,
 1856             ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 1857 
 1858         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 1859                 invlpg(va);
 1860                 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 1861                     pmap->pm_ucr3 != PMAP_NO_CR3) {
 1862                         critical_enter();
 1863                         pcid = pmap->pm_pcids[0].pm_pcid;
 1864                         if (invpcid_works) {
 1865                                 d.pcid = pcid | PMAP_PCID_USER_PT;
 1866                                 d.pad = 0;
 1867                                 d.addr = va;
 1868                                 invpcid(&d, INVPCID_ADDR);
 1869                         } else {
 1870                                 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 1871                                 ucr3 = pmap->pm_ucr3 | pcid |
 1872                                     PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 1873                                 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 1874                         }
 1875                         critical_exit();
 1876                 }
 1877         } else if (pmap_pcid_enabled)
 1878                 pmap->pm_pcids[0].pm_gen = 0;
 1879 }
 1880 
 1881 void
 1882 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1883 {
 1884         struct invpcid_descr d;
 1885         vm_offset_t addr;
 1886         uint64_t kcr3, ucr3;
 1887 
 1888         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 1889                 pmap->pm_eptgen++;
 1890                 return;
 1891         }
 1892         KASSERT(pmap->pm_type == PT_X86,
 1893             ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 1894 
 1895         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 1896                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1897                         invlpg(addr);
 1898                 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 1899                     pmap->pm_ucr3 != PMAP_NO_CR3) {
 1900                         critical_enter();
 1901                         if (invpcid_works) {
 1902                                 d.pcid = pmap->pm_pcids[0].pm_pcid |
 1903                                     PMAP_PCID_USER_PT;
 1904                                 d.pad = 0;
 1905                                 d.addr = sva;
 1906                                 for (; d.addr < eva; d.addr += PAGE_SIZE)
 1907                                         invpcid(&d, INVPCID_ADDR);
 1908                         } else {
 1909                                 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
 1910                                     pm_pcid | CR3_PCID_SAVE;
 1911                                 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
 1912                                     pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 1913                                 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 1914                         }
 1915                         critical_exit();
 1916                 }
 1917         } else if (pmap_pcid_enabled) {
 1918                 pmap->pm_pcids[0].pm_gen = 0;
 1919         }
 1920 }
 1921 
 1922 void
 1923 pmap_invalidate_all(pmap_t pmap)
 1924 {
 1925         struct invpcid_descr d;
 1926         uint64_t kcr3, ucr3;
 1927 
 1928         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 1929                 pmap->pm_eptgen++;
 1930                 return;
 1931         }
 1932         KASSERT(pmap->pm_type == PT_X86,
 1933             ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
 1934 
 1935         if (pmap == kernel_pmap) {
 1936                 if (pmap_pcid_enabled && invpcid_works) {
 1937                         bzero(&d, sizeof(d));
 1938                         invpcid(&d, INVPCID_CTXGLOB);
 1939                 } else {
 1940                         invltlb_glob();
 1941                 }
 1942         } else if (pmap == PCPU_GET(curpmap)) {
 1943                 if (pmap_pcid_enabled) {
 1944                         critical_enter();
 1945                         if (invpcid_works) {
 1946                                 d.pcid = pmap->pm_pcids[0].pm_pcid;
 1947                                 d.pad = 0;
 1948                                 d.addr = 0;
 1949                                 invpcid(&d, INVPCID_CTX);
 1950                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 1951                                         d.pcid |= PMAP_PCID_USER_PT;
 1952                                         invpcid(&d, INVPCID_CTX);
 1953                                 }
 1954                         } else {
 1955                                 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
 1956                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 1957                                         ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
 1958                                             0].pm_pcid | PMAP_PCID_USER_PT;
 1959                                         pmap_pti_pcid_invalidate(ucr3, kcr3);
 1960                                 } else
 1961                                         load_cr3(kcr3);
 1962                         }
 1963                         critical_exit();
 1964                 } else {
 1965                         invltlb();
 1966                 }
 1967         } else if (pmap_pcid_enabled) {
 1968                 pmap->pm_pcids[0].pm_gen = 0;
 1969         }
 1970 }
 1971 
 1972 PMAP_INLINE void
 1973 pmap_invalidate_cache(void)
 1974 {
 1975 
 1976         wbinvd();
 1977 }
 1978 
 1979 static void
 1980 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1981 {
 1982 
 1983         pmap_update_pde_store(pmap, pde, newpde);
 1984         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
 1985                 pmap_update_pde_invalidate(pmap, va, newpde);
 1986         else
 1987                 pmap->pm_pcids[0].pm_gen = 0;
 1988 }
 1989 #endif /* !SMP */
 1990 
 1991 static void
 1992 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 1993 {
 1994 
 1995         /*
 1996          * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
 1997          * by a promotion that did not invalidate the 512 4KB page mappings
 1998          * that might exist in the TLB.  Consequently, at this point, the TLB
 1999          * may hold both 4KB and 2MB page mappings for the address range [va,
 2000          * va + NBPDR).  Therefore, the entire range must be invalidated here.
 2001          * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
 2002          * 4KB page mappings for the address range [va, va + NBPDR), and so a
 2003          * single INVLPG suffices to invalidate the 2MB page mapping from the
 2004          * TLB.
 2005          */
 2006         if ((pde & PG_PROMOTED) != 0)
 2007                 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
 2008         else
 2009                 pmap_invalidate_page(pmap, va);
 2010 }
 2011 
 2012 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 2013 
 2014 void
 2015 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
 2016 {
 2017 
 2018         if (force) {
 2019                 sva &= ~(vm_offset_t)cpu_clflush_line_size;
 2020         } else {
 2021                 KASSERT((sva & PAGE_MASK) == 0,
 2022                     ("pmap_invalidate_cache_range: sva not page-aligned"));
 2023                 KASSERT((eva & PAGE_MASK) == 0,
 2024                     ("pmap_invalidate_cache_range: eva not page-aligned"));
 2025         }
 2026 
 2027         if ((cpu_feature & CPUID_SS) != 0 && !force)
 2028                 ; /* If "Self Snoop" is supported and allowed, do nothing. */
 2029         else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
 2030             eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 2031                 /*
 2032                  * XXX: Some CPUs fault, hang, or trash the local APIC
 2033                  * registers if we use CLFLUSH on the local APIC
 2034                  * range.  The local APIC is always uncached, so we
 2035                  * don't need to flush for that range anyway.
 2036                  */
 2037                 if (pmap_kextract(sva) == lapic_paddr)
 2038                         return;
 2039 
 2040                 /*
 2041                  * Otherwise, do per-cache line flush.  Use the sfence
 2042                  * instruction to insure that previous stores are
 2043                  * included in the write-back.  The processor
 2044                  * propagates flush to other processors in the cache
 2045                  * coherence domain.
 2046                  */
 2047                 sfence();
 2048                 for (; sva < eva; sva += cpu_clflush_line_size)
 2049                         clflushopt(sva);
 2050                 sfence();
 2051         } else if ((cpu_feature & CPUID_CLFSH) != 0 &&
 2052             eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 2053                 if (pmap_kextract(sva) == lapic_paddr)
 2054                         return;
 2055                 /*
 2056                  * Writes are ordered by CLFLUSH on Intel CPUs.
 2057                  */
 2058                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
 2059                         mfence();
 2060                 for (; sva < eva; sva += cpu_clflush_line_size)
 2061                         clflush(sva);
 2062                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
 2063                         mfence();
 2064         } else {
 2065 
 2066                 /*
 2067                  * No targeted cache flush methods are supported by CPU,
 2068                  * or the supplied range is bigger than 2MB.
 2069                  * Globally invalidate cache.
 2070                  */
 2071                 pmap_invalidate_cache();
 2072         }
 2073 }
 2074 
 2075 /*
 2076  * Remove the specified set of pages from the data and instruction caches.
 2077  *
 2078  * In contrast to pmap_invalidate_cache_range(), this function does not
 2079  * rely on the CPU's self-snoop feature, because it is intended for use
 2080  * when moving pages into a different cache domain.
 2081  */
 2082 void
 2083 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 2084 {
 2085         vm_offset_t daddr, eva;
 2086         int i;
 2087         bool useclflushopt;
 2088 
 2089         useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 2090         if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 2091             ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
 2092                 pmap_invalidate_cache();
 2093         else {
 2094                 if (useclflushopt)
 2095                         sfence();
 2096                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 2097                         mfence();
 2098                 for (i = 0; i < count; i++) {
 2099                         daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 2100                         eva = daddr + PAGE_SIZE;
 2101                         for (; daddr < eva; daddr += cpu_clflush_line_size) {
 2102                                 if (useclflushopt)
 2103                                         clflushopt(daddr);
 2104                                 else
 2105                                         clflush(daddr);
 2106                         }
 2107                 }
 2108                 if (useclflushopt)
 2109                         sfence();
 2110                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 2111                         mfence();
 2112         }
 2113 }
 2114 
 2115 /*
 2116  *      Routine:        pmap_extract
 2117  *      Function:
 2118  *              Extract the physical page address associated
 2119  *              with the given map/virtual_address pair.
 2120  */
 2121 vm_paddr_t 
 2122 pmap_extract(pmap_t pmap, vm_offset_t va)
 2123 {
 2124         pdp_entry_t *pdpe;
 2125         pd_entry_t *pde;
 2126         pt_entry_t *pte, PG_V;
 2127         vm_paddr_t pa;
 2128 
 2129         pa = 0;
 2130         PG_V = pmap_valid_bit(pmap);
 2131         PMAP_LOCK(pmap);
 2132         pdpe = pmap_pdpe(pmap, va);
 2133         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 2134                 if ((*pdpe & PG_PS) != 0)
 2135                         pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 2136                 else {
 2137                         pde = pmap_pdpe_to_pde(pdpe, va);
 2138                         if ((*pde & PG_V) != 0) {
 2139                                 if ((*pde & PG_PS) != 0) {
 2140                                         pa = (*pde & PG_PS_FRAME) |
 2141                                             (va & PDRMASK);
 2142                                 } else {
 2143                                         pte = pmap_pde_to_pte(pde, va);
 2144                                         pa = (*pte & PG_FRAME) |
 2145                                             (va & PAGE_MASK);
 2146                                 }
 2147                         }
 2148                 }
 2149         }
 2150         PMAP_UNLOCK(pmap);
 2151         return (pa);
 2152 }
 2153 
 2154 /*
 2155  *      Routine:        pmap_extract_and_hold
 2156  *      Function:
 2157  *              Atomically extract and hold the physical page
 2158  *              with the given pmap and virtual address pair
 2159  *              if that mapping permits the given protection.
 2160  */
 2161 vm_page_t
 2162 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 2163 {
 2164         pd_entry_t pde, *pdep;
 2165         pt_entry_t pte, PG_RW, PG_V;
 2166         vm_paddr_t pa;
 2167         vm_page_t m;
 2168 
 2169         pa = 0;
 2170         m = NULL;
 2171         PG_RW = pmap_rw_bit(pmap);
 2172         PG_V = pmap_valid_bit(pmap);
 2173         PMAP_LOCK(pmap);
 2174 retry:
 2175         pdep = pmap_pde(pmap, va);
 2176         if (pdep != NULL && (pde = *pdep)) {
 2177                 if (pde & PG_PS) {
 2178                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 2179                                 if (vm_page_pa_tryrelock(pmap, (pde &
 2180                                     PG_PS_FRAME) | (va & PDRMASK), &pa))
 2181                                         goto retry;
 2182                                 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 2183                                     (va & PDRMASK));
 2184                                 vm_page_hold(m);
 2185                         }
 2186                 } else {
 2187                         pte = *pmap_pde_to_pte(pdep, va);
 2188                         if ((pte & PG_V) &&
 2189                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 2190                                 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 2191                                     &pa))
 2192                                         goto retry;
 2193                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 2194                                 vm_page_hold(m);
 2195                         }
 2196                 }
 2197         }
 2198         PA_UNLOCK_COND(pa);
 2199         PMAP_UNLOCK(pmap);
 2200         return (m);
 2201 }
 2202 
 2203 vm_paddr_t
 2204 pmap_kextract(vm_offset_t va)
 2205 {
 2206         pd_entry_t pde;
 2207         vm_paddr_t pa;
 2208 
 2209         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 2210                 pa = DMAP_TO_PHYS(va);
 2211         } else {
 2212                 pde = *vtopde(va);
 2213                 if (pde & PG_PS) {
 2214                         pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 2215                 } else {
 2216                         /*
 2217                          * Beware of a concurrent promotion that changes the
 2218                          * PDE at this point!  For example, vtopte() must not
 2219                          * be used to access the PTE because it would use the
 2220                          * new PDE.  It is, however, safe to use the old PDE
 2221                          * because the page table page is preserved by the
 2222                          * promotion.
 2223                          */
 2224                         pa = *pmap_pde_to_pte(&pde, va);
 2225                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 2226                 }
 2227         }
 2228         return (pa);
 2229 }
 2230 
 2231 /***************************************************
 2232  * Low level mapping routines.....
 2233  ***************************************************/
 2234 
 2235 /*
 2236  * Add a wired page to the kva.
 2237  * Note: not SMP coherent.
 2238  */
 2239 PMAP_INLINE void 
 2240 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 2241 {
 2242         pt_entry_t *pte;
 2243 
 2244         pte = vtopte(va);
 2245         pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
 2246 }
 2247 
 2248 static __inline void
 2249 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 2250 {
 2251         pt_entry_t *pte;
 2252         int cache_bits;
 2253 
 2254         pte = vtopte(va);
 2255         cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
 2256         pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
 2257 }
 2258 
 2259 /*
 2260  * Remove a page from the kernel pagetables.
 2261  * Note: not SMP coherent.
 2262  */
 2263 PMAP_INLINE void
 2264 pmap_kremove(vm_offset_t va)
 2265 {
 2266         pt_entry_t *pte;
 2267 
 2268         pte = vtopte(va);
 2269         pte_clear(pte);
 2270 }
 2271 
 2272 /*
 2273  *      Used to map a range of physical addresses into kernel
 2274  *      virtual address space.
 2275  *
 2276  *      The value passed in '*virt' is a suggested virtual address for
 2277  *      the mapping. Architectures which can support a direct-mapped
 2278  *      physical to virtual region can return the appropriate address
 2279  *      within that region, leaving '*virt' unchanged. Other
 2280  *      architectures should map the pages starting at '*virt' and
 2281  *      update '*virt' with the first usable address after the mapped
 2282  *      region.
 2283  */
 2284 vm_offset_t
 2285 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 2286 {
 2287         return PHYS_TO_DMAP(start);
 2288 }
 2289 
 2290 
 2291 /*
 2292  * Add a list of wired pages to the kva
 2293  * this routine is only used for temporary
 2294  * kernel mappings that do not need to have
 2295  * page modification or references recorded.
 2296  * Note that old mappings are simply written
 2297  * over.  The page *must* be wired.
 2298  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 2299  */
 2300 void
 2301 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 2302 {
 2303         pt_entry_t *endpte, oldpte, pa, *pte;
 2304         vm_page_t m;
 2305         int cache_bits;
 2306 
 2307         oldpte = 0;
 2308         pte = vtopte(sva);
 2309         endpte = pte + count;
 2310         while (pte < endpte) {
 2311                 m = *ma++;
 2312                 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 2313                 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 2314                 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 2315                         oldpte |= *pte;
 2316                         pte_store(pte, pa | pg_g | X86_PG_RW | X86_PG_V);
 2317                 }
 2318                 pte++;
 2319         }
 2320         if (__predict_false((oldpte & X86_PG_V) != 0))
 2321                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
 2322                     PAGE_SIZE);
 2323 }
 2324 
 2325 /*
 2326  * This routine tears out page mappings from the
 2327  * kernel -- it is meant only for temporary mappings.
 2328  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 2329  */
 2330 void
 2331 pmap_qremove(vm_offset_t sva, int count)
 2332 {
 2333         vm_offset_t va;
 2334 
 2335         va = sva;
 2336         while (count-- > 0) {
 2337                 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
 2338                 pmap_kremove(va);
 2339                 va += PAGE_SIZE;
 2340         }
 2341         pmap_invalidate_range(kernel_pmap, sva, va);
 2342 }
 2343 
 2344 /***************************************************
 2345  * Page table page management routines.....
 2346  ***************************************************/
 2347 static __inline void
 2348 pmap_free_zero_pages(struct spglist *free)
 2349 {
 2350         vm_page_t m;
 2351 
 2352         while ((m = SLIST_FIRST(free)) != NULL) {
 2353                 SLIST_REMOVE_HEAD(free, plinks.s.ss);
 2354                 /* Preserve the page's PG_ZERO setting. */
 2355                 vm_page_free_toq(m);
 2356         }
 2357 }
 2358 
 2359 /*
 2360  * Schedule the specified unused page table page to be freed.  Specifically,
 2361  * add the page to the specified list of pages that will be released to the
 2362  * physical memory manager after the TLB has been updated.
 2363  */
 2364 static __inline void
 2365 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
 2366     boolean_t set_PG_ZERO)
 2367 {
 2368 
 2369         if (set_PG_ZERO)
 2370                 m->flags |= PG_ZERO;
 2371         else
 2372                 m->flags &= ~PG_ZERO;
 2373         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 2374 }
 2375         
 2376 /*
 2377  * Inserts the specified page table page into the specified pmap's collection
 2378  * of idle page table pages.  Each of a pmap's page table pages is responsible
 2379  * for mapping a distinct range of virtual addresses.  The pmap's collection is
 2380  * ordered by this virtual address range.
 2381  */
 2382 static __inline int
 2383 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 2384 {
 2385 
 2386         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2387         return (vm_radix_insert(&pmap->pm_root, mpte));
 2388 }
 2389 
 2390 /*
 2391  * Removes the page table page mapping the specified virtual address from the
 2392  * specified pmap's collection of idle page table pages, and returns it.
 2393  * Otherwise, returns NULL if there is no page table page corresponding to the
 2394  * specified virtual address.
 2395  */
 2396 static __inline vm_page_t
 2397 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 2398 {
 2399 
 2400         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2401         return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
 2402 }
 2403 
 2404 /*
 2405  * Decrements a page table page's wire count, which is used to record the
 2406  * number of valid page table entries within the page.  If the wire count
 2407  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
 2408  * page table page was unmapped and FALSE otherwise.
 2409  */
 2410 static inline boolean_t
 2411 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 2412 {
 2413 
 2414         --m->wire_count;
 2415         if (m->wire_count == 0) {
 2416                 _pmap_unwire_ptp(pmap, va, m, free);
 2417                 return (TRUE);
 2418         } else
 2419                 return (FALSE);
 2420 }
 2421 
 2422 static void
 2423 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 2424 {
 2425 
 2426         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2427         /*
 2428          * unmap the page table page
 2429          */
 2430         if (m->pindex >= (NUPDE + NUPDPE)) {
 2431                 /* PDP page */
 2432                 pml4_entry_t *pml4;
 2433                 pml4 = pmap_pml4e(pmap, va);
 2434                 *pml4 = 0;
 2435                 if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
 2436                         pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
 2437                         *pml4 = 0;
 2438                 }
 2439         } else if (m->pindex >= NUPDE) {
 2440                 /* PD page */
 2441                 pdp_entry_t *pdp;
 2442                 pdp = pmap_pdpe(pmap, va);
 2443                 *pdp = 0;
 2444         } else {
 2445                 /* PTE page */
 2446                 pd_entry_t *pd;
 2447                 pd = pmap_pde(pmap, va);
 2448                 *pd = 0;
 2449         }
 2450         pmap_resident_count_dec(pmap, 1);
 2451         if (m->pindex < NUPDE) {
 2452                 /* We just released a PT, unhold the matching PD */
 2453                 vm_page_t pdpg;
 2454 
 2455                 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 2456                 pmap_unwire_ptp(pmap, va, pdpg, free);
 2457         }
 2458         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 2459                 /* We just released a PD, unhold the matching PDP */
 2460                 vm_page_t pdppg;
 2461 
 2462                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 2463                 pmap_unwire_ptp(pmap, va, pdppg, free);
 2464         }
 2465 
 2466         /*
 2467          * This is a release store so that the ordinary store unmapping
 2468          * the page table page is globally performed before TLB shoot-
 2469          * down is begun.
 2470          */
 2471         atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
 2472 
 2473         /* 
 2474          * Put page on a list so that it is released after
 2475          * *ALL* TLB shootdown is done
 2476          */
 2477         pmap_add_delayed_free_list(m, free, TRUE);
 2478 }
 2479 
 2480 /*
 2481  * After removing a page table entry, this routine is used to
 2482  * conditionally free the page, and manage the hold/wire counts.
 2483  */
 2484 static int
 2485 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
 2486     struct spglist *free)
 2487 {
 2488         vm_page_t mpte;
 2489 
 2490         if (va >= VM_MAXUSER_ADDRESS)
 2491                 return (0);
 2492         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 2493         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 2494         return (pmap_unwire_ptp(pmap, va, mpte, free));
 2495 }
 2496 
 2497 void
 2498 pmap_pinit0(pmap_t pmap)
 2499 {
 2500         int i;
 2501 
 2502         PMAP_LOCK_INIT(pmap);
 2503         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 2504         pmap->pm_pml4u = NULL;
 2505         pmap->pm_cr3 = KPML4phys;
 2506         /* hack to keep pmap_pti_pcid_invalidate() alive */
 2507         pmap->pm_ucr3 = PMAP_NO_CR3;
 2508         pmap->pm_root.rt_root = 0;
 2509         CPU_ZERO(&pmap->pm_active);
 2510         TAILQ_INIT(&pmap->pm_pvchunk);
 2511         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 2512         pmap->pm_flags = pmap_flags;
 2513         CPU_FOREACH(i) {
 2514                 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 2515                 pmap->pm_pcids[i].pm_gen = 0;
 2516                 if (!pti)
 2517                         __pcpu[i].pc_kcr3 = PMAP_NO_CR3;
 2518         }
 2519         PCPU_SET(curpmap, kernel_pmap);
 2520         pmap_activate(curthread);
 2521         CPU_FILL(&kernel_pmap->pm_active);
 2522 }
 2523 
 2524 void
 2525 pmap_pinit_pml4(vm_page_t pml4pg)
 2526 {
 2527         pml4_entry_t *pm_pml4;
 2528         int i;
 2529 
 2530         pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 2531 
 2532         /* Wire in kernel global address entries. */
 2533         for (i = 0; i < NKPML4E; i++) {
 2534                 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
 2535                     X86_PG_V | PG_U;
 2536         }
 2537         for (i = 0; i < ndmpdpphys; i++) {
 2538                 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
 2539                     X86_PG_V | PG_U;
 2540         }
 2541 
 2542         /* install self-referential address mapping entry(s) */
 2543         pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
 2544             X86_PG_A | X86_PG_M;
 2545 }
 2546 
 2547 static void
 2548 pmap_pinit_pml4_pti(vm_page_t pml4pg)
 2549 {
 2550         pml4_entry_t *pm_pml4;
 2551         int i;
 2552 
 2553         pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 2554         for (i = 0; i < NPML4EPG; i++)
 2555                 pm_pml4[i] = pti_pml4[i];
 2556 }
 2557 
 2558 /*
 2559  * Initialize a preallocated and zeroed pmap structure,
 2560  * such as one in a vmspace structure.
 2561  */
 2562 int
 2563 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 2564 {
 2565         vm_page_t pml4pg, pml4pgu;
 2566         vm_paddr_t pml4phys;
 2567         int i;
 2568 
 2569         /*
 2570          * allocate the page directory page
 2571          */
 2572         while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 2573             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 2574                 VM_WAIT;
 2575 
 2576         pml4phys = VM_PAGE_TO_PHYS(pml4pg);
 2577         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
 2578         CPU_FOREACH(i) {
 2579                 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 2580                 pmap->pm_pcids[i].pm_gen = 0;
 2581         }
 2582         pmap->pm_cr3 = PMAP_NO_CR3;     /* initialize to an invalid value */
 2583         pmap->pm_ucr3 = PMAP_NO_CR3;
 2584         pmap->pm_pml4u = NULL;
 2585 
 2586         pmap->pm_type = pm_type;
 2587         if ((pml4pg->flags & PG_ZERO) == 0)
 2588                 pagezero(pmap->pm_pml4);
 2589 
 2590         /*
 2591          * Do not install the host kernel mappings in the nested page
 2592          * tables. These mappings are meaningless in the guest physical
 2593          * address space.
 2594          * Install minimal kernel mappings in PTI case.
 2595          */
 2596         if (pm_type == PT_X86) {
 2597                 pmap->pm_cr3 = pml4phys;
 2598                 pmap_pinit_pml4(pml4pg);
 2599                 if (pti) {
 2600                         while ((pml4pgu = vm_page_alloc(NULL, 0,
 2601                             VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED))
 2602                             == NULL)
 2603                                VM_WAIT;
 2604                         pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
 2605                             VM_PAGE_TO_PHYS(pml4pgu));
 2606                         pmap_pinit_pml4_pti(pml4pgu);
 2607                         pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
 2608                 }
 2609         }
 2610 
 2611         pmap->pm_root.rt_root = 0;
 2612         CPU_ZERO(&pmap->pm_active);
 2613         TAILQ_INIT(&pmap->pm_pvchunk);
 2614         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 2615         pmap->pm_flags = flags;
 2616         pmap->pm_eptgen = 0;
 2617 
 2618         return (1);
 2619 }
 2620 
 2621 int
 2622 pmap_pinit(pmap_t pmap)
 2623 {
 2624 
 2625         return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 2626 }
 2627 
 2628 /*
 2629  * This routine is called if the desired page table page does not exist.
 2630  *
 2631  * If page table page allocation fails, this routine may sleep before
 2632  * returning NULL.  It sleeps only if a lock pointer was given.
 2633  *
 2634  * Note: If a page allocation fails at page table level two or three,
 2635  * one or two pages may be held during the wait, only to be released
 2636  * afterwards.  This conservative approach is easily argued to avoid
 2637  * race conditions.
 2638  */
 2639 static vm_page_t
 2640 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 2641 {
 2642         vm_page_t m, pdppg, pdpg;
 2643         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 2644 
 2645         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2646 
 2647         PG_A = pmap_accessed_bit(pmap);
 2648         PG_M = pmap_modified_bit(pmap);
 2649         PG_V = pmap_valid_bit(pmap);
 2650         PG_RW = pmap_rw_bit(pmap);
 2651 
 2652         /*
 2653          * Allocate a page table page.
 2654          */
 2655         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 2656             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 2657                 if (lockp != NULL) {
 2658                         RELEASE_PV_LIST_LOCK(lockp);
 2659                         PMAP_UNLOCK(pmap);
 2660                         PMAP_ASSERT_NOT_IN_DI();
 2661                         VM_WAIT;
 2662                         PMAP_LOCK(pmap);
 2663                 }
 2664 
 2665                 /*
 2666                  * Indicate the need to retry.  While waiting, the page table
 2667                  * page may have been allocated.
 2668                  */
 2669                 return (NULL);
 2670         }
 2671         if ((m->flags & PG_ZERO) == 0)
 2672                 pmap_zero_page(m);
 2673 
 2674         /*
 2675          * Map the pagetable page into the process address space, if
 2676          * it isn't already there.
 2677          */
 2678 
 2679         if (ptepindex >= (NUPDE + NUPDPE)) {
 2680                 pml4_entry_t *pml4, *pml4u;
 2681                 vm_pindex_t pml4index;
 2682 
 2683                 /* Wire up a new PDPE page */
 2684                 pml4index = ptepindex - (NUPDE + NUPDPE);
 2685                 pml4 = &pmap->pm_pml4[pml4index];
 2686                 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2687                 if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
 2688                         /*
 2689                          * PTI: Make all user-space mappings in the
 2690                          * kernel-mode page table no-execute so that
 2691                          * we detect any programming errors that leave
 2692                          * the kernel-mode page table active on return
 2693                          * to user space.
 2694                          */
 2695                         *pml4 |= pg_nx;
 2696 
 2697                         pml4u = &pmap->pm_pml4u[pml4index];
 2698                         *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
 2699                             PG_A | PG_M;
 2700                 }
 2701 
 2702         } else if (ptepindex >= NUPDE) {
 2703                 vm_pindex_t pml4index;
 2704                 vm_pindex_t pdpindex;
 2705                 pml4_entry_t *pml4;
 2706                 pdp_entry_t *pdp;
 2707 
 2708                 /* Wire up a new PDE page */
 2709                 pdpindex = ptepindex - NUPDE;
 2710                 pml4index = pdpindex >> NPML4EPGSHIFT;
 2711 
 2712                 pml4 = &pmap->pm_pml4[pml4index];
 2713                 if ((*pml4 & PG_V) == 0) {
 2714                         /* Have to allocate a new pdp, recurse */
 2715                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 2716                             lockp) == NULL) {
 2717                                 --m->wire_count;
 2718                                 atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 2719                                 vm_page_free_zero(m);
 2720                                 return (NULL);
 2721                         }
 2722                 } else {
 2723                         /* Add reference to pdp page */
 2724                         pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 2725                         pdppg->wire_count++;
 2726                 }
 2727                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2728 
 2729                 /* Now find the pdp page */
 2730                 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2731                 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2732 
 2733         } else {
 2734                 vm_pindex_t pml4index;
 2735                 vm_pindex_t pdpindex;
 2736                 pml4_entry_t *pml4;
 2737                 pdp_entry_t *pdp;
 2738                 pd_entry_t *pd;
 2739 
 2740                 /* Wire up a new PTE page */
 2741                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 2742                 pml4index = pdpindex >> NPML4EPGSHIFT;
 2743 
 2744                 /* First, find the pdp and check that its valid. */
 2745                 pml4 = &pmap->pm_pml4[pml4index];
 2746                 if ((*pml4 & PG_V) == 0) {
 2747                         /* Have to allocate a new pd, recurse */
 2748                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 2749                             lockp) == NULL) {
 2750                                 --m->wire_count;
 2751                                 atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 2752                                 vm_page_free_zero(m);
 2753                                 return (NULL);
 2754                         }
 2755                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2756                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2757                 } else {
 2758                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 2759                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 2760                         if ((*pdp & PG_V) == 0) {
 2761                                 /* Have to allocate a new pd, recurse */
 2762                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 2763                                     lockp) == NULL) {
 2764                                         --m->wire_count;
 2765                                         atomic_subtract_int(&vm_cnt.v_wire_count,
 2766                                             1);
 2767                                         vm_page_free_zero(m);
 2768                                         return (NULL);
 2769                                 }
 2770                         } else {
 2771                                 /* Add reference to the pd page */
 2772                                 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 2773                                 pdpg->wire_count++;
 2774                         }
 2775                 }
 2776                 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 2777 
 2778                 /* Now we know where the page directory page is */
 2779                 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 2780                 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 2781         }
 2782 
 2783         pmap_resident_count_inc(pmap, 1);
 2784 
 2785         return (m);
 2786 }
 2787 
 2788 static vm_page_t
 2789 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 2790 {
 2791         vm_pindex_t pdpindex, ptepindex;
 2792         pdp_entry_t *pdpe, PG_V;
 2793         vm_page_t pdpg;
 2794 
 2795         PG_V = pmap_valid_bit(pmap);
 2796 
 2797 retry:
 2798         pdpe = pmap_pdpe(pmap, va);
 2799         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 2800                 /* Add a reference to the pd page. */
 2801                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 2802                 pdpg->wire_count++;
 2803         } else {
 2804                 /* Allocate a pd page. */
 2805                 ptepindex = pmap_pde_pindex(va);
 2806                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 2807                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 2808                 if (pdpg == NULL && lockp != NULL)
 2809                         goto retry;
 2810         }
 2811         return (pdpg);
 2812 }
 2813 
 2814 static vm_page_t
 2815 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 2816 {
 2817         vm_pindex_t ptepindex;
 2818         pd_entry_t *pd, PG_V;
 2819         vm_page_t m;
 2820 
 2821         PG_V = pmap_valid_bit(pmap);
 2822 
 2823         /*
 2824          * Calculate pagetable page index
 2825          */
 2826         ptepindex = pmap_pde_pindex(va);
 2827 retry:
 2828         /*
 2829          * Get the page directory entry
 2830          */
 2831         pd = pmap_pde(pmap, va);
 2832 
 2833         /*
 2834          * This supports switching from a 2MB page to a
 2835          * normal 4K page.
 2836          */
 2837         if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 2838                 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 2839                         /*
 2840                          * Invalidation of the 2MB page mapping may have caused
 2841                          * the deallocation of the underlying PD page.
 2842                          */
 2843                         pd = NULL;
 2844                 }
 2845         }
 2846 
 2847         /*
 2848          * If the page table page is mapped, we just increment the
 2849          * hold count, and activate it.
 2850          */
 2851         if (pd != NULL && (*pd & PG_V) != 0) {
 2852                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 2853                 m->wire_count++;
 2854         } else {
 2855                 /*
 2856                  * Here if the pte page isn't mapped, or if it has been
 2857                  * deallocated.
 2858                  */
 2859                 m = _pmap_allocpte(pmap, ptepindex, lockp);
 2860                 if (m == NULL && lockp != NULL)
 2861                         goto retry;
 2862         }
 2863         return (m);
 2864 }
 2865 
 2866 
 2867 /***************************************************
 2868  * Pmap allocation/deallocation routines.
 2869  ***************************************************/
 2870 
 2871 /*
 2872  * Release any resources held by the given physical map.
 2873  * Called when a pmap initialized by pmap_pinit is being released.
 2874  * Should only be called if the map contains no valid mappings.
 2875  */
 2876 void
 2877 pmap_release(pmap_t pmap)
 2878 {
 2879         vm_page_t m;
 2880         int i;
 2881 
 2882         KASSERT(pmap->pm_stats.resident_count == 0,
 2883             ("pmap_release: pmap resident count %ld != 0",
 2884             pmap->pm_stats.resident_count));
 2885         KASSERT(vm_radix_is_empty(&pmap->pm_root),
 2886             ("pmap_release: pmap has reserved page table page(s)"));
 2887         KASSERT(CPU_EMPTY(&pmap->pm_active),
 2888             ("releasing active pmap %p", pmap));
 2889 
 2890         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
 2891 
 2892         for (i = 0; i < NKPML4E; i++)   /* KVA */
 2893                 pmap->pm_pml4[KPML4BASE + i] = 0;
 2894         for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 2895                 pmap->pm_pml4[DMPML4I + i] = 0;
 2896         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
 2897 
 2898         m->wire_count--;
 2899         atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 2900         vm_page_free_zero(m);
 2901 
 2902         if (pmap->pm_pml4u != NULL) {
 2903                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
 2904                 m->wire_count--;
 2905                 atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 2906                 vm_page_free(m);
 2907         }
 2908 }
 2909 
 2910 static int
 2911 kvm_size(SYSCTL_HANDLER_ARGS)
 2912 {
 2913         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 2914 
 2915         return sysctl_handle_long(oidp, &ksize, 0, req);
 2916 }
 2917 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
 2918     0, 0, kvm_size, "LU", "Size of KVM");
 2919 
 2920 static int
 2921 kvm_free(SYSCTL_HANDLER_ARGS)
 2922 {
 2923         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 2924 
 2925         return sysctl_handle_long(oidp, &kfree, 0, req);
 2926 }
 2927 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
 2928     0, 0, kvm_free, "LU", "Amount of KVM free");
 2929 
 2930 /*
 2931  * grow the number of kernel page table entries, if needed
 2932  */
 2933 void
 2934 pmap_growkernel(vm_offset_t addr)
 2935 {
 2936         vm_paddr_t paddr;
 2937         vm_page_t nkpg;
 2938         pd_entry_t *pde, newpdir;
 2939         pdp_entry_t *pdpe;
 2940 
 2941         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 2942 
 2943         /*
 2944          * Return if "addr" is within the range of kernel page table pages
 2945          * that were preallocated during pmap bootstrap.  Moreover, leave
 2946          * "kernel_vm_end" and the kernel page table as they were.
 2947          *
 2948          * The correctness of this action is based on the following
 2949          * argument: vm_map_insert() allocates contiguous ranges of the
 2950          * kernel virtual address space.  It calls this function if a range
 2951          * ends after "kernel_vm_end".  If the kernel is mapped between
 2952          * "kernel_vm_end" and "addr", then the range cannot begin at
 2953          * "kernel_vm_end".  In fact, its beginning address cannot be less
 2954          * than the kernel.  Thus, there is no immediate need to allocate
 2955          * any new kernel page table pages between "kernel_vm_end" and
 2956          * "KERNBASE".
 2957          */
 2958         if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 2959                 return;
 2960 
 2961         addr = roundup2(addr, NBPDR);
 2962         if (addr - 1 >= kernel_map->max_offset)
 2963                 addr = kernel_map->max_offset;
 2964         while (kernel_vm_end < addr) {
 2965                 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 2966                 if ((*pdpe & X86_PG_V) == 0) {
 2967                         /* We need a new PDP entry */
 2968                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 2969                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 2970                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 2971                         if (nkpg == NULL)
 2972                                 panic("pmap_growkernel: no memory to grow kernel");
 2973                         if ((nkpg->flags & PG_ZERO) == 0)
 2974                                 pmap_zero_page(nkpg);
 2975                         paddr = VM_PAGE_TO_PHYS(nkpg);
 2976                         *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
 2977                             X86_PG_A | X86_PG_M);
 2978                         continue; /* try again */
 2979                 }
 2980                 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 2981                 if ((*pde & X86_PG_V) != 0) {
 2982                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 2983                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 2984                                 kernel_vm_end = kernel_map->max_offset;
 2985                                 break;                       
 2986                         }
 2987                         continue;
 2988                 }
 2989 
 2990                 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 2991                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 2992                     VM_ALLOC_ZERO);
 2993                 if (nkpg == NULL)
 2994                         panic("pmap_growkernel: no memory to grow kernel");
 2995                 if ((nkpg->flags & PG_ZERO) == 0)
 2996                         pmap_zero_page(nkpg);
 2997                 paddr = VM_PAGE_TO_PHYS(nkpg);
 2998                 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 2999                 pde_store(pde, newpdir);
 3000 
 3001                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 3002                 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 3003                         kernel_vm_end = kernel_map->max_offset;
 3004                         break;                       
 3005                 }
 3006         }
 3007 }
 3008 
 3009 
 3010 /***************************************************
 3011  * page management routines.
 3012  ***************************************************/
 3013 
 3014 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 3015 CTASSERT(_NPCM == 3);
 3016 CTASSERT(_NPCPV == 168);
 3017 
 3018 static __inline struct pv_chunk *
 3019 pv_to_chunk(pv_entry_t pv)
 3020 {
 3021 
 3022         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 3023 }
 3024 
 3025 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 3026 
 3027 #define PC_FREE0        0xfffffffffffffffful
 3028 #define PC_FREE1        0xfffffffffffffffful
 3029 #define PC_FREE2        0x000000fffffffffful
 3030 
 3031 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 3032 
 3033 #ifdef PV_STATS
 3034 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 3035 
 3036 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 3037         "Current number of pv entry chunks");
 3038 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 3039         "Current number of pv entry chunks allocated");
 3040 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 3041         "Current number of pv entry chunks frees");
 3042 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 3043         "Number of times tried to get a chunk page but failed.");
 3044 
 3045 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 3046 static int pv_entry_spare;
 3047 
 3048 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 3049         "Current number of pv entry frees");
 3050 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 3051         "Current number of pv entry allocs");
 3052 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 3053         "Current number of pv entries");
 3054 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 3055         "Current number of spare pv entries");
 3056 #endif
 3057 
 3058 /*
 3059  * We are in a serious low memory condition.  Resort to
 3060  * drastic measures to free some pages so we can allocate
 3061  * another pv entry chunk.
 3062  *
 3063  * Returns NULL if PV entries were reclaimed from the specified pmap.
 3064  *
 3065  * We do not, however, unmap 2mpages because subsequent accesses will
 3066  * allocate per-page pv entries until repromotion occurs, thereby
 3067  * exacerbating the shortage of free pv entries.
 3068  */
 3069 static vm_page_t
 3070 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 3071 {
 3072         struct pch new_tail;
 3073         struct pv_chunk *pc;
 3074         struct md_page *pvh;
 3075         pd_entry_t *pde;
 3076         pmap_t pmap;
 3077         pt_entry_t *pte, tpte;
 3078         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 3079         pv_entry_t pv;
 3080         vm_offset_t va;
 3081         vm_page_t m, m_pc;
 3082         struct spglist free;
 3083         uint64_t inuse;
 3084         int bit, field, freed;
 3085 
 3086         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 3087         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 3088         pmap = NULL;
 3089         m_pc = NULL;
 3090         PG_G = PG_A = PG_M = PG_RW = 0;
 3091         SLIST_INIT(&free);
 3092         TAILQ_INIT(&new_tail);
 3093         pmap_delayed_invl_started();
 3094         mtx_lock(&pv_chunks_mutex);
 3095         while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
 3096                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 3097                 mtx_unlock(&pv_chunks_mutex);
 3098                 if (pmap != pc->pc_pmap) {
 3099                         if (pmap != NULL) {
 3100                                 pmap_invalidate_all(pmap);
 3101                                 if (pmap != locked_pmap)
 3102                                         PMAP_UNLOCK(pmap);
 3103                         }
 3104                         pmap_delayed_invl_finished();
 3105                         pmap_delayed_invl_started();
 3106                         pmap = pc->pc_pmap;
 3107                         /* Avoid deadlock and lock recursion. */
 3108                         if (pmap > locked_pmap) {
 3109                                 RELEASE_PV_LIST_LOCK(lockp);
 3110                                 PMAP_LOCK(pmap);
 3111                         } else if (pmap != locked_pmap &&
 3112                             !PMAP_TRYLOCK(pmap)) {
 3113                                 pmap = NULL;
 3114                                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 3115                                 mtx_lock(&pv_chunks_mutex);
 3116                                 continue;
 3117                         }
 3118                         PG_G = pmap_global_bit(pmap);
 3119                         PG_A = pmap_accessed_bit(pmap);
 3120                         PG_M = pmap_modified_bit(pmap);
 3121                         PG_RW = pmap_rw_bit(pmap);
 3122                 }
 3123 
 3124                 /*
 3125                  * Destroy every non-wired, 4 KB page mapping in the chunk.
 3126                  */
 3127                 freed = 0;
 3128                 for (field = 0; field < _NPCM; field++) {
 3129                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 3130                             inuse != 0; inuse &= ~(1UL << bit)) {
 3131                                 bit = bsfq(inuse);
 3132                                 pv = &pc->pc_pventry[field * 64 + bit];
 3133                                 va = pv->pv_va;
 3134                                 pde = pmap_pde(pmap, va);
 3135                                 if ((*pde & PG_PS) != 0)
 3136                                         continue;
 3137                                 pte = pmap_pde_to_pte(pde, va);
 3138                                 if ((*pte & PG_W) != 0)
 3139                                         continue;
 3140                                 tpte = pte_load_clear(pte);
 3141                                 if ((tpte & PG_G) != 0)
 3142                                         pmap_invalidate_page(pmap, va);
 3143                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 3144                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3145                                         vm_page_dirty(m);
 3146                                 if ((tpte & PG_A) != 0)
 3147                                         vm_page_aflag_set(m, PGA_REFERENCED);
 3148                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 3149                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 3150                                 m->md.pv_gen++;
 3151                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 3152                                     (m->flags & PG_FICTITIOUS) == 0) {
 3153                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3154                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 3155                                                 vm_page_aflag_clear(m,
 3156                                                     PGA_WRITEABLE);
 3157                                         }
 3158                                 }
 3159                                 pmap_delayed_invl_page(m);
 3160                                 pc->pc_map[field] |= 1UL << bit;
 3161                                 pmap_unuse_pt(pmap, va, *pde, &free);
 3162                                 freed++;
 3163                         }
 3164                 }
 3165                 if (freed == 0) {
 3166                         TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 3167                         mtx_lock(&pv_chunks_mutex);
 3168                         continue;
 3169                 }
 3170                 /* Every freed mapping is for a 4 KB page. */
 3171                 pmap_resident_count_dec(pmap, freed);
 3172                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 3173                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 3174                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 3175                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3176                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 3177                     pc->pc_map[2] == PC_FREE2) {
 3178                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 3179                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 3180                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 3181                         /* Entire chunk is free; return it. */
 3182                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 3183                         dump_drop_page(m_pc->phys_addr);
 3184                         mtx_lock(&pv_chunks_mutex);
 3185                         break;
 3186                 }
 3187                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 3188                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 3189                 mtx_lock(&pv_chunks_mutex);
 3190                 /* One freed pv entry in locked_pmap is sufficient. */
 3191                 if (pmap == locked_pmap)
 3192                         break;
 3193         }
 3194         TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 3195         mtx_unlock(&pv_chunks_mutex);
 3196         if (pmap != NULL) {
 3197                 pmap_invalidate_all(pmap);
 3198                 if (pmap != locked_pmap)
 3199                         PMAP_UNLOCK(pmap);
 3200         }
 3201         pmap_delayed_invl_finished();
 3202         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 3203                 m_pc = SLIST_FIRST(&free);
 3204                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 3205                 /* Recycle a freed page table page. */
 3206                 m_pc->wire_count = 1;
 3207                 atomic_add_int(&vm_cnt.v_wire_count, 1);
 3208         }
 3209         pmap_free_zero_pages(&free);
 3210         return (m_pc);
 3211 }
 3212 
 3213 /*
 3214  * free the pv_entry back to the free list
 3215  */
 3216 static void
 3217 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 3218 {
 3219         struct pv_chunk *pc;
 3220         int idx, field, bit;
 3221 
 3222         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3223         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 3224         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 3225         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 3226         pc = pv_to_chunk(pv);
 3227         idx = pv - &pc->pc_pventry[0];
 3228         field = idx / 64;
 3229         bit = idx % 64;
 3230         pc->pc_map[field] |= 1ul << bit;
 3231         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 3232             pc->pc_map[2] != PC_FREE2) {
 3233                 /* 98% of the time, pc is already at the head of the list. */
 3234                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 3235                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3236                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 3237                 }
 3238                 return;
 3239         }
 3240         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3241         free_pv_chunk(pc);
 3242 }
 3243 
 3244 static void
 3245 free_pv_chunk(struct pv_chunk *pc)
 3246 {
 3247         vm_page_t m;
 3248 
 3249         mtx_lock(&pv_chunks_mutex);
 3250         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 3251         mtx_unlock(&pv_chunks_mutex);
 3252         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 3253         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 3254         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 3255         /* entire chunk is free, return it */
 3256         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 3257         dump_drop_page(m->phys_addr);
 3258         vm_page_unwire(m, PQ_NONE);
 3259         vm_page_free(m);
 3260 }
 3261 
 3262 /*
 3263  * Returns a new PV entry, allocating a new PV chunk from the system when
 3264  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
 3265  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
 3266  * returned.
 3267  *
 3268  * The given PV list lock may be released.
 3269  */
 3270 static pv_entry_t
 3271 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 3272 {
 3273         int bit, field;
 3274         pv_entry_t pv;
 3275         struct pv_chunk *pc;
 3276         vm_page_t m;
 3277 
 3278         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3279         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 3280 retry:
 3281         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 3282         if (pc != NULL) {
 3283                 for (field = 0; field < _NPCM; field++) {
 3284                         if (pc->pc_map[field]) {
 3285                                 bit = bsfq(pc->pc_map[field]);
 3286                                 break;
 3287                         }
 3288                 }
 3289                 if (field < _NPCM) {
 3290                         pv = &pc->pc_pventry[field * 64 + bit];
 3291                         pc->pc_map[field] &= ~(1ul << bit);
 3292                         /* If this was the last item, move it to tail */
 3293                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 3294                             pc->pc_map[2] == 0) {
 3295                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3296                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 3297                                     pc_list);
 3298                         }
 3299                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 3300                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 3301                         return (pv);
 3302                 }
 3303         }
 3304         /* No free items, allocate another chunk */
 3305         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 3306             VM_ALLOC_WIRED);
 3307         if (m == NULL) {
 3308                 if (lockp == NULL) {
 3309                         PV_STAT(pc_chunk_tryfail++);
 3310                         return (NULL);
 3311                 }
 3312                 m = reclaim_pv_chunk(pmap, lockp);
 3313                 if (m == NULL)
 3314                         goto retry;
 3315         }
 3316         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 3317         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 3318         dump_add_page(m->phys_addr);
 3319         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 3320         pc->pc_pmap = pmap;
 3321         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
 3322         pc->pc_map[1] = PC_FREE1;
 3323         pc->pc_map[2] = PC_FREE2;
 3324         mtx_lock(&pv_chunks_mutex);
 3325         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 3326         mtx_unlock(&pv_chunks_mutex);
 3327         pv = &pc->pc_pventry[0];
 3328         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 3329         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 3330         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 3331         return (pv);
 3332 }
 3333 
 3334 /*
 3335  * Returns the number of one bits within the given PV chunk map.
 3336  *
 3337  * The erratas for Intel processors state that "POPCNT Instruction May
 3338  * Take Longer to Execute Than Expected".  It is believed that the
 3339  * issue is the spurious dependency on the destination register.
 3340  * Provide a hint to the register rename logic that the destination
 3341  * value is overwritten, by clearing it, as suggested in the
 3342  * optimization manual.  It should be cheap for unaffected processors
 3343  * as well.
 3344  *
 3345  * Reference numbers for erratas are
 3346  * 4th Gen Core: HSD146
 3347  * 5th Gen Core: BDM85
 3348  * 6th Gen Core: SKL029
 3349  */
 3350 static int
 3351 popcnt_pc_map_pq(uint64_t *map)
 3352 {
 3353         u_long result, tmp;
 3354 
 3355         __asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
 3356             "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
 3357             "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
 3358             : "=&r" (result), "=&r" (tmp)
 3359             : "m" (map[0]), "m" (map[1]), "m" (map[2]));
 3360         return (result);
 3361 }
 3362 
 3363 /*
 3364  * Ensure that the number of spare PV entries in the specified pmap meets or
 3365  * exceeds the given count, "needed".
 3366  *
 3367  * The given PV list lock may be released.
 3368  */
 3369 static void
 3370 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 3371 {
 3372         struct pch new_tail;
 3373         struct pv_chunk *pc;
 3374         int avail, free;
 3375         vm_page_t m;
 3376 
 3377         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3378         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 3379 
 3380         /*
 3381          * Newly allocated PV chunks must be stored in a private list until
 3382          * the required number of PV chunks have been allocated.  Otherwise,
 3383          * reclaim_pv_chunk() could recycle one of these chunks.  In
 3384          * contrast, these chunks must be added to the pmap upon allocation.
 3385          */
 3386         TAILQ_INIT(&new_tail);
 3387 retry:
 3388         avail = 0;
 3389         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 3390 #ifndef __POPCNT__
 3391                 if ((cpu_feature2 & CPUID2_POPCNT) == 0)
 3392                         bit_count((bitstr_t *)pc->pc_map, 0,
 3393                             sizeof(pc->pc_map) * NBBY, &free);
 3394                 else
 3395 #endif
 3396                 free = popcnt_pc_map_pq(pc->pc_map);
 3397                 if (free == 0)
 3398                         break;
 3399                 avail += free;
 3400                 if (avail >= needed)
 3401                         break;
 3402         }
 3403         for (; avail < needed; avail += _NPCPV) {
 3404                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 3405                     VM_ALLOC_WIRED);
 3406                 if (m == NULL) {
 3407                         m = reclaim_pv_chunk(pmap, lockp);
 3408                         if (m == NULL)
 3409                                 goto retry;
 3410                 }
 3411                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 3412                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 3413                 dump_add_page(m->phys_addr);
 3414                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 3415                 pc->pc_pmap = pmap;
 3416                 pc->pc_map[0] = PC_FREE0;
 3417                 pc->pc_map[1] = PC_FREE1;
 3418                 pc->pc_map[2] = PC_FREE2;
 3419                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 3420                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 3421                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 3422         }
 3423         if (!TAILQ_EMPTY(&new_tail)) {
 3424                 mtx_lock(&pv_chunks_mutex);
 3425                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 3426                 mtx_unlock(&pv_chunks_mutex);
 3427         }
 3428 }
 3429 
 3430 /*
 3431  * First find and then remove the pv entry for the specified pmap and virtual
 3432  * address from the specified pv list.  Returns the pv entry if found and NULL
 3433  * otherwise.  This operation can be performed on pv lists for either 4KB or
 3434  * 2MB page mappings.
 3435  */
 3436 static __inline pv_entry_t
 3437 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 3438 {
 3439         pv_entry_t pv;
 3440 
 3441         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 3442                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 3443                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 3444                         pvh->pv_gen++;
 3445                         break;
 3446                 }
 3447         }
 3448         return (pv);
 3449 }
 3450 
 3451 /*
 3452  * After demotion from a 2MB page mapping to 512 4KB page mappings,
 3453  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
 3454  * entries for each of the 4KB page mappings.
 3455  */
 3456 static void
 3457 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3458     struct rwlock **lockp)
 3459 {
 3460         struct md_page *pvh;
 3461         struct pv_chunk *pc;
 3462         pv_entry_t pv;
 3463         vm_offset_t va_last;
 3464         vm_page_t m;
 3465         int bit, field;
 3466 
 3467         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3468         KASSERT((pa & PDRMASK) == 0,
 3469             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 3470         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3471 
 3472         /*
 3473          * Transfer the 2mpage's pv entry for this mapping to the first
 3474          * page's pv list.  Once this transfer begins, the pv list lock
 3475          * must not be released until the last pv entry is reinstantiated.
 3476          */
 3477         pvh = pa_to_pvh(pa);
 3478         va = trunc_2mpage(va);
 3479         pv = pmap_pvh_remove(pvh, pmap, va);
 3480         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 3481         m = PHYS_TO_VM_PAGE(pa);
 3482         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3483         m->md.pv_gen++;
 3484         /* Instantiate the remaining NPTEPG - 1 pv entries. */
 3485         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 3486         va_last = va + NBPDR - PAGE_SIZE;
 3487         for (;;) {
 3488                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 3489                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 3490                     pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
 3491                 for (field = 0; field < _NPCM; field++) {
 3492                         while (pc->pc_map[field]) {
 3493                                 bit = bsfq(pc->pc_map[field]);
 3494                                 pc->pc_map[field] &= ~(1ul << bit);
 3495                                 pv = &pc->pc_pventry[field * 64 + bit];
 3496                                 va += PAGE_SIZE;
 3497                                 pv->pv_va = va;
 3498                                 m++;
 3499                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3500                             ("pmap_pv_demote_pde: page %p is not managed", m));
 3501                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3502                                 m->md.pv_gen++;
 3503                                 if (va == va_last)
 3504                                         goto out;
 3505                         }
 3506                 }
 3507                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3508                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 3509         }
 3510 out:
 3511         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 3512                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 3513                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 3514         }
 3515         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 3516         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 3517 }
 3518 
 3519 /*
 3520  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
 3521  * replace the many pv entries for the 4KB page mappings by a single pv entry
 3522  * for the 2MB page mapping.
 3523  */
 3524 static void
 3525 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3526     struct rwlock **lockp)
 3527 {
 3528         struct md_page *pvh;
 3529         pv_entry_t pv;
 3530         vm_offset_t va_last;
 3531         vm_page_t m;
 3532 
 3533         KASSERT((pa & PDRMASK) == 0,
 3534             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 3535         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3536 
 3537         /*
 3538          * Transfer the first page's pv entry for this mapping to the 2mpage's
 3539          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 3540          * a transfer avoids the possibility that get_pv_entry() calls
 3541          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 3542          * mappings that is being promoted.
 3543          */
 3544         m = PHYS_TO_VM_PAGE(pa);
 3545         va = trunc_2mpage(va);
 3546         pv = pmap_pvh_remove(&m->md, pmap, va);
 3547         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 3548         pvh = pa_to_pvh(pa);
 3549         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 3550         pvh->pv_gen++;
 3551         /* Free the remaining NPTEPG - 1 pv entries. */
 3552         va_last = va + NBPDR - PAGE_SIZE;
 3553         do {
 3554                 m++;
 3555                 va += PAGE_SIZE;
 3556                 pmap_pvh_free(&m->md, pmap, va);
 3557         } while (va < va_last);
 3558 }
 3559 
 3560 /*
 3561  * First find and then destroy the pv entry for the specified pmap and virtual
 3562  * address.  This operation can be performed on pv lists for either 4KB or 2MB
 3563  * page mappings.
 3564  */
 3565 static void
 3566 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 3567 {
 3568         pv_entry_t pv;
 3569 
 3570         pv = pmap_pvh_remove(pvh, pmap, va);
 3571         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 3572         free_pv_entry(pmap, pv);
 3573 }
 3574 
 3575 /*
 3576  * Conditionally create the PV entry for a 4KB page mapping if the required
 3577  * memory can be allocated without resorting to reclamation.
 3578  */
 3579 static boolean_t
 3580 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
 3581     struct rwlock **lockp)
 3582 {
 3583         pv_entry_t pv;
 3584 
 3585         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3586         /* Pass NULL instead of the lock pointer to disable reclamation. */
 3587         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 3588                 pv->pv_va = va;
 3589                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 3590                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3591                 m->md.pv_gen++;
 3592                 return (TRUE);
 3593         } else
 3594                 return (FALSE);
 3595 }
 3596 
 3597 /*
 3598  * Conditionally create the PV entry for a 2MB page mapping if the required
 3599  * memory can be allocated without resorting to reclamation.
 3600  */
 3601 static boolean_t
 3602 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 3603     struct rwlock **lockp)
 3604 {
 3605         struct md_page *pvh;
 3606         pv_entry_t pv;
 3607 
 3608         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3609         /* Pass NULL instead of the lock pointer to disable reclamation. */
 3610         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 3611                 pv->pv_va = va;
 3612                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 3613                 pvh = pa_to_pvh(pa);
 3614                 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 3615                 pvh->pv_gen++;
 3616                 return (TRUE);
 3617         } else
 3618                 return (FALSE);
 3619 }
 3620 
 3621 /*
 3622  * Fills a page table page with mappings to consecutive physical pages.
 3623  */
 3624 static void
 3625 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 3626 {
 3627         pt_entry_t *pte;
 3628 
 3629         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 3630                 *pte = newpte;
 3631                 newpte += PAGE_SIZE;
 3632         }
 3633 }
 3634 
 3635 /*
 3636  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
 3637  * mapping is invalidated.
 3638  */
 3639 static boolean_t
 3640 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 3641 {
 3642         struct rwlock *lock;
 3643         boolean_t rv;
 3644 
 3645         lock = NULL;
 3646         rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
 3647         if (lock != NULL)
 3648                 rw_wunlock(lock);
 3649         return (rv);
 3650 }
 3651 
 3652 static boolean_t
 3653 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 3654     struct rwlock **lockp)
 3655 {
 3656         pd_entry_t newpde, oldpde;
 3657         pt_entry_t *firstpte, newpte;
 3658         pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
 3659         vm_paddr_t mptepa;
 3660         vm_page_t mpte;
 3661         struct spglist free;
 3662         vm_offset_t sva;
 3663         int PG_PTE_CACHE;
 3664 
 3665         PG_G = pmap_global_bit(pmap);
 3666         PG_A = pmap_accessed_bit(pmap);
 3667         PG_M = pmap_modified_bit(pmap);
 3668         PG_RW = pmap_rw_bit(pmap);
 3669         PG_V = pmap_valid_bit(pmap);
 3670         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 3671 
 3672         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3673         oldpde = *pde;
 3674         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 3675             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 3676         if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 3677             NULL) {
 3678                 KASSERT((oldpde & PG_W) == 0,
 3679                     ("pmap_demote_pde: page table page for a wired mapping"
 3680                     " is missing"));
 3681 
 3682                 /*
 3683                  * Invalidate the 2MB page mapping and return "failure" if the
 3684                  * mapping was never accessed or the allocation of the new
 3685                  * page table page fails.  If the 2MB page mapping belongs to
 3686                  * the direct map region of the kernel's address space, then
 3687                  * the page allocation request specifies the highest possible
 3688                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
 3689                  * normal.  Page table pages are preallocated for every other
 3690                  * part of the kernel address space, so the direct map region
 3691                  * is the only part of the kernel address space that must be
 3692                  * handled here.
 3693                  */
 3694                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 3695                     pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
 3696                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 3697                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 3698                         SLIST_INIT(&free);
 3699                         sva = trunc_2mpage(va);
 3700                         pmap_remove_pde(pmap, pde, sva, &free, lockp);
 3701                         if ((oldpde & PG_G) == 0)
 3702                                 pmap_invalidate_pde_page(pmap, sva, oldpde);
 3703                         pmap_free_zero_pages(&free);
 3704                         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
 3705                             " in pmap %p", va, pmap);
 3706                         return (FALSE);
 3707                 }
 3708                 if (va < VM_MAXUSER_ADDRESS)
 3709                         pmap_resident_count_inc(pmap, 1);
 3710         }
 3711         mptepa = VM_PAGE_TO_PHYS(mpte);
 3712         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 3713         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 3714         KASSERT((oldpde & PG_A) != 0,
 3715             ("pmap_demote_pde: oldpde is missing PG_A"));
 3716         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 3717             ("pmap_demote_pde: oldpde is missing PG_M"));
 3718         newpte = oldpde & ~PG_PS;
 3719         newpte = pmap_swap_pat(pmap, newpte);
 3720 
 3721         /*
 3722          * If the page table page is new, initialize it.
 3723          */
 3724         if (mpte->wire_count == 1) {
 3725                 mpte->wire_count = NPTEPG;
 3726                 pmap_fill_ptp(firstpte, newpte);
 3727         }
 3728         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 3729             ("pmap_demote_pde: firstpte and newpte map different physical"
 3730             " addresses"));
 3731 
 3732         /*
 3733          * If the mapping has changed attributes, update the page table
 3734          * entries.
 3735          */
 3736         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 3737                 pmap_fill_ptp(firstpte, newpte);
 3738 
 3739         /*
 3740          * The spare PV entries must be reserved prior to demoting the
 3741          * mapping, that is, prior to changing the PDE.  Otherwise, the state
 3742          * of the PDE and the PV lists will be inconsistent, which can result
 3743          * in reclaim_pv_chunk() attempting to remove a PV entry from the
 3744          * wrong PV list and pmap_pv_demote_pde() failing to find the expected
 3745          * PV entry for the 2MB page mapping that is being demoted.
 3746          */
 3747         if ((oldpde & PG_MANAGED) != 0)
 3748                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 3749 
 3750         /*
 3751          * Demote the mapping.  This pmap is locked.  The old PDE has
 3752          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 3753          * set.  Thus, there is no danger of a race with another
 3754          * processor changing the setting of PG_A and/or PG_M between
 3755          * the read above and the store below. 
 3756          */
 3757         if (workaround_erratum383)
 3758                 pmap_update_pde(pmap, va, pde, newpde);
 3759         else
 3760                 pde_store(pde, newpde);
 3761 
 3762         /*
 3763          * Invalidate a stale recursive mapping of the page table page.
 3764          */
 3765         if (va >= VM_MAXUSER_ADDRESS)
 3766                 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 3767 
 3768         /*
 3769          * Demote the PV entry.
 3770          */
 3771         if ((oldpde & PG_MANAGED) != 0)
 3772                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
 3773 
 3774         atomic_add_long(&pmap_pde_demotions, 1);
 3775         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
 3776             " in pmap %p", va, pmap);
 3777         return (TRUE);
 3778 }
 3779 
 3780 /*
 3781  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
 3782  */
 3783 static void
 3784 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 3785 {
 3786         pd_entry_t newpde;
 3787         vm_paddr_t mptepa;
 3788         vm_page_t mpte;
 3789 
 3790         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 3791         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3792         mpte = pmap_remove_pt_page(pmap, va);
 3793         if (mpte == NULL)
 3794                 panic("pmap_remove_kernel_pde: Missing pt page.");
 3795 
 3796         mptepa = VM_PAGE_TO_PHYS(mpte);
 3797         newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
 3798 
 3799         /*
 3800          * Initialize the page table page.
 3801          */
 3802         pagezero((void *)PHYS_TO_DMAP(mptepa));
 3803 
 3804         /*
 3805          * Demote the mapping.
 3806          */
 3807         if (workaround_erratum383)
 3808                 pmap_update_pde(pmap, va, pde, newpde);
 3809         else
 3810                 pde_store(pde, newpde);
 3811 
 3812         /*
 3813          * Invalidate a stale recursive mapping of the page table page.
 3814          */
 3815         pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 3816 }
 3817 
 3818 /*
 3819  * pmap_remove_pde: do the things to unmap a superpage in a process
 3820  */
 3821 static int
 3822 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 3823     struct spglist *free, struct rwlock **lockp)
 3824 {
 3825         struct md_page *pvh;
 3826         pd_entry_t oldpde;
 3827         vm_offset_t eva, va;
 3828         vm_page_t m, mpte;
 3829         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 3830 
 3831         PG_G = pmap_global_bit(pmap);
 3832         PG_A = pmap_accessed_bit(pmap);
 3833         PG_M = pmap_modified_bit(pmap);
 3834         PG_RW = pmap_rw_bit(pmap);
 3835 
 3836         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3837         KASSERT((sva & PDRMASK) == 0,
 3838             ("pmap_remove_pde: sva is not 2mpage aligned"));
 3839         oldpde = pte_load_clear(pdq);
 3840         if (oldpde & PG_W)
 3841                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 3842         if ((oldpde & PG_G) != 0)
 3843                 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 3844         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 3845         if (oldpde & PG_MANAGED) {
 3846                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 3847                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 3848                 pmap_pvh_free(pvh, pmap, sva);
 3849                 eva = sva + NBPDR;
 3850                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 3851                     va < eva; va += PAGE_SIZE, m++) {
 3852                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3853                                 vm_page_dirty(m);
 3854                         if (oldpde & PG_A)
 3855                                 vm_page_aflag_set(m, PGA_REFERENCED);
 3856                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 3857                             TAILQ_EMPTY(&pvh->pv_list))
 3858                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 3859                         pmap_delayed_invl_page(m);
 3860                 }
 3861         }
 3862         if (pmap == kernel_pmap) {
 3863                 pmap_remove_kernel_pde(pmap, pdq, sva);
 3864         } else {
 3865                 mpte = pmap_remove_pt_page(pmap, sva);
 3866                 if (mpte != NULL) {
 3867                         pmap_resident_count_dec(pmap, 1);
 3868                         KASSERT(mpte->wire_count == NPTEPG,
 3869                             ("pmap_remove_pde: pte page wire count error"));
 3870                         mpte->wire_count = 0;
 3871                         pmap_add_delayed_free_list(mpte, free, FALSE);
 3872                         atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 3873                 }
 3874         }
 3875         return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 3876 }
 3877 
 3878 /*
 3879  * pmap_remove_pte: do the things to unmap a page in a process
 3880  */
 3881 static int
 3882 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
 3883     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 3884 {
 3885         struct md_page *pvh;
 3886         pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 3887         vm_page_t m;
 3888 
 3889         PG_A = pmap_accessed_bit(pmap);
 3890         PG_M = pmap_modified_bit(pmap);
 3891         PG_RW = pmap_rw_bit(pmap);
 3892 
 3893         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3894         oldpte = pte_load_clear(ptq);
 3895         if (oldpte & PG_W)
 3896                 pmap->pm_stats.wired_count -= 1;
 3897         pmap_resident_count_dec(pmap, 1);
 3898         if (oldpte & PG_MANAGED) {
 3899                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 3900                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3901                         vm_page_dirty(m);
 3902                 if (oldpte & PG_A)
 3903                         vm_page_aflag_set(m, PGA_REFERENCED);
 3904                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 3905                 pmap_pvh_free(&m->md, pmap, va);
 3906                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 3907                     (m->flags & PG_FICTITIOUS) == 0) {
 3908                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3909                         if (TAILQ_EMPTY(&pvh->pv_list))
 3910                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 3911                 }
 3912                 pmap_delayed_invl_page(m);
 3913         }
 3914         return (pmap_unuse_pt(pmap, va, ptepde, free));
 3915 }
 3916 
 3917 /*
 3918  * Remove a single page from a process address space
 3919  */
 3920 static void
 3921 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 3922     struct spglist *free)
 3923 {
 3924         struct rwlock *lock;
 3925         pt_entry_t *pte, PG_V;
 3926 
 3927         PG_V = pmap_valid_bit(pmap);
 3928         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3929         if ((*pde & PG_V) == 0)
 3930                 return;
 3931         pte = pmap_pde_to_pte(pde, va);
 3932         if ((*pte & PG_V) == 0)
 3933                 return;
 3934         lock = NULL;
 3935         pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
 3936         if (lock != NULL)
 3937                 rw_wunlock(lock);
 3938         pmap_invalidate_page(pmap, va);
 3939 }
 3940 
 3941 /*
 3942  *      Remove the given range of addresses from the specified map.
 3943  *
 3944  *      It is assumed that the start and end are properly
 3945  *      rounded to the page size.
 3946  */
 3947 void
 3948 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 3949 {
 3950         struct rwlock *lock;
 3951         vm_offset_t va, va_next;
 3952         pml4_entry_t *pml4e;
 3953         pdp_entry_t *pdpe;
 3954         pd_entry_t ptpaddr, *pde;
 3955         pt_entry_t *pte, PG_G, PG_V;
 3956         struct spglist free;
 3957         int anyvalid;
 3958 
 3959         PG_G = pmap_global_bit(pmap);
 3960         PG_V = pmap_valid_bit(pmap);
 3961 
 3962         /*
 3963          * Perform an unsynchronized read.  This is, however, safe.
 3964          */
 3965         if (pmap->pm_stats.resident_count == 0)
 3966                 return;
 3967 
 3968         anyvalid = 0;
 3969         SLIST_INIT(&free);
 3970 
 3971         pmap_delayed_invl_started();
 3972         PMAP_LOCK(pmap);
 3973 
 3974         /*
 3975          * special handling of removing one page.  a very
 3976          * common operation and easy to short circuit some
 3977          * code.
 3978          */
 3979         if (sva + PAGE_SIZE == eva) {
 3980                 pde = pmap_pde(pmap, sva);
 3981                 if (pde && (*pde & PG_PS) == 0) {
 3982                         pmap_remove_page(pmap, sva, pde, &free);
 3983                         goto out;
 3984                 }
 3985         }
 3986 
 3987         lock = NULL;
 3988         for (; sva < eva; sva = va_next) {
 3989 
 3990                 if (pmap->pm_stats.resident_count == 0)
 3991                         break;
 3992 
 3993                 pml4e = pmap_pml4e(pmap, sva);
 3994                 if ((*pml4e & PG_V) == 0) {
 3995                         va_next = (sva + NBPML4) & ~PML4MASK;
 3996                         if (va_next < sva)
 3997                                 va_next = eva;
 3998                         continue;
 3999                 }
 4000 
 4001                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 4002                 if ((*pdpe & PG_V) == 0) {
 4003                         va_next = (sva + NBPDP) & ~PDPMASK;
 4004                         if (va_next < sva)
 4005                                 va_next = eva;
 4006                         continue;
 4007                 }
 4008 
 4009                 /*
 4010                  * Calculate index for next page table.
 4011                  */
 4012                 va_next = (sva + NBPDR) & ~PDRMASK;
 4013                 if (va_next < sva)
 4014                         va_next = eva;
 4015 
 4016                 pde = pmap_pdpe_to_pde(pdpe, sva);
 4017                 ptpaddr = *pde;
 4018 
 4019                 /*
 4020                  * Weed out invalid mappings.
 4021                  */
 4022                 if (ptpaddr == 0)
 4023                         continue;
 4024 
 4025                 /*
 4026                  * Check for large page.
 4027                  */
 4028                 if ((ptpaddr & PG_PS) != 0) {
 4029                         /*
 4030                          * Are we removing the entire large page?  If not,
 4031                          * demote the mapping and fall through.
 4032                          */
 4033                         if (sva + NBPDR == va_next && eva >= va_next) {
 4034                                 /*
 4035                                  * The TLB entry for a PG_G mapping is
 4036                                  * invalidated by pmap_remove_pde().
 4037                                  */
 4038                                 if ((ptpaddr & PG_G) == 0)
 4039                                         anyvalid = 1;
 4040                                 pmap_remove_pde(pmap, pde, sva, &free, &lock);
 4041                                 continue;
 4042                         } else if (!pmap_demote_pde_locked(pmap, pde, sva,
 4043                             &lock)) {
 4044                                 /* The large page mapping was destroyed. */
 4045                                 continue;
 4046                         } else
 4047                                 ptpaddr = *pde;
 4048                 }
 4049 
 4050                 /*
 4051                  * Limit our scan to either the end of the va represented
 4052                  * by the current page table page, or to the end of the
 4053                  * range being removed.
 4054                  */
 4055                 if (va_next > eva)
 4056                         va_next = eva;
 4057 
 4058                 va = va_next;
 4059                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 4060                     sva += PAGE_SIZE) {
 4061                         if (*pte == 0) {
 4062                                 if (va != va_next) {
 4063                                         pmap_invalidate_range(pmap, va, sva);
 4064                                         va = va_next;
 4065                                 }
 4066                                 continue;
 4067                         }
 4068                         if ((*pte & PG_G) == 0)
 4069                                 anyvalid = 1;
 4070                         else if (va == va_next)
 4071                                 va = sva;
 4072                         if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
 4073                             &lock)) {
 4074                                 sva += PAGE_SIZE;
 4075                                 break;
 4076                         }
 4077                 }
 4078                 if (va != va_next)
 4079                         pmap_invalidate_range(pmap, va, sva);
 4080         }
 4081         if (lock != NULL)
 4082                 rw_wunlock(lock);
 4083 out:
 4084         if (anyvalid)
 4085                 pmap_invalidate_all(pmap);
 4086         PMAP_UNLOCK(pmap);
 4087         pmap_delayed_invl_finished();
 4088         pmap_free_zero_pages(&free);
 4089 }
 4090 
 4091 /*
 4092  *      Routine:        pmap_remove_all
 4093  *      Function:
 4094  *              Removes this physical page from
 4095  *              all physical maps in which it resides.
 4096  *              Reflects back modify bits to the pager.
 4097  *
 4098  *      Notes:
 4099  *              Original versions of this routine were very
 4100  *              inefficient because they iteratively called
 4101  *              pmap_remove (slow...)
 4102  */
 4103 
 4104 void
 4105 pmap_remove_all(vm_page_t m)
 4106 {
 4107         struct md_page *pvh;
 4108         pv_entry_t pv;
 4109         pmap_t pmap;
 4110         struct rwlock *lock;
 4111         pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 4112         pd_entry_t *pde;
 4113         vm_offset_t va;
 4114         struct spglist free;
 4115         int pvh_gen, md_gen;
 4116 
 4117         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 4118             ("pmap_remove_all: page %p is not managed", m));
 4119         SLIST_INIT(&free);
 4120         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 4121         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 4122             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4123 retry:
 4124         rw_wlock(lock);
 4125         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 4126                 pmap = PV_PMAP(pv);
 4127                 if (!PMAP_TRYLOCK(pmap)) {
 4128                         pvh_gen = pvh->pv_gen;
 4129                         rw_wunlock(lock);
 4130                         PMAP_LOCK(pmap);
 4131                         rw_wlock(lock);
 4132                         if (pvh_gen != pvh->pv_gen) {
 4133                                 rw_wunlock(lock);
 4134                                 PMAP_UNLOCK(pmap);
 4135                                 goto retry;
 4136                         }
 4137                 }
 4138                 va = pv->pv_va;
 4139                 pde = pmap_pde(pmap, va);
 4140                 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 4141                 PMAP_UNLOCK(pmap);
 4142         }
 4143         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 4144                 pmap = PV_PMAP(pv);
 4145                 if (!PMAP_TRYLOCK(pmap)) {
 4146                         pvh_gen = pvh->pv_gen;
 4147                         md_gen = m->md.pv_gen;
 4148                         rw_wunlock(lock);
 4149                         PMAP_LOCK(pmap);
 4150                         rw_wlock(lock);
 4151                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 4152                                 rw_wunlock(lock);
 4153                                 PMAP_UNLOCK(pmap);
 4154                                 goto retry;
 4155                         }
 4156                 }
 4157                 PG_A = pmap_accessed_bit(pmap);
 4158                 PG_M = pmap_modified_bit(pmap);
 4159                 PG_RW = pmap_rw_bit(pmap);
 4160                 pmap_resident_count_dec(pmap, 1);
 4161                 pde = pmap_pde(pmap, pv->pv_va);
 4162                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 4163                     " a 2mpage in page %p's pv list", m));
 4164                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 4165                 tpte = pte_load_clear(pte);
 4166                 if (tpte & PG_W)
 4167                         pmap->pm_stats.wired_count--;
 4168                 if (tpte & PG_A)
 4169                         vm_page_aflag_set(m, PGA_REFERENCED);
 4170 
 4171                 /*
 4172                  * Update the vm_page_t clean and reference bits.
 4173                  */
 4174                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 4175                         vm_page_dirty(m);
 4176                 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 4177                 pmap_invalidate_page(pmap, pv->pv_va);
 4178                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 4179                 m->md.pv_gen++;
 4180                 free_pv_entry(pmap, pv);
 4181                 PMAP_UNLOCK(pmap);
 4182         }
 4183         vm_page_aflag_clear(m, PGA_WRITEABLE);
 4184         rw_wunlock(lock);
 4185         pmap_delayed_invl_wait(m);
 4186         pmap_free_zero_pages(&free);
 4187 }
 4188 
 4189 /*
 4190  * pmap_protect_pde: do the things to protect a 2mpage in a process
 4191  */
 4192 static boolean_t
 4193 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 4194 {
 4195         pd_entry_t newpde, oldpde;
 4196         vm_offset_t eva, va;
 4197         vm_page_t m;
 4198         boolean_t anychanged;
 4199         pt_entry_t PG_G, PG_M, PG_RW;
 4200 
 4201         PG_G = pmap_global_bit(pmap);
 4202         PG_M = pmap_modified_bit(pmap);
 4203         PG_RW = pmap_rw_bit(pmap);
 4204 
 4205         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4206         KASSERT((sva & PDRMASK) == 0,
 4207             ("pmap_protect_pde: sva is not 2mpage aligned"));
 4208         anychanged = FALSE;
 4209 retry:
 4210         oldpde = newpde = *pde;
 4211         if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 4212             (PG_MANAGED | PG_M | PG_RW)) {
 4213                 eva = sva + NBPDR;
 4214                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 4215                     va < eva; va += PAGE_SIZE, m++)
 4216                         vm_page_dirty(m);
 4217         }
 4218         if ((prot & VM_PROT_WRITE) == 0)
 4219                 newpde &= ~(PG_RW | PG_M);
 4220         if ((prot & VM_PROT_EXECUTE) == 0)
 4221                 newpde |= pg_nx;
 4222         if (newpde != oldpde) {
 4223                 /*
 4224                  * As an optimization to future operations on this PDE, clear
 4225                  * PG_PROMOTED.  The impending invalidation will remove any
 4226                  * lingering 4KB page mappings from the TLB.
 4227                  */
 4228                 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
 4229                         goto retry;
 4230                 if ((oldpde & PG_G) != 0)
 4231                         pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 4232                 else
 4233                         anychanged = TRUE;
 4234         }
 4235         return (anychanged);
 4236 }
 4237 
 4238 /*
 4239  *      Set the physical protection on the
 4240  *      specified range of this map as requested.
 4241  */
 4242 void
 4243 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 4244 {
 4245         vm_offset_t va_next;
 4246         pml4_entry_t *pml4e;
 4247         pdp_entry_t *pdpe;
 4248         pd_entry_t ptpaddr, *pde;
 4249         pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 4250         boolean_t anychanged;
 4251 
 4252         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 4253         if (prot == VM_PROT_NONE) {
 4254                 pmap_remove(pmap, sva, eva);
 4255                 return;
 4256         }
 4257 
 4258         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 4259             (VM_PROT_WRITE|VM_PROT_EXECUTE))
 4260                 return;
 4261 
 4262         PG_G = pmap_global_bit(pmap);
 4263         PG_M = pmap_modified_bit(pmap);
 4264         PG_V = pmap_valid_bit(pmap);
 4265         PG_RW = pmap_rw_bit(pmap);
 4266         anychanged = FALSE;
 4267 
 4268         PMAP_LOCK(pmap);
 4269         for (; sva < eva; sva = va_next) {
 4270 
 4271                 pml4e = pmap_pml4e(pmap, sva);
 4272                 if ((*pml4e & PG_V) == 0) {
 4273                         va_next = (sva + NBPML4) & ~PML4MASK;
 4274                         if (va_next < sva)
 4275                                 va_next = eva;
 4276                         continue;
 4277                 }
 4278 
 4279                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 4280                 if ((*pdpe & PG_V) == 0) {
 4281                         va_next = (sva + NBPDP) & ~PDPMASK;
 4282                         if (va_next < sva)
 4283                                 va_next = eva;
 4284                         continue;
 4285                 }
 4286 
 4287                 va_next = (sva + NBPDR) & ~PDRMASK;
 4288                 if (va_next < sva)
 4289                         va_next = eva;
 4290 
 4291                 pde = pmap_pdpe_to_pde(pdpe, sva);
 4292                 ptpaddr = *pde;
 4293 
 4294                 /*
 4295                  * Weed out invalid mappings.
 4296                  */
 4297                 if (ptpaddr == 0)
 4298                         continue;
 4299 
 4300                 /*
 4301                  * Check for large page.
 4302                  */
 4303                 if ((ptpaddr & PG_PS) != 0) {
 4304                         /*
 4305                          * Are we protecting the entire large page?  If not,
 4306                          * demote the mapping and fall through.
 4307                          */
 4308                         if (sva + NBPDR == va_next && eva >= va_next) {
 4309                                 /*
 4310                                  * The TLB entry for a PG_G mapping is
 4311                                  * invalidated by pmap_protect_pde().
 4312                                  */
 4313                                 if (pmap_protect_pde(pmap, pde, sva, prot))
 4314                                         anychanged = TRUE;
 4315                                 continue;
 4316                         } else if (!pmap_demote_pde(pmap, pde, sva)) {
 4317                                 /*
 4318                                  * The large page mapping was destroyed.
 4319                                  */
 4320                                 continue;
 4321                         }
 4322                 }
 4323 
 4324                 if (va_next > eva)
 4325                         va_next = eva;
 4326 
 4327                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 4328                     sva += PAGE_SIZE) {
 4329                         pt_entry_t obits, pbits;
 4330                         vm_page_t m;
 4331 
 4332 retry:
 4333                         obits = pbits = *pte;
 4334                         if ((pbits & PG_V) == 0)
 4335                                 continue;
 4336 
 4337                         if ((prot & VM_PROT_WRITE) == 0) {
 4338                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 4339                                     (PG_MANAGED | PG_M | PG_RW)) {
 4340                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 4341                                         vm_page_dirty(m);
 4342                                 }
 4343                                 pbits &= ~(PG_RW | PG_M);
 4344                         }
 4345                         if ((prot & VM_PROT_EXECUTE) == 0)
 4346                                 pbits |= pg_nx;
 4347 
 4348                         if (pbits != obits) {
 4349                                 if (!atomic_cmpset_long(pte, obits, pbits))
 4350                                         goto retry;
 4351                                 if (obits & PG_G)
 4352                                         pmap_invalidate_page(pmap, sva);
 4353                                 else
 4354                                         anychanged = TRUE;
 4355                         }
 4356                 }
 4357         }
 4358         if (anychanged)
 4359                 pmap_invalidate_all(pmap);
 4360         PMAP_UNLOCK(pmap);
 4361 }
 4362 
 4363 /*
 4364  * Tries to promote the 512, contiguous 4KB page mappings that are within a
 4365  * single page table page (PTP) to a single 2MB page mapping.  For promotion
 4366  * to occur, two conditions must be met: (1) the 4KB page mappings must map
 4367  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
 4368  * identical characteristics. 
 4369  */
 4370 static void
 4371 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 4372     struct rwlock **lockp)
 4373 {
 4374         pd_entry_t newpde;
 4375         pt_entry_t *firstpte, oldpte, pa, *pte;
 4376         pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
 4377         vm_page_t mpte;
 4378         int PG_PTE_CACHE;
 4379 
 4380         PG_A = pmap_accessed_bit(pmap);
 4381         PG_G = pmap_global_bit(pmap);
 4382         PG_M = pmap_modified_bit(pmap);
 4383         PG_V = pmap_valid_bit(pmap);
 4384         PG_RW = pmap_rw_bit(pmap);
 4385         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 4386 
 4387         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4388 
 4389         /*
 4390          * Examine the first PTE in the specified PTP.  Abort if this PTE is
 4391          * either invalid, unused, or does not map the first 4KB physical page
 4392          * within a 2MB page. 
 4393          */
 4394         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 4395 setpde:
 4396         newpde = *firstpte;
 4397         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 4398                 atomic_add_long(&pmap_pde_p_failures, 1);
 4399                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4400                     " in pmap %p", va, pmap);
 4401                 return;
 4402         }
 4403         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 4404                 /*
 4405                  * When PG_M is already clear, PG_RW can be cleared without
 4406                  * a TLB invalidation.
 4407                  */
 4408                 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 4409                         goto setpde;
 4410                 newpde &= ~PG_RW;
 4411         }
 4412 
 4413         /*
 4414          * Examine each of the other PTEs in the specified PTP.  Abort if this
 4415          * PTE maps an unexpected 4KB physical page or does not have identical
 4416          * characteristics to the first PTE.
 4417          */
 4418         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 4419         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 4420 setpte:
 4421                 oldpte = *pte;
 4422                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 4423                         atomic_add_long(&pmap_pde_p_failures, 1);
 4424                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4425                             " in pmap %p", va, pmap);
 4426                         return;
 4427                 }
 4428                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 4429                         /*
 4430                          * When PG_M is already clear, PG_RW can be cleared
 4431                          * without a TLB invalidation.
 4432                          */
 4433                         if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 4434                                 goto setpte;
 4435                         oldpte &= ~PG_RW;
 4436                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 4437                             " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
 4438                             (va & ~PDRMASK), pmap);
 4439                 }
 4440                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 4441                         atomic_add_long(&pmap_pde_p_failures, 1);
 4442                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 4443                             " in pmap %p", va, pmap);
 4444                         return;
 4445                 }
 4446                 pa -= PAGE_SIZE;
 4447         }
 4448 
 4449         /*
 4450          * Save the page table page in its current state until the PDE
 4451          * mapping the superpage is demoted by pmap_demote_pde() or
 4452          * destroyed by pmap_remove_pde(). 
 4453          */
 4454         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 4455         KASSERT(mpte >= vm_page_array &&
 4456             mpte < &vm_page_array[vm_page_array_size],
 4457             ("pmap_promote_pde: page table page is out of range"));
 4458         KASSERT(mpte->pindex == pmap_pde_pindex(va),
 4459             ("pmap_promote_pde: page table page's pindex is wrong"));
 4460         if (pmap_insert_pt_page(pmap, mpte)) {
 4461                 atomic_add_long(&pmap_pde_p_failures, 1);
 4462                 CTR2(KTR_PMAP,
 4463                     "pmap_promote_pde: failure for va %#lx in pmap %p", va,
 4464                     pmap);
 4465                 return;
 4466         }
 4467 
 4468         /*
 4469          * Promote the pv entries.
 4470          */
 4471         if ((newpde & PG_MANAGED) != 0)
 4472                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 4473 
 4474         /*
 4475          * Propagate the PAT index to its proper position.
 4476          */
 4477         newpde = pmap_swap_pat(pmap, newpde);
 4478 
 4479         /*
 4480          * Map the superpage.
 4481          */
 4482         if (workaround_erratum383)
 4483                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 4484         else
 4485                 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 4486 
 4487         atomic_add_long(&pmap_pde_promotions, 1);
 4488         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 4489             " in pmap %p", va, pmap);
 4490 }
 4491 
 4492 /*
 4493  *      Insert the given physical page (p) at
 4494  *      the specified virtual address (v) in the
 4495  *      target physical map with the protection requested.
 4496  *
 4497  *      If specified, the page will be wired down, meaning
 4498  *      that the related pte can not be reclaimed.
 4499  *
 4500  *      NB:  This is the only routine which MAY NOT lazy-evaluate
 4501  *      or lose information.  That is, this routine must actually
 4502  *      insert this page into the given map NOW.
 4503  *
 4504  *      When destroying both a page table and PV entry, this function
 4505  *      performs the TLB invalidation before releasing the PV list
 4506  *      lock, so we do not need pmap_delayed_invl_page() calls here.
 4507  */
 4508 int
 4509 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 4510     u_int flags, int8_t psind __unused)
 4511 {
 4512         struct rwlock *lock;
 4513         pd_entry_t *pde;
 4514         pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 4515         pt_entry_t newpte, origpte;
 4516         pv_entry_t pv;
 4517         vm_paddr_t opa, pa;
 4518         vm_page_t mpte, om;
 4519         boolean_t nosleep;
 4520 
 4521         PG_A = pmap_accessed_bit(pmap);
 4522         PG_G = pmap_global_bit(pmap);
 4523         PG_M = pmap_modified_bit(pmap);
 4524         PG_V = pmap_valid_bit(pmap);
 4525         PG_RW = pmap_rw_bit(pmap);
 4526 
 4527         va = trunc_page(va);
 4528         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 4529         KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 4530             ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 4531             va));
 4532         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
 4533             va >= kmi.clean_eva,
 4534             ("pmap_enter: managed mapping within the clean submap"));
 4535         if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 4536                 VM_OBJECT_ASSERT_LOCKED(m->object);
 4537         pa = VM_PAGE_TO_PHYS(m);
 4538         newpte = (pt_entry_t)(pa | PG_A | PG_V);
 4539         if ((flags & VM_PROT_WRITE) != 0)
 4540                 newpte |= PG_M;
 4541         if ((prot & VM_PROT_WRITE) != 0)
 4542                 newpte |= PG_RW;
 4543         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 4544             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 4545         if ((prot & VM_PROT_EXECUTE) == 0)
 4546                 newpte |= pg_nx;
 4547         if ((flags & PMAP_ENTER_WIRED) != 0)
 4548                 newpte |= PG_W;
 4549         if (va < VM_MAXUSER_ADDRESS)
 4550                 newpte |= PG_U;
 4551         if (pmap == kernel_pmap)
 4552                 newpte |= PG_G;
 4553         newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
 4554 
 4555         /*
 4556          * Set modified bit gratuitously for writeable mappings if
 4557          * the page is unmanaged. We do not want to take a fault
 4558          * to do the dirty bit accounting for these mappings.
 4559          */
 4560         if ((m->oflags & VPO_UNMANAGED) != 0) {
 4561                 if ((newpte & PG_RW) != 0)
 4562                         newpte |= PG_M;
 4563         } else
 4564                 newpte |= PG_MANAGED;
 4565 
 4566         mpte = NULL;
 4567 
 4568         lock = NULL;
 4569         PMAP_LOCK(pmap);
 4570 
 4571         /*
 4572          * In the case that a page table page is not
 4573          * resident, we are creating it here.
 4574          */
 4575 retry:
 4576         pde = pmap_pde(pmap, va);
 4577         if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
 4578             pmap_demote_pde_locked(pmap, pde, va, &lock))) {
 4579                 pte = pmap_pde_to_pte(pde, va);
 4580                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 4581                         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 4582                         mpte->wire_count++;
 4583                 }
 4584         } else if (va < VM_MAXUSER_ADDRESS) {
 4585                 /*
 4586                  * Here if the pte page isn't mapped, or if it has been
 4587                  * deallocated.
 4588                  */
 4589                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 4590                 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
 4591                     nosleep ? NULL : &lock);
 4592                 if (mpte == NULL && nosleep) {
 4593                         if (lock != NULL)
 4594                                 rw_wunlock(lock);
 4595                         PMAP_UNLOCK(pmap);
 4596                         return (KERN_RESOURCE_SHORTAGE);
 4597                 }
 4598                 goto retry;
 4599         } else
 4600                 panic("pmap_enter: invalid page directory va=%#lx", va);
 4601 
 4602         origpte = *pte;
 4603 
 4604         /*
 4605          * Is the specified virtual address already mapped?
 4606          */
 4607         if ((origpte & PG_V) != 0) {
 4608                 /*
 4609                  * Wiring change, just update stats. We don't worry about
 4610                  * wiring PT pages as they remain resident as long as there
 4611                  * are valid mappings in them. Hence, if a user page is wired,
 4612                  * the PT page will be also.
 4613                  */
 4614                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 4615                         pmap->pm_stats.wired_count++;
 4616                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 4617                         pmap->pm_stats.wired_count--;
 4618 
 4619                 /*
 4620                  * Remove the extra PT page reference.
 4621                  */
 4622                 if (mpte != NULL) {
 4623                         mpte->wire_count--;
 4624                         KASSERT(mpte->wire_count > 0,
 4625                             ("pmap_enter: missing reference to page table page,"
 4626                              " va: 0x%lx", va));
 4627                 }
 4628 
 4629                 /*
 4630                  * Has the physical page changed?
 4631                  */
 4632                 opa = origpte & PG_FRAME;
 4633                 if (opa == pa) {
 4634                         /*
 4635                          * No, might be a protection or wiring change.
 4636                          */
 4637                         if ((origpte & PG_MANAGED) != 0 &&
 4638                             (newpte & PG_RW) != 0)
 4639                                 vm_page_aflag_set(m, PGA_WRITEABLE);
 4640                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 4641                                 goto unchanged;
 4642                         goto validate;
 4643                 }
 4644         } else {
 4645                 /*
 4646                  * Increment the counters.
 4647                  */
 4648                 if ((newpte & PG_W) != 0)
 4649                         pmap->pm_stats.wired_count++;
 4650                 pmap_resident_count_inc(pmap, 1);
 4651         }
 4652 
 4653         /*
 4654          * Enter on the PV list if part of our managed memory.
 4655          */
 4656         if ((newpte & PG_MANAGED) != 0) {
 4657                 pv = get_pv_entry(pmap, &lock);
 4658                 pv->pv_va = va;
 4659                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 4660                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 4661                 m->md.pv_gen++;
 4662                 if ((newpte & PG_RW) != 0)
 4663                         vm_page_aflag_set(m, PGA_WRITEABLE);
 4664         }
 4665 
 4666         /*
 4667          * Update the PTE.
 4668          */
 4669         if ((origpte & PG_V) != 0) {
 4670 validate:
 4671                 origpte = pte_load_store(pte, newpte);
 4672                 opa = origpte & PG_FRAME;
 4673                 if (opa != pa) {
 4674                         if ((origpte & PG_MANAGED) != 0) {
 4675                                 om = PHYS_TO_VM_PAGE(opa);
 4676                                 if ((origpte & (PG_M | PG_RW)) == (PG_M |
 4677                                     PG_RW))
 4678                                         vm_page_dirty(om);
 4679                                 if ((origpte & PG_A) != 0)
 4680                                         vm_page_aflag_set(om, PGA_REFERENCED);
 4681                                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 4682                                 pmap_pvh_free(&om->md, pmap, va);
 4683                                 if ((om->aflags & PGA_WRITEABLE) != 0 &&
 4684                                     TAILQ_EMPTY(&om->md.pv_list) &&
 4685                                     ((om->flags & PG_FICTITIOUS) != 0 ||
 4686                                     TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 4687                                         vm_page_aflag_clear(om, PGA_WRITEABLE);
 4688                         }
 4689                 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
 4690                     PG_RW)) == (PG_M | PG_RW)) {
 4691                         if ((origpte & PG_MANAGED) != 0)
 4692                                 vm_page_dirty(m);
 4693 
 4694                         /*
 4695                          * Although the PTE may still have PG_RW set, TLB
 4696                          * invalidation may nonetheless be required because
 4697                          * the PTE no longer has PG_M set.
 4698                          */
 4699                 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 4700                         /*
 4701                          * This PTE change does not require TLB invalidation.
 4702                          */
 4703                         goto unchanged;
 4704                 }
 4705                 if ((origpte & PG_A) != 0)
 4706                         pmap_invalidate_page(pmap, va);
 4707         } else
 4708                 pte_store(pte, newpte);
 4709 
 4710 unchanged:
 4711 
 4712         /*
 4713          * If both the page table page and the reservation are fully
 4714          * populated, then attempt promotion.
 4715          */
 4716         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 4717             pmap_ps_enabled(pmap) &&
 4718             (m->flags & PG_FICTITIOUS) == 0 &&
 4719             vm_reserv_level_iffullpop(m) == 0)
 4720                 pmap_promote_pde(pmap, pde, va, &lock);
 4721 
 4722         if (lock != NULL)
 4723                 rw_wunlock(lock);
 4724         PMAP_UNLOCK(pmap);
 4725         return (KERN_SUCCESS);
 4726 }
 4727 
 4728 /*
 4729  * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
 4730  * otherwise.  Fails if (1) a page table page cannot be allocated without
 4731  * blocking, (2) a mapping already exists at the specified virtual address, or
 4732  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
 4733  */
 4734 static boolean_t
 4735 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 4736     struct rwlock **lockp)
 4737 {
 4738         pd_entry_t *pde, newpde;
 4739         pt_entry_t PG_V;
 4740         vm_page_t mpde;
 4741         struct spglist free;
 4742 
 4743         PG_V = pmap_valid_bit(pmap);
 4744         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4745 
 4746         if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
 4747                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4748                     " in pmap %p", va, pmap);
 4749                 return (FALSE);
 4750         }
 4751         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
 4752         pde = &pde[pmap_pde_index(va)];
 4753         if ((*pde & PG_V) != 0) {
 4754                 KASSERT(mpde->wire_count > 1,
 4755                     ("pmap_enter_pde: mpde's wire count is too low"));
 4756                 mpde->wire_count--;
 4757                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4758                     " in pmap %p", va, pmap);
 4759                 return (FALSE);
 4760         }
 4761         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 4762             PG_PS | PG_V;
 4763         if ((m->oflags & VPO_UNMANAGED) == 0) {
 4764                 newpde |= PG_MANAGED;
 4765 
 4766                 /*
 4767                  * Abort this mapping if its PV entry could not be created.
 4768                  */
 4769                 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
 4770                     lockp)) {
 4771                         SLIST_INIT(&free);
 4772                         if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
 4773                                 /*
 4774                                  * Although "va" is not mapped, paging-
 4775                                  * structure caches could nonetheless have
 4776                                  * entries that refer to the freed page table
 4777                                  * pages.  Invalidate those entries.
 4778                                  */
 4779                                 pmap_invalidate_page(pmap, va);
 4780                                 pmap_free_zero_pages(&free);
 4781                         }
 4782                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4783                             " in pmap %p", va, pmap);
 4784                         return (FALSE);
 4785                 }
 4786         }
 4787         if ((prot & VM_PROT_EXECUTE) == 0)
 4788                 newpde |= pg_nx;
 4789         if (va < VM_MAXUSER_ADDRESS)
 4790                 newpde |= PG_U;
 4791 
 4792         /*
 4793          * Increment counters.
 4794          */
 4795         pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 4796 
 4797         /*
 4798          * Map the superpage.  (This is not a promoted mapping; there will not
 4799          * be any lingering 4KB page mappings in the TLB.)
 4800          */
 4801         pde_store(pde, newpde);
 4802 
 4803         atomic_add_long(&pmap_pde_mappings, 1);
 4804         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 4805             " in pmap %p", va, pmap);
 4806         return (TRUE);
 4807 }
 4808 
 4809 /*
 4810  * Maps a sequence of resident pages belonging to the same object.
 4811  * The sequence begins with the given page m_start.  This page is
 4812  * mapped at the given virtual address start.  Each subsequent page is
 4813  * mapped at a virtual address that is offset from start by the same
 4814  * amount as the page is offset from m_start within the object.  The
 4815  * last page in the sequence is the page with the largest offset from
 4816  * m_start that can be mapped at a virtual address less than the given
 4817  * virtual address end.  Not every virtual page between start and end
 4818  * is mapped; only those for which a resident page exists with the
 4819  * corresponding offset from m_start are mapped.
 4820  */
 4821 void
 4822 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
 4823     vm_page_t m_start, vm_prot_t prot)
 4824 {
 4825         struct rwlock *lock;
 4826         vm_offset_t va;
 4827         vm_page_t m, mpte;
 4828         vm_pindex_t diff, psize;
 4829 
 4830         VM_OBJECT_ASSERT_LOCKED(m_start->object);
 4831 
 4832         psize = atop(end - start);
 4833         mpte = NULL;
 4834         m = m_start;
 4835         lock = NULL;
 4836         PMAP_LOCK(pmap);
 4837         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 4838                 va = start + ptoa(diff);
 4839                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 4840                     m->psind == 1 && pmap_ps_enabled(pmap) &&
 4841                     pmap_enter_pde(pmap, va, m, prot, &lock))
 4842                         m = &m[NBPDR / PAGE_SIZE - 1];
 4843                 else
 4844                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 4845                             mpte, &lock);
 4846                 m = TAILQ_NEXT(m, listq);
 4847         }
 4848         if (lock != NULL)
 4849                 rw_wunlock(lock);
 4850         PMAP_UNLOCK(pmap);
 4851 }
 4852 
 4853 /*
 4854  * this code makes some *MAJOR* assumptions:
 4855  * 1. Current pmap & pmap exists.
 4856  * 2. Not wired.
 4857  * 3. Read access.
 4858  * 4. No page table pages.
 4859  * but is *MUCH* faster than pmap_enter...
 4860  */
 4861 
 4862 void
 4863 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 4864 {
 4865         struct rwlock *lock;
 4866 
 4867         lock = NULL;
 4868         PMAP_LOCK(pmap);
 4869         (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 4870         if (lock != NULL)
 4871                 rw_wunlock(lock);
 4872         PMAP_UNLOCK(pmap);
 4873 }
 4874 
 4875 static vm_page_t
 4876 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 4877     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 4878 {
 4879         struct spglist free;
 4880         pt_entry_t *pte, PG_V;
 4881         vm_paddr_t pa;
 4882 
 4883         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 4884             (m->oflags & VPO_UNMANAGED) != 0,
 4885             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 4886         PG_V = pmap_valid_bit(pmap);
 4887         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4888 
 4889         /*
 4890          * In the case that a page table page is not
 4891          * resident, we are creating it here.
 4892          */
 4893         if (va < VM_MAXUSER_ADDRESS) {
 4894                 vm_pindex_t ptepindex;
 4895                 pd_entry_t *ptepa;
 4896 
 4897                 /*
 4898                  * Calculate pagetable page index
 4899                  */
 4900                 ptepindex = pmap_pde_pindex(va);
 4901                 if (mpte && (mpte->pindex == ptepindex)) {
 4902                         mpte->wire_count++;
 4903                 } else {
 4904                         /*
 4905                          * Get the page directory entry
 4906                          */
 4907                         ptepa = pmap_pde(pmap, va);
 4908 
 4909                         /*
 4910                          * If the page table page is mapped, we just increment
 4911                          * the hold count, and activate it.  Otherwise, we
 4912                          * attempt to allocate a page table page.  If this
 4913                          * attempt fails, we don't retry.  Instead, we give up.
 4914                          */
 4915                         if (ptepa && (*ptepa & PG_V) != 0) {
 4916                                 if (*ptepa & PG_PS)
 4917                                         return (NULL);
 4918                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 4919                                 mpte->wire_count++;
 4920                         } else {
 4921                                 /*
 4922                                  * Pass NULL instead of the PV list lock
 4923                                  * pointer, because we don't intend to sleep.
 4924                                  */
 4925                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 4926                                 if (mpte == NULL)
 4927                                         return (mpte);
 4928                         }
 4929                 }
 4930                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 4931                 pte = &pte[pmap_pte_index(va)];
 4932         } else {
 4933                 mpte = NULL;
 4934                 pte = vtopte(va);
 4935         }
 4936         if (*pte) {
 4937                 if (mpte != NULL) {
 4938                         mpte->wire_count--;
 4939                         mpte = NULL;
 4940                 }
 4941                 return (mpte);
 4942         }
 4943 
 4944         /*
 4945          * Enter on the PV list if part of our managed memory.
 4946          */
 4947         if ((m->oflags & VPO_UNMANAGED) == 0 &&
 4948             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 4949                 if (mpte != NULL) {
 4950                         SLIST_INIT(&free);
 4951                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 4952                                 /*
 4953                                  * Although "va" is not mapped, paging-
 4954                                  * structure caches could nonetheless have
 4955                                  * entries that refer to the freed page table
 4956                                  * pages.  Invalidate those entries.
 4957                                  */
 4958                                 pmap_invalidate_page(pmap, va);
 4959                                 pmap_free_zero_pages(&free);
 4960                         }
 4961                         mpte = NULL;
 4962                 }
 4963                 return (mpte);
 4964         }
 4965 
 4966         /*
 4967          * Increment counters
 4968          */
 4969         pmap_resident_count_inc(pmap, 1);
 4970 
 4971         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
 4972         if ((prot & VM_PROT_EXECUTE) == 0)
 4973                 pa |= pg_nx;
 4974 
 4975         /*
 4976          * Now validate mapping with RO protection
 4977          */
 4978         if ((m->oflags & VPO_UNMANAGED) != 0)
 4979                 pte_store(pte, pa | PG_V | PG_U);
 4980         else
 4981                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 4982         return (mpte);
 4983 }
 4984 
 4985 /*
 4986  * Make a temporary mapping for a physical address.  This is only intended
 4987  * to be used for panic dumps.
 4988  */
 4989 void *
 4990 pmap_kenter_temporary(vm_paddr_t pa, int i)
 4991 {
 4992         vm_offset_t va;
 4993 
 4994         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 4995         pmap_kenter(va, pa);
 4996         invlpg(va);
 4997         return ((void *)crashdumpmap);
 4998 }
 4999 
 5000 /*
 5001  * This code maps large physical mmap regions into the
 5002  * processor address space.  Note that some shortcuts
 5003  * are taken, but the code works.
 5004  */
 5005 void
 5006 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
 5007     vm_pindex_t pindex, vm_size_t size)
 5008 {
 5009         pd_entry_t *pde;
 5010         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 5011         vm_paddr_t pa, ptepa;
 5012         vm_page_t p, pdpg;
 5013         int pat_mode;
 5014 
 5015         PG_A = pmap_accessed_bit(pmap);
 5016         PG_M = pmap_modified_bit(pmap);
 5017         PG_V = pmap_valid_bit(pmap);
 5018         PG_RW = pmap_rw_bit(pmap);
 5019 
 5020         VM_OBJECT_ASSERT_WLOCKED(object);
 5021         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 5022             ("pmap_object_init_pt: non-device object"));
 5023         if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 5024                 if (!pmap_ps_enabled(pmap))
 5025                         return;
 5026                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
 5027                         return;
 5028                 p = vm_page_lookup(object, pindex);
 5029                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
 5030                     ("pmap_object_init_pt: invalid page %p", p));
 5031                 pat_mode = p->md.pat_mode;
 5032 
 5033                 /*
 5034                  * Abort the mapping if the first page is not physically
 5035                  * aligned to a 2MB page boundary.
 5036                  */
 5037                 ptepa = VM_PAGE_TO_PHYS(p);
 5038                 if (ptepa & (NBPDR - 1))
 5039                         return;
 5040 
 5041                 /*
 5042                  * Skip the first page.  Abort the mapping if the rest of
 5043                  * the pages are not physically contiguous or have differing
 5044                  * memory attributes.
 5045                  */
 5046                 p = TAILQ_NEXT(p, listq);
 5047                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 5048                     pa += PAGE_SIZE) {
 5049                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
 5050                             ("pmap_object_init_pt: invalid page %p", p));
 5051                         if (pa != VM_PAGE_TO_PHYS(p) ||
 5052                             pat_mode != p->md.pat_mode)
 5053                                 return;
 5054                         p = TAILQ_NEXT(p, listq);
 5055                 }
 5056 
 5057                 /*
 5058                  * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 5059                  * "size" is a multiple of 2M, adding the PAT setting to "pa"
 5060                  * will not affect the termination of this loop.
 5061                  */ 
 5062                 PMAP_LOCK(pmap);
 5063                 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 5064                     pa < ptepa + size; pa += NBPDR) {
 5065                         pdpg = pmap_allocpde(pmap, addr, NULL);
 5066                         if (pdpg == NULL) {
 5067                                 /*
 5068                                  * The creation of mappings below is only an
 5069                                  * optimization.  If a page directory page
 5070                                  * cannot be allocated without blocking,
 5071                                  * continue on to the next mapping rather than
 5072                                  * blocking.
 5073                                  */
 5074                                 addr += NBPDR;
 5075                                 continue;
 5076                         }
 5077                         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 5078                         pde = &pde[pmap_pde_index(addr)];
 5079                         if ((*pde & PG_V) == 0) {
 5080                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
 5081                                     PG_U | PG_RW | PG_V);
 5082                                 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 5083                                 atomic_add_long(&pmap_pde_mappings, 1);
 5084                         } else {
 5085                                 /* Continue on if the PDE is already valid. */
 5086                                 pdpg->wire_count--;
 5087                                 KASSERT(pdpg->wire_count > 0,
 5088                                     ("pmap_object_init_pt: missing reference "
 5089                                     "to page directory page, va: 0x%lx", addr));
 5090                         }
 5091                         addr += NBPDR;
 5092                 }
 5093                 PMAP_UNLOCK(pmap);
 5094         }
 5095 }
 5096 
 5097 /*
 5098  *      Clear the wired attribute from the mappings for the specified range of
 5099  *      addresses in the given pmap.  Every valid mapping within that range
 5100  *      must have the wired attribute set.  In contrast, invalid mappings
 5101  *      cannot have the wired attribute set, so they are ignored.
 5102  *
 5103  *      The wired attribute of the page table entry is not a hardware
 5104  *      feature, so there is no need to invalidate any TLB entries.
 5105  *      Since pmap_demote_pde() for the wired entry must never fail,
 5106  *      pmap_delayed_invl_started()/finished() calls around the
 5107  *      function are not needed.
 5108  */
 5109 void
 5110 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 5111 {
 5112         vm_offset_t va_next;
 5113         pml4_entry_t *pml4e;
 5114         pdp_entry_t *pdpe;
 5115         pd_entry_t *pde;
 5116         pt_entry_t *pte, PG_V;
 5117 
 5118         PG_V = pmap_valid_bit(pmap);
 5119         PMAP_LOCK(pmap);
 5120         for (; sva < eva; sva = va_next) {
 5121                 pml4e = pmap_pml4e(pmap, sva);
 5122                 if ((*pml4e & PG_V) == 0) {
 5123                         va_next = (sva + NBPML4) & ~PML4MASK;
 5124                         if (va_next < sva)
 5125                                 va_next = eva;
 5126                         continue;
 5127                 }
 5128                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 5129                 if ((*pdpe & PG_V) == 0) {
 5130                         va_next = (sva + NBPDP) & ~PDPMASK;
 5131                         if (va_next < sva)
 5132                                 va_next = eva;
 5133                         continue;
 5134                 }
 5135                 va_next = (sva + NBPDR) & ~PDRMASK;
 5136                 if (va_next < sva)
 5137                         va_next = eva;
 5138                 pde = pmap_pdpe_to_pde(pdpe, sva);
 5139                 if ((*pde & PG_V) == 0)
 5140                         continue;
 5141                 if ((*pde & PG_PS) != 0) {
 5142                         if ((*pde & PG_W) == 0)
 5143                                 panic("pmap_unwire: pde %#jx is missing PG_W",
 5144                                     (uintmax_t)*pde);
 5145 
 5146                         /*
 5147                          * Are we unwiring the entire large page?  If not,
 5148                          * demote the mapping and fall through.
 5149                          */
 5150                         if (sva + NBPDR == va_next && eva >= va_next) {
 5151                                 atomic_clear_long(pde, PG_W);
 5152                                 pmap->pm_stats.wired_count -= NBPDR /
 5153                                     PAGE_SIZE;
 5154                                 continue;
 5155                         } else if (!pmap_demote_pde(pmap, pde, sva))
 5156                                 panic("pmap_unwire: demotion failed");
 5157                 }
 5158                 if (va_next > eva)
 5159                         va_next = eva;
 5160                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 5161                     sva += PAGE_SIZE) {
 5162                         if ((*pte & PG_V) == 0)
 5163                                 continue;
 5164                         if ((*pte & PG_W) == 0)
 5165                                 panic("pmap_unwire: pte %#jx is missing PG_W",
 5166                                     (uintmax_t)*pte);
 5167 
 5168                         /*
 5169                          * PG_W must be cleared atomically.  Although the pmap
 5170                          * lock synchronizes access to PG_W, another processor
 5171                          * could be setting PG_M and/or PG_A concurrently.
 5172                          */
 5173                         atomic_clear_long(pte, PG_W);
 5174                         pmap->pm_stats.wired_count--;
 5175                 }
 5176         }
 5177         PMAP_UNLOCK(pmap);
 5178 }
 5179 
 5180 /*
 5181  *      Copy the range specified by src_addr/len
 5182  *      from the source map to the range dst_addr/len
 5183  *      in the destination map.
 5184  *
 5185  *      This routine is only advisory and need not do anything.
 5186  */
 5187 
 5188 void
 5189 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 5190     vm_offset_t src_addr)
 5191 {
 5192         struct rwlock *lock;
 5193         struct spglist free;
 5194         vm_offset_t addr;
 5195         vm_offset_t end_addr = src_addr + len;
 5196         vm_offset_t va_next;
 5197         pt_entry_t PG_A, PG_M, PG_V;
 5198 
 5199         if (dst_addr != src_addr)
 5200                 return;
 5201 
 5202         if (dst_pmap->pm_type != src_pmap->pm_type)
 5203                 return;
 5204 
 5205         /*
 5206          * EPT page table entries that require emulation of A/D bits are
 5207          * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
 5208          * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
 5209          * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
 5210          * implementations flag an EPT misconfiguration for exec-only
 5211          * mappings we skip this function entirely for emulated pmaps.
 5212          */
 5213         if (pmap_emulate_ad_bits(dst_pmap))
 5214                 return;
 5215 
 5216         lock = NULL;
 5217         if (dst_pmap < src_pmap) {
 5218                 PMAP_LOCK(dst_pmap);
 5219                 PMAP_LOCK(src_pmap);
 5220         } else {
 5221                 PMAP_LOCK(src_pmap);
 5222                 PMAP_LOCK(dst_pmap);
 5223         }
 5224 
 5225         PG_A = pmap_accessed_bit(dst_pmap);
 5226         PG_M = pmap_modified_bit(dst_pmap);
 5227         PG_V = pmap_valid_bit(dst_pmap);
 5228 
 5229         for (addr = src_addr; addr < end_addr; addr = va_next) {
 5230                 pt_entry_t *src_pte, *dst_pte;
 5231                 vm_page_t dstmpde, dstmpte, srcmpte;
 5232                 pml4_entry_t *pml4e;
 5233                 pdp_entry_t *pdpe;
 5234                 pd_entry_t srcptepaddr, *pde;
 5235 
 5236                 KASSERT(addr < UPT_MIN_ADDRESS,
 5237                     ("pmap_copy: invalid to pmap_copy page tables"));
 5238 
 5239                 pml4e = pmap_pml4e(src_pmap, addr);
 5240                 if ((*pml4e & PG_V) == 0) {
 5241                         va_next = (addr + NBPML4) & ~PML4MASK;
 5242                         if (va_next < addr)
 5243                                 va_next = end_addr;
 5244                         continue;
 5245                 }
 5246 
 5247                 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 5248                 if ((*pdpe & PG_V) == 0) {
 5249                         va_next = (addr + NBPDP) & ~PDPMASK;
 5250                         if (va_next < addr)
 5251                                 va_next = end_addr;
 5252                         continue;
 5253                 }
 5254 
 5255                 va_next = (addr + NBPDR) & ~PDRMASK;
 5256                 if (va_next < addr)
 5257                         va_next = end_addr;
 5258 
 5259                 pde = pmap_pdpe_to_pde(pdpe, addr);
 5260                 srcptepaddr = *pde;
 5261                 if (srcptepaddr == 0)
 5262                         continue;
 5263                         
 5264                 if (srcptepaddr & PG_PS) {
 5265                         if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 5266                                 continue;
 5267                         dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
 5268                         if (dstmpde == NULL)
 5269                                 break;
 5270                         pde = (pd_entry_t *)
 5271                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
 5272                         pde = &pde[pmap_pde_index(addr)];
 5273                         if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 5274                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
 5275                             PG_PS_FRAME, &lock))) {
 5276                                 *pde = srcptepaddr & ~PG_W;
 5277                                 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
 5278                                 atomic_add_long(&pmap_pde_mappings, 1);
 5279                         } else
 5280                                 dstmpde->wire_count--;
 5281                         continue;
 5282                 }
 5283 
 5284                 srcptepaddr &= PG_FRAME;
 5285                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 5286                 KASSERT(srcmpte->wire_count > 0,
 5287                     ("pmap_copy: source page table page is unused"));
 5288 
 5289                 if (va_next > end_addr)
 5290                         va_next = end_addr;
 5291 
 5292                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 5293                 src_pte = &src_pte[pmap_pte_index(addr)];
 5294                 dstmpte = NULL;
 5295                 while (addr < va_next) {
 5296                         pt_entry_t ptetemp;
 5297                         ptetemp = *src_pte;
 5298                         /*
 5299                          * we only virtual copy managed pages
 5300                          */
 5301                         if ((ptetemp & PG_MANAGED) != 0) {
 5302                                 if (dstmpte != NULL &&
 5303                                     dstmpte->pindex == pmap_pde_pindex(addr))
 5304                                         dstmpte->wire_count++;
 5305                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
 5306                                     addr, NULL)) == NULL)
 5307                                         goto out;
 5308                                 dst_pte = (pt_entry_t *)
 5309                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 5310                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
 5311                                 if (*dst_pte == 0 &&
 5312                                     pmap_try_insert_pv_entry(dst_pmap, addr,
 5313                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
 5314                                     &lock)) {
 5315                                         /*
 5316                                          * Clear the wired, modified, and
 5317                                          * accessed (referenced) bits
 5318                                          * during the copy.
 5319                                          */
 5320                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
 5321                                             PG_A);
 5322                                         pmap_resident_count_inc(dst_pmap, 1);
 5323                                 } else {
 5324                                         SLIST_INIT(&free);
 5325                                         if (pmap_unwire_ptp(dst_pmap, addr,
 5326                                             dstmpte, &free)) {
 5327                                                 /*
 5328                                                  * Although "addr" is not
 5329                                                  * mapped, paging-structure
 5330                                                  * caches could nonetheless
 5331                                                  * have entries that refer to
 5332                                                  * the freed page table pages.
 5333                                                  * Invalidate those entries.
 5334                                                  */
 5335                                                 pmap_invalidate_page(dst_pmap,
 5336                                                     addr);
 5337                                                 pmap_free_zero_pages(&free);
 5338                                         }
 5339                                         goto out;
 5340                                 }
 5341                                 if (dstmpte->wire_count >= srcmpte->wire_count)
 5342                                         break;
 5343                         }
 5344                         addr += PAGE_SIZE;
 5345                         src_pte++;
 5346                 }
 5347         }
 5348 out:
 5349         if (lock != NULL)
 5350                 rw_wunlock(lock);
 5351         PMAP_UNLOCK(src_pmap);
 5352         PMAP_UNLOCK(dst_pmap);
 5353 }
 5354 
 5355 /*
 5356  *      pmap_zero_page zeros the specified hardware page by mapping
 5357  *      the page into KVM and using bzero to clear its contents.
 5358  */
 5359 void
 5360 pmap_zero_page(vm_page_t m)
 5361 {
 5362         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 5363 
 5364         pagezero((void *)va);
 5365 }
 5366 
 5367 /*
 5368  *      pmap_zero_page_area zeros the specified hardware page by mapping 
 5369  *      the page into KVM and using bzero to clear its contents.
 5370  *
 5371  *      off and size may not cover an area beyond a single hardware page.
 5372  */
 5373 void
 5374 pmap_zero_page_area(vm_page_t m, int off, int size)
 5375 {
 5376         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 5377 
 5378         if (off == 0 && size == PAGE_SIZE)
 5379                 pagezero((void *)va);
 5380         else
 5381                 bzero((char *)va + off, size);
 5382 }
 5383 
 5384 /*
 5385  *      pmap_zero_page_idle zeros the specified hardware page by mapping 
 5386  *      the page into KVM and using bzero to clear its contents.  This
 5387  *      is intended to be called from the vm_pagezero process only and
 5388  *      outside of Giant.
 5389  */
 5390 void
 5391 pmap_zero_page_idle(vm_page_t m)
 5392 {
 5393         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 5394 
 5395         pagezero((void *)va);
 5396 }
 5397 
 5398 /*
 5399  *      pmap_copy_page copies the specified (machine independent)
 5400  *      page by mapping the page into virtual memory and using
 5401  *      bcopy to copy the page, one machine dependent page at a
 5402  *      time.
 5403  */
 5404 void
 5405 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 5406 {
 5407         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 5408         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 5409 
 5410         pagecopy((void *)src, (void *)dst);
 5411 }
 5412 
 5413 int unmapped_buf_allowed = 1;
 5414 
 5415 void
 5416 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
 5417     vm_offset_t b_offset, int xfersize)
 5418 {
 5419         void *a_cp, *b_cp;
 5420         vm_page_t pages[2];
 5421         vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
 5422         int cnt;
 5423         boolean_t mapped;
 5424 
 5425         while (xfersize > 0) {
 5426                 a_pg_offset = a_offset & PAGE_MASK;
 5427                 pages[0] = ma[a_offset >> PAGE_SHIFT];
 5428                 b_pg_offset = b_offset & PAGE_MASK;
 5429                 pages[1] = mb[b_offset >> PAGE_SHIFT];
 5430                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 5431                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 5432                 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
 5433                 a_cp = (char *)vaddr[0] + a_pg_offset;
 5434                 b_cp = (char *)vaddr[1] + b_pg_offset;
 5435                 bcopy(a_cp, b_cp, cnt);
 5436                 if (__predict_false(mapped))
 5437                         pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
 5438                 a_offset += cnt;
 5439                 b_offset += cnt;
 5440                 xfersize -= cnt;
 5441         }
 5442 }
 5443 
 5444 /*
 5445  * Returns true if the pmap's pv is one of the first
 5446  * 16 pvs linked to from this page.  This count may
 5447  * be changed upwards or downwards in the future; it
 5448  * is only necessary that true be returned for a small
 5449  * subset of pmaps for proper page aging.
 5450  */
 5451 boolean_t
 5452 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 5453 {
 5454         struct md_page *pvh;
 5455         struct rwlock *lock;
 5456         pv_entry_t pv;
 5457         int loops = 0;
 5458         boolean_t rv;
 5459 
 5460         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5461             ("pmap_page_exists_quick: page %p is not managed", m));
 5462         rv = FALSE;
 5463         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5464         rw_rlock(lock);
 5465         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5466                 if (PV_PMAP(pv) == pmap) {
 5467                         rv = TRUE;
 5468                         break;
 5469                 }
 5470                 loops++;
 5471                 if (loops >= 16)
 5472                         break;
 5473         }
 5474         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 5475                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5476                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5477                         if (PV_PMAP(pv) == pmap) {
 5478                                 rv = TRUE;
 5479                                 break;
 5480                         }
 5481                         loops++;
 5482                         if (loops >= 16)
 5483                                 break;
 5484                 }
 5485         }
 5486         rw_runlock(lock);
 5487         return (rv);
 5488 }
 5489 
 5490 /*
 5491  *      pmap_page_wired_mappings:
 5492  *
 5493  *      Return the number of managed mappings to the given physical page
 5494  *      that are wired.
 5495  */
 5496 int
 5497 pmap_page_wired_mappings(vm_page_t m)
 5498 {
 5499         struct rwlock *lock;
 5500         struct md_page *pvh;
 5501         pmap_t pmap;
 5502         pt_entry_t *pte;
 5503         pv_entry_t pv;
 5504         int count, md_gen, pvh_gen;
 5505 
 5506         if ((m->oflags & VPO_UNMANAGED) != 0)
 5507                 return (0);
 5508         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5509         rw_rlock(lock);
 5510 restart:
 5511         count = 0;
 5512         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5513                 pmap = PV_PMAP(pv);
 5514                 if (!PMAP_TRYLOCK(pmap)) {
 5515                         md_gen = m->md.pv_gen;
 5516                         rw_runlock(lock);
 5517                         PMAP_LOCK(pmap);
 5518                         rw_rlock(lock);
 5519                         if (md_gen != m->md.pv_gen) {
 5520                                 PMAP_UNLOCK(pmap);
 5521                                 goto restart;
 5522                         }
 5523                 }
 5524                 pte = pmap_pte(pmap, pv->pv_va);
 5525                 if ((*pte & PG_W) != 0)
 5526                         count++;
 5527                 PMAP_UNLOCK(pmap);
 5528         }
 5529         if ((m->flags & PG_FICTITIOUS) == 0) {
 5530                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5531                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5532                         pmap = PV_PMAP(pv);
 5533                         if (!PMAP_TRYLOCK(pmap)) {
 5534                                 md_gen = m->md.pv_gen;
 5535                                 pvh_gen = pvh->pv_gen;
 5536                                 rw_runlock(lock);
 5537                                 PMAP_LOCK(pmap);
 5538                                 rw_rlock(lock);
 5539                                 if (md_gen != m->md.pv_gen ||
 5540                                     pvh_gen != pvh->pv_gen) {
 5541                                         PMAP_UNLOCK(pmap);
 5542                                         goto restart;
 5543                                 }
 5544                         }
 5545                         pte = pmap_pde(pmap, pv->pv_va);
 5546                         if ((*pte & PG_W) != 0)
 5547                                 count++;
 5548                         PMAP_UNLOCK(pmap);
 5549                 }
 5550         }
 5551         rw_runlock(lock);
 5552         return (count);
 5553 }
 5554 
 5555 /*
 5556  * Returns TRUE if the given page is mapped individually or as part of
 5557  * a 2mpage.  Otherwise, returns FALSE.
 5558  */
 5559 boolean_t
 5560 pmap_page_is_mapped(vm_page_t m)
 5561 {
 5562         struct rwlock *lock;
 5563         boolean_t rv;
 5564 
 5565         if ((m->oflags & VPO_UNMANAGED) != 0)
 5566                 return (FALSE);
 5567         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5568         rw_rlock(lock);
 5569         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 5570             ((m->flags & PG_FICTITIOUS) == 0 &&
 5571             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 5572         rw_runlock(lock);
 5573         return (rv);
 5574 }
 5575 
 5576 /*
 5577  * Destroy all managed, non-wired mappings in the given user-space
 5578  * pmap.  This pmap cannot be active on any processor besides the
 5579  * caller.
 5580  *
 5581  * This function cannot be applied to the kernel pmap.  Moreover, it
 5582  * is not intended for general use.  It is only to be used during
 5583  * process termination.  Consequently, it can be implemented in ways
 5584  * that make it faster than pmap_remove().  First, it can more quickly
 5585  * destroy mappings by iterating over the pmap's collection of PV
 5586  * entries, rather than searching the page table.  Second, it doesn't
 5587  * have to test and clear the page table entries atomically, because
 5588  * no processor is currently accessing the user address space.  In
 5589  * particular, a page table entry's dirty bit won't change state once
 5590  * this function starts.
 5591  */
 5592 void
 5593 pmap_remove_pages(pmap_t pmap)
 5594 {
 5595         pd_entry_t ptepde;
 5596         pt_entry_t *pte, tpte;
 5597         pt_entry_t PG_M, PG_RW, PG_V;
 5598         struct spglist free;
 5599         vm_page_t m, mpte, mt;
 5600         pv_entry_t pv;
 5601         struct md_page *pvh;
 5602         struct pv_chunk *pc, *npc;
 5603         struct rwlock *lock;
 5604         int64_t bit;
 5605         uint64_t inuse, bitmask;
 5606         int allfree, field, freed, idx;
 5607         boolean_t superpage;
 5608         vm_paddr_t pa;
 5609 
 5610         /*
 5611          * Assert that the given pmap is only active on the current
 5612          * CPU.  Unfortunately, we cannot block another CPU from
 5613          * activating the pmap while this function is executing.
 5614          */
 5615         KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
 5616 #ifdef INVARIANTS
 5617         {
 5618                 cpuset_t other_cpus;
 5619 
 5620                 other_cpus = all_cpus;
 5621                 critical_enter();
 5622                 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 5623                 CPU_AND(&other_cpus, &pmap->pm_active);
 5624                 critical_exit();
 5625                 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
 5626         }
 5627 #endif
 5628 
 5629         lock = NULL;
 5630         PG_M = pmap_modified_bit(pmap);
 5631         PG_V = pmap_valid_bit(pmap);
 5632         PG_RW = pmap_rw_bit(pmap);
 5633 
 5634         SLIST_INIT(&free);
 5635         PMAP_LOCK(pmap);
 5636         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 5637                 allfree = 1;
 5638                 freed = 0;
 5639                 for (field = 0; field < _NPCM; field++) {
 5640                         inuse = ~pc->pc_map[field] & pc_freemask[field];
 5641                         while (inuse != 0) {
 5642                                 bit = bsfq(inuse);
 5643                                 bitmask = 1UL << bit;
 5644                                 idx = field * 64 + bit;
 5645                                 pv = &pc->pc_pventry[idx];
 5646                                 inuse &= ~bitmask;
 5647 
 5648                                 pte = pmap_pdpe(pmap, pv->pv_va);
 5649                                 ptepde = *pte;
 5650                                 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 5651                                 tpte = *pte;
 5652                                 if ((tpte & (PG_PS | PG_V)) == PG_V) {
 5653                                         superpage = FALSE;
 5654                                         ptepde = tpte;
 5655                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 5656                                             PG_FRAME);
 5657                                         pte = &pte[pmap_pte_index(pv->pv_va)];
 5658                                         tpte = *pte;
 5659                                 } else {
 5660                                         /*
 5661                                          * Keep track whether 'tpte' is a
 5662                                          * superpage explicitly instead of
 5663                                          * relying on PG_PS being set.
 5664                                          *
 5665                                          * This is because PG_PS is numerically
 5666                                          * identical to PG_PTE_PAT and thus a
 5667                                          * regular page could be mistaken for
 5668                                          * a superpage.
 5669                                          */
 5670                                         superpage = TRUE;
 5671                                 }
 5672 
 5673                                 if ((tpte & PG_V) == 0) {
 5674                                         panic("bad pte va %lx pte %lx",
 5675                                             pv->pv_va, tpte);
 5676                                 }
 5677 
 5678 /*
 5679  * We cannot remove wired pages from a process' mapping at this time
 5680  */
 5681                                 if (tpte & PG_W) {
 5682                                         allfree = 0;
 5683                                         continue;
 5684                                 }
 5685 
 5686                                 if (superpage)
 5687                                         pa = tpte & PG_PS_FRAME;
 5688                                 else
 5689                                         pa = tpte & PG_FRAME;
 5690 
 5691                                 m = PHYS_TO_VM_PAGE(pa);
 5692                                 KASSERT(m->phys_addr == pa,
 5693                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 5694                                     m, (uintmax_t)m->phys_addr,
 5695                                     (uintmax_t)tpte));
 5696 
 5697                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 5698                                     m < &vm_page_array[vm_page_array_size],
 5699                                     ("pmap_remove_pages: bad tpte %#jx",
 5700                                     (uintmax_t)tpte));
 5701 
 5702                                 pte_clear(pte);
 5703 
 5704                                 /*
 5705                                  * Update the vm_page_t clean/reference bits.
 5706                                  */
 5707                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 5708                                         if (superpage) {
 5709                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 5710                                                         vm_page_dirty(mt);
 5711                                         } else
 5712                                                 vm_page_dirty(m);
 5713                                 }
 5714 
 5715                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 5716 
 5717                                 /* Mark free */
 5718                                 pc->pc_map[field] |= bitmask;
 5719                                 if (superpage) {
 5720                                         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 5721                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 5722                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 5723                                         pvh->pv_gen++;
 5724                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 5725                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 5726                                                         if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 5727                                                             TAILQ_EMPTY(&mt->md.pv_list))
 5728                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
 5729                                         }
 5730                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 5731                                         if (mpte != NULL) {
 5732                                                 pmap_resident_count_dec(pmap, 1);
 5733                                                 KASSERT(mpte->wire_count == NPTEPG,
 5734                                                     ("pmap_remove_pages: pte page wire count error"));
 5735                                                 mpte->wire_count = 0;
 5736                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
 5737                                                 atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 5738                                         }
 5739                                 } else {
 5740                                         pmap_resident_count_dec(pmap, 1);
 5741                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 5742                                         m->md.pv_gen++;
 5743                                         if ((m->aflags & PGA_WRITEABLE) != 0 &&
 5744                                             TAILQ_EMPTY(&m->md.pv_list) &&
 5745                                             (m->flags & PG_FICTITIOUS) == 0) {
 5746                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5747                                                 if (TAILQ_EMPTY(&pvh->pv_list))
 5748                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5749                                         }
 5750                                 }
 5751                                 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 5752                                 freed++;
 5753                         }
 5754                 }
 5755                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 5756                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 5757                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 5758                 if (allfree) {
 5759                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 5760                         free_pv_chunk(pc);
 5761                 }
 5762         }
 5763         if (lock != NULL)
 5764                 rw_wunlock(lock);
 5765         pmap_invalidate_all(pmap);
 5766         PMAP_UNLOCK(pmap);
 5767         pmap_free_zero_pages(&free);
 5768 }
 5769 
 5770 static boolean_t
 5771 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 5772 {
 5773         struct rwlock *lock;
 5774         pv_entry_t pv;
 5775         struct md_page *pvh;
 5776         pt_entry_t *pte, mask;
 5777         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 5778         pmap_t pmap;
 5779         int md_gen, pvh_gen;
 5780         boolean_t rv;
 5781 
 5782         rv = FALSE;
 5783         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5784         rw_rlock(lock);
 5785 restart:
 5786         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5787                 pmap = PV_PMAP(pv);
 5788                 if (!PMAP_TRYLOCK(pmap)) {
 5789                         md_gen = m->md.pv_gen;
 5790                         rw_runlock(lock);
 5791                         PMAP_LOCK(pmap);
 5792                         rw_rlock(lock);
 5793                         if (md_gen != m->md.pv_gen) {
 5794                                 PMAP_UNLOCK(pmap);
 5795                                 goto restart;
 5796                         }
 5797                 }
 5798                 pte = pmap_pte(pmap, pv->pv_va);
 5799                 mask = 0;
 5800                 if (modified) {
 5801                         PG_M = pmap_modified_bit(pmap);
 5802                         PG_RW = pmap_rw_bit(pmap);
 5803                         mask |= PG_RW | PG_M;
 5804                 }
 5805                 if (accessed) {
 5806                         PG_A = pmap_accessed_bit(pmap);
 5807                         PG_V = pmap_valid_bit(pmap);
 5808                         mask |= PG_V | PG_A;
 5809                 }
 5810                 rv = (*pte & mask) == mask;
 5811                 PMAP_UNLOCK(pmap);
 5812                 if (rv)
 5813                         goto out;
 5814         }
 5815         if ((m->flags & PG_FICTITIOUS) == 0) {
 5816                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5817                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5818                         pmap = PV_PMAP(pv);
 5819                         if (!PMAP_TRYLOCK(pmap)) {
 5820                                 md_gen = m->md.pv_gen;
 5821                                 pvh_gen = pvh->pv_gen;
 5822                                 rw_runlock(lock);
 5823                                 PMAP_LOCK(pmap);
 5824                                 rw_rlock(lock);
 5825                                 if (md_gen != m->md.pv_gen ||
 5826                                     pvh_gen != pvh->pv_gen) {
 5827                                         PMAP_UNLOCK(pmap);
 5828                                         goto restart;
 5829                                 }
 5830                         }
 5831                         pte = pmap_pde(pmap, pv->pv_va);
 5832                         mask = 0;
 5833                         if (modified) {
 5834                                 PG_M = pmap_modified_bit(pmap);
 5835                                 PG_RW = pmap_rw_bit(pmap);
 5836                                 mask |= PG_RW | PG_M;
 5837                         }
 5838                         if (accessed) {
 5839                                 PG_A = pmap_accessed_bit(pmap);
 5840                                 PG_V = pmap_valid_bit(pmap);
 5841                                 mask |= PG_V | PG_A;
 5842                         }
 5843                         rv = (*pte & mask) == mask;
 5844                         PMAP_UNLOCK(pmap);
 5845                         if (rv)
 5846                                 goto out;
 5847                 }
 5848         }
 5849 out:
 5850         rw_runlock(lock);
 5851         return (rv);
 5852 }
 5853 
 5854 /*
 5855  *      pmap_is_modified:
 5856  *
 5857  *      Return whether or not the specified physical page was modified
 5858  *      in any physical maps.
 5859  */
 5860 boolean_t
 5861 pmap_is_modified(vm_page_t m)
 5862 {
 5863 
 5864         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5865             ("pmap_is_modified: page %p is not managed", m));
 5866 
 5867         /*
 5868          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 5869          * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 5870          * is clear, no PTEs can have PG_M set.
 5871          */
 5872         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5873         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 5874                 return (FALSE);
 5875         return (pmap_page_test_mappings(m, FALSE, TRUE));
 5876 }
 5877 
 5878 /*
 5879  *      pmap_is_prefaultable:
 5880  *
 5881  *      Return whether or not the specified virtual address is eligible
 5882  *      for prefault.
 5883  */
 5884 boolean_t
 5885 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 5886 {
 5887         pd_entry_t *pde;
 5888         pt_entry_t *pte, PG_V;
 5889         boolean_t rv;
 5890 
 5891         PG_V = pmap_valid_bit(pmap);
 5892         rv = FALSE;
 5893         PMAP_LOCK(pmap);
 5894         pde = pmap_pde(pmap, addr);
 5895         if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 5896                 pte = pmap_pde_to_pte(pde, addr);
 5897                 rv = (*pte & PG_V) == 0;
 5898         }
 5899         PMAP_UNLOCK(pmap);
 5900         return (rv);
 5901 }
 5902 
 5903 /*
 5904  *      pmap_is_referenced:
 5905  *
 5906  *      Return whether or not the specified physical page was referenced
 5907  *      in any physical maps.
 5908  */
 5909 boolean_t
 5910 pmap_is_referenced(vm_page_t m)
 5911 {
 5912 
 5913         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5914             ("pmap_is_referenced: page %p is not managed", m));
 5915         return (pmap_page_test_mappings(m, TRUE, FALSE));
 5916 }
 5917 
 5918 /*
 5919  * Clear the write and modified bits in each of the given page's mappings.
 5920  */
 5921 void
 5922 pmap_remove_write(vm_page_t m)
 5923 {
 5924         struct md_page *pvh;
 5925         pmap_t pmap;
 5926         struct rwlock *lock;
 5927         pv_entry_t next_pv, pv;
 5928         pd_entry_t *pde;
 5929         pt_entry_t oldpte, *pte, PG_M, PG_RW;
 5930         vm_offset_t va;
 5931         int pvh_gen, md_gen;
 5932 
 5933         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5934             ("pmap_remove_write: page %p is not managed", m));
 5935 
 5936         /*
 5937          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 5938          * set by another thread while the object is locked.  Thus,
 5939          * if PGA_WRITEABLE is clear, no page table entries need updating.
 5940          */
 5941         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5942         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 5943                 return;
 5944         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5945         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 5946             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5947 retry_pv_loop:
 5948         rw_wlock(lock);
 5949         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 5950                 pmap = PV_PMAP(pv);
 5951                 if (!PMAP_TRYLOCK(pmap)) {
 5952                         pvh_gen = pvh->pv_gen;
 5953                         rw_wunlock(lock);
 5954                         PMAP_LOCK(pmap);
 5955                         rw_wlock(lock);
 5956                         if (pvh_gen != pvh->pv_gen) {
 5957                                 PMAP_UNLOCK(pmap);
 5958                                 rw_wunlock(lock);
 5959                                 goto retry_pv_loop;
 5960                         }
 5961                 }
 5962                 PG_RW = pmap_rw_bit(pmap);
 5963                 va = pv->pv_va;
 5964                 pde = pmap_pde(pmap, va);
 5965                 if ((*pde & PG_RW) != 0)
 5966                         (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 5967                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 5968                     ("inconsistent pv lock %p %p for page %p",
 5969                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 5970                 PMAP_UNLOCK(pmap);
 5971         }
 5972         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5973                 pmap = PV_PMAP(pv);
 5974                 if (!PMAP_TRYLOCK(pmap)) {
 5975                         pvh_gen = pvh->pv_gen;
 5976                         md_gen = m->md.pv_gen;
 5977                         rw_wunlock(lock);
 5978                         PMAP_LOCK(pmap);
 5979                         rw_wlock(lock);
 5980                         if (pvh_gen != pvh->pv_gen ||
 5981                             md_gen != m->md.pv_gen) {
 5982                                 PMAP_UNLOCK(pmap);
 5983                                 rw_wunlock(lock);
 5984                                 goto retry_pv_loop;
 5985                         }
 5986                 }
 5987                 PG_M = pmap_modified_bit(pmap);
 5988                 PG_RW = pmap_rw_bit(pmap);
 5989                 pde = pmap_pde(pmap, pv->pv_va);
 5990                 KASSERT((*pde & PG_PS) == 0,
 5991                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
 5992                     m));
 5993                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 5994 retry:
 5995                 oldpte = *pte;
 5996                 if (oldpte & PG_RW) {
 5997                         if (!atomic_cmpset_long(pte, oldpte, oldpte &
 5998                             ~(PG_RW | PG_M)))
 5999                                 goto retry;
 6000                         if ((oldpte & PG_M) != 0)
 6001                                 vm_page_dirty(m);
 6002                         pmap_invalidate_page(pmap, pv->pv_va);
 6003                 }
 6004                 PMAP_UNLOCK(pmap);
 6005         }
 6006         rw_wunlock(lock);
 6007         vm_page_aflag_clear(m, PGA_WRITEABLE);
 6008         pmap_delayed_invl_wait(m);
 6009 }
 6010 
 6011 static __inline boolean_t
 6012 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
 6013 {
 6014 
 6015         if (!pmap_emulate_ad_bits(pmap))
 6016                 return (TRUE);
 6017 
 6018         KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
 6019 
 6020         /*
 6021          * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
 6022          * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
 6023          * if the EPT_PG_WRITE bit is set.
 6024          */
 6025         if ((pte & EPT_PG_WRITE) != 0)
 6026                 return (FALSE);
 6027 
 6028         /*
 6029          * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
 6030          */
 6031         if ((pte & EPT_PG_EXECUTE) == 0 ||
 6032             ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
 6033                 return (TRUE);
 6034         else
 6035                 return (FALSE);
 6036 }
 6037 
 6038 #define PMAP_TS_REFERENCED_MAX  5
 6039 
 6040 /*
 6041  *      pmap_ts_referenced:
 6042  *
 6043  *      Return a count of reference bits for a page, clearing those bits.
 6044  *      It is not necessary for every reference bit to be cleared, but it
 6045  *      is necessary that 0 only be returned when there are truly no
 6046  *      reference bits set.
 6047  *
 6048  *      XXX: The exact number of bits to check and clear is a matter that
 6049  *      should be tested and standardized at some point in the future for
 6050  *      optimal aging of shared pages.
 6051  *
 6052  *      As an optimization, update the page's dirty field if a modified bit is
 6053  *      found while counting reference bits.  This opportunistic update can be
 6054  *      performed at low cost and can eliminate the need for some future calls
 6055  *      to pmap_is_modified().  However, since this function stops after
 6056  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
 6057  *      dirty pages.  Those dirty pages will only be detected by a future call
 6058  *      to pmap_is_modified().
 6059  *
 6060  *      A DI block is not needed within this function, because
 6061  *      invalidations are performed before the PV list lock is
 6062  *      released.
 6063  */
 6064 int
 6065 pmap_ts_referenced(vm_page_t m)
 6066 {
 6067         struct md_page *pvh;
 6068         pv_entry_t pv, pvf;
 6069         pmap_t pmap;
 6070         struct rwlock *lock;
 6071         pd_entry_t oldpde, *pde;
 6072         pt_entry_t *pte, PG_A, PG_M, PG_RW;
 6073         vm_offset_t va;
 6074         vm_paddr_t pa;
 6075         int cleared, md_gen, not_cleared, pvh_gen;
 6076         struct spglist free;
 6077         boolean_t demoted;
 6078 
 6079         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 6080             ("pmap_ts_referenced: page %p is not managed", m));
 6081         SLIST_INIT(&free);
 6082         cleared = 0;
 6083         pa = VM_PAGE_TO_PHYS(m);
 6084         lock = PHYS_TO_PV_LIST_LOCK(pa);
 6085         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 6086         rw_wlock(lock);
 6087 retry:
 6088         not_cleared = 0;
 6089         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 6090                 goto small_mappings;
 6091         pv = pvf;
 6092         do {
 6093                 if (pvf == NULL)
 6094                         pvf = pv;
 6095                 pmap = PV_PMAP(pv);
 6096                 if (!PMAP_TRYLOCK(pmap)) {
 6097                         pvh_gen = pvh->pv_gen;
 6098                         rw_wunlock(lock);
 6099                         PMAP_LOCK(pmap);
 6100                         rw_wlock(lock);
 6101                         if (pvh_gen != pvh->pv_gen) {
 6102                                 PMAP_UNLOCK(pmap);
 6103                                 goto retry;
 6104                         }
 6105                 }
 6106                 PG_A = pmap_accessed_bit(pmap);
 6107                 PG_M = pmap_modified_bit(pmap);
 6108                 PG_RW = pmap_rw_bit(pmap);
 6109                 va = pv->pv_va;
 6110                 pde = pmap_pde(pmap, pv->pv_va);
 6111                 oldpde = *pde;
 6112                 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 6113                         /*
 6114                          * Although "oldpde" is mapping a 2MB page, because
 6115                          * this function is called at a 4KB page granularity,
 6116                          * we only update the 4KB page under test.
 6117                          */
 6118                         vm_page_dirty(m);
 6119                 }
 6120                 if ((*pde & PG_A) != 0) {
 6121                         /*
 6122                          * Since this reference bit is shared by 512 4KB
 6123                          * pages, it should not be cleared every time it is
 6124                          * tested.  Apply a simple "hash" function on the
 6125                          * physical page number, the virtual superpage number,
 6126                          * and the pmap address to select one 4KB page out of
 6127                          * the 512 on which testing the reference bit will
 6128                          * result in clearing that reference bit.  This
 6129                          * function is designed to avoid the selection of the
 6130                          * same 4KB page for every 2MB page mapping.
 6131                          *
 6132                          * On demotion, a mapping that hasn't been referenced
 6133                          * is simply destroyed.  To avoid the possibility of a
 6134                          * subsequent page fault on a demoted wired mapping,
 6135                          * always leave its reference bit set.  Moreover,
 6136                          * since the superpage is wired, the current state of
 6137                          * its reference bit won't affect page replacement.
 6138                          */
 6139                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 6140                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 6141                             (*pde & PG_W) == 0) {
 6142                                 if (safe_to_clear_referenced(pmap, oldpde)) {
 6143                                         atomic_clear_long(pde, PG_A);
 6144                                         pmap_invalidate_page(pmap, pv->pv_va);
 6145                                         demoted = FALSE;
 6146                                 } else if (pmap_demote_pde_locked(pmap, pde,
 6147                                     pv->pv_va, &lock)) {
 6148                                         /*
 6149                                          * Remove the mapping to a single page
 6150                                          * so that a subsequent access may
 6151                                          * repromote.  Since the underlying
 6152                                          * page table page is fully populated,
 6153                                          * this removal never frees a page
 6154                                          * table page.
 6155                                          */
 6156                                         demoted = TRUE;
 6157                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 6158                                             PG_PS_FRAME);
 6159                                         pte = pmap_pde_to_pte(pde, va);
 6160                                         pmap_remove_pte(pmap, pte, va, *pde,
 6161                                             NULL, &lock);
 6162                                         pmap_invalidate_page(pmap, va);
 6163                                 } else
 6164                                         demoted = TRUE;
 6165 
 6166                                 if (demoted) {
 6167                                         /*
 6168                                          * The superpage mapping was removed
 6169                                          * entirely and therefore 'pv' is no
 6170                                          * longer valid.
 6171                                          */
 6172                                         if (pvf == pv)
 6173                                                 pvf = NULL;
 6174                                         pv = NULL;
 6175                                 }
 6176                                 cleared++;
 6177                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 6178                                     ("inconsistent pv lock %p %p for page %p",
 6179                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 6180                         } else
 6181                                 not_cleared++;
 6182                 }
 6183                 PMAP_UNLOCK(pmap);
 6184                 /* Rotate the PV list if it has more than one entry. */
 6185                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 6186                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 6187                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 6188                         pvh->pv_gen++;
 6189                 }
 6190                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 6191                         goto out;
 6192         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 6193 small_mappings:
 6194         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 6195                 goto out;
 6196         pv = pvf;
 6197         do {
 6198                 if (pvf == NULL)
 6199                         pvf = pv;
 6200                 pmap = PV_PMAP(pv);
 6201                 if (!PMAP_TRYLOCK(pmap)) {
 6202                         pvh_gen = pvh->pv_gen;
 6203                         md_gen = m->md.pv_gen;
 6204                         rw_wunlock(lock);
 6205                         PMAP_LOCK(pmap);
 6206                         rw_wlock(lock);
 6207                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 6208                                 PMAP_UNLOCK(pmap);
 6209                                 goto retry;
 6210                         }
 6211                 }
 6212                 PG_A = pmap_accessed_bit(pmap);
 6213                 PG_M = pmap_modified_bit(pmap);
 6214                 PG_RW = pmap_rw_bit(pmap);
 6215                 pde = pmap_pde(pmap, pv->pv_va);
 6216                 KASSERT((*pde & PG_PS) == 0,
 6217                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 6218                     m));
 6219                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 6220                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 6221                         vm_page_dirty(m);
 6222                 if ((*pte & PG_A) != 0) {
 6223                         if (safe_to_clear_referenced(pmap, *pte)) {
 6224                                 atomic_clear_long(pte, PG_A);
 6225                                 pmap_invalidate_page(pmap, pv->pv_va);
 6226                                 cleared++;
 6227                         } else if ((*pte & PG_W) == 0) {
 6228                                 /*
 6229                                  * Wired pages cannot be paged out so
 6230                                  * doing accessed bit emulation for
 6231                                  * them is wasted effort. We do the
 6232                                  * hard work for unwired pages only.
 6233                                  */
 6234                                 pmap_remove_pte(pmap, pte, pv->pv_va,
 6235                                     *pde, &free, &lock);
 6236                                 pmap_invalidate_page(pmap, pv->pv_va);
 6237                                 cleared++;
 6238                                 if (pvf == pv)
 6239                                         pvf = NULL;
 6240                                 pv = NULL;
 6241                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 6242                                     ("inconsistent pv lock %p %p for page %p",
 6243                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 6244                         } else
 6245                                 not_cleared++;
 6246                 }
 6247                 PMAP_UNLOCK(pmap);
 6248                 /* Rotate the PV list if it has more than one entry. */
 6249                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 6250                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 6251                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 6252                         m->md.pv_gen++;
 6253                 }
 6254         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 6255             not_cleared < PMAP_TS_REFERENCED_MAX);
 6256 out:
 6257         rw_wunlock(lock);
 6258         pmap_free_zero_pages(&free);
 6259         return (cleared + not_cleared);
 6260 }
 6261 
 6262 /*
 6263  *      Apply the given advice to the specified range of addresses within the
 6264  *      given pmap.  Depending on the advice, clear the referenced and/or
 6265  *      modified flags in each mapping and set the mapped page's dirty field.
 6266  */
 6267 void
 6268 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 6269 {
 6270         struct rwlock *lock;
 6271         pml4_entry_t *pml4e;
 6272         pdp_entry_t *pdpe;
 6273         pd_entry_t oldpde, *pde;
 6274         pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
 6275         vm_offset_t va, va_next;
 6276         vm_page_t m;
 6277         boolean_t anychanged;
 6278 
 6279         if (advice != MADV_DONTNEED && advice != MADV_FREE)
 6280                 return;
 6281 
 6282         /*
 6283          * A/D bit emulation requires an alternate code path when clearing
 6284          * the modified and accessed bits below. Since this function is
 6285          * advisory in nature we skip it entirely for pmaps that require
 6286          * A/D bit emulation.
 6287          */
 6288         if (pmap_emulate_ad_bits(pmap))
 6289                 return;
 6290 
 6291         PG_A = pmap_accessed_bit(pmap);
 6292         PG_G = pmap_global_bit(pmap);
 6293         PG_M = pmap_modified_bit(pmap);
 6294         PG_V = pmap_valid_bit(pmap);
 6295         PG_RW = pmap_rw_bit(pmap);
 6296         anychanged = FALSE;
 6297         pmap_delayed_invl_started();
 6298         PMAP_LOCK(pmap);
 6299         for (; sva < eva; sva = va_next) {
 6300                 pml4e = pmap_pml4e(pmap, sva);
 6301                 if ((*pml4e & PG_V) == 0) {
 6302                         va_next = (sva + NBPML4) & ~PML4MASK;
 6303                         if (va_next < sva)
 6304                                 va_next = eva;
 6305                         continue;
 6306                 }
 6307                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 6308                 if ((*pdpe & PG_V) == 0) {
 6309                         va_next = (sva + NBPDP) & ~PDPMASK;
 6310                         if (va_next < sva)
 6311                                 va_next = eva;
 6312                         continue;
 6313                 }
 6314                 va_next = (sva + NBPDR) & ~PDRMASK;
 6315                 if (va_next < sva)
 6316                         va_next = eva;
 6317                 pde = pmap_pdpe_to_pde(pdpe, sva);
 6318                 oldpde = *pde;
 6319                 if ((oldpde & PG_V) == 0)
 6320                         continue;
 6321                 else if ((oldpde & PG_PS) != 0) {
 6322                         if ((oldpde & PG_MANAGED) == 0)
 6323                                 continue;
 6324                         lock = NULL;
 6325                         if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
 6326                                 if (lock != NULL)
 6327                                         rw_wunlock(lock);
 6328 
 6329                                 /*
 6330                                  * The large page mapping was destroyed.
 6331                                  */
 6332                                 continue;
 6333                         }
 6334 
 6335                         /*
 6336                          * Unless the page mappings are wired, remove the
 6337                          * mapping to a single page so that a subsequent
 6338                          * access may repromote.  Since the underlying page
 6339                          * table page is fully populated, this removal never
 6340                          * frees a page table page.
 6341                          */
 6342                         if ((oldpde & PG_W) == 0) {
 6343                                 pte = pmap_pde_to_pte(pde, sva);
 6344                                 KASSERT((*pte & PG_V) != 0,
 6345                                     ("pmap_advise: invalid PTE"));
 6346                                 pmap_remove_pte(pmap, pte, sva, *pde, NULL,
 6347                                     &lock);
 6348                                 anychanged = TRUE;
 6349                         }
 6350                         if (lock != NULL)
 6351                                 rw_wunlock(lock);
 6352                 }
 6353                 if (va_next > eva)
 6354                         va_next = eva;
 6355                 va = va_next;
 6356                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 6357                     sva += PAGE_SIZE) {
 6358                         if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 6359                                 goto maybe_invlrng;
 6360                         else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 6361                                 if (advice == MADV_DONTNEED) {
 6362                                         /*
 6363                                          * Future calls to pmap_is_modified()
 6364                                          * can be avoided by making the page
 6365                                          * dirty now.
 6366                                          */
 6367                                         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 6368                                         vm_page_dirty(m);
 6369                                 }
 6370                                 atomic_clear_long(pte, PG_M | PG_A);
 6371                         } else if ((*pte & PG_A) != 0)
 6372                                 atomic_clear_long(pte, PG_A);
 6373                         else
 6374                                 goto maybe_invlrng;
 6375 
 6376                         if ((*pte & PG_G) != 0) {
 6377                                 if (va == va_next)
 6378                                         va = sva;
 6379                         } else
 6380                                 anychanged = TRUE;
 6381                         continue;
 6382 maybe_invlrng:
 6383                         if (va != va_next) {
 6384                                 pmap_invalidate_range(pmap, va, sva);
 6385                                 va = va_next;
 6386                         }
 6387                 }
 6388                 if (va != va_next)
 6389                         pmap_invalidate_range(pmap, va, sva);
 6390         }
 6391         if (anychanged)
 6392                 pmap_invalidate_all(pmap);
 6393         PMAP_UNLOCK(pmap);
 6394         pmap_delayed_invl_finished();
 6395 }
 6396 
 6397 /*
 6398  *      Clear the modify bits on the specified physical page.
 6399  */
 6400 void
 6401 pmap_clear_modify(vm_page_t m)
 6402 {
 6403         struct md_page *pvh;
 6404         pmap_t pmap;
 6405         pv_entry_t next_pv, pv;
 6406         pd_entry_t oldpde, *pde;
 6407         pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
 6408         struct rwlock *lock;
 6409         vm_offset_t va;
 6410         int md_gen, pvh_gen;
 6411 
 6412         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 6413             ("pmap_clear_modify: page %p is not managed", m));
 6414         VM_OBJECT_ASSERT_WLOCKED(m->object);
 6415         KASSERT(!vm_page_xbusied(m),
 6416             ("pmap_clear_modify: page %p is exclusive busied", m));
 6417 
 6418         /*
 6419          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 6420          * If the object containing the page is locked and the page is not
 6421          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 6422          */
 6423         if ((m->aflags & PGA_WRITEABLE) == 0)
 6424                 return;
 6425         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 6426             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 6427         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 6428         rw_wlock(lock);
 6429 restart:
 6430         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 6431                 pmap = PV_PMAP(pv);
 6432                 if (!PMAP_TRYLOCK(pmap)) {
 6433                         pvh_gen = pvh->pv_gen;
 6434                         rw_wunlock(lock);
 6435                         PMAP_LOCK(pmap);
 6436                         rw_wlock(lock);
 6437                         if (pvh_gen != pvh->pv_gen) {
 6438                                 PMAP_UNLOCK(pmap);
 6439                                 goto restart;
 6440                         }
 6441                 }
 6442                 PG_M = pmap_modified_bit(pmap);
 6443                 PG_V = pmap_valid_bit(pmap);
 6444                 PG_RW = pmap_rw_bit(pmap);
 6445                 va = pv->pv_va;
 6446                 pde = pmap_pde(pmap, va);
 6447                 oldpde = *pde;
 6448                 if ((oldpde & PG_RW) != 0) {
 6449                         if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
 6450                                 if ((oldpde & PG_W) == 0) {
 6451                                         /*
 6452                                          * Write protect the mapping to a
 6453                                          * single page so that a subsequent
 6454                                          * write access may repromote.
 6455                                          */
 6456                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 6457                                             PG_PS_FRAME);
 6458                                         pte = pmap_pde_to_pte(pde, va);
 6459                                         oldpte = *pte;
 6460                                         if ((oldpte & PG_V) != 0) {
 6461                                                 while (!atomic_cmpset_long(pte,
 6462                                                     oldpte,
 6463                                                     oldpte & ~(PG_M | PG_RW)))
 6464                                                         oldpte = *pte;
 6465                                                 vm_page_dirty(m);
 6466                                                 pmap_invalidate_page(pmap, va);
 6467                                         }
 6468                                 }
 6469                         }
 6470                 }
 6471                 PMAP_UNLOCK(pmap);
 6472         }
 6473         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 6474                 pmap = PV_PMAP(pv);
 6475                 if (!PMAP_TRYLOCK(pmap)) {
 6476                         md_gen = m->md.pv_gen;
 6477                         pvh_gen = pvh->pv_gen;
 6478                         rw_wunlock(lock);
 6479                         PMAP_LOCK(pmap);
 6480                         rw_wlock(lock);
 6481                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 6482                                 PMAP_UNLOCK(pmap);
 6483                                 goto restart;
 6484                         }
 6485                 }
 6486                 PG_M = pmap_modified_bit(pmap);
 6487                 PG_RW = pmap_rw_bit(pmap);
 6488                 pde = pmap_pde(pmap, pv->pv_va);
 6489                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 6490                     " a 2mpage in page %p's pv list", m));
 6491                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 6492                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 6493                         atomic_clear_long(pte, PG_M);
 6494                         pmap_invalidate_page(pmap, pv->pv_va);
 6495                 }
 6496                 PMAP_UNLOCK(pmap);
 6497         }
 6498         rw_wunlock(lock);
 6499 }
 6500 
 6501 /*
 6502  * Miscellaneous support routines follow
 6503  */
 6504 
 6505 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 6506 static __inline void
 6507 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
 6508 {
 6509         u_int opte, npte;
 6510 
 6511         /*
 6512          * The cache mode bits are all in the low 32-bits of the
 6513          * PTE, so we can just spin on updating the low 32-bits.
 6514          */
 6515         do {
 6516                 opte = *(u_int *)pte;
 6517                 npte = opte & ~mask;
 6518                 npte |= cache_bits;
 6519         } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 6520 }
 6521 
 6522 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 6523 static __inline void
 6524 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
 6525 {
 6526         u_int opde, npde;
 6527 
 6528         /*
 6529          * The cache mode bits are all in the low 32-bits of the
 6530          * PDE, so we can just spin on updating the low 32-bits.
 6531          */
 6532         do {
 6533                 opde = *(u_int *)pde;
 6534                 npde = opde & ~mask;
 6535                 npde |= cache_bits;
 6536         } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 6537 }
 6538 
 6539 /*
 6540  * Map a set of physical memory pages into the kernel virtual
 6541  * address space. Return a pointer to where it is mapped. This
 6542  * routine is intended to be used for mapping device memory,
 6543  * NOT real memory.
 6544  */
 6545 void *
 6546 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 6547 {
 6548         struct pmap_preinit_mapping *ppim;
 6549         vm_offset_t va, offset;
 6550         vm_size_t tmpsize;
 6551         int i;
 6552 
 6553         offset = pa & PAGE_MASK;
 6554         size = round_page(offset + size);
 6555         pa = trunc_page(pa);
 6556 
 6557         if (!pmap_initialized) {
 6558                 va = 0;
 6559                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 6560                         ppim = pmap_preinit_mapping + i;
 6561                         if (ppim->va == 0) {
 6562                                 ppim->pa = pa;
 6563                                 ppim->sz = size;
 6564                                 ppim->mode = mode;
 6565                                 ppim->va = virtual_avail;
 6566                                 virtual_avail += size;
 6567                                 va = ppim->va;
 6568                                 break;
 6569                         }
 6570                 }
 6571                 if (va == 0)
 6572                         panic("%s: too many preinit mappings", __func__);
 6573         } else {
 6574                 /*
 6575                  * If we have a preinit mapping, re-use it.
 6576                  */
 6577                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 6578                         ppim = pmap_preinit_mapping + i;
 6579                         if (ppim->pa == pa && ppim->sz == size &&
 6580                             ppim->mode == mode)
 6581                                 return ((void *)(ppim->va + offset));
 6582                 }
 6583                 /*
 6584                  * If the specified range of physical addresses fits within
 6585                  * the direct map window, use the direct map.
 6586                  */
 6587                 if (pa < dmaplimit && pa + size < dmaplimit) {
 6588                         va = PHYS_TO_DMAP(pa);
 6589                         if (!pmap_change_attr(va, size, mode))
 6590                                 return ((void *)(va + offset));
 6591                 }
 6592                 va = kva_alloc(size);
 6593                 if (va == 0)
 6594                         panic("%s: Couldn't allocate KVA", __func__);
 6595         }
 6596         for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 6597                 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 6598         pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 6599         pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
 6600         return ((void *)(va + offset));
 6601 }
 6602 
 6603 void *
 6604 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 6605 {
 6606 
 6607         return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 6608 }
 6609 
 6610 void *
 6611 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 6612 {
 6613 
 6614         return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 6615 }
 6616 
 6617 void
 6618 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 6619 {
 6620         struct pmap_preinit_mapping *ppim;
 6621         vm_offset_t offset;
 6622         int i;
 6623 
 6624         /* If we gave a direct map region in pmap_mapdev, do nothing */
 6625         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 6626                 return;
 6627         offset = va & PAGE_MASK;
 6628         size = round_page(offset + size);
 6629         va = trunc_page(va);
 6630         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 6631                 ppim = pmap_preinit_mapping + i;
 6632                 if (ppim->va == va && ppim->sz == size) {
 6633                         if (pmap_initialized)
 6634                                 return;
 6635                         ppim->pa = 0;
 6636                         ppim->va = 0;
 6637                         ppim->sz = 0;
 6638                         ppim->mode = 0;
 6639                         if (va + size == virtual_avail)
 6640                                 virtual_avail = va;
 6641                         return;
 6642                 }
 6643         }
 6644         if (pmap_initialized)
 6645                 kva_free(va, size);
 6646 }
 6647 
 6648 /*
 6649  * Tries to demote a 1GB page mapping.
 6650  */
 6651 static boolean_t
 6652 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
 6653 {
 6654         pdp_entry_t newpdpe, oldpdpe;
 6655         pd_entry_t *firstpde, newpde, *pde;
 6656         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 6657         vm_paddr_t mpdepa;
 6658         vm_page_t mpde;
 6659 
 6660         PG_A = pmap_accessed_bit(pmap);
 6661         PG_M = pmap_modified_bit(pmap);
 6662         PG_V = pmap_valid_bit(pmap);
 6663         PG_RW = pmap_rw_bit(pmap);
 6664 
 6665         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 6666         oldpdpe = *pdpe;
 6667         KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
 6668             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 6669         if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
 6670             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 6671                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 6672                     " in pmap %p", va, pmap);
 6673                 return (FALSE);
 6674         }
 6675         mpdepa = VM_PAGE_TO_PHYS(mpde);
 6676         firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
 6677         newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
 6678         KASSERT((oldpdpe & PG_A) != 0,
 6679             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 6680         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 6681             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 6682         newpde = oldpdpe;
 6683 
 6684         /*
 6685          * Initialize the page directory page.
 6686          */
 6687         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 6688                 *pde = newpde;
 6689                 newpde += NBPDR;
 6690         }
 6691 
 6692         /*
 6693          * Demote the mapping.
 6694          */
 6695         *pdpe = newpdpe;
 6696 
 6697         /*
 6698          * Invalidate a stale recursive mapping of the page directory page.
 6699          */
 6700         pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
 6701 
 6702         pmap_pdpe_demotions++;
 6703         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 6704             " in pmap %p", va, pmap);
 6705         return (TRUE);
 6706 }
 6707 
 6708 /*
 6709  * Sets the memory attribute for the specified page.
 6710  */
 6711 void
 6712 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 6713 {
 6714 
 6715         m->md.pat_mode = ma;
 6716 
 6717         /*
 6718          * If "m" is a normal page, update its direct mapping.  This update
 6719          * can be relied upon to perform any cache operations that are
 6720          * required for data coherence.
 6721          */
 6722         if ((m->flags & PG_FICTITIOUS) == 0 &&
 6723             pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 6724             m->md.pat_mode))
 6725                 panic("memory attribute change on the direct map failed");
 6726 }
 6727 
 6728 /*
 6729  * Changes the specified virtual address range's memory type to that given by
 6730  * the parameter "mode".  The specified virtual address range must be
 6731  * completely contained within either the direct map or the kernel map.  If
 6732  * the virtual address range is contained within the kernel map, then the
 6733  * memory type for each of the corresponding ranges of the direct map is also
 6734  * changed.  (The corresponding ranges of the direct map are those ranges that
 6735  * map the same physical pages as the specified virtual address range.)  These
 6736  * changes to the direct map are necessary because Intel describes the
 6737  * behavior of their processors as "undefined" if two or more mappings to the
 6738  * same physical page have different memory types.
 6739  *
 6740  * Returns zero if the change completed successfully, and either EINVAL or
 6741  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
 6742  * of the virtual address range was not mapped, and ENOMEM is returned if
 6743  * there was insufficient memory available to complete the change.  In the
 6744  * latter case, the memory type may have been changed on some part of the
 6745  * virtual address range or the direct map.
 6746  */
 6747 int
 6748 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 6749 {
 6750         int error;
 6751 
 6752         PMAP_LOCK(kernel_pmap);
 6753         error = pmap_change_attr_locked(va, size, mode);
 6754         PMAP_UNLOCK(kernel_pmap);
 6755         return (error);
 6756 }
 6757 
 6758 static int
 6759 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 6760 {
 6761         vm_offset_t base, offset, tmpva;
 6762         vm_paddr_t pa_start, pa_end, pa_end1;
 6763         pdp_entry_t *pdpe;
 6764         pd_entry_t *pde;
 6765         pt_entry_t *pte;
 6766         int cache_bits_pte, cache_bits_pde, error;
 6767         boolean_t changed;
 6768 
 6769         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 6770         base = trunc_page(va);
 6771         offset = va & PAGE_MASK;
 6772         size = round_page(offset + size);
 6773 
 6774         /*
 6775          * Only supported on kernel virtual addresses, including the direct
 6776          * map but excluding the recursive map.
 6777          */
 6778         if (base < DMAP_MIN_ADDRESS)
 6779                 return (EINVAL);
 6780 
 6781         cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
 6782         cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
 6783         changed = FALSE;
 6784 
 6785         /*
 6786          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 6787          * into 4KB pages if required.
 6788          */
 6789         for (tmpva = base; tmpva < base + size; ) {
 6790                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
 6791                 if (pdpe == NULL || *pdpe == 0)
 6792                         return (EINVAL);
 6793                 if (*pdpe & PG_PS) {
 6794                         /*
 6795                          * If the current 1GB page already has the required
 6796                          * memory type, then we need not demote this page. Just
 6797                          * increment tmpva to the next 1GB page frame.
 6798                          */
 6799                         if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
 6800                                 tmpva = trunc_1gpage(tmpva) + NBPDP;
 6801                                 continue;
 6802                         }
 6803 
 6804                         /*
 6805                          * If the current offset aligns with a 1GB page frame
 6806                          * and there is at least 1GB left within the range, then
 6807                          * we need not break down this page into 2MB pages.
 6808                          */
 6809                         if ((tmpva & PDPMASK) == 0 &&
 6810                             tmpva + PDPMASK < base + size) {
 6811                                 tmpva += NBPDP;
 6812                                 continue;
 6813                         }
 6814                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
 6815                                 return (ENOMEM);
 6816                 }
 6817                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
 6818                 if (*pde == 0)
 6819                         return (EINVAL);
 6820                 if (*pde & PG_PS) {
 6821                         /*
 6822                          * If the current 2MB page already has the required
 6823                          * memory type, then we need not demote this page. Just
 6824                          * increment tmpva to the next 2MB page frame.
 6825                          */
 6826                         if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
 6827                                 tmpva = trunc_2mpage(tmpva) + NBPDR;
 6828                                 continue;
 6829                         }
 6830 
 6831                         /*
 6832                          * If the current offset aligns with a 2MB page frame
 6833                          * and there is at least 2MB left within the range, then
 6834                          * we need not break down this page into 4KB pages.
 6835                          */
 6836                         if ((tmpva & PDRMASK) == 0 &&
 6837                             tmpva + PDRMASK < base + size) {
 6838                                 tmpva += NBPDR;
 6839                                 continue;
 6840                         }
 6841                         if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
 6842                                 return (ENOMEM);
 6843                 }
 6844                 pte = pmap_pde_to_pte(pde, tmpva);
 6845                 if (*pte == 0)
 6846                         return (EINVAL);
 6847                 tmpva += PAGE_SIZE;
 6848         }
 6849         error = 0;
 6850 
 6851         /*
 6852          * Ok, all the pages exist, so run through them updating their
 6853          * cache mode if required.
 6854          */
 6855         pa_start = pa_end = 0;
 6856         for (tmpva = base; tmpva < base + size; ) {
 6857                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
 6858                 if (*pdpe & PG_PS) {
 6859                         if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
 6860                                 pmap_pde_attr(pdpe, cache_bits_pde,
 6861                                     X86_PG_PDE_CACHE);
 6862                                 changed = TRUE;
 6863                         }
 6864                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 6865                             (*pdpe & PG_PS_FRAME) < dmaplimit) {
 6866                                 if (pa_start == pa_end) {
 6867                                         /* Start physical address run. */
 6868                                         pa_start = *pdpe & PG_PS_FRAME;
 6869                                         pa_end = pa_start + NBPDP;
 6870                                 } else if (pa_end == (*pdpe & PG_PS_FRAME))
 6871                                         pa_end += NBPDP;
 6872                                 else {
 6873                                         /* Run ended, update direct map. */
 6874                                         error = pmap_change_attr_locked(
 6875                                             PHYS_TO_DMAP(pa_start),
 6876                                             pa_end - pa_start, mode);
 6877                                         if (error != 0)
 6878                                                 break;
 6879                                         /* Start physical address run. */
 6880                                         pa_start = *pdpe & PG_PS_FRAME;
 6881                                         pa_end = pa_start + NBPDP;
 6882                                 }
 6883                         }
 6884                         tmpva = trunc_1gpage(tmpva) + NBPDP;
 6885                         continue;
 6886                 }
 6887                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
 6888                 if (*pde & PG_PS) {
 6889                         if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
 6890                                 pmap_pde_attr(pde, cache_bits_pde,
 6891                                     X86_PG_PDE_CACHE);
 6892                                 changed = TRUE;
 6893                         }
 6894                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 6895                             (*pde & PG_PS_FRAME) < dmaplimit) {
 6896                                 if (pa_start == pa_end) {
 6897                                         /* Start physical address run. */
 6898                                         pa_start = *pde & PG_PS_FRAME;
 6899                                         pa_end = pa_start + NBPDR;
 6900                                 } else if (pa_end == (*pde & PG_PS_FRAME))
 6901                                         pa_end += NBPDR;
 6902                                 else {
 6903                                         /* Run ended, update direct map. */
 6904                                         error = pmap_change_attr_locked(
 6905                                             PHYS_TO_DMAP(pa_start),
 6906                                             pa_end - pa_start, mode);
 6907                                         if (error != 0)
 6908                                                 break;
 6909                                         /* Start physical address run. */
 6910                                         pa_start = *pde & PG_PS_FRAME;
 6911                                         pa_end = pa_start + NBPDR;
 6912                                 }
 6913                         }
 6914                         tmpva = trunc_2mpage(tmpva) + NBPDR;
 6915                 } else {
 6916                         pte = pmap_pde_to_pte(pde, tmpva);
 6917                         if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
 6918                                 pmap_pte_attr(pte, cache_bits_pte,
 6919                                     X86_PG_PTE_CACHE);
 6920                                 changed = TRUE;
 6921                         }
 6922                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 6923                             (*pte & PG_FRAME) < dmaplimit) {
 6924                                 if (pa_start == pa_end) {
 6925                                         /* Start physical address run. */
 6926                                         pa_start = *pte & PG_FRAME;
 6927                                         pa_end = pa_start + PAGE_SIZE;
 6928                                 } else if (pa_end == (*pte & PG_FRAME))
 6929                                         pa_end += PAGE_SIZE;
 6930                                 else {
 6931                                         /* Run ended, update direct map. */
 6932                                         error = pmap_change_attr_locked(
 6933                                             PHYS_TO_DMAP(pa_start),
 6934                                             pa_end - pa_start, mode);
 6935                                         if (error != 0)
 6936                                                 break;
 6937                                         /* Start physical address run. */
 6938                                         pa_start = *pte & PG_FRAME;
 6939                                         pa_end = pa_start + PAGE_SIZE;
 6940                                 }
 6941                         }
 6942                         tmpva += PAGE_SIZE;
 6943                 }
 6944         }
 6945         if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
 6946                 pa_end1 = MIN(pa_end, dmaplimit);
 6947                 if (pa_start != pa_end1)
 6948                         error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
 6949                             pa_end1 - pa_start, mode);
 6950         }
 6951 
 6952         /*
 6953          * Flush CPU caches if required to make sure any data isn't cached that
 6954          * shouldn't be, etc.
 6955          */
 6956         if (changed) {
 6957                 pmap_invalidate_range(kernel_pmap, base, tmpva);
 6958                 pmap_invalidate_cache_range(base, tmpva, FALSE);
 6959         }
 6960         return (error);
 6961 }
 6962 
 6963 /*
 6964  * Demotes any mapping within the direct map region that covers more than the
 6965  * specified range of physical addresses.  This range's size must be a power
 6966  * of two and its starting address must be a multiple of its size.  Since the
 6967  * demotion does not change any attributes of the mapping, a TLB invalidation
 6968  * is not mandatory.  The caller may, however, request a TLB invalidation.
 6969  */
 6970 void
 6971 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
 6972 {
 6973         pdp_entry_t *pdpe;
 6974         pd_entry_t *pde;
 6975         vm_offset_t va;
 6976         boolean_t changed;
 6977 
 6978         if (len == 0)
 6979                 return;
 6980         KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
 6981         KASSERT((base & (len - 1)) == 0,
 6982             ("pmap_demote_DMAP: base is not a multiple of len"));
 6983         if (len < NBPDP && base < dmaplimit) {
 6984                 va = PHYS_TO_DMAP(base);
 6985                 changed = FALSE;
 6986                 PMAP_LOCK(kernel_pmap);
 6987                 pdpe = pmap_pdpe(kernel_pmap, va);
 6988                 if ((*pdpe & X86_PG_V) == 0)
 6989                         panic("pmap_demote_DMAP: invalid PDPE");
 6990                 if ((*pdpe & PG_PS) != 0) {
 6991                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
 6992                                 panic("pmap_demote_DMAP: PDPE failed");
 6993                         changed = TRUE;
 6994                 }
 6995                 if (len < NBPDR) {
 6996                         pde = pmap_pdpe_to_pde(pdpe, va);
 6997                         if ((*pde & X86_PG_V) == 0)
 6998                                 panic("pmap_demote_DMAP: invalid PDE");
 6999                         if ((*pde & PG_PS) != 0) {
 7000                                 if (!pmap_demote_pde(kernel_pmap, pde, va))
 7001                                         panic("pmap_demote_DMAP: PDE failed");
 7002                                 changed = TRUE;
 7003                         }
 7004                 }
 7005                 if (changed && invalidate)
 7006                         pmap_invalidate_page(kernel_pmap, va);
 7007                 PMAP_UNLOCK(kernel_pmap);
 7008         }
 7009 }
 7010 
 7011 /*
 7012  * perform the pmap work for mincore
 7013  */
 7014 int
 7015 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 7016 {
 7017         pd_entry_t *pdep;
 7018         pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
 7019         vm_paddr_t pa;
 7020         int val;
 7021 
 7022         PG_A = pmap_accessed_bit(pmap);
 7023         PG_M = pmap_modified_bit(pmap);
 7024         PG_V = pmap_valid_bit(pmap);
 7025         PG_RW = pmap_rw_bit(pmap);
 7026 
 7027         PMAP_LOCK(pmap);
 7028 retry:
 7029         pdep = pmap_pde(pmap, addr);
 7030         if (pdep != NULL && (*pdep & PG_V)) {
 7031                 if (*pdep & PG_PS) {
 7032                         pte = *pdep;
 7033                         /* Compute the physical address of the 4KB page. */
 7034                         pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 7035                             PG_FRAME;
 7036                         val = MINCORE_SUPER;
 7037                 } else {
 7038                         pte = *pmap_pde_to_pte(pdep, addr);
 7039                         pa = pte & PG_FRAME;
 7040                         val = 0;
 7041                 }
 7042         } else {
 7043                 pte = 0;
 7044                 pa = 0;
 7045                 val = 0;
 7046         }
 7047         if ((pte & PG_V) != 0) {
 7048                 val |= MINCORE_INCORE;
 7049                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 7050                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 7051                 if ((pte & PG_A) != 0)
 7052                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 7053         }
 7054         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 7055             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 7056             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 7057                 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 7058                 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 7059                         goto retry;
 7060         } else
 7061                 PA_UNLOCK_COND(*locked_pa);
 7062         PMAP_UNLOCK(pmap);
 7063         return (val);
 7064 }
 7065 
 7066 static uint64_t
 7067 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
 7068 {
 7069         uint32_t gen, new_gen, pcid_next;
 7070 
 7071         CRITICAL_ASSERT(curthread);
 7072         gen = PCPU_GET(pcid_gen);
 7073         if (!pti && (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
 7074             pmap->pm_pcids[cpuid].pm_gen == gen))
 7075                 return (CR3_PCID_SAVE);
 7076         pcid_next = PCPU_GET(pcid_next);
 7077         KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
 7078             (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
 7079             ("cpu %d pcid_next %#x", cpuid, pcid_next));
 7080         if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
 7081             (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
 7082                 new_gen = gen + 1;
 7083                 if (new_gen == 0)
 7084                         new_gen = 1;
 7085                 PCPU_SET(pcid_gen, new_gen);
 7086                 pcid_next = PMAP_PCID_KERN + 1;
 7087         } else {
 7088                 new_gen = gen;
 7089         }
 7090         pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
 7091         pmap->pm_pcids[cpuid].pm_gen = new_gen;
 7092         PCPU_SET(pcid_next, pcid_next + 1);
 7093         return (0);
 7094 }
 7095 
 7096 void
 7097 pmap_activate_sw(struct thread *td)
 7098 {
 7099         pmap_t oldpmap, pmap;
 7100         struct invpcid_descr d;
 7101         uint64_t cached, cr3, kcr3, ucr3;
 7102         register_t rflags;
 7103         u_int cpuid;
 7104 
 7105         oldpmap = PCPU_GET(curpmap);
 7106         pmap = vmspace_pmap(td->td_proc->p_vmspace);
 7107         if (oldpmap == pmap)
 7108                 return;
 7109         cpuid = PCPU_GET(cpuid);
 7110 #ifdef SMP
 7111         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 7112 #else
 7113         CPU_SET(cpuid, &pmap->pm_active);
 7114 #endif
 7115         cr3 = rcr3();
 7116         if (pmap_pcid_enabled) {
 7117                 cached = pmap_pcid_alloc(pmap, cpuid);
 7118                 KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 &&
 7119                     pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
 7120                     ("pmap %p cpu %d pcid %#x", pmap, cpuid,
 7121                     pmap->pm_pcids[cpuid].pm_pcid));
 7122                 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
 7123                     pmap == kernel_pmap,
 7124                     ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x",
 7125                     td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
 7126 
 7127                 /*
 7128                  * If the INVPCID instruction is not available,
 7129                  * invltlb_pcid_handler() is used for handle
 7130                  * invalidate_all IPI, which checks for curpmap ==
 7131                  * smp_tlb_pmap.  Below operations sequence has a
 7132                  * window where %CR3 is loaded with the new pmap's
 7133                  * PML4 address, but curpmap value is not yet updated.
 7134                  * This causes invltlb IPI handler, called between the
 7135                  * updates, to execute as NOP, which leaves stale TLB
 7136                  * entries.
 7137                  *
 7138                  * Note that the most typical use of
 7139                  * pmap_activate_sw(), from the context switch, is
 7140                  * immune to this race, because interrupts are
 7141                  * disabled (while the thread lock is owned), and IPI
 7142                  * happends after curpmap is updated.  Protect other
 7143                  * callers in a similar way, by disabling interrupts
 7144                  * around the %cr3 register reload and curpmap
 7145                  * assignment.
 7146                  */
 7147                 if (!invpcid_works)
 7148                         rflags = intr_disable();
 7149 
 7150                 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) {
 7151                         load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
 7152                             cached);
 7153                         if (cached)
 7154                                 PCPU_INC(pm_save_cnt);
 7155                 }
 7156                 PCPU_SET(curpmap, pmap);
 7157                 if (pti) {
 7158                         kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
 7159                         ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
 7160                             PMAP_PCID_USER_PT;
 7161 
 7162                         /*
 7163                          * Manually invalidate translations cached
 7164                          * from the user page table, which are not
 7165                          * flushed by reload of cr3 with the kernel
 7166                          * page table pointer above.
 7167                          */
 7168                         if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 7169                                 if (invpcid_works) {
 7170                                         d.pcid = PMAP_PCID_USER_PT |
 7171                                             pmap->pm_pcids[cpuid].pm_pcid;
 7172                                         d.pad = 0;
 7173                                         d.addr = 0;
 7174                                         invpcid(&d, INVPCID_CTX);
 7175                                 } else {
 7176                                         pmap_pti_pcid_invalidate(ucr3, kcr3);
 7177                                 }
 7178                         }
 7179 
 7180                         PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
 7181                         PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
 7182                 }
 7183                 if (!invpcid_works)
 7184                         intr_restore(rflags);
 7185         } else if (cr3 != pmap->pm_cr3) {
 7186                 load_cr3(pmap->pm_cr3);
 7187                 PCPU_SET(curpmap, pmap);
 7188                 if (pti) {
 7189                         PCPU_SET(kcr3, pmap->pm_cr3);
 7190                         PCPU_SET(ucr3, pmap->pm_ucr3);
 7191                 }
 7192         }
 7193 #ifdef SMP
 7194         CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 7195 #else
 7196         CPU_CLR(cpuid, &oldpmap->pm_active);
 7197 #endif
 7198 }
 7199 
 7200 void
 7201 pmap_activate(struct thread *td)
 7202 {
 7203 
 7204         critical_enter();
 7205         pmap_activate_sw(td);
 7206         critical_exit();
 7207 }
 7208 
 7209 void
 7210 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 7211 {
 7212 }
 7213 
 7214 /*
 7215  *      Increase the starting virtual address of the given mapping if a
 7216  *      different alignment might result in more superpage mappings.
 7217  */
 7218 void
 7219 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
 7220     vm_offset_t *addr, vm_size_t size)
 7221 {
 7222         vm_offset_t superpage_offset;
 7223 
 7224         if (size < NBPDR)
 7225                 return;
 7226         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 7227                 offset += ptoa(object->pg_color);
 7228         superpage_offset = offset & PDRMASK;
 7229         if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 7230             (*addr & PDRMASK) == superpage_offset)
 7231                 return;
 7232         if ((*addr & PDRMASK) < superpage_offset)
 7233                 *addr = (*addr & ~PDRMASK) + superpage_offset;
 7234         else
 7235                 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 7236 }
 7237 
 7238 #ifdef INVARIANTS
 7239 static unsigned long num_dirty_emulations;
 7240 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
 7241              &num_dirty_emulations, 0, NULL);
 7242 
 7243 static unsigned long num_accessed_emulations;
 7244 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
 7245              &num_accessed_emulations, 0, NULL);
 7246 
 7247 static unsigned long num_superpage_accessed_emulations;
 7248 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
 7249              &num_superpage_accessed_emulations, 0, NULL);
 7250 
 7251 static unsigned long ad_emulation_superpage_promotions;
 7252 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
 7253              &ad_emulation_superpage_promotions, 0, NULL);
 7254 #endif  /* INVARIANTS */
 7255 
 7256 int
 7257 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
 7258 {
 7259         int rv;
 7260         struct rwlock *lock;
 7261         vm_page_t m, mpte;
 7262         pd_entry_t *pde;
 7263         pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
 7264 
 7265         KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
 7266             ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
 7267 
 7268         if (!pmap_emulate_ad_bits(pmap))
 7269                 return (-1);
 7270 
 7271         PG_A = pmap_accessed_bit(pmap);
 7272         PG_M = pmap_modified_bit(pmap);
 7273         PG_V = pmap_valid_bit(pmap);
 7274         PG_RW = pmap_rw_bit(pmap);
 7275 
 7276         rv = -1;
 7277         lock = NULL;
 7278         PMAP_LOCK(pmap);
 7279 
 7280         pde = pmap_pde(pmap, va);
 7281         if (pde == NULL || (*pde & PG_V) == 0)
 7282                 goto done;
 7283 
 7284         if ((*pde & PG_PS) != 0) {
 7285                 if (ftype == VM_PROT_READ) {
 7286 #ifdef INVARIANTS
 7287                         atomic_add_long(&num_superpage_accessed_emulations, 1);
 7288 #endif
 7289                         *pde |= PG_A;
 7290                         rv = 0;
 7291                 }
 7292                 goto done;
 7293         }
 7294 
 7295         pte = pmap_pde_to_pte(pde, va);
 7296         if ((*pte & PG_V) == 0)
 7297                 goto done;
 7298 
 7299         if (ftype == VM_PROT_WRITE) {
 7300                 if ((*pte & PG_RW) == 0)
 7301                         goto done;
 7302                 /*
 7303                  * Set the modified and accessed bits simultaneously.
 7304                  *
 7305                  * Intel EPT PTEs that do software emulation of A/D bits map
 7306                  * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
 7307                  * An EPT misconfiguration is triggered if the PTE is writable
 7308                  * but not readable (WR=10). This is avoided by setting PG_A
 7309                  * and PG_M simultaneously.
 7310                  */
 7311                 *pte |= PG_M | PG_A;
 7312         } else {
 7313                 *pte |= PG_A;
 7314         }
 7315 
 7316         /* try to promote the mapping */
 7317         if (va < VM_MAXUSER_ADDRESS)
 7318                 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 7319         else
 7320                 mpte = NULL;
 7321 
 7322         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 7323 
 7324         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 7325             pmap_ps_enabled(pmap) &&
 7326             (m->flags & PG_FICTITIOUS) == 0 &&
 7327             vm_reserv_level_iffullpop(m) == 0) {
 7328                 pmap_promote_pde(pmap, pde, va, &lock);
 7329 #ifdef INVARIANTS
 7330                 atomic_add_long(&ad_emulation_superpage_promotions, 1);
 7331 #endif
 7332         }
 7333 #ifdef INVARIANTS
 7334         if (ftype == VM_PROT_WRITE)
 7335                 atomic_add_long(&num_dirty_emulations, 1);
 7336         else
 7337                 atomic_add_long(&num_accessed_emulations, 1);
 7338 #endif
 7339         rv = 0;         /* success */
 7340 done:
 7341         if (lock != NULL)
 7342                 rw_wunlock(lock);
 7343         PMAP_UNLOCK(pmap);
 7344         return (rv);
 7345 }
 7346 
 7347 void
 7348 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
 7349 {
 7350         pml4_entry_t *pml4;
 7351         pdp_entry_t *pdp;
 7352         pd_entry_t *pde;
 7353         pt_entry_t *pte, PG_V;
 7354         int idx;
 7355 
 7356         idx = 0;
 7357         PG_V = pmap_valid_bit(pmap);
 7358         PMAP_LOCK(pmap);
 7359 
 7360         pml4 = pmap_pml4e(pmap, va);
 7361         ptr[idx++] = *pml4;
 7362         if ((*pml4 & PG_V) == 0)
 7363                 goto done;
 7364 
 7365         pdp = pmap_pml4e_to_pdpe(pml4, va);
 7366         ptr[idx++] = *pdp;
 7367         if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
 7368                 goto done;
 7369 
 7370         pde = pmap_pdpe_to_pde(pdp, va);
 7371         ptr[idx++] = *pde;
 7372         if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
 7373                 goto done;
 7374 
 7375         pte = pmap_pde_to_pte(pde, va);
 7376         ptr[idx++] = *pte;
 7377 
 7378 done:
 7379         PMAP_UNLOCK(pmap);
 7380         *num = idx;
 7381 }
 7382 
 7383 /**
 7384  * Get the kernel virtual address of a set of physical pages. If there are
 7385  * physical addresses not covered by the DMAP perform a transient mapping
 7386  * that will be removed when calling pmap_unmap_io_transient.
 7387  *
 7388  * \param page        The pages the caller wishes to obtain the virtual
 7389  *                    address on the kernel memory map.
 7390  * \param vaddr       On return contains the kernel virtual memory address
 7391  *                    of the pages passed in the page parameter.
 7392  * \param count       Number of pages passed in.
 7393  * \param can_fault   TRUE if the thread using the mapped pages can take
 7394  *                    page faults, FALSE otherwise.
 7395  *
 7396  * \returns TRUE if the caller must call pmap_unmap_io_transient when
 7397  *          finished or FALSE otherwise.
 7398  *
 7399  */
 7400 boolean_t
 7401 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
 7402     boolean_t can_fault)
 7403 {
 7404         vm_paddr_t paddr;
 7405         boolean_t needs_mapping;
 7406         pt_entry_t *pte;
 7407         int cache_bits, error, i;
 7408 
 7409         /*
 7410          * Allocate any KVA space that we need, this is done in a separate
 7411          * loop to prevent calling vmem_alloc while pinned.
 7412          */
 7413         needs_mapping = FALSE;
 7414         for (i = 0; i < count; i++) {
 7415                 paddr = VM_PAGE_TO_PHYS(page[i]);
 7416                 if (__predict_false(paddr >= dmaplimit)) {
 7417                         error = vmem_alloc(kernel_arena, PAGE_SIZE,
 7418                             M_BESTFIT | M_WAITOK, &vaddr[i]);
 7419                         KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 7420                         needs_mapping = TRUE;
 7421                 } else {
 7422                         vaddr[i] = PHYS_TO_DMAP(paddr);
 7423                 }
 7424         }
 7425 
 7426         /* Exit early if everything is covered by the DMAP */
 7427         if (!needs_mapping)
 7428                 return (FALSE);
 7429 
 7430         /*
 7431          * NB:  The sequence of updating a page table followed by accesses
 7432          * to the corresponding pages used in the !DMAP case is subject to
 7433          * the situation described in the "AMD64 Architecture Programmer's
 7434          * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
 7435          * Coherency Considerations".  Therefore, issuing the INVLPG right
 7436          * after modifying the PTE bits is crucial.
 7437          */
 7438         if (!can_fault)
 7439                 sched_pin();
 7440         for (i = 0; i < count; i++) {
 7441                 paddr = VM_PAGE_TO_PHYS(page[i]);
 7442                 if (paddr >= dmaplimit) {
 7443                         if (can_fault) {
 7444                                 /*
 7445                                  * Slow path, since we can get page faults
 7446                                  * while mappings are active don't pin the
 7447                                  * thread to the CPU and instead add a global
 7448                                  * mapping visible to all CPUs.
 7449                                  */
 7450                                 pmap_qenter(vaddr[i], &page[i], 1);
 7451                         } else {
 7452                                 pte = vtopte(vaddr[i]);
 7453                                 cache_bits = pmap_cache_bits(kernel_pmap,
 7454                                     page[i]->md.pat_mode, 0);
 7455                                 pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
 7456                                     cache_bits);
 7457                                 invlpg(vaddr[i]);
 7458                         }
 7459                 }
 7460         }
 7461 
 7462         return (needs_mapping);
 7463 }
 7464 
 7465 void
 7466 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
 7467     boolean_t can_fault)
 7468 {
 7469         vm_paddr_t paddr;
 7470         int i;
 7471 
 7472         if (!can_fault)
 7473                 sched_unpin();
 7474         for (i = 0; i < count; i++) {
 7475                 paddr = VM_PAGE_TO_PHYS(page[i]);
 7476                 if (paddr >= dmaplimit) {
 7477                         if (can_fault)
 7478                                 pmap_qremove(vaddr[i], 1);
 7479                         vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
 7480                 }
 7481         }
 7482 }
 7483 
 7484 vm_offset_t
 7485 pmap_quick_enter_page(vm_page_t m)
 7486 {
 7487         vm_paddr_t paddr;
 7488 
 7489         paddr = VM_PAGE_TO_PHYS(m);
 7490         if (paddr < dmaplimit)
 7491                 return (PHYS_TO_DMAP(paddr));
 7492         mtx_lock_spin(&qframe_mtx);
 7493         KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
 7494         pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
 7495             X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
 7496         return (qframe);
 7497 }
 7498 
 7499 void
 7500 pmap_quick_remove_page(vm_offset_t addr)
 7501 {
 7502 
 7503         if (addr != qframe)
 7504                 return;
 7505         pte_store(vtopte(qframe), 0);
 7506         invlpg(qframe);
 7507         mtx_unlock_spin(&qframe_mtx);
 7508 }
 7509 
 7510 static vm_page_t
 7511 pmap_pti_alloc_page(void)
 7512 {
 7513         vm_page_t m;
 7514 
 7515         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 7516         m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
 7517             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 7518         return (m);
 7519 }
 7520 
 7521 static bool
 7522 pmap_pti_free_page(vm_page_t m)
 7523 {
 7524 
 7525         KASSERT(m->wire_count > 0, ("page %p not wired", m));
 7526         m->wire_count--;
 7527         if (m->wire_count != 0)
 7528                 return (false);
 7529         atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 7530         vm_page_free_zero(m);
 7531         return (true);
 7532 }
 7533 
 7534 static void
 7535 pmap_pti_init(void)
 7536 {
 7537         vm_page_t pml4_pg;
 7538         pdp_entry_t *pdpe;
 7539         vm_offset_t va;
 7540         int i;
 7541 
 7542         if (!pti)
 7543                 return;
 7544         pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
 7545         VM_OBJECT_WLOCK(pti_obj);
 7546         pml4_pg = pmap_pti_alloc_page();
 7547         pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
 7548         for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
 7549             va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
 7550                 pdpe = pmap_pti_pdpe(va);
 7551                 pmap_pti_wire_pte(pdpe);
 7552         }
 7553         pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
 7554             (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
 7555         pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
 7556             sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
 7557         pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
 7558             sizeof(struct gate_descriptor) * NIDT, false);
 7559         pmap_pti_add_kva_locked((vm_offset_t)common_tss,
 7560             (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
 7561         CPU_FOREACH(i) {
 7562                 /* Doublefault stack IST 1 */
 7563                 va = common_tss[i].tss_ist1;
 7564                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 7565                 /* NMI stack IST 2 */
 7566                 va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
 7567                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 7568                 /* MC# stack IST 3 */
 7569                 va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
 7570                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 7571                 /* DB# stack IST 4 */
 7572                 va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
 7573                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 7574         }
 7575         pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
 7576             (vm_offset_t)etext, true);
 7577         pti_finalized = true;
 7578         VM_OBJECT_WUNLOCK(pti_obj);
 7579 }
 7580 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
 7581 
 7582 static pdp_entry_t *
 7583 pmap_pti_pdpe(vm_offset_t va)
 7584 {
 7585         pml4_entry_t *pml4e;
 7586         pdp_entry_t *pdpe;
 7587         vm_page_t m;
 7588         vm_pindex_t pml4_idx;
 7589         vm_paddr_t mphys;
 7590 
 7591         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 7592 
 7593         pml4_idx = pmap_pml4e_index(va);
 7594         pml4e = &pti_pml4[pml4_idx];
 7595         m = NULL;
 7596         if (*pml4e == 0) {
 7597                 if (pti_finalized)
 7598                         panic("pml4 alloc after finalization\n");
 7599                 m = pmap_pti_alloc_page();
 7600                 if (*pml4e != 0) {
 7601                         pmap_pti_free_page(m);
 7602                         mphys = *pml4e & ~PAGE_MASK;
 7603                 } else {
 7604                         mphys = VM_PAGE_TO_PHYS(m);
 7605                         *pml4e = mphys | X86_PG_RW | X86_PG_V;
 7606                 }
 7607         } else {
 7608                 mphys = *pml4e & ~PAGE_MASK;
 7609         }
 7610         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
 7611         return (pdpe);
 7612 }
 7613 
 7614 static void
 7615 pmap_pti_wire_pte(void *pte)
 7616 {
 7617         vm_page_t m;
 7618 
 7619         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 7620         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 7621         m->wire_count++;
 7622 }
 7623 
 7624 static void
 7625 pmap_pti_unwire_pde(void *pde, bool only_ref)
 7626 {
 7627         vm_page_t m;
 7628 
 7629         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 7630         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
 7631         MPASS(m->wire_count > 0);
 7632         MPASS(only_ref || m->wire_count > 1);
 7633         pmap_pti_free_page(m);
 7634 }
 7635 
 7636 static void
 7637 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
 7638 {
 7639         vm_page_t m;
 7640         pd_entry_t *pde;
 7641 
 7642         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 7643         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 7644         MPASS(m->wire_count > 0);
 7645         if (pmap_pti_free_page(m)) {
 7646                 pde = pmap_pti_pde(va);
 7647                 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
 7648                 *pde = 0;
 7649                 pmap_pti_unwire_pde(pde, false);
 7650         }
 7651 }
 7652 
 7653 static pd_entry_t *
 7654 pmap_pti_pde(vm_offset_t va)
 7655 {
 7656         pdp_entry_t *pdpe;
 7657         pd_entry_t *pde;
 7658         vm_page_t m;
 7659         vm_pindex_t pd_idx;
 7660         vm_paddr_t mphys;
 7661 
 7662         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 7663 
 7664         pdpe = pmap_pti_pdpe(va);
 7665         if (*pdpe == 0) {
 7666                 m = pmap_pti_alloc_page();
 7667                 if (*pdpe != 0) {
 7668                         pmap_pti_free_page(m);
 7669                         MPASS((*pdpe & X86_PG_PS) == 0);
 7670                         mphys = *pdpe & ~PAGE_MASK;
 7671                 } else {
 7672                         mphys =  VM_PAGE_TO_PHYS(m);
 7673                         *pdpe = mphys | X86_PG_RW | X86_PG_V;
 7674                 }
 7675         } else {
 7676                 MPASS((*pdpe & X86_PG_PS) == 0);
 7677                 mphys = *pdpe & ~PAGE_MASK;
 7678         }
 7679 
 7680         pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
 7681         pd_idx = pmap_pde_index(va);
 7682         pde += pd_idx;
 7683         return (pde);
 7684 }
 7685 
 7686 static pt_entry_t *
 7687 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
 7688 {
 7689         pd_entry_t *pde;
 7690         pt_entry_t *pte;
 7691         vm_page_t m;
 7692         vm_paddr_t mphys;
 7693 
 7694         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 7695 
 7696         pde = pmap_pti_pde(va);
 7697         if (unwire_pde != NULL) {
 7698                 *unwire_pde = true;
 7699                 pmap_pti_wire_pte(pde);
 7700         }
 7701         if (*pde == 0) {
 7702                 m = pmap_pti_alloc_page();
 7703                 if (*pde != 0) {
 7704                         pmap_pti_free_page(m);
 7705                         MPASS((*pde & X86_PG_PS) == 0);
 7706                         mphys = *pde & ~(PAGE_MASK | pg_nx);
 7707                 } else {
 7708                         mphys = VM_PAGE_TO_PHYS(m);
 7709                         *pde = mphys | X86_PG_RW | X86_PG_V;
 7710                         if (unwire_pde != NULL)
 7711                                 *unwire_pde = false;
 7712                 }
 7713         } else {
 7714                 MPASS((*pde & X86_PG_PS) == 0);
 7715                 mphys = *pde & ~(PAGE_MASK | pg_nx);
 7716         }
 7717 
 7718         pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
 7719         pte += pmap_pte_index(va);
 7720 
 7721         return (pte);
 7722 }
 7723 
 7724 static void
 7725 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
 7726 {
 7727         vm_paddr_t pa;
 7728         pd_entry_t *pde;
 7729         pt_entry_t *pte, ptev;
 7730         bool unwire_pde;
 7731 
 7732         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 7733 
 7734         sva = trunc_page(sva);
 7735         MPASS(sva > VM_MAXUSER_ADDRESS);
 7736         eva = round_page(eva);
 7737         MPASS(sva < eva);
 7738         for (; sva < eva; sva += PAGE_SIZE) {
 7739                 pte = pmap_pti_pte(sva, &unwire_pde);
 7740                 pa = pmap_kextract(sva);
 7741                 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A |
 7742                     (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
 7743                     VM_MEMATTR_DEFAULT, FALSE);
 7744                 if (*pte == 0) {
 7745                         pte_store(pte, ptev);
 7746                         pmap_pti_wire_pte(pte);
 7747                 } else {
 7748                         KASSERT(!pti_finalized,
 7749                             ("pti overlap after fin %#lx %#lx %#lx",
 7750                             sva, *pte, ptev));
 7751                         KASSERT(*pte == ptev,
 7752                             ("pti non-identical pte after fin %#lx %#lx %#lx",
 7753                             sva, *pte, ptev));
 7754                 }
 7755                 if (unwire_pde) {
 7756                         pde = pmap_pti_pde(sva);
 7757                         pmap_pti_unwire_pde(pde, true);
 7758                 }
 7759         }
 7760 }
 7761 
 7762 void
 7763 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
 7764 {
 7765 
 7766         if (!pti)
 7767                 return;
 7768         VM_OBJECT_WLOCK(pti_obj);
 7769         pmap_pti_add_kva_locked(sva, eva, exec);
 7770         VM_OBJECT_WUNLOCK(pti_obj);
 7771 }
 7772 
 7773 void
 7774 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
 7775 {
 7776         pt_entry_t *pte;
 7777         vm_offset_t va;
 7778 
 7779         if (!pti)
 7780                 return;
 7781         sva = rounddown2(sva, PAGE_SIZE);
 7782         MPASS(sva > VM_MAXUSER_ADDRESS);
 7783         eva = roundup2(eva, PAGE_SIZE);
 7784         MPASS(sva < eva);
 7785         VM_OBJECT_WLOCK(pti_obj);
 7786         for (va = sva; va < eva; va += PAGE_SIZE) {
 7787                 pte = pmap_pti_pte(va, NULL);
 7788                 KASSERT((*pte & X86_PG_V) != 0,
 7789                     ("invalid pte va %#lx pte %#lx pt %#lx", va,
 7790                     (u_long)pte, *pte));
 7791                 pte_clear(pte);
 7792                 pmap_pti_unwire_pte(pte, va);
 7793         }
 7794         pmap_invalidate_range(kernel_pmap, sva, eva);
 7795         VM_OBJECT_WUNLOCK(pti_obj);
 7796 }
 7797 
 7798 #include "opt_ddb.h"
 7799 #ifdef DDB
 7800 #include <ddb/ddb.h>
 7801 
 7802 DB_SHOW_COMMAND(pte, pmap_print_pte)
 7803 {
 7804         pmap_t pmap;
 7805         pml4_entry_t *pml4;
 7806         pdp_entry_t *pdp;
 7807         pd_entry_t *pde;
 7808         pt_entry_t *pte, PG_V;
 7809         vm_offset_t va;
 7810 
 7811         if (have_addr) {
 7812                 va = (vm_offset_t)addr;
 7813                 pmap = PCPU_GET(curpmap); /* XXX */
 7814         } else {
 7815                 db_printf("show pte addr\n");
 7816                 return;
 7817         }
 7818         PG_V = pmap_valid_bit(pmap);
 7819         pml4 = pmap_pml4e(pmap, va);
 7820         db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
 7821         if ((*pml4 & PG_V) == 0) {
 7822                 db_printf("\n");
 7823                 return;
 7824         }
 7825         pdp = pmap_pml4e_to_pdpe(pml4, va);
 7826         db_printf(" pdpe %#016lx", *pdp);
 7827         if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
 7828                 db_printf("\n");
 7829                 return;
 7830         }
 7831         pde = pmap_pdpe_to_pde(pdp, va);
 7832         db_printf(" pde %#016lx", *pde);
 7833         if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
 7834                 db_printf("\n");
 7835                 return;
 7836         }
 7837         pte = pmap_pde_to_pte(pde, va);
 7838         db_printf(" pte %#016lx\n", *pte);
 7839 }
 7840 
 7841 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
 7842 {
 7843         vm_paddr_t a;
 7844 
 7845         if (have_addr) {
 7846                 a = (vm_paddr_t)addr;
 7847                 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
 7848         } else {
 7849                 db_printf("show phys2dmap addr\n");
 7850         }
 7851 }
 7852 #endif

Cache object: 55147f009ab5e82c95491cf3a5a7ca97


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.