The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/i386/i386/pmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-4-Clause
    3  *
    4  * Copyright (c) 1991 Regents of the University of California.
    5  * All rights reserved.
    6  * Copyright (c) 1994 John S. Dyson
    7  * All rights reserved.
    8  * Copyright (c) 1994 David Greenman
    9  * All rights reserved.
   10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
   11  * All rights reserved.
   12  *
   13  * This code is derived from software contributed to Berkeley by
   14  * the Systems Programming Group of the University of Utah Computer
   15  * Science Department and William Jolitz of UUNET Technologies Inc.
   16  *
   17  * Redistribution and use in source and binary forms, with or without
   18  * modification, are permitted provided that the following conditions
   19  * are met:
   20  * 1. Redistributions of source code must retain the above copyright
   21  *    notice, this list of conditions and the following disclaimer.
   22  * 2. Redistributions in binary form must reproduce the above copyright
   23  *    notice, this list of conditions and the following disclaimer in the
   24  *    documentation and/or other materials provided with the distribution.
   25  * 3. All advertising materials mentioning features or use of this software
   26  *    must display the following acknowledgement:
   27  *      This product includes software developed by the University of
   28  *      California, Berkeley and its contributors.
   29  * 4. Neither the name of the University nor the names of its contributors
   30  *    may be used to endorse or promote products derived from this software
   31  *    without specific prior written permission.
   32  *
   33  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   34  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   35  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   36  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   37  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   38  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   39  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   40  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   41  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   42  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   43  * SUCH DAMAGE.
   44  *
   45  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
   46  */
   47 /*-
   48  * Copyright (c) 2003 Networks Associates Technology, Inc.
   49  * All rights reserved.
   50  * Copyright (c) 2018 The FreeBSD Foundation
   51  * All rights reserved.
   52  *
   53  * This software was developed for the FreeBSD Project by Jake Burkholder,
   54  * Safeport Network Services, and Network Associates Laboratories, the
   55  * Security Research Division of Network Associates, Inc. under
   56  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
   57  * CHATS research program.
   58  *
   59  * Portions of this software were developed by
   60  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
   61  * the FreeBSD Foundation.
   62  *
   63  * Redistribution and use in source and binary forms, with or without
   64  * modification, are permitted provided that the following conditions
   65  * are met:
   66  * 1. Redistributions of source code must retain the above copyright
   67  *    notice, this list of conditions and the following disclaimer.
   68  * 2. Redistributions in binary form must reproduce the above copyright
   69  *    notice, this list of conditions and the following disclaimer in the
   70  *    documentation and/or other materials provided with the distribution.
   71  *
   72  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   73  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   74  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   75  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   76  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   77  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   78  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   79  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   80  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   81  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   82  * SUCH DAMAGE.
   83  */
   84 
   85 #include <sys/cdefs.h>
   86 __FBSDID("$FreeBSD: releng/12.0/sys/i386/i386/pmap.c 340401 2018-11-13 18:21:47Z markj $");
   87 
   88 /*
   89  *      Manages physical address maps.
   90  *
   91  *      Since the information managed by this module is
   92  *      also stored by the logical address mapping module,
   93  *      this module may throw away valid virtual-to-physical
   94  *      mappings at almost any time.  However, invalidations
   95  *      of virtual-to-physical mappings must be done as
   96  *      requested.
   97  *
   98  *      In order to cope with hardware architectures which
   99  *      make virtual-to-physical map invalidates expensive,
  100  *      this module may delay invalidate or reduced protection
  101  *      operations until such time as they are actually
  102  *      necessary.  This module is given full information as
  103  *      to which processors are currently using which maps,
  104  *      and to when physical maps must be made correct.
  105  */
  106 
  107 #include "opt_apic.h"
  108 #include "opt_cpu.h"
  109 #include "opt_pmap.h"
  110 #include "opt_smp.h"
  111 #include "opt_vm.h"
  112 
  113 #include <sys/param.h>
  114 #include <sys/systm.h>
  115 #include <sys/kernel.h>
  116 #include <sys/ktr.h>
  117 #include <sys/lock.h>
  118 #include <sys/malloc.h>
  119 #include <sys/mman.h>
  120 #include <sys/msgbuf.h>
  121 #include <sys/mutex.h>
  122 #include <sys/proc.h>
  123 #include <sys/rwlock.h>
  124 #include <sys/sf_buf.h>
  125 #include <sys/sx.h>
  126 #include <sys/vmmeter.h>
  127 #include <sys/sched.h>
  128 #include <sys/sysctl.h>
  129 #include <sys/smp.h>
  130 #include <sys/vmem.h>
  131 
  132 #include <vm/vm.h>
  133 #include <vm/vm_param.h>
  134 #include <vm/vm_kern.h>
  135 #include <vm/vm_page.h>
  136 #include <vm/vm_map.h>
  137 #include <vm/vm_object.h>
  138 #include <vm/vm_extern.h>
  139 #include <vm/vm_pageout.h>
  140 #include <vm/vm_pager.h>
  141 #include <vm/vm_phys.h>
  142 #include <vm/vm_radix.h>
  143 #include <vm/vm_reserv.h>
  144 #include <vm/uma.h>
  145 
  146 #ifdef DEV_APIC
  147 #include <sys/bus.h>
  148 #include <machine/intr_machdep.h>
  149 #include <x86/apicvar.h>
  150 #endif
  151 #include <x86/ifunc.h>
  152 #include <machine/bootinfo.h>
  153 #include <machine/cpu.h>
  154 #include <machine/cputypes.h>
  155 #include <machine/md_var.h>
  156 #include <machine/pcb.h>
  157 #include <machine/specialreg.h>
  158 #ifdef SMP
  159 #include <machine/smp.h>
  160 #endif
  161 
  162 #ifndef PMAP_SHPGPERPROC
  163 #define PMAP_SHPGPERPROC 200
  164 #endif
  165 
  166 #if !defined(DIAGNOSTIC)
  167 #ifdef __GNUC_GNU_INLINE__
  168 #define PMAP_INLINE     __attribute__((__gnu_inline__)) inline
  169 #else
  170 #define PMAP_INLINE     extern inline
  171 #endif
  172 #else
  173 #define PMAP_INLINE
  174 #endif
  175 
  176 #ifdef PV_STATS
  177 #define PV_STAT(x)      do { x ; } while (0)
  178 #else
  179 #define PV_STAT(x)      do { } while (0)
  180 #endif
  181 
  182 #define pa_index(pa)    ((pa) >> PDRSHIFT)
  183 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
  184 
  185 /*
  186  * Get PDEs and PTEs for user/kernel address space
  187  */
  188 #define pmap_pde(m, v)  (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
  189 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
  190 
  191 #define pmap_pde_v(pte)         ((*(int *)pte & PG_V) != 0)
  192 #define pmap_pte_w(pte)         ((*(int *)pte & PG_W) != 0)
  193 #define pmap_pte_m(pte)         ((*(int *)pte & PG_M) != 0)
  194 #define pmap_pte_u(pte)         ((*(int *)pte & PG_A) != 0)
  195 #define pmap_pte_v(pte)         ((*(int *)pte & PG_V) != 0)
  196 
  197 #define pmap_pte_set_w(pte, v)  ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
  198     atomic_clear_int((u_int *)(pte), PG_W))
  199 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
  200 
  201 struct pmap kernel_pmap_store;
  202 
  203 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
  204 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
  205 static int pgeflag = 0;         /* PG_G or-in */
  206 static int pseflag = 0;         /* PG_PS or-in */
  207 
  208 static int nkpt = NKPT;
  209 vm_offset_t kernel_vm_end = /* 0 + */ NKPT * NBPDR;
  210 
  211 #if defined(PAE) || defined(PAE_TABLES)
  212 pt_entry_t pg_nx;
  213 static uma_zone_t pdptzone;
  214 #endif
  215 
  216 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  217 
  218 static int pat_works = 1;
  219 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
  220     "Is page attribute table fully functional?");
  221 
  222 static int pg_ps_enabled = 1;
  223 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  224     &pg_ps_enabled, 0, "Are large page mappings enabled?");
  225 
  226 #define PAT_INDEX_SIZE  8
  227 static int pat_index[PAT_INDEX_SIZE];   /* cache mode to PAT index conversion */
  228 
  229 /*
  230  * pmap_mapdev support pre initialization (i.e. console)
  231  */
  232 #define PMAP_PREINIT_MAPPING_COUNT      8
  233 static struct pmap_preinit_mapping {
  234         vm_paddr_t      pa;
  235         vm_offset_t     va;
  236         vm_size_t       sz;
  237         int             mode;
  238 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
  239 static int pmap_initialized;
  240 
  241 static struct rwlock_padalign pvh_global_lock;
  242 
  243 /*
  244  * Data for the pv entry allocation mechanism
  245  */
  246 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
  247 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
  248 static struct md_page *pv_table;
  249 static int shpgperproc = PMAP_SHPGPERPROC;
  250 
  251 struct pv_chunk *pv_chunkbase;          /* KVA block for pv_chunks */
  252 int pv_maxchunks;                       /* How many chunks we have KVA for */
  253 vm_offset_t pv_vafree;                  /* freelist stored in the PTE */
  254 
  255 /*
  256  * All those kernel PT submaps that BSD is so fond of
  257  */
  258 pt_entry_t *CMAP3;
  259 static pd_entry_t *KPTD;
  260 caddr_t ptvmmap = 0;
  261 caddr_t CADDR3;
  262 
  263 /*
  264  * Crashdump maps.
  265  */
  266 static caddr_t crashdumpmap;
  267 
  268 static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3;
  269 static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3;
  270 #ifdef SMP
  271 static int PMAP1cpu, PMAP3cpu;
  272 static int PMAP1changedcpu;
  273 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 
  274            &PMAP1changedcpu, 0,
  275            "Number of times pmap_pte_quick changed CPU with same PMAP1");
  276 #endif
  277 static int PMAP1changed;
  278 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 
  279            &PMAP1changed, 0,
  280            "Number of times pmap_pte_quick changed PMAP1");
  281 static int PMAP1unchanged;
  282 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 
  283            &PMAP1unchanged, 0,
  284            "Number of times pmap_pte_quick didn't change PMAP1");
  285 static struct mtx PMAP2mutex;
  286 
  287 int pti;
  288 
  289 /*
  290  * Internal flags for pmap_enter()'s helper functions.
  291  */
  292 #define PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV entries. */
  293 #define PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace mappings. */
  294 
  295 static void     free_pv_chunk(struct pv_chunk *pc);
  296 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
  297 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
  298 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  299 static bool     pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
  300                     u_int flags);
  301 #if VM_NRESERVLEVEL > 0
  302 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  303 #endif
  304 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  305 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
  306                     vm_offset_t va);
  307 static int      pmap_pvh_wired_mappings(struct md_page *pvh, int count);
  308 
  309 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  310 static bool     pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
  311                     vm_prot_t prot);
  312 static int      pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
  313                     u_int flags, vm_page_t m);
  314 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
  315     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
  316 static void pmap_flush_page(vm_page_t m);
  317 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
  318 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
  319     vm_offset_t eva);
  320 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
  321     vm_offset_t eva);
  322 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
  323                     pd_entry_t pde);
  324 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
  325 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
  326 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
  327 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
  328 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
  329 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
  330 #if VM_NRESERVLEVEL > 0
  331 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  332 #endif
  333 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
  334     vm_prot_t prot);
  335 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
  336 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  337     struct spglist *free);
  338 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
  339     struct spglist *free);
  340 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
  341 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
  342     struct spglist *free);
  343 static bool     pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
  344                     struct spglist *free);
  345 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
  346                                         vm_offset_t va);
  347 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
  348 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
  349     vm_page_t m);
  350 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  351     pd_entry_t newpde);
  352 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
  353 
  354 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
  355 
  356 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
  357 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
  358 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
  359 static void pmap_pte_release(pt_entry_t *pte);
  360 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
  361 #if defined(PAE) || defined(PAE_TABLES)
  362 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain,
  363     uint8_t *flags, int wait);
  364 #endif
  365 static void pmap_init_trm(void);
  366 
  367 static __inline void pagezero(void *page);
  368 
  369 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
  370 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
  371 
  372 void pmap_cold(void);
  373 extern char _end[];
  374 u_long physfree;        /* phys addr of next free page */
  375 u_long vm86phystk;      /* PA of vm86/bios stack */
  376 u_long vm86paddr;       /* address of vm86 region */
  377 int vm86pa;             /* phys addr of vm86 region */
  378 u_long KERNend;         /* phys addr end of kernel (just after bss) */
  379 pd_entry_t *IdlePTD;    /* phys addr of kernel PTD */
  380 #if defined(PAE) || defined(PAE_TABLES)
  381 pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */
  382 #endif
  383 pt_entry_t *KPTmap;     /* address of kernel page tables */
  384 u_long KPTphys;         /* phys addr of kernel page tables */
  385 extern u_long tramp_idleptd;
  386 
  387 static u_long
  388 allocpages(u_int cnt, u_long *physfree)
  389 {
  390         u_long res;
  391 
  392         res = *physfree;
  393         *physfree += PAGE_SIZE * cnt;
  394         bzero((void *)res, PAGE_SIZE * cnt);
  395         return (res);
  396 }
  397 
  398 static void
  399 pmap_cold_map(u_long pa, u_long va, u_long cnt)
  400 {
  401         pt_entry_t *pt;
  402 
  403         for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0;
  404             cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE)
  405                 *pt = pa | PG_V | PG_RW | PG_A | PG_M;
  406 }
  407 
  408 static void
  409 pmap_cold_mapident(u_long pa, u_long cnt)
  410 {
  411 
  412         pmap_cold_map(pa, pa, cnt);
  413 }
  414 
  415 _Static_assert(2 * NBPDR == KERNBASE, "Broken double-map of zero PTD");
  416 
  417 /*
  418  * Called from locore.s before paging is enabled.  Sets up the first
  419  * kernel page table.  Since kernel is mapped with PA == VA, this code
  420  * does not require relocations.
  421  */
  422 void
  423 pmap_cold(void)
  424 {
  425         pt_entry_t *pt;
  426         u_long a;
  427         u_int cr3, ncr4;
  428 
  429         physfree = (u_long)&_end;
  430         if (bootinfo.bi_esymtab != 0)
  431                 physfree = bootinfo.bi_esymtab;
  432         if (bootinfo.bi_kernend != 0)
  433                 physfree = bootinfo.bi_kernend;
  434         physfree = roundup2(physfree, NBPDR);
  435         KERNend = physfree;
  436 
  437         /* Allocate Kernel Page Tables */
  438         KPTphys = allocpages(NKPT, &physfree);
  439         KPTmap = (pt_entry_t *)KPTphys;
  440 
  441         /* Allocate Page Table Directory */
  442 #if defined(PAE) || defined(PAE_TABLES)
  443         /* XXX only need 32 bytes (easier for now) */
  444         IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree);
  445 #endif
  446         IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree);
  447 
  448         /*
  449          * Allocate KSTACK.  Leave a guard page between IdlePTD and
  450          * proc0kstack, to control stack overflow for thread0 and
  451          * prevent corruption of the page table.  We leak the guard
  452          * physical memory due to 1:1 mappings.
  453          */
  454         allocpages(1, &physfree);
  455         proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree);
  456 
  457         /* vm86/bios stack */
  458         vm86phystk = allocpages(1, &physfree);
  459 
  460         /* pgtable + ext + IOPAGES */
  461         vm86paddr = vm86pa = allocpages(3, &physfree);
  462 
  463         /* Install page tables into PTD.  Page table page 1 is wasted. */
  464         for (a = 0; a < NKPT; a++)
  465                 IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M;
  466 
  467 #if defined(PAE) || defined(PAE_TABLES)
  468         /* PAE install PTD pointers into PDPT */
  469         for (a = 0; a < NPGPTD; a++)
  470                 IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V;
  471 #endif
  472 
  473         /*
  474          * Install recursive mapping for kernel page tables into
  475          * itself.
  476          */
  477         for (a = 0; a < NPGPTD; a++)
  478                 IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V |
  479                     PG_RW;
  480 
  481         /*
  482          * Initialize page table pages mapping physical address zero
  483          * through the (physical) end of the kernel.  Many of these
  484          * pages must be reserved, and we reserve them all and map
  485          * them linearly for convenience.  We do this even if we've
  486          * enabled PSE above; we'll just switch the corresponding
  487          * kernel PDEs before we turn on paging.
  488          *
  489          * This and all other page table entries allow read and write
  490          * access for various reasons.  Kernel mappings never have any
  491          * access restrictions.
  492          */
  493         pmap_cold_mapident(0, atop(NBPDR));
  494         pmap_cold_map(0, NBPDR, atop(NBPDR));
  495         pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE));
  496 
  497         /* Map page table directory */
  498 #if defined(PAE) || defined(PAE_TABLES)
  499         pmap_cold_mapident((u_long)IdlePDPT, 1);
  500 #endif
  501         pmap_cold_mapident((u_long)IdlePTD, NPGPTD);
  502 
  503         /* Map early KPTmap.  It is really pmap_cold_mapident. */
  504         pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT);
  505 
  506         /* Map proc0kstack */
  507         pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES);
  508         /* ISA hole already mapped */
  509 
  510         pmap_cold_mapident(vm86phystk, 1);
  511         pmap_cold_mapident(vm86pa, 3);
  512 
  513         /* Map page 0 into the vm86 page table */
  514         *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V;
  515 
  516         /* ...likewise for the ISA hole for vm86 */
  517         for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0;
  518             a < atop(ISA_HOLE_LENGTH); a++, pt++)
  519                 *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A |
  520                     PG_M | PG_V;
  521 
  522         /* Enable PSE, PGE, VME, and PAE if configured. */
  523         ncr4 = 0;
  524         if ((cpu_feature & CPUID_PSE) != 0) {
  525                 ncr4 |= CR4_PSE;
  526                 pseflag = PG_PS;
  527                 /*
  528                  * Superpage mapping of the kernel text.  Existing 4k
  529                  * page table pages are wasted.
  530                  */
  531                 for (a = KERNBASE; a < KERNend; a += NBPDR)
  532                         IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M |
  533                             PG_RW | PG_V;
  534         }
  535         if ((cpu_feature & CPUID_PGE) != 0) {
  536                 ncr4 |= CR4_PGE;
  537                 pgeflag = PG_G;
  538         }
  539         ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0;
  540 #if defined(PAE) || defined(PAE_TABLES)
  541         ncr4 |= CR4_PAE;
  542 #endif
  543         if (ncr4 != 0)
  544                 load_cr4(rcr4() | ncr4);
  545 
  546         /* Now enable paging */
  547 #if defined(PAE) || defined(PAE_TABLES)
  548         cr3 = (u_int)IdlePDPT;
  549 #else
  550         cr3 = (u_int)IdlePTD;
  551 #endif
  552         tramp_idleptd = cr3;
  553         load_cr3(cr3);
  554         load_cr0(rcr0() | CR0_PG);
  555 
  556         /*
  557          * Now running relocated at KERNBASE where the system is
  558          * linked to run.
  559          */
  560 
  561         /*
  562          * Remove the lowest part of the double mapping of low memory
  563          * to get some null pointer checks.
  564          */
  565         IdlePTD[0] = 0;
  566         load_cr3(cr3);          /* invalidate TLB */
  567 }
  568 
  569 /*
  570  *      Bootstrap the system enough to run with virtual memory.
  571  *
  572  *      On the i386 this is called after mapping has already been enabled
  573  *      in locore.s with the page table created in pmap_cold(),
  574  *      and just syncs the pmap module with what has already been done.
  575  */
  576 void
  577 pmap_bootstrap(vm_paddr_t firstaddr)
  578 {
  579         vm_offset_t va;
  580         pt_entry_t *pte, *unused;
  581         struct pcpu *pc;
  582         u_long res;
  583         int i;
  584 
  585         res = atop(firstaddr - (vm_paddr_t)KERNLOAD);
  586 
  587         /*
  588          * Add a physical memory segment (vm_phys_seg) corresponding to the
  589          * preallocated kernel page table pages so that vm_page structures
  590          * representing these pages will be created.  The vm_page structures
  591          * are required for promotion of the corresponding kernel virtual
  592          * addresses to superpage mappings.
  593          */
  594         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
  595 
  596         /*
  597          * Initialize the first available kernel virtual address.  However,
  598          * using "firstaddr" may waste a few pages of the kernel virtual
  599          * address space, because locore may not have mapped every physical
  600          * page that it allocated.  Preferably, locore would provide a first
  601          * unused virtual address in addition to "firstaddr".
  602          */
  603         virtual_avail = (vm_offset_t)firstaddr;
  604         virtual_end = VM_MAX_KERNEL_ADDRESS;
  605 
  606         /*
  607          * Initialize the kernel pmap (which is statically allocated).
  608          * Count bootstrap data as being resident in case any of this data is
  609          * later unmapped (using pmap_remove()) and freed.
  610          */
  611         PMAP_LOCK_INIT(kernel_pmap);
  612         kernel_pmap->pm_pdir = IdlePTD;
  613 #if defined(PAE) || defined(PAE_TABLES)
  614         kernel_pmap->pm_pdpt = IdlePDPT;
  615 #endif
  616         CPU_FILL(&kernel_pmap->pm_active);      /* don't allow deactivation */
  617         kernel_pmap->pm_stats.resident_count = res;
  618         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
  619 
  620         /*
  621          * Initialize the global pv list lock.
  622          */
  623         rw_init(&pvh_global_lock, "pmap pv global");
  624 
  625         /*
  626          * Reserve some special page table entries/VA space for temporary
  627          * mapping of pages.
  628          */
  629 #define SYSMAP(c, p, v, n)      \
  630         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
  631 
  632         va = virtual_avail;
  633         pte = vtopte(va);
  634 
  635 
  636         /*
  637          * Initialize temporary map objects on the current CPU for use
  638          * during early boot.
  639          * CMAP1/CMAP2 are used for zeroing and copying pages.
  640          * CMAP3 is used for the boot-time memory test.
  641          */
  642         pc = get_pcpu();
  643         mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
  644         SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
  645         SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
  646         SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
  647 
  648         SYSMAP(caddr_t, CMAP3, CADDR3, 1);
  649 
  650         /*
  651          * Crashdump maps.
  652          */
  653         SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
  654 
  655         /*
  656          * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
  657          */
  658         SYSMAP(caddr_t, unused, ptvmmap, 1)
  659 
  660         /*
  661          * msgbufp is used to map the system message buffer.
  662          */
  663         SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
  664 
  665         /*
  666          * KPTmap is used by pmap_kextract().
  667          *
  668          * KPTmap is first initialized by locore.  However, that initial
  669          * KPTmap can only support NKPT page table pages.  Here, a larger
  670          * KPTmap is created that can support KVA_PAGES page table pages.
  671          */
  672         SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
  673 
  674         for (i = 0; i < NKPT; i++)
  675                 KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
  676 
  677         /*
  678          * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
  679          * respectively.
  680          */
  681         SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
  682         SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
  683         SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1)
  684 
  685         mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
  686 
  687         virtual_avail = va;
  688 
  689         /*
  690          * Initialize the PAT MSR if present.
  691          * pmap_init_pat() clears and sets CR4_PGE, which, as a
  692          * side-effect, invalidates stale PG_G TLB entries that might
  693          * have been created in our pre-boot environment.  We assume
  694          * that PAT support implies PGE and in reverse, PGE presence
  695          * comes with PAT.  Both features were added for Pentium Pro.
  696          */
  697         pmap_init_pat();
  698 }
  699 
  700 static void
  701 pmap_init_reserved_pages(void)
  702 {
  703         struct pcpu *pc;
  704         vm_offset_t pages;
  705         int i;
  706 
  707         CPU_FOREACH(i) {
  708                 pc = pcpu_find(i);
  709                 mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF |
  710                     MTX_NEW);
  711                 pc->pc_copyout_maddr = kva_alloc(ptoa(2));
  712                 if (pc->pc_copyout_maddr == 0)
  713                         panic("unable to allocate non-sleepable copyout KVA");
  714                 sx_init(&pc->pc_copyout_slock, "cpslk");
  715                 pc->pc_copyout_saddr = kva_alloc(ptoa(2));
  716                 if (pc->pc_copyout_saddr == 0)
  717                         panic("unable to allocate sleepable copyout KVA");
  718                 pc->pc_pmap_eh_va = kva_alloc(ptoa(1));
  719                 if (pc->pc_pmap_eh_va == 0)
  720                         panic("unable to allocate pmap_extract_and_hold KVA");
  721                 pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va);
  722 
  723                 /*
  724                  * Skip if the mappings have already been initialized,
  725                  * i.e. this is the BSP.
  726                  */
  727                 if (pc->pc_cmap_addr1 != 0)
  728                         continue;
  729 
  730                 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
  731                 pages = kva_alloc(PAGE_SIZE * 3);
  732                 if (pages == 0)
  733                         panic("unable to allocate CMAP KVA");
  734                 pc->pc_cmap_pte1 = vtopte(pages);
  735                 pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
  736                 pc->pc_cmap_addr1 = (caddr_t)pages;
  737                 pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
  738                 pc->pc_qmap_addr = pages + atop(2);
  739         }
  740 }
  741  
  742 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
  743 
  744 /*
  745  * Setup the PAT MSR.
  746  */
  747 void
  748 pmap_init_pat(void)
  749 {
  750         int pat_table[PAT_INDEX_SIZE];
  751         uint64_t pat_msr;
  752         u_long cr0, cr4;
  753         int i;
  754 
  755         /* Set default PAT index table. */
  756         for (i = 0; i < PAT_INDEX_SIZE; i++)
  757                 pat_table[i] = -1;
  758         pat_table[PAT_WRITE_BACK] = 0;
  759         pat_table[PAT_WRITE_THROUGH] = 1;
  760         pat_table[PAT_UNCACHEABLE] = 3;
  761         pat_table[PAT_WRITE_COMBINING] = 3;
  762         pat_table[PAT_WRITE_PROTECTED] = 3;
  763         pat_table[PAT_UNCACHED] = 3;
  764 
  765         /*
  766          * Bail if this CPU doesn't implement PAT.
  767          * We assume that PAT support implies PGE.
  768          */
  769         if ((cpu_feature & CPUID_PAT) == 0) {
  770                 for (i = 0; i < PAT_INDEX_SIZE; i++)
  771                         pat_index[i] = pat_table[i];
  772                 pat_works = 0;
  773                 return;
  774         }
  775 
  776         /*
  777          * Due to some Intel errata, we can only safely use the lower 4
  778          * PAT entries.
  779          *
  780          *   Intel Pentium III Processor Specification Update
  781          * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
  782          * or Mode C Paging)
  783          *
  784          *   Intel Pentium IV  Processor Specification Update
  785          * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
  786          */
  787         if (cpu_vendor_id == CPU_VENDOR_INTEL &&
  788             !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
  789                 pat_works = 0;
  790 
  791         /* Initialize default PAT entries. */
  792         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
  793             PAT_VALUE(1, PAT_WRITE_THROUGH) |
  794             PAT_VALUE(2, PAT_UNCACHED) |
  795             PAT_VALUE(3, PAT_UNCACHEABLE) |
  796             PAT_VALUE(4, PAT_WRITE_BACK) |
  797             PAT_VALUE(5, PAT_WRITE_THROUGH) |
  798             PAT_VALUE(6, PAT_UNCACHED) |
  799             PAT_VALUE(7, PAT_UNCACHEABLE);
  800 
  801         if (pat_works) {
  802                 /*
  803                  * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
  804                  * Program 5 and 6 as WP and WC.
  805                  * Leave 4 and 7 as WB and UC.
  806                  */
  807                 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
  808                 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
  809                     PAT_VALUE(6, PAT_WRITE_COMBINING);
  810                 pat_table[PAT_UNCACHED] = 2;
  811                 pat_table[PAT_WRITE_PROTECTED] = 5;
  812                 pat_table[PAT_WRITE_COMBINING] = 6;
  813         } else {
  814                 /*
  815                  * Just replace PAT Index 2 with WC instead of UC-.
  816                  */
  817                 pat_msr &= ~PAT_MASK(2);
  818                 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
  819                 pat_table[PAT_WRITE_COMBINING] = 2;
  820         }
  821 
  822         /* Disable PGE. */
  823         cr4 = rcr4();
  824         load_cr4(cr4 & ~CR4_PGE);
  825 
  826         /* Disable caches (CD = 1, NW = 0). */
  827         cr0 = rcr0();
  828         load_cr0((cr0 & ~CR0_NW) | CR0_CD);
  829 
  830         /* Flushes caches and TLBs. */
  831         wbinvd();
  832         invltlb();
  833 
  834         /* Update PAT and index table. */
  835         wrmsr(MSR_PAT, pat_msr);
  836         for (i = 0; i < PAT_INDEX_SIZE; i++)
  837                 pat_index[i] = pat_table[i];
  838 
  839         /* Flush caches and TLBs again. */
  840         wbinvd();
  841         invltlb();
  842 
  843         /* Restore caches and PGE. */
  844         load_cr0(cr0);
  845         load_cr4(cr4);
  846 }
  847 
  848 /*
  849  * Initialize a vm_page's machine-dependent fields.
  850  */
  851 void
  852 pmap_page_init(vm_page_t m)
  853 {
  854 
  855         TAILQ_INIT(&m->md.pv_list);
  856         m->md.pat_mode = PAT_WRITE_BACK;
  857 }
  858 
  859 #if defined(PAE) || defined(PAE_TABLES)
  860 static void *
  861 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
  862     int wait)
  863 {
  864 
  865         /* Inform UMA that this allocator uses kernel_map/object. */
  866         *flags = UMA_SLAB_KERNEL;
  867         return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
  868             bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
  869 }
  870 #endif
  871 
  872 /*
  873  * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
  874  * Requirements:
  875  *  - Must deal with pages in order to ensure that none of the PG_* bits
  876  *    are ever set, PG_V in particular.
  877  *  - Assumes we can write to ptes without pte_store() atomic ops, even
  878  *    on PAE systems.  This should be ok.
  879  *  - Assumes nothing will ever test these addresses for 0 to indicate
  880  *    no mapping instead of correctly checking PG_V.
  881  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
  882  * Because PG_V is never set, there can be no mappings to invalidate.
  883  */
  884 static vm_offset_t
  885 pmap_ptelist_alloc(vm_offset_t *head)
  886 {
  887         pt_entry_t *pte;
  888         vm_offset_t va;
  889 
  890         va = *head;
  891         if (va == 0)
  892                 panic("pmap_ptelist_alloc: exhausted ptelist KVA");
  893         pte = vtopte(va);
  894         *head = *pte;
  895         if (*head & PG_V)
  896                 panic("pmap_ptelist_alloc: va with PG_V set!");
  897         *pte = 0;
  898         return (va);
  899 }
  900 
  901 static void
  902 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
  903 {
  904         pt_entry_t *pte;
  905 
  906         if (va & PG_V)
  907                 panic("pmap_ptelist_free: freeing va with PG_V set!");
  908         pte = vtopte(va);
  909         *pte = *head;           /* virtual! PG_V is 0 though */
  910         *head = va;
  911 }
  912 
  913 static void
  914 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
  915 {
  916         int i;
  917         vm_offset_t va;
  918 
  919         *head = 0;
  920         for (i = npages - 1; i >= 0; i--) {
  921                 va = (vm_offset_t)base + i * PAGE_SIZE;
  922                 pmap_ptelist_free(head, va);
  923         }
  924 }
  925 
  926 
  927 /*
  928  *      Initialize the pmap module.
  929  *      Called by vm_init, to initialize any structures that the pmap
  930  *      system needs to map virtual memory.
  931  */
  932 void
  933 pmap_init(void)
  934 {
  935         struct pmap_preinit_mapping *ppim;
  936         vm_page_t mpte;
  937         vm_size_t s;
  938         int i, pv_npg;
  939 
  940         /*
  941          * Initialize the vm page array entries for the kernel pmap's
  942          * page table pages.
  943          */ 
  944         PMAP_LOCK(kernel_pmap);
  945         for (i = 0; i < NKPT; i++) {
  946                 mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i));
  947                 KASSERT(mpte >= vm_page_array &&
  948                     mpte < &vm_page_array[vm_page_array_size],
  949                     ("pmap_init: page table page is out of range"));
  950                 mpte->pindex = i + KPTDI;
  951                 mpte->phys_addr = KPTphys + ptoa(i);
  952                 mpte->wire_count = 1;
  953                 if (pseflag != 0 &&
  954                     KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend &&
  955                     pmap_insert_pt_page(kernel_pmap, mpte))
  956                         panic("pmap_init: pmap_insert_pt_page failed");
  957         }
  958         PMAP_UNLOCK(kernel_pmap);
  959         vm_wire_add(NKPT);
  960 
  961         /*
  962          * Initialize the address space (zone) for the pv entries.  Set a
  963          * high water mark so that the system can recover from excessive
  964          * numbers of pv entries.
  965          */
  966         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
  967         pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
  968         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
  969         pv_entry_max = roundup(pv_entry_max, _NPCPV);
  970         pv_entry_high_water = 9 * (pv_entry_max / 10);
  971 
  972         /*
  973          * If the kernel is running on a virtual machine, then it must assume
  974          * that MCA is enabled by the hypervisor.  Moreover, the kernel must
  975          * be prepared for the hypervisor changing the vendor and family that
  976          * are reported by CPUID.  Consequently, the workaround for AMD Family
  977          * 10h Erratum 383 is enabled if the processor's feature set does not
  978          * include at least one feature that is only supported by older Intel
  979          * or newer AMD processors.
  980          */
  981         if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
  982             (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
  983             CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
  984             AMDID2_FMA4)) == 0)
  985                 workaround_erratum383 = 1;
  986 
  987         /*
  988          * Are large page mappings supported and enabled?
  989          */
  990         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
  991         if (pseflag == 0)
  992                 pg_ps_enabled = 0;
  993         else if (pg_ps_enabled) {
  994                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
  995                     ("pmap_init: can't assign to pagesizes[1]"));
  996                 pagesizes[1] = NBPDR;
  997         }
  998 
  999         /*
 1000          * Calculate the size of the pv head table for superpages.
 1001          * Handle the possibility that "vm_phys_segs[...].end" is zero.
 1002          */
 1003         pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
 1004             PAGE_SIZE) / NBPDR + 1;
 1005 
 1006         /*
 1007          * Allocate memory for the pv head table for superpages.
 1008          */
 1009         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 1010         s = round_page(s);
 1011         pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 1012         for (i = 0; i < pv_npg; i++)
 1013                 TAILQ_INIT(&pv_table[i].pv_list);
 1014 
 1015         pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
 1016         pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
 1017         if (pv_chunkbase == NULL)
 1018                 panic("pmap_init: not enough kvm for pv chunks");
 1019         pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 1020 #if defined(PAE) || defined(PAE_TABLES)
 1021         pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 1022             NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 1023             UMA_ZONE_VM | UMA_ZONE_NOFREE);
 1024         uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 1025 #endif
 1026 
 1027         pmap_initialized = 1;
 1028         pmap_init_trm();
 1029 
 1030         if (!bootverbose)
 1031                 return;
 1032         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 1033                 ppim = pmap_preinit_mapping + i;
 1034                 if (ppim->va == 0)
 1035                         continue;
 1036                 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
 1037                     (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
 1038         }
 1039 
 1040 }
 1041 
 1042 
 1043 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
 1044         "Max number of PV entries");
 1045 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
 1046         "Page share factor per proc");
 1047 
 1048 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
 1049     "2/4MB page mapping counters");
 1050 
 1051 static u_long pmap_pde_demotions;
 1052 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
 1053     &pmap_pde_demotions, 0, "2/4MB page demotions");
 1054 
 1055 static u_long pmap_pde_mappings;
 1056 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
 1057     &pmap_pde_mappings, 0, "2/4MB page mappings");
 1058 
 1059 static u_long pmap_pde_p_failures;
 1060 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
 1061     &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
 1062 
 1063 static u_long pmap_pde_promotions;
 1064 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
 1065     &pmap_pde_promotions, 0, "2/4MB page promotions");
 1066 
 1067 /***************************************************
 1068  * Low level helper routines.....
 1069  ***************************************************/
 1070 
 1071 boolean_t
 1072 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 1073 {
 1074 
 1075         return (mode >= 0 && mode < PAT_INDEX_SIZE &&
 1076             pat_index[(int)mode] >= 0);
 1077 }
 1078 
 1079 /*
 1080  * Determine the appropriate bits to set in a PTE or PDE for a specified
 1081  * caching mode.
 1082  */
 1083 int
 1084 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 1085 {
 1086         int cache_bits, pat_flag, pat_idx;
 1087 
 1088         if (!pmap_is_valid_memattr(pmap, mode))
 1089                 panic("Unknown caching mode %d\n", mode);
 1090 
 1091         /* The PAT bit is different for PTE's and PDE's. */
 1092         pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 1093 
 1094         /* Map the caching mode to a PAT index. */
 1095         pat_idx = pat_index[mode];
 1096 
 1097         /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 1098         cache_bits = 0;
 1099         if (pat_idx & 0x4)
 1100                 cache_bits |= pat_flag;
 1101         if (pat_idx & 0x2)
 1102                 cache_bits |= PG_NC_PCD;
 1103         if (pat_idx & 0x1)
 1104                 cache_bits |= PG_NC_PWT;
 1105         return (cache_bits);
 1106 }
 1107 
 1108 bool
 1109 pmap_ps_enabled(pmap_t pmap __unused)
 1110 {
 1111 
 1112         return (pg_ps_enabled);
 1113 }
 1114 
 1115 /*
 1116  * The caller is responsible for maintaining TLB consistency.
 1117  */
 1118 static void
 1119 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
 1120 {
 1121         pd_entry_t *pde;
 1122 
 1123         pde = pmap_pde(kernel_pmap, va);
 1124         pde_store(pde, newpde);
 1125 }
 1126 
 1127 /*
 1128  * After changing the page size for the specified virtual address in the page
 1129  * table, flush the corresponding entries from the processor's TLB.  Only the
 1130  * calling processor's TLB is affected.
 1131  *
 1132  * The calling thread must be pinned to a processor.
 1133  */
 1134 static void
 1135 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
 1136 {
 1137 
 1138         if ((newpde & PG_PS) == 0)
 1139                 /* Demotion: flush a specific 2MB page mapping. */
 1140                 invlpg(va);
 1141         else /* if ((newpde & PG_G) == 0) */
 1142                 /*
 1143                  * Promotion: flush every 4KB page mapping from the TLB
 1144                  * because there are too many to flush individually.
 1145                  */
 1146                 invltlb();
 1147 }
 1148 
 1149 void
 1150 invltlb_glob(void)
 1151 {
 1152 
 1153         invltlb();
 1154 }
 1155 
 1156 
 1157 #ifdef SMP
 1158 /*
 1159  * For SMP, these functions have to use the IPI mechanism for coherence.
 1160  *
 1161  * N.B.: Before calling any of the following TLB invalidation functions,
 1162  * the calling processor must ensure that all stores updating a non-
 1163  * kernel page table are globally performed.  Otherwise, another
 1164  * processor could cache an old, pre-update entry without being
 1165  * invalidated.  This can happen one of two ways: (1) The pmap becomes
 1166  * active on another processor after its pm_active field is checked by
 1167  * one of the following functions but before a store updating the page
 1168  * table is globally performed. (2) The pmap becomes active on another
 1169  * processor before its pm_active field is checked but due to
 1170  * speculative loads one of the following functions stills reads the
 1171  * pmap as inactive on the other processor.
 1172  * 
 1173  * The kernel page table is exempt because its pm_active field is
 1174  * immutable.  The kernel page table is always active on every
 1175  * processor.
 1176  */
 1177 void
 1178 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1179 {
 1180         cpuset_t *mask, other_cpus;
 1181         u_int cpuid;
 1182 
 1183         sched_pin();
 1184         if (pmap == kernel_pmap) {
 1185                 invlpg(va);
 1186                 mask = &all_cpus;
 1187         } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1188                 mask = &all_cpus;
 1189         } else {
 1190                 cpuid = PCPU_GET(cpuid);
 1191                 other_cpus = all_cpus;
 1192                 CPU_CLR(cpuid, &other_cpus);
 1193                 CPU_AND(&other_cpus, &pmap->pm_active);
 1194                 mask = &other_cpus;
 1195         }
 1196         smp_masked_invlpg(*mask, va, pmap);
 1197         sched_unpin();
 1198 }
 1199 
 1200 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 1201 #define PMAP_INVLPG_THRESHOLD   (4 * 1024 * PAGE_SIZE)
 1202 
 1203 void
 1204 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1205 {
 1206         cpuset_t *mask, other_cpus;
 1207         vm_offset_t addr;
 1208         u_int cpuid;
 1209 
 1210         if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 1211                 pmap_invalidate_all(pmap);
 1212                 return;
 1213         }
 1214 
 1215         sched_pin();
 1216         if (pmap == kernel_pmap) {
 1217                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1218                         invlpg(addr);
 1219                 mask = &all_cpus;
 1220         } else  if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1221                 mask = &all_cpus;
 1222         } else {
 1223                 cpuid = PCPU_GET(cpuid);
 1224                 other_cpus = all_cpus;
 1225                 CPU_CLR(cpuid, &other_cpus);
 1226                 CPU_AND(&other_cpus, &pmap->pm_active);
 1227                 mask = &other_cpus;
 1228         }
 1229         smp_masked_invlpg_range(*mask, sva, eva, pmap);
 1230         sched_unpin();
 1231 }
 1232 
 1233 void
 1234 pmap_invalidate_all(pmap_t pmap)
 1235 {
 1236         cpuset_t *mask, other_cpus;
 1237         u_int cpuid;
 1238 
 1239         sched_pin();
 1240         if (pmap == kernel_pmap) {
 1241                 invltlb();
 1242                 mask = &all_cpus;
 1243         } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
 1244                 mask = &all_cpus;
 1245         } else {
 1246                 cpuid = PCPU_GET(cpuid);
 1247                 other_cpus = all_cpus;
 1248                 CPU_CLR(cpuid, &other_cpus);
 1249                 CPU_AND(&other_cpus, &pmap->pm_active);
 1250                 mask = &other_cpus;
 1251         }
 1252         smp_masked_invltlb(*mask, pmap);
 1253         sched_unpin();
 1254 }
 1255 
 1256 void
 1257 pmap_invalidate_cache(void)
 1258 {
 1259 
 1260         sched_pin();
 1261         wbinvd();
 1262         smp_cache_flush();
 1263         sched_unpin();
 1264 }
 1265 
 1266 struct pde_action {
 1267         cpuset_t invalidate;    /* processors that invalidate their TLB */
 1268         vm_offset_t va;
 1269         pd_entry_t *pde;
 1270         pd_entry_t newpde;
 1271         u_int store;            /* processor that updates the PDE */
 1272 };
 1273 
 1274 static void
 1275 pmap_update_pde_kernel(void *arg)
 1276 {
 1277         struct pde_action *act = arg;
 1278         pd_entry_t *pde;
 1279 
 1280         if (act->store == PCPU_GET(cpuid)) {
 1281                 pde = pmap_pde(kernel_pmap, act->va);
 1282                 pde_store(pde, act->newpde);
 1283         }
 1284 }
 1285 
 1286 static void
 1287 pmap_update_pde_user(void *arg)
 1288 {
 1289         struct pde_action *act = arg;
 1290 
 1291         if (act->store == PCPU_GET(cpuid))
 1292                 pde_store(act->pde, act->newpde);
 1293 }
 1294 
 1295 static void
 1296 pmap_update_pde_teardown(void *arg)
 1297 {
 1298         struct pde_action *act = arg;
 1299 
 1300         if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 1301                 pmap_update_pde_invalidate(act->va, act->newpde);
 1302 }
 1303 
 1304 /*
 1305  * Change the page size for the specified virtual address in a way that
 1306  * prevents any possibility of the TLB ever having two entries that map the
 1307  * same virtual address using different page sizes.  This is the recommended
 1308  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
 1309  * machine check exception for a TLB state that is improperly diagnosed as a
 1310  * hardware error.
 1311  */
 1312 static void
 1313 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1314 {
 1315         struct pde_action act;
 1316         cpuset_t active, other_cpus;
 1317         u_int cpuid;
 1318 
 1319         sched_pin();
 1320         cpuid = PCPU_GET(cpuid);
 1321         other_cpus = all_cpus;
 1322         CPU_CLR(cpuid, &other_cpus);
 1323         if (pmap == kernel_pmap)
 1324                 active = all_cpus;
 1325         else
 1326                 active = pmap->pm_active;
 1327         if (CPU_OVERLAP(&active, &other_cpus)) {
 1328                 act.store = cpuid;
 1329                 act.invalidate = active;
 1330                 act.va = va;
 1331                 act.pde = pde;
 1332                 act.newpde = newpde;
 1333                 CPU_SET(cpuid, &active);
 1334                 smp_rendezvous_cpus(active,
 1335                     smp_no_rendezvous_barrier, pmap == kernel_pmap ?
 1336                     pmap_update_pde_kernel : pmap_update_pde_user,
 1337                     pmap_update_pde_teardown, &act);
 1338         } else {
 1339                 if (pmap == kernel_pmap)
 1340                         pmap_kenter_pde(va, newpde);
 1341                 else
 1342                         pde_store(pde, newpde);
 1343                 if (CPU_ISSET(cpuid, &active))
 1344                         pmap_update_pde_invalidate(va, newpde);
 1345         }
 1346         sched_unpin();
 1347 }
 1348 #else /* !SMP */
 1349 /*
 1350  * Normal, non-SMP, 486+ invalidation functions.
 1351  * We inline these within pmap.c for speed.
 1352  */
 1353 PMAP_INLINE void
 1354 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1355 {
 1356 
 1357         if (pmap == kernel_pmap)
 1358                 invlpg(va);
 1359 }
 1360 
 1361 PMAP_INLINE void
 1362 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1363 {
 1364         vm_offset_t addr;
 1365 
 1366         if (pmap == kernel_pmap)
 1367                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1368                         invlpg(addr);
 1369 }
 1370 
 1371 PMAP_INLINE void
 1372 pmap_invalidate_all(pmap_t pmap)
 1373 {
 1374 
 1375         if (pmap == kernel_pmap)
 1376                 invltlb();
 1377 }
 1378 
 1379 PMAP_INLINE void
 1380 pmap_invalidate_cache(void)
 1381 {
 1382 
 1383         wbinvd();
 1384 }
 1385 
 1386 static void
 1387 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1388 {
 1389 
 1390         if (pmap == kernel_pmap)
 1391                 pmap_kenter_pde(va, newpde);
 1392         else
 1393                 pde_store(pde, newpde);
 1394         if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 1395                 pmap_update_pde_invalidate(va, newpde);
 1396 }
 1397 #endif /* !SMP */
 1398 
 1399 static void
 1400 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 1401 {
 1402 
 1403         /*
 1404          * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was
 1405          * created by a promotion that did not invalidate the 512 or 1024 4KB
 1406          * page mappings that might exist in the TLB.  Consequently, at this
 1407          * point, the TLB may hold both 4KB and 2- or 4MB page mappings for
 1408          * the address range [va, va + NBPDR).  Therefore, the entire range
 1409          * must be invalidated here.  In contrast, when PG_PROMOTED is clear,
 1410          * the TLB will not hold any 4KB page mappings for the address range
 1411          * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the
 1412          * 2- or 4MB page mapping from the TLB.
 1413          */
 1414         if ((pde & PG_PROMOTED) != 0)
 1415                 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
 1416         else
 1417                 pmap_invalidate_page(pmap, va);
 1418 }
 1419 
 1420 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t, vm_offset_t),
 1421     static)
 1422 {
 1423 
 1424         if ((cpu_feature & CPUID_SS) != 0)
 1425                 return (pmap_invalidate_cache_range_selfsnoop);
 1426         if ((cpu_feature & CPUID_CLFSH) != 0)
 1427                 return (pmap_force_invalidate_cache_range);
 1428         return (pmap_invalidate_cache_range_all);
 1429 }
 1430 
 1431 #define PMAP_CLFLUSH_THRESHOLD  (2 * 1024 * 1024)
 1432 
 1433 static void
 1434 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
 1435 {
 1436 
 1437         KASSERT((sva & PAGE_MASK) == 0,
 1438             ("pmap_invalidate_cache_range: sva not page-aligned"));
 1439         KASSERT((eva & PAGE_MASK) == 0,
 1440             ("pmap_invalidate_cache_range: eva not page-aligned"));
 1441 }
 1442 
 1443 static void
 1444 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
 1445 {
 1446 
 1447         pmap_invalidate_cache_range_check_align(sva, eva);
 1448 }
 1449 
 1450 void
 1451 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 1452 {
 1453 
 1454         sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
 1455         if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) {
 1456                 /*
 1457                  * The supplied range is bigger than 2MB.
 1458                  * Globally invalidate cache.
 1459                  */
 1460                 pmap_invalidate_cache();
 1461                 return;
 1462         }
 1463 
 1464 #ifdef DEV_APIC
 1465         /*
 1466          * XXX: Some CPUs fault, hang, or trash the local APIC
 1467          * registers if we use CLFLUSH on the local APIC
 1468          * range.  The local APIC is always uncached, so we
 1469          * don't need to flush for that range anyway.
 1470          */
 1471         if (pmap_kextract(sva) == lapic_paddr)
 1472                 return;
 1473 #endif
 1474 
 1475         if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
 1476                 /*
 1477                  * Do per-cache line flush.  Use the sfence
 1478                  * instruction to insure that previous stores are
 1479                  * included in the write-back.  The processor
 1480                  * propagates flush to other processors in the cache
 1481                  * coherence domain.
 1482                  */
 1483                 sfence();
 1484                 for (; sva < eva; sva += cpu_clflush_line_size)
 1485                         clflushopt(sva);
 1486                 sfence();
 1487         } else {
 1488                 /*
 1489                  * Writes are ordered by CLFLUSH on Intel CPUs.
 1490                  */
 1491                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
 1492                         mfence();
 1493                 for (; sva < eva; sva += cpu_clflush_line_size)
 1494                         clflush(sva);
 1495                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
 1496                         mfence();
 1497         }
 1498 }
 1499 
 1500 static void
 1501 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
 1502 {
 1503 
 1504         pmap_invalidate_cache_range_check_align(sva, eva);
 1505         pmap_invalidate_cache();
 1506 }
 1507 
 1508 void
 1509 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 1510 {
 1511         int i;
 1512 
 1513         if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 1514             (cpu_feature & CPUID_CLFSH) == 0) {
 1515                 pmap_invalidate_cache();
 1516         } else {
 1517                 for (i = 0; i < count; i++)
 1518                         pmap_flush_page(pages[i]);
 1519         }
 1520 }
 1521 
 1522 /*
 1523  * Are we current address space or kernel?
 1524  */
 1525 static __inline int
 1526 pmap_is_current(pmap_t pmap)
 1527 {
 1528 
 1529         return (pmap == kernel_pmap);
 1530 }
 1531 
 1532 /*
 1533  * If the given pmap is not the current or kernel pmap, the returned pte must
 1534  * be released by passing it to pmap_pte_release().
 1535  */
 1536 pt_entry_t *
 1537 pmap_pte(pmap_t pmap, vm_offset_t va)
 1538 {
 1539         pd_entry_t newpf;
 1540         pd_entry_t *pde;
 1541 
 1542         pde = pmap_pde(pmap, va);
 1543         if (*pde & PG_PS)
 1544                 return (pde);
 1545         if (*pde != 0) {
 1546                 /* are we current address space or kernel? */
 1547                 if (pmap_is_current(pmap))
 1548                         return (vtopte(va));
 1549                 mtx_lock(&PMAP2mutex);
 1550                 newpf = *pde & PG_FRAME;
 1551                 if ((*PMAP2 & PG_FRAME) != newpf) {
 1552                         *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 1553                         pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 1554                 }
 1555                 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 1556         }
 1557         return (NULL);
 1558 }
 1559 
 1560 /*
 1561  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
 1562  * being NULL.
 1563  */
 1564 static __inline void
 1565 pmap_pte_release(pt_entry_t *pte)
 1566 {
 1567 
 1568         if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 1569                 mtx_unlock(&PMAP2mutex);
 1570 }
 1571 
 1572 /*
 1573  * NB:  The sequence of updating a page table followed by accesses to the
 1574  * corresponding pages is subject to the situation described in the "AMD64
 1575  * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
 1576  * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
 1577  * right after modifying the PTE bits is crucial.
 1578  */
 1579 static __inline void
 1580 invlcaddr(void *caddr)
 1581 {
 1582 
 1583         invlpg((u_int)caddr);
 1584 }
 1585 
 1586 /*
 1587  * Super fast pmap_pte routine best used when scanning
 1588  * the pv lists.  This eliminates many coarse-grained
 1589  * invltlb calls.  Note that many of the pv list
 1590  * scans are across different pmaps.  It is very wasteful
 1591  * to do an entire invltlb for checking a single mapping.
 1592  *
 1593  * If the given pmap is not the current pmap, pvh_global_lock
 1594  * must be held and curthread pinned to a CPU.
 1595  */
 1596 static pt_entry_t *
 1597 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 1598 {
 1599         pd_entry_t newpf;
 1600         pd_entry_t *pde;
 1601 
 1602         pde = pmap_pde(pmap, va);
 1603         if (*pde & PG_PS)
 1604                 return (pde);
 1605         if (*pde != 0) {
 1606                 /* are we current address space or kernel? */
 1607                 if (pmap_is_current(pmap))
 1608                         return (vtopte(va));
 1609                 rw_assert(&pvh_global_lock, RA_WLOCKED);
 1610                 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 1611                 newpf = *pde & PG_FRAME;
 1612                 if ((*PMAP1 & PG_FRAME) != newpf) {
 1613                         *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 1614 #ifdef SMP
 1615                         PMAP1cpu = PCPU_GET(cpuid);
 1616 #endif
 1617                         invlcaddr(PADDR1);
 1618                         PMAP1changed++;
 1619                 } else
 1620 #ifdef SMP
 1621                 if (PMAP1cpu != PCPU_GET(cpuid)) {
 1622                         PMAP1cpu = PCPU_GET(cpuid);
 1623                         invlcaddr(PADDR1);
 1624                         PMAP1changedcpu++;
 1625                 } else
 1626 #endif
 1627                         PMAP1unchanged++;
 1628                 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 1629         }
 1630         return (0);
 1631 }
 1632 
 1633 static pt_entry_t *
 1634 pmap_pte_quick3(pmap_t pmap, vm_offset_t va)
 1635 {
 1636         pd_entry_t newpf;
 1637         pd_entry_t *pde;
 1638 
 1639         pde = pmap_pde(pmap, va);
 1640         if (*pde & PG_PS)
 1641                 return (pde);
 1642         if (*pde != 0) {
 1643                 rw_assert(&pvh_global_lock, RA_WLOCKED);
 1644                 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 1645                 newpf = *pde & PG_FRAME;
 1646                 if ((*PMAP3 & PG_FRAME) != newpf) {
 1647                         *PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M;
 1648 #ifdef SMP
 1649                         PMAP3cpu = PCPU_GET(cpuid);
 1650 #endif
 1651                         invlcaddr(PADDR3);
 1652                         PMAP1changed++;
 1653                 } else
 1654 #ifdef SMP
 1655                 if (PMAP3cpu != PCPU_GET(cpuid)) {
 1656                         PMAP3cpu = PCPU_GET(cpuid);
 1657                         invlcaddr(PADDR3);
 1658                         PMAP1changedcpu++;
 1659                 } else
 1660 #endif
 1661                         PMAP1unchanged++;
 1662                 return (PADDR3 + (i386_btop(va) & (NPTEPG - 1)));
 1663         }
 1664         return (0);
 1665 }
 1666 
 1667 static pt_entry_t
 1668 pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 1669 {
 1670         pt_entry_t *eh_ptep, pte, *ptep;
 1671 
 1672         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1673         pde &= PG_FRAME;
 1674         critical_enter();
 1675         eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep);
 1676         if ((*eh_ptep & PG_FRAME) != pde) {
 1677                 *eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M;
 1678                 invlcaddr((void *)PCPU_GET(pmap_eh_va));
 1679         }
 1680         ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) &
 1681             (NPTEPG - 1));
 1682         pte = *ptep;
 1683         critical_exit();
 1684         return (pte);
 1685 }
 1686 
 1687 /*
 1688  *      Routine:        pmap_extract
 1689  *      Function:
 1690  *              Extract the physical page address associated
 1691  *              with the given map/virtual_address pair.
 1692  */
 1693 vm_paddr_t 
 1694 pmap_extract(pmap_t pmap, vm_offset_t va)
 1695 {
 1696         vm_paddr_t rtval;
 1697         pt_entry_t pte;
 1698         pd_entry_t pde;
 1699 
 1700         rtval = 0;
 1701         PMAP_LOCK(pmap);
 1702         pde = pmap->pm_pdir[va >> PDRSHIFT];
 1703         if (pde != 0) {
 1704                 if ((pde & PG_PS) != 0)
 1705                         rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 1706                 else {
 1707                         pte = pmap_pte_ufast(pmap, va, pde);
 1708                         rtval = (pte & PG_FRAME) | (va & PAGE_MASK);
 1709                 }
 1710         }
 1711         PMAP_UNLOCK(pmap);
 1712         return (rtval);
 1713 }
 1714 
 1715 /*
 1716  *      Routine:        pmap_extract_and_hold
 1717  *      Function:
 1718  *              Atomically extract and hold the physical page
 1719  *              with the given pmap and virtual address pair
 1720  *              if that mapping permits the given protection.
 1721  */
 1722 vm_page_t
 1723 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 1724 {
 1725         pd_entry_t pde;
 1726         pt_entry_t pte;
 1727         vm_page_t m;
 1728         vm_paddr_t pa;
 1729 
 1730         pa = 0;
 1731         m = NULL;
 1732         PMAP_LOCK(pmap);
 1733 retry:
 1734         pde = *pmap_pde(pmap, va);
 1735         if (pde != 0) {
 1736                 if (pde & PG_PS) {
 1737                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 1738                                 if (vm_page_pa_tryrelock(pmap, (pde &
 1739                                     PG_PS_FRAME) | (va & PDRMASK), &pa))
 1740                                         goto retry;
 1741                                 m = PHYS_TO_VM_PAGE(pa);
 1742                         }
 1743                 } else {
 1744                         pte = pmap_pte_ufast(pmap, va, pde);
 1745                         if (pte != 0 &&
 1746                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 1747                                 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 1748                                     &pa))
 1749                                         goto retry;
 1750                                 m = PHYS_TO_VM_PAGE(pa);
 1751                         }
 1752                 }
 1753                 if (m != NULL)
 1754                         vm_page_hold(m);
 1755         }
 1756         PA_UNLOCK_COND(pa);
 1757         PMAP_UNLOCK(pmap);
 1758         return (m);
 1759 }
 1760 
 1761 /***************************************************
 1762  * Low level mapping routines.....
 1763  ***************************************************/
 1764 
 1765 /*
 1766  * Add a wired page to the kva.
 1767  * Note: not SMP coherent.
 1768  *
 1769  * This function may be used before pmap_bootstrap() is called.
 1770  */
 1771 PMAP_INLINE void 
 1772 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 1773 {
 1774         pt_entry_t *pte;
 1775 
 1776         pte = vtopte(va);
 1777         pte_store(pte, pa | PG_RW | PG_V);
 1778 }
 1779 
 1780 static __inline void
 1781 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 1782 {
 1783         pt_entry_t *pte;
 1784 
 1785         pte = vtopte(va);
 1786         pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(kernel_pmap,
 1787             mode, 0));
 1788 }
 1789 
 1790 /*
 1791  * Remove a page from the kernel pagetables.
 1792  * Note: not SMP coherent.
 1793  *
 1794  * This function may be used before pmap_bootstrap() is called.
 1795  */
 1796 PMAP_INLINE void
 1797 pmap_kremove(vm_offset_t va)
 1798 {
 1799         pt_entry_t *pte;
 1800 
 1801         pte = vtopte(va);
 1802         pte_clear(pte);
 1803 }
 1804 
 1805 /*
 1806  *      Used to map a range of physical addresses into kernel
 1807  *      virtual address space.
 1808  *
 1809  *      The value passed in '*virt' is a suggested virtual address for
 1810  *      the mapping. Architectures which can support a direct-mapped
 1811  *      physical to virtual region can return the appropriate address
 1812  *      within that region, leaving '*virt' unchanged. Other
 1813  *      architectures should map the pages starting at '*virt' and
 1814  *      update '*virt' with the first usable address after the mapped
 1815  *      region.
 1816  */
 1817 vm_offset_t
 1818 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 1819 {
 1820         vm_offset_t va, sva;
 1821         vm_paddr_t superpage_offset;
 1822         pd_entry_t newpde;
 1823 
 1824         va = *virt;
 1825         /*
 1826          * Does the physical address range's size and alignment permit at
 1827          * least one superpage mapping to be created?
 1828          */ 
 1829         superpage_offset = start & PDRMASK;
 1830         if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
 1831                 /*
 1832                  * Increase the starting virtual address so that its alignment
 1833                  * does not preclude the use of superpage mappings.
 1834                  */
 1835                 if ((va & PDRMASK) < superpage_offset)
 1836                         va = (va & ~PDRMASK) + superpage_offset;
 1837                 else if ((va & PDRMASK) > superpage_offset)
 1838                         va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
 1839         }
 1840         sva = va;
 1841         while (start < end) {
 1842                 if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
 1843                     pseflag != 0) {
 1844                         KASSERT((va & PDRMASK) == 0,
 1845                             ("pmap_map: misaligned va %#x", va));
 1846                         newpde = start | PG_PS | PG_RW | PG_V;
 1847                         pmap_kenter_pde(va, newpde);
 1848                         va += NBPDR;
 1849                         start += NBPDR;
 1850                 } else {
 1851                         pmap_kenter(va, start);
 1852                         va += PAGE_SIZE;
 1853                         start += PAGE_SIZE;
 1854                 }
 1855         }
 1856         pmap_invalidate_range(kernel_pmap, sva, va);
 1857         *virt = va;
 1858         return (sva);
 1859 }
 1860 
 1861 
 1862 /*
 1863  * Add a list of wired pages to the kva
 1864  * this routine is only used for temporary
 1865  * kernel mappings that do not need to have
 1866  * page modification or references recorded.
 1867  * Note that old mappings are simply written
 1868  * over.  The page *must* be wired.
 1869  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1870  */
 1871 void
 1872 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 1873 {
 1874         pt_entry_t *endpte, oldpte, pa, *pte;
 1875         vm_page_t m;
 1876 
 1877         oldpte = 0;
 1878         pte = vtopte(sva);
 1879         endpte = pte + count;
 1880         while (pte < endpte) {
 1881                 m = *ma++;
 1882                 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(kernel_pmap,
 1883                     m->md.pat_mode, 0);
 1884                 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
 1885                         oldpte |= *pte;
 1886 #if defined(PAE) || defined(PAE_TABLES)
 1887                         pte_store(pte, pa | pg_nx | PG_RW | PG_V);
 1888 #else
 1889                         pte_store(pte, pa | PG_RW | PG_V);
 1890 #endif
 1891                 }
 1892                 pte++;
 1893         }
 1894         if (__predict_false((oldpte & PG_V) != 0))
 1895                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
 1896                     PAGE_SIZE);
 1897 }
 1898 
 1899 /*
 1900  * This routine tears out page mappings from the
 1901  * kernel -- it is meant only for temporary mappings.
 1902  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1903  */
 1904 void
 1905 pmap_qremove(vm_offset_t sva, int count)
 1906 {
 1907         vm_offset_t va;
 1908 
 1909         va = sva;
 1910         while (count-- > 0) {
 1911                 pmap_kremove(va);
 1912                 va += PAGE_SIZE;
 1913         }
 1914         pmap_invalidate_range(kernel_pmap, sva, va);
 1915 }
 1916 
 1917 /***************************************************
 1918  * Page table page management routines.....
 1919  ***************************************************/
 1920 /*
 1921  * Schedule the specified unused page table page to be freed.  Specifically,
 1922  * add the page to the specified list of pages that will be released to the
 1923  * physical memory manager after the TLB has been updated.
 1924  */
 1925 static __inline void
 1926 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
 1927     boolean_t set_PG_ZERO)
 1928 {
 1929 
 1930         if (set_PG_ZERO)
 1931                 m->flags |= PG_ZERO;
 1932         else
 1933                 m->flags &= ~PG_ZERO;
 1934         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 1935 }
 1936 
 1937 /*
 1938  * Inserts the specified page table page into the specified pmap's collection
 1939  * of idle page table pages.  Each of a pmap's page table pages is responsible
 1940  * for mapping a distinct range of virtual addresses.  The pmap's collection is
 1941  * ordered by this virtual address range.
 1942  */
 1943 static __inline int
 1944 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 1945 {
 1946 
 1947         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1948         return (vm_radix_insert(&pmap->pm_root, mpte));
 1949 }
 1950 
 1951 /*
 1952  * Removes the page table page mapping the specified virtual address from the
 1953  * specified pmap's collection of idle page table pages, and returns it.
 1954  * Otherwise, returns NULL if there is no page table page corresponding to the
 1955  * specified virtual address.
 1956  */
 1957 static __inline vm_page_t
 1958 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 1959 {
 1960 
 1961         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1962         return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT));
 1963 }
 1964 
 1965 /*
 1966  * Decrements a page table page's wire count, which is used to record the
 1967  * number of valid page table entries within the page.  If the wire count
 1968  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
 1969  * page table page was unmapped and FALSE otherwise.
 1970  */
 1971 static inline boolean_t
 1972 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
 1973 {
 1974 
 1975         --m->wire_count;
 1976         if (m->wire_count == 0) {
 1977                 _pmap_unwire_ptp(pmap, m, free);
 1978                 return (TRUE);
 1979         } else
 1980                 return (FALSE);
 1981 }
 1982 
 1983 static void
 1984 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
 1985 {
 1986 
 1987         /*
 1988          * unmap the page table page
 1989          */
 1990         pmap->pm_pdir[m->pindex] = 0;
 1991         --pmap->pm_stats.resident_count;
 1992 
 1993         /*
 1994          * There is not need to invalidate the recursive mapping since
 1995          * we never instantiate such mapping for the usermode pmaps,
 1996          * and never remove page table pages from the kernel pmap.
 1997          * Put page on a list so that it is released since all TLB
 1998          * shootdown is done.
 1999          */
 2000         MPASS(pmap != kernel_pmap);
 2001         pmap_add_delayed_free_list(m, free, TRUE);
 2002 }
 2003 
 2004 /*
 2005  * After removing a page table entry, this routine is used to
 2006  * conditionally free the page, and manage the hold/wire counts.
 2007  */
 2008 static int
 2009 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
 2010 {
 2011         pd_entry_t ptepde;
 2012         vm_page_t mpte;
 2013 
 2014         if (pmap == kernel_pmap)
 2015                 return (0);
 2016         ptepde = *pmap_pde(pmap, va);
 2017         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 2018         return (pmap_unwire_ptp(pmap, mpte, free));
 2019 }
 2020 
 2021 /*
 2022  * Initialize the pmap for the swapper process.
 2023  */
 2024 void
 2025 pmap_pinit0(pmap_t pmap)
 2026 {
 2027 
 2028         PMAP_LOCK_INIT(pmap);
 2029         pmap->pm_pdir = IdlePTD;
 2030 #if defined(PAE) || defined(PAE_TABLES)
 2031         pmap->pm_pdpt = IdlePDPT;
 2032 #endif
 2033         pmap->pm_root.rt_root = 0;
 2034         CPU_ZERO(&pmap->pm_active);
 2035         TAILQ_INIT(&pmap->pm_pvchunk);
 2036         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 2037         pmap_activate_boot(pmap);
 2038 }
 2039 
 2040 /*
 2041  * Initialize a preallocated and zeroed pmap structure,
 2042  * such as one in a vmspace structure.
 2043  */
 2044 int
 2045 pmap_pinit(pmap_t pmap)
 2046 {
 2047         vm_page_t m;
 2048         int i;
 2049 
 2050         /*
 2051          * No need to allocate page table space yet but we do need a valid
 2052          * page directory table.
 2053          */
 2054         if (pmap->pm_pdir == NULL) {
 2055                 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
 2056                 if (pmap->pm_pdir == NULL)
 2057                         return (0);
 2058 #if defined(PAE) || defined(PAE_TABLES)
 2059                 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 2060                 KASSERT(((vm_offset_t)pmap->pm_pdpt &
 2061                     ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
 2062                     ("pmap_pinit: pdpt misaligned"));
 2063                 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 2064                     ("pmap_pinit: pdpt above 4g"));
 2065 #endif
 2066                 pmap->pm_root.rt_root = 0;
 2067         }
 2068         KASSERT(vm_radix_is_empty(&pmap->pm_root),
 2069             ("pmap_pinit: pmap has reserved page table page(s)"));
 2070 
 2071         /*
 2072          * allocate the page directory page(s)
 2073          */
 2074         for (i = 0; i < NPGPTD;) {
 2075                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 2076                     VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 2077                 if (m == NULL) {
 2078                         vm_wait(NULL);
 2079                 } else {
 2080                         pmap->pm_ptdpg[i] = m;
 2081 #if defined(PAE) || defined(PAE_TABLES)
 2082                         pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V;
 2083 #endif
 2084                         i++;
 2085                 }
 2086         }
 2087 
 2088         pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD);
 2089 
 2090         for (i = 0; i < NPGPTD; i++)
 2091                 if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0)
 2092                         pagezero(pmap->pm_pdir + (i * NPDEPG));
 2093 
 2094         /* Install the trampoline mapping. */
 2095         pmap->pm_pdir[TRPTDI] = PTD[TRPTDI];
 2096 
 2097         CPU_ZERO(&pmap->pm_active);
 2098         TAILQ_INIT(&pmap->pm_pvchunk);
 2099         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 2100 
 2101         return (1);
 2102 }
 2103 
 2104 /*
 2105  * this routine is called if the page table page is not
 2106  * mapped correctly.
 2107  */
 2108 static vm_page_t
 2109 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
 2110 {
 2111         vm_paddr_t ptepa;
 2112         vm_page_t m;
 2113 
 2114         /*
 2115          * Allocate a page table page.
 2116          */
 2117         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 2118             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 2119                 if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
 2120                         PMAP_UNLOCK(pmap);
 2121                         rw_wunlock(&pvh_global_lock);
 2122                         vm_wait(NULL);
 2123                         rw_wlock(&pvh_global_lock);
 2124                         PMAP_LOCK(pmap);
 2125                 }
 2126 
 2127                 /*
 2128                  * Indicate the need to retry.  While waiting, the page table
 2129                  * page may have been allocated.
 2130                  */
 2131                 return (NULL);
 2132         }
 2133         if ((m->flags & PG_ZERO) == 0)
 2134                 pmap_zero_page(m);
 2135 
 2136         /*
 2137          * Map the pagetable page into the process address space, if
 2138          * it isn't already there.
 2139          */
 2140 
 2141         pmap->pm_stats.resident_count++;
 2142 
 2143         ptepa = VM_PAGE_TO_PHYS(m);
 2144         pmap->pm_pdir[ptepindex] =
 2145                 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
 2146 
 2147         return (m);
 2148 }
 2149 
 2150 static vm_page_t
 2151 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
 2152 {
 2153         u_int ptepindex;
 2154         pd_entry_t ptepa;
 2155         vm_page_t m;
 2156 
 2157         /*
 2158          * Calculate pagetable page index
 2159          */
 2160         ptepindex = va >> PDRSHIFT;
 2161 retry:
 2162         /*
 2163          * Get the page directory entry
 2164          */
 2165         ptepa = pmap->pm_pdir[ptepindex];
 2166 
 2167         /*
 2168          * This supports switching from a 4MB page to a
 2169          * normal 4K page.
 2170          */
 2171         if (ptepa & PG_PS) {
 2172                 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
 2173                 ptepa = pmap->pm_pdir[ptepindex];
 2174         }
 2175 
 2176         /*
 2177          * If the page table page is mapped, we just increment the
 2178          * hold count, and activate it.
 2179          */
 2180         if (ptepa) {
 2181                 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 2182                 m->wire_count++;
 2183         } else {
 2184                 /*
 2185                  * Here if the pte page isn't mapped, or if it has
 2186                  * been deallocated. 
 2187                  */
 2188                 m = _pmap_allocpte(pmap, ptepindex, flags);
 2189                 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
 2190                         goto retry;
 2191         }
 2192         return (m);
 2193 }
 2194 
 2195 
 2196 /***************************************************
 2197 * Pmap allocation/deallocation routines.
 2198  ***************************************************/
 2199 
 2200 /*
 2201  * Release any resources held by the given physical map.
 2202  * Called when a pmap initialized by pmap_pinit is being released.
 2203  * Should only be called if the map contains no valid mappings.
 2204  */
 2205 void
 2206 pmap_release(pmap_t pmap)
 2207 {
 2208         vm_page_t m;
 2209         int i;
 2210 
 2211         KASSERT(pmap->pm_stats.resident_count == 0,
 2212             ("pmap_release: pmap resident count %ld != 0",
 2213             pmap->pm_stats.resident_count));
 2214         KASSERT(vm_radix_is_empty(&pmap->pm_root),
 2215             ("pmap_release: pmap has reserved page table page(s)"));
 2216         KASSERT(CPU_EMPTY(&pmap->pm_active),
 2217             ("releasing active pmap %p", pmap));
 2218 
 2219         pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 2220 
 2221         for (i = 0; i < NPGPTD; i++) {
 2222                 m = pmap->pm_ptdpg[i];
 2223 #if defined(PAE) || defined(PAE_TABLES)
 2224                 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
 2225                     ("pmap_release: got wrong ptd page"));
 2226 #endif
 2227                 vm_page_unwire_noq(m);
 2228                 vm_page_free(m);
 2229         }
 2230 }
 2231 
 2232 static int
 2233 kvm_size(SYSCTL_HANDLER_ARGS)
 2234 {
 2235         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 2236 
 2237         return (sysctl_handle_long(oidp, &ksize, 0, req));
 2238 }
 2239 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
 2240     0, 0, kvm_size, "IU", "Size of KVM");
 2241 
 2242 static int
 2243 kvm_free(SYSCTL_HANDLER_ARGS)
 2244 {
 2245         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 2246 
 2247         return (sysctl_handle_long(oidp, &kfree, 0, req));
 2248 }
 2249 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
 2250     0, 0, kvm_free, "IU", "Amount of KVM free");
 2251 
 2252 /*
 2253  * grow the number of kernel page table entries, if needed
 2254  */
 2255 void
 2256 pmap_growkernel(vm_offset_t addr)
 2257 {
 2258         vm_paddr_t ptppaddr;
 2259         vm_page_t nkpg;
 2260         pd_entry_t newpdir;
 2261 
 2262         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 2263         addr = roundup2(addr, NBPDR);
 2264         if (addr - 1 >= vm_map_max(kernel_map))
 2265                 addr = vm_map_max(kernel_map);
 2266         while (kernel_vm_end < addr) {
 2267                 if (pdir_pde(PTD, kernel_vm_end)) {
 2268                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 2269                         if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 2270                                 kernel_vm_end = vm_map_max(kernel_map);
 2271                                 break;
 2272                         }
 2273                         continue;
 2274                 }
 2275 
 2276                 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
 2277                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 2278                     VM_ALLOC_ZERO);
 2279                 if (nkpg == NULL)
 2280                         panic("pmap_growkernel: no memory to grow kernel");
 2281 
 2282                 nkpt++;
 2283 
 2284                 if ((nkpg->flags & PG_ZERO) == 0)
 2285                         pmap_zero_page(nkpg);
 2286                 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 2287                 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
 2288                 pdir_pde(KPTD, kernel_vm_end) = newpdir;
 2289 
 2290                 pmap_kenter_pde(kernel_vm_end, newpdir);
 2291                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 2292                 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 2293                         kernel_vm_end = vm_map_max(kernel_map);
 2294                         break;
 2295                 }
 2296         }
 2297 }
 2298 
 2299 
 2300 /***************************************************
 2301  * page management routines.
 2302  ***************************************************/
 2303 
 2304 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 2305 CTASSERT(_NPCM == 11);
 2306 CTASSERT(_NPCPV == 336);
 2307 
 2308 static __inline struct pv_chunk *
 2309 pv_to_chunk(pv_entry_t pv)
 2310 {
 2311 
 2312         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 2313 }
 2314 
 2315 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 2316 
 2317 #define PC_FREE0_9      0xfffffffful    /* Free values for index 0 through 9 */
 2318 #define PC_FREE10       0x0000fffful    /* Free values for index 10 */
 2319 
 2320 static const uint32_t pc_freemask[_NPCM] = {
 2321         PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 2322         PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 2323         PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 2324         PC_FREE0_9, PC_FREE10
 2325 };
 2326 
 2327 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 2328         "Current number of pv entries");
 2329 
 2330 #ifdef PV_STATS
 2331 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 2332 
 2333 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 2334         "Current number of pv entry chunks");
 2335 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 2336         "Current number of pv entry chunks allocated");
 2337 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 2338         "Current number of pv entry chunks frees");
 2339 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 2340         "Number of times tried to get a chunk page but failed.");
 2341 
 2342 static long pv_entry_frees, pv_entry_allocs;
 2343 static int pv_entry_spare;
 2344 
 2345 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 2346         "Current number of pv entry frees");
 2347 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 2348         "Current number of pv entry allocs");
 2349 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 2350         "Current number of spare pv entries");
 2351 #endif
 2352 
 2353 /*
 2354  * We are in a serious low memory condition.  Resort to
 2355  * drastic measures to free some pages so we can allocate
 2356  * another pv entry chunk.
 2357  */
 2358 static vm_page_t
 2359 pmap_pv_reclaim(pmap_t locked_pmap)
 2360 {
 2361         struct pch newtail;
 2362         struct pv_chunk *pc;
 2363         struct md_page *pvh;
 2364         pd_entry_t *pde;
 2365         pmap_t pmap;
 2366         pt_entry_t *pte, tpte;
 2367         pv_entry_t pv;
 2368         vm_offset_t va;
 2369         vm_page_t m, m_pc;
 2370         struct spglist free;
 2371         uint32_t inuse;
 2372         int bit, field, freed;
 2373 
 2374         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 2375         pmap = NULL;
 2376         m_pc = NULL;
 2377         SLIST_INIT(&free);
 2378         TAILQ_INIT(&newtail);
 2379         while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
 2380             SLIST_EMPTY(&free))) {
 2381                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 2382                 if (pmap != pc->pc_pmap) {
 2383                         if (pmap != NULL) {
 2384                                 pmap_invalidate_all(pmap);
 2385                                 if (pmap != locked_pmap)
 2386                                         PMAP_UNLOCK(pmap);
 2387                         }
 2388                         pmap = pc->pc_pmap;
 2389                         /* Avoid deadlock and lock recursion. */
 2390                         if (pmap > locked_pmap)
 2391                                 PMAP_LOCK(pmap);
 2392                         else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
 2393                                 pmap = NULL;
 2394                                 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 2395                                 continue;
 2396                         }
 2397                 }
 2398 
 2399                 /*
 2400                  * Destroy every non-wired, 4 KB page mapping in the chunk.
 2401                  */
 2402                 freed = 0;
 2403                 for (field = 0; field < _NPCM; field++) {
 2404                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 2405                             inuse != 0; inuse &= ~(1UL << bit)) {
 2406                                 bit = bsfl(inuse);
 2407                                 pv = &pc->pc_pventry[field * 32 + bit];
 2408                                 va = pv->pv_va;
 2409                                 pde = pmap_pde(pmap, va);
 2410                                 if ((*pde & PG_PS) != 0)
 2411                                         continue;
 2412                                 pte = pmap_pte(pmap, va);
 2413                                 tpte = *pte;
 2414                                 if ((tpte & PG_W) == 0)
 2415                                         tpte = pte_load_clear(pte);
 2416                                 pmap_pte_release(pte);
 2417                                 if ((tpte & PG_W) != 0)
 2418                                         continue;
 2419                                 KASSERT(tpte != 0,
 2420                                     ("pmap_pv_reclaim: pmap %p va %x zero pte",
 2421                                     pmap, va));
 2422                                 if ((tpte & PG_G) != 0)
 2423                                         pmap_invalidate_page(pmap, va);
 2424                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 2425                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2426                                         vm_page_dirty(m);
 2427                                 if ((tpte & PG_A) != 0)
 2428                                         vm_page_aflag_set(m, PGA_REFERENCED);
 2429                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 2430                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 2431                                     (m->flags & PG_FICTITIOUS) == 0) {
 2432                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2433                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 2434                                                 vm_page_aflag_clear(m,
 2435                                                     PGA_WRITEABLE);
 2436                                         }
 2437                                 }
 2438                                 pc->pc_map[field] |= 1UL << bit;
 2439                                 pmap_unuse_pt(pmap, va, &free);
 2440                                 freed++;
 2441                         }
 2442                 }
 2443                 if (freed == 0) {
 2444                         TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 2445                         continue;
 2446                 }
 2447                 /* Every freed mapping is for a 4 KB page. */
 2448                 pmap->pm_stats.resident_count -= freed;
 2449                 PV_STAT(pv_entry_frees += freed);
 2450                 PV_STAT(pv_entry_spare += freed);
 2451                 pv_entry_count -= freed;
 2452                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2453                 for (field = 0; field < _NPCM; field++)
 2454                         if (pc->pc_map[field] != pc_freemask[field]) {
 2455                                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 2456                                     pc_list);
 2457                                 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 2458 
 2459                                 /*
 2460                                  * One freed pv entry in locked_pmap is
 2461                                  * sufficient.
 2462                                  */
 2463                                 if (pmap == locked_pmap)
 2464                                         goto out;
 2465                                 break;
 2466                         }
 2467                 if (field == _NPCM) {
 2468                         PV_STAT(pv_entry_spare -= _NPCPV);
 2469                         PV_STAT(pc_chunk_count--);
 2470                         PV_STAT(pc_chunk_frees++);
 2471                         /* Entire chunk is free; return it. */
 2472                         m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 2473                         pmap_qremove((vm_offset_t)pc, 1);
 2474                         pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 2475                         break;
 2476                 }
 2477         }
 2478 out:
 2479         TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
 2480         if (pmap != NULL) {
 2481                 pmap_invalidate_all(pmap);
 2482                 if (pmap != locked_pmap)
 2483                         PMAP_UNLOCK(pmap);
 2484         }
 2485         if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
 2486                 m_pc = SLIST_FIRST(&free);
 2487                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 2488                 /* Recycle a freed page table page. */
 2489                 m_pc->wire_count = 1;
 2490         }
 2491         vm_page_free_pages_toq(&free, true);
 2492         return (m_pc);
 2493 }
 2494 
 2495 /*
 2496  * free the pv_entry back to the free list
 2497  */
 2498 static void
 2499 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 2500 {
 2501         struct pv_chunk *pc;
 2502         int idx, field, bit;
 2503 
 2504         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2505         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2506         PV_STAT(pv_entry_frees++);
 2507         PV_STAT(pv_entry_spare++);
 2508         pv_entry_count--;
 2509         pc = pv_to_chunk(pv);
 2510         idx = pv - &pc->pc_pventry[0];
 2511         field = idx / 32;
 2512         bit = idx % 32;
 2513         pc->pc_map[field] |= 1ul << bit;
 2514         for (idx = 0; idx < _NPCM; idx++)
 2515                 if (pc->pc_map[idx] != pc_freemask[idx]) {
 2516                         /*
 2517                          * 98% of the time, pc is already at the head of the
 2518                          * list.  If it isn't already, move it to the head.
 2519                          */
 2520                         if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
 2521                             pc)) {
 2522                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2523                                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 2524                                     pc_list);
 2525                         }
 2526                         return;
 2527                 }
 2528         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2529         free_pv_chunk(pc);
 2530 }
 2531 
 2532 static void
 2533 free_pv_chunk(struct pv_chunk *pc)
 2534 {
 2535         vm_page_t m;
 2536 
 2537         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 2538         PV_STAT(pv_entry_spare -= _NPCPV);
 2539         PV_STAT(pc_chunk_count--);
 2540         PV_STAT(pc_chunk_frees++);
 2541         /* entire chunk is free, return it */
 2542         m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 2543         pmap_qremove((vm_offset_t)pc, 1);
 2544         vm_page_unwire(m, PQ_NONE);
 2545         vm_page_free(m);
 2546         pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 2547 }
 2548 
 2549 /*
 2550  * get a new pv_entry, allocating a block from the system
 2551  * when needed.
 2552  */
 2553 static pv_entry_t
 2554 get_pv_entry(pmap_t pmap, boolean_t try)
 2555 {
 2556         static const struct timeval printinterval = { 60, 0 };
 2557         static struct timeval lastprint;
 2558         int bit, field;
 2559         pv_entry_t pv;
 2560         struct pv_chunk *pc;
 2561         vm_page_t m;
 2562 
 2563         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2564         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2565         PV_STAT(pv_entry_allocs++);
 2566         pv_entry_count++;
 2567         if (pv_entry_count > pv_entry_high_water)
 2568                 if (ratecheck(&lastprint, &printinterval))
 2569                         printf("Approaching the limit on PV entries, consider "
 2570                             "increasing either the vm.pmap.shpgperproc or the "
 2571                             "vm.pmap.pv_entry_max tunable.\n");
 2572 retry:
 2573         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 2574         if (pc != NULL) {
 2575                 for (field = 0; field < _NPCM; field++) {
 2576                         if (pc->pc_map[field]) {
 2577                                 bit = bsfl(pc->pc_map[field]);
 2578                                 break;
 2579                         }
 2580                 }
 2581                 if (field < _NPCM) {
 2582                         pv = &pc->pc_pventry[field * 32 + bit];
 2583                         pc->pc_map[field] &= ~(1ul << bit);
 2584                         /* If this was the last item, move it to tail */
 2585                         for (field = 0; field < _NPCM; field++)
 2586                                 if (pc->pc_map[field] != 0) {
 2587                                         PV_STAT(pv_entry_spare--);
 2588                                         return (pv);    /* not full, return */
 2589                                 }
 2590                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2591                         TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 2592                         PV_STAT(pv_entry_spare--);
 2593                         return (pv);
 2594                 }
 2595         }
 2596         /*
 2597          * Access to the ptelist "pv_vafree" is synchronized by the pvh
 2598          * global lock.  If "pv_vafree" is currently non-empty, it will
 2599          * remain non-empty until pmap_ptelist_alloc() completes.
 2600          */
 2601         if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 2602             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 2603                 if (try) {
 2604                         pv_entry_count--;
 2605                         PV_STAT(pc_chunk_tryfail++);
 2606                         return (NULL);
 2607                 }
 2608                 m = pmap_pv_reclaim(pmap);
 2609                 if (m == NULL)
 2610                         goto retry;
 2611         }
 2612         PV_STAT(pc_chunk_count++);
 2613         PV_STAT(pc_chunk_allocs++);
 2614         pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
 2615         pmap_qenter((vm_offset_t)pc, &m, 1);
 2616         pc->pc_pmap = pmap;
 2617         pc->pc_map[0] = pc_freemask[0] & ~1ul;  /* preallocated bit 0 */
 2618         for (field = 1; field < _NPCM; field++)
 2619                 pc->pc_map[field] = pc_freemask[field];
 2620         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 2621         pv = &pc->pc_pventry[0];
 2622         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2623         PV_STAT(pv_entry_spare += _NPCPV - 1);
 2624         return (pv);
 2625 }
 2626 
 2627 static __inline pv_entry_t
 2628 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 2629 {
 2630         pv_entry_t pv;
 2631 
 2632         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2633         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 2634                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 2635                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 2636                         break;
 2637                 }
 2638         }
 2639         return (pv);
 2640 }
 2641 
 2642 static void
 2643 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 2644 {
 2645         struct md_page *pvh;
 2646         pv_entry_t pv;
 2647         vm_offset_t va_last;
 2648         vm_page_t m;
 2649 
 2650         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2651         KASSERT((pa & PDRMASK) == 0,
 2652             ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
 2653 
 2654         /*
 2655          * Transfer the 4mpage's pv entry for this mapping to the first
 2656          * page's pv list.
 2657          */
 2658         pvh = pa_to_pvh(pa);
 2659         va = trunc_4mpage(va);
 2660         pv = pmap_pvh_remove(pvh, pmap, va);
 2661         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 2662         m = PHYS_TO_VM_PAGE(pa);
 2663         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 2664         /* Instantiate the remaining NPTEPG - 1 pv entries. */
 2665         va_last = va + NBPDR - PAGE_SIZE;
 2666         do {
 2667                 m++;
 2668                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 2669                     ("pmap_pv_demote_pde: page %p is not managed", m));
 2670                 va += PAGE_SIZE;
 2671                 pmap_insert_entry(pmap, va, m);
 2672         } while (va < va_last);
 2673 }
 2674 
 2675 #if VM_NRESERVLEVEL > 0
 2676 static void
 2677 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 2678 {
 2679         struct md_page *pvh;
 2680         pv_entry_t pv;
 2681         vm_offset_t va_last;
 2682         vm_page_t m;
 2683 
 2684         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2685         KASSERT((pa & PDRMASK) == 0,
 2686             ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
 2687 
 2688         /*
 2689          * Transfer the first page's pv entry for this mapping to the
 2690          * 4mpage's pv list.  Aside from avoiding the cost of a call
 2691          * to get_pv_entry(), a transfer avoids the possibility that
 2692          * get_pv_entry() calls pmap_collect() and that pmap_collect()
 2693          * removes one of the mappings that is being promoted.
 2694          */
 2695         m = PHYS_TO_VM_PAGE(pa);
 2696         va = trunc_4mpage(va);
 2697         pv = pmap_pvh_remove(&m->md, pmap, va);
 2698         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 2699         pvh = pa_to_pvh(pa);
 2700         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 2701         /* Free the remaining NPTEPG - 1 pv entries. */
 2702         va_last = va + NBPDR - PAGE_SIZE;
 2703         do {
 2704                 m++;
 2705                 va += PAGE_SIZE;
 2706                 pmap_pvh_free(&m->md, pmap, va);
 2707         } while (va < va_last);
 2708 }
 2709 #endif /* VM_NRESERVLEVEL > 0 */
 2710 
 2711 static void
 2712 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 2713 {
 2714         pv_entry_t pv;
 2715 
 2716         pv = pmap_pvh_remove(pvh, pmap, va);
 2717         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 2718         free_pv_entry(pmap, pv);
 2719 }
 2720 
 2721 static void
 2722 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 2723 {
 2724         struct md_page *pvh;
 2725 
 2726         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2727         pmap_pvh_free(&m->md, pmap, va);
 2728         if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
 2729                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2730                 if (TAILQ_EMPTY(&pvh->pv_list))
 2731                         vm_page_aflag_clear(m, PGA_WRITEABLE);
 2732         }
 2733 }
 2734 
 2735 /*
 2736  * Create a pv entry for page at pa for
 2737  * (pmap, va).
 2738  */
 2739 static void
 2740 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 2741 {
 2742         pv_entry_t pv;
 2743 
 2744         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2745         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2746         pv = get_pv_entry(pmap, FALSE);
 2747         pv->pv_va = va;
 2748         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 2749 }
 2750 
 2751 /*
 2752  * Conditionally create a pv entry.
 2753  */
 2754 static boolean_t
 2755 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 2756 {
 2757         pv_entry_t pv;
 2758 
 2759         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2760         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2761         if (pv_entry_count < pv_entry_high_water && 
 2762             (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 2763                 pv->pv_va = va;
 2764                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 2765                 return (TRUE);
 2766         } else
 2767                 return (FALSE);
 2768 }
 2769 
 2770 /*
 2771  * Create the pv entries for each of the pages within a superpage.
 2772  */
 2773 static bool
 2774 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags)
 2775 {
 2776         struct md_page *pvh;
 2777         pv_entry_t pv;
 2778         bool noreclaim;
 2779 
 2780         rw_assert(&pvh_global_lock, RA_WLOCKED);
 2781         noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0;
 2782         if ((noreclaim && pv_entry_count >= pv_entry_high_water) ||
 2783             (pv = get_pv_entry(pmap, noreclaim)) == NULL)
 2784                 return (false);
 2785         pv->pv_va = va;
 2786         pvh = pa_to_pvh(pde & PG_PS_FRAME);
 2787         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 2788         return (true);
 2789 }
 2790 
 2791 /*
 2792  * Fills a page table page with mappings to consecutive physical pages.
 2793  */
 2794 static void
 2795 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 2796 {
 2797         pt_entry_t *pte;
 2798 
 2799         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 2800                 *pte = newpte;  
 2801                 newpte += PAGE_SIZE;
 2802         }
 2803 }
 2804 
 2805 /*
 2806  * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
 2807  * 2- or 4MB page mapping is invalidated.
 2808  */
 2809 static boolean_t
 2810 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 2811 {
 2812         pd_entry_t newpde, oldpde;
 2813         pt_entry_t *firstpte, newpte;
 2814         vm_paddr_t mptepa;
 2815         vm_page_t mpte;
 2816         struct spglist free;
 2817         vm_offset_t sva;
 2818 
 2819         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2820         oldpde = *pde;
 2821         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 2822             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 2823         if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 2824             NULL) {
 2825                 KASSERT((oldpde & PG_W) == 0,
 2826                     ("pmap_demote_pde: page table page for a wired mapping"
 2827                     " is missing"));
 2828 
 2829                 /*
 2830                  * Invalidate the 2- or 4MB page mapping and return
 2831                  * "failure" if the mapping was never accessed or the
 2832                  * allocation of the new page table page fails.
 2833                  */
 2834                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 2835                     va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
 2836                     VM_ALLOC_WIRED)) == NULL) {
 2837                         SLIST_INIT(&free);
 2838                         sva = trunc_4mpage(va);
 2839                         pmap_remove_pde(pmap, pde, sva, &free);
 2840                         if ((oldpde & PG_G) == 0)
 2841                                 pmap_invalidate_pde_page(pmap, sva, oldpde);
 2842                         vm_page_free_pages_toq(&free, true);
 2843                         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
 2844                             " in pmap %p", va, pmap);
 2845                         return (FALSE);
 2846                 }
 2847                 if (pmap != kernel_pmap)
 2848                         pmap->pm_stats.resident_count++;
 2849         }
 2850         mptepa = VM_PAGE_TO_PHYS(mpte);
 2851 
 2852         /*
 2853          * If the page mapping is in the kernel's address space, then the
 2854          * KPTmap can provide access to the page table page.  Otherwise,
 2855          * temporarily map the page table page (mpte) into the kernel's
 2856          * address space at either PADDR1 or PADDR2. 
 2857          */
 2858         if (pmap == kernel_pmap)
 2859                 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
 2860         else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
 2861                 if ((*PMAP1 & PG_FRAME) != mptepa) {
 2862                         *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 2863 #ifdef SMP
 2864                         PMAP1cpu = PCPU_GET(cpuid);
 2865 #endif
 2866                         invlcaddr(PADDR1);
 2867                         PMAP1changed++;
 2868                 } else
 2869 #ifdef SMP
 2870                 if (PMAP1cpu != PCPU_GET(cpuid)) {
 2871                         PMAP1cpu = PCPU_GET(cpuid);
 2872                         invlcaddr(PADDR1);
 2873                         PMAP1changedcpu++;
 2874                 } else
 2875 #endif
 2876                         PMAP1unchanged++;
 2877                 firstpte = PADDR1;
 2878         } else {
 2879                 mtx_lock(&PMAP2mutex);
 2880                 if ((*PMAP2 & PG_FRAME) != mptepa) {
 2881                         *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 2882                         pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 2883                 }
 2884                 firstpte = PADDR2;
 2885         }
 2886         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 2887         KASSERT((oldpde & PG_A) != 0,
 2888             ("pmap_demote_pde: oldpde is missing PG_A"));
 2889         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 2890             ("pmap_demote_pde: oldpde is missing PG_M"));
 2891         newpte = oldpde & ~PG_PS;
 2892         if ((newpte & PG_PDE_PAT) != 0)
 2893                 newpte ^= PG_PDE_PAT | PG_PTE_PAT;
 2894 
 2895         /*
 2896          * If the page table page is new, initialize it.
 2897          */
 2898         if (mpte->wire_count == 1) {
 2899                 mpte->wire_count = NPTEPG;
 2900                 pmap_fill_ptp(firstpte, newpte);
 2901         }
 2902         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 2903             ("pmap_demote_pde: firstpte and newpte map different physical"
 2904             " addresses"));
 2905 
 2906         /*
 2907          * If the mapping has changed attributes, update the page table
 2908          * entries.
 2909          */ 
 2910         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 2911                 pmap_fill_ptp(firstpte, newpte);
 2912         
 2913         /*
 2914          * Demote the mapping.  This pmap is locked.  The old PDE has
 2915          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 2916          * set.  Thus, there is no danger of a race with another
 2917          * processor changing the setting of PG_A and/or PG_M between
 2918          * the read above and the store below. 
 2919          */
 2920         if (workaround_erratum383)
 2921                 pmap_update_pde(pmap, va, pde, newpde);
 2922         else if (pmap == kernel_pmap)
 2923                 pmap_kenter_pde(va, newpde);
 2924         else
 2925                 pde_store(pde, newpde); 
 2926         if (firstpte == PADDR2)
 2927                 mtx_unlock(&PMAP2mutex);
 2928 
 2929         /*
 2930          * Invalidate the recursive mapping of the page table page.
 2931          */
 2932         pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 2933 
 2934         /*
 2935          * Demote the pv entry.  This depends on the earlier demotion
 2936          * of the mapping.  Specifically, the (re)creation of a per-
 2937          * page pv entry might trigger the execution of pmap_collect(),
 2938          * which might reclaim a newly (re)created per-page pv entry
 2939          * and destroy the associated mapping.  In order to destroy
 2940          * the mapping, the PDE must have already changed from mapping
 2941          * the 2mpage to referencing the page table page.
 2942          */
 2943         if ((oldpde & PG_MANAGED) != 0)
 2944                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
 2945 
 2946         pmap_pde_demotions++;
 2947         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
 2948             " in pmap %p", va, pmap);
 2949         return (TRUE);
 2950 }
 2951 
 2952 /*
 2953  * Removes a 2- or 4MB page mapping from the kernel pmap.
 2954  */
 2955 static void
 2956 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 2957 {
 2958         pd_entry_t newpde;
 2959         vm_paddr_t mptepa;
 2960         vm_page_t mpte;
 2961 
 2962         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2963         mpte = pmap_remove_pt_page(pmap, va);
 2964         if (mpte == NULL)
 2965                 panic("pmap_remove_kernel_pde: Missing pt page.");
 2966 
 2967         mptepa = VM_PAGE_TO_PHYS(mpte);
 2968         newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
 2969 
 2970         /*
 2971          * Initialize the page table page.
 2972          */
 2973         pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
 2974 
 2975         /*
 2976          * Remove the mapping.
 2977          */
 2978         if (workaround_erratum383)
 2979                 pmap_update_pde(pmap, va, pde, newpde);
 2980         else 
 2981                 pmap_kenter_pde(va, newpde);
 2982 
 2983         /*
 2984          * Invalidate the recursive mapping of the page table page.
 2985          */
 2986         pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 2987 }
 2988 
 2989 /*
 2990  * pmap_remove_pde: do the things to unmap a superpage in a process
 2991  */
 2992 static void
 2993 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 2994     struct spglist *free)
 2995 {
 2996         struct md_page *pvh;
 2997         pd_entry_t oldpde;
 2998         vm_offset_t eva, va;
 2999         vm_page_t m, mpte;
 3000 
 3001         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3002         KASSERT((sva & PDRMASK) == 0,
 3003             ("pmap_remove_pde: sva is not 4mpage aligned"));
 3004         oldpde = pte_load_clear(pdq);
 3005         if (oldpde & PG_W)
 3006                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 3007 
 3008         /*
 3009          * Machines that don't support invlpg, also don't support
 3010          * PG_G.
 3011          */
 3012         if ((oldpde & PG_G) != 0)
 3013                 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 3014 
 3015         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 3016         if (oldpde & PG_MANAGED) {
 3017                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 3018                 pmap_pvh_free(pvh, pmap, sva);
 3019                 eva = sva + NBPDR;
 3020                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 3021                     va < eva; va += PAGE_SIZE, m++) {
 3022                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3023                                 vm_page_dirty(m);
 3024                         if (oldpde & PG_A)
 3025                                 vm_page_aflag_set(m, PGA_REFERENCED);
 3026                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 3027                             TAILQ_EMPTY(&pvh->pv_list))
 3028                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 3029                 }
 3030         }
 3031         if (pmap == kernel_pmap) {
 3032                 pmap_remove_kernel_pde(pmap, pdq, sva);
 3033         } else {
 3034                 mpte = pmap_remove_pt_page(pmap, sva);
 3035                 if (mpte != NULL) {
 3036                         pmap->pm_stats.resident_count--;
 3037                         KASSERT(mpte->wire_count == NPTEPG,
 3038                             ("pmap_remove_pde: pte page wire count error"));
 3039                         mpte->wire_count = 0;
 3040                         pmap_add_delayed_free_list(mpte, free, FALSE);
 3041                 }
 3042         }
 3043 }
 3044 
 3045 /*
 3046  * pmap_remove_pte: do the things to unmap a page in a process
 3047  */
 3048 static int
 3049 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
 3050     struct spglist *free)
 3051 {
 3052         pt_entry_t oldpte;
 3053         vm_page_t m;
 3054 
 3055         rw_assert(&pvh_global_lock, RA_WLOCKED);
 3056         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3057         oldpte = pte_load_clear(ptq);
 3058         KASSERT(oldpte != 0,
 3059             ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
 3060         if (oldpte & PG_W)
 3061                 pmap->pm_stats.wired_count -= 1;
 3062         /*
 3063          * Machines that don't support invlpg, also don't support
 3064          * PG_G.
 3065          */
 3066         if (oldpte & PG_G)
 3067                 pmap_invalidate_page(kernel_pmap, va);
 3068         pmap->pm_stats.resident_count -= 1;
 3069         if (oldpte & PG_MANAGED) {
 3070                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 3071                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3072                         vm_page_dirty(m);
 3073                 if (oldpte & PG_A)
 3074                         vm_page_aflag_set(m, PGA_REFERENCED);
 3075                 pmap_remove_entry(pmap, m, va);
 3076         }
 3077         return (pmap_unuse_pt(pmap, va, free));
 3078 }
 3079 
 3080 /*
 3081  * Remove a single page from a process address space
 3082  */
 3083 static void
 3084 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
 3085 {
 3086         pt_entry_t *pte;
 3087 
 3088         rw_assert(&pvh_global_lock, RA_WLOCKED);
 3089         KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 3090         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3091         if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 3092                 return;
 3093         pmap_remove_pte(pmap, pte, va, free);
 3094         pmap_invalidate_page(pmap, va);
 3095 }
 3096 
 3097 /*
 3098  * Removes the specified range of addresses from the page table page.
 3099  */
 3100 static bool
 3101 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 3102     struct spglist *free)
 3103 {
 3104         pt_entry_t *pte;
 3105         bool anyvalid;
 3106 
 3107         rw_assert(&pvh_global_lock, RA_WLOCKED);
 3108         KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 3109         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3110         anyvalid = false;
 3111         for (pte = pmap_pte_quick(pmap, sva); sva != eva; pte++,
 3112             sva += PAGE_SIZE) {
 3113                 if (*pte == 0)
 3114                         continue;
 3115 
 3116                 /*
 3117                  * The TLB entry for a PG_G mapping is invalidated by
 3118                  * pmap_remove_pte().
 3119                  */
 3120                 if ((*pte & PG_G) == 0)
 3121                         anyvalid = true;
 3122 
 3123                 if (pmap_remove_pte(pmap, pte, sva, free))
 3124                         break;
 3125         }
 3126         return (anyvalid);
 3127 }
 3128 
 3129 /*
 3130  *      Remove the given range of addresses from the specified map.
 3131  *
 3132  *      It is assumed that the start and end are properly
 3133  *      rounded to the page size.
 3134  */
 3135 void
 3136 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 3137 {
 3138         vm_offset_t pdnxt;
 3139         pd_entry_t ptpaddr;
 3140         struct spglist free;
 3141         int anyvalid;
 3142 
 3143         /*
 3144          * Perform an unsynchronized read.  This is, however, safe.
 3145          */
 3146         if (pmap->pm_stats.resident_count == 0)
 3147                 return;
 3148 
 3149         anyvalid = 0;
 3150         SLIST_INIT(&free);
 3151 
 3152         rw_wlock(&pvh_global_lock);
 3153         sched_pin();
 3154         PMAP_LOCK(pmap);
 3155 
 3156         /*
 3157          * special handling of removing one page.  a very
 3158          * common operation and easy to short circuit some
 3159          * code.
 3160          */
 3161         if ((sva + PAGE_SIZE == eva) && 
 3162             ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 3163                 pmap_remove_page(pmap, sva, &free);
 3164                 goto out;
 3165         }
 3166 
 3167         for (; sva < eva; sva = pdnxt) {
 3168                 u_int pdirindex;
 3169 
 3170                 /*
 3171                  * Calculate index for next page table.
 3172                  */
 3173                 pdnxt = (sva + NBPDR) & ~PDRMASK;
 3174                 if (pdnxt < sva)
 3175                         pdnxt = eva;
 3176                 if (pmap->pm_stats.resident_count == 0)
 3177                         break;
 3178 
 3179                 pdirindex = sva >> PDRSHIFT;
 3180                 ptpaddr = pmap->pm_pdir[pdirindex];
 3181 
 3182                 /*
 3183                  * Weed out invalid mappings. Note: we assume that the page
 3184                  * directory table is always allocated, and in kernel virtual.
 3185                  */
 3186                 if (ptpaddr == 0)
 3187                         continue;
 3188 
 3189                 /*
 3190                  * Check for large page.
 3191                  */
 3192                 if ((ptpaddr & PG_PS) != 0) {
 3193                         /*
 3194                          * Are we removing the entire large page?  If not,
 3195                          * demote the mapping and fall through.
 3196                          */
 3197                         if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 3198                                 /*
 3199                                  * The TLB entry for a PG_G mapping is
 3200                                  * invalidated by pmap_remove_pde().
 3201                                  */
 3202                                 if ((ptpaddr & PG_G) == 0)
 3203                                         anyvalid = 1;
 3204                                 pmap_remove_pde(pmap,
 3205                                     &pmap->pm_pdir[pdirindex], sva, &free);
 3206                                 continue;
 3207                         } else if (!pmap_demote_pde(pmap,
 3208                             &pmap->pm_pdir[pdirindex], sva)) {
 3209                                 /* The large page mapping was destroyed. */
 3210                                 continue;
 3211                         }
 3212                 }
 3213 
 3214                 /*
 3215                  * Limit our scan to either the end of the va represented
 3216                  * by the current page table page, or to the end of the
 3217                  * range being removed.
 3218                  */
 3219                 if (pdnxt > eva)
 3220                         pdnxt = eva;
 3221 
 3222                 if (pmap_remove_ptes(pmap, sva, pdnxt, &free))
 3223                         anyvalid = 1;
 3224         }
 3225 out:
 3226         sched_unpin();
 3227         if (anyvalid)
 3228                 pmap_invalidate_all(pmap);
 3229         rw_wunlock(&pvh_global_lock);
 3230         PMAP_UNLOCK(pmap);
 3231         vm_page_free_pages_toq(&free, true);
 3232 }
 3233 
 3234 /*
 3235  *      Routine:        pmap_remove_all
 3236  *      Function:
 3237  *              Removes this physical page from
 3238  *              all physical maps in which it resides.
 3239  *              Reflects back modify bits to the pager.
 3240  *
 3241  *      Notes:
 3242  *              Original versions of this routine were very
 3243  *              inefficient because they iteratively called
 3244  *              pmap_remove (slow...)
 3245  */
 3246 
 3247 void
 3248 pmap_remove_all(vm_page_t m)
 3249 {
 3250         struct md_page *pvh;
 3251         pv_entry_t pv;
 3252         pmap_t pmap;
 3253         pt_entry_t *pte, tpte;
 3254         pd_entry_t *pde;
 3255         vm_offset_t va;
 3256         struct spglist free;
 3257 
 3258         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3259             ("pmap_remove_all: page %p is not managed", m));
 3260         SLIST_INIT(&free);
 3261         rw_wlock(&pvh_global_lock);
 3262         sched_pin();
 3263         if ((m->flags & PG_FICTITIOUS) != 0)
 3264                 goto small_mappings;
 3265         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3266         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 3267                 va = pv->pv_va;
 3268                 pmap = PV_PMAP(pv);
 3269                 PMAP_LOCK(pmap);
 3270                 pde = pmap_pde(pmap, va);
 3271                 (void)pmap_demote_pde(pmap, pde, va);
 3272                 PMAP_UNLOCK(pmap);
 3273         }
 3274 small_mappings:
 3275         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 3276                 pmap = PV_PMAP(pv);
 3277                 PMAP_LOCK(pmap);
 3278                 pmap->pm_stats.resident_count--;
 3279                 pde = pmap_pde(pmap, pv->pv_va);
 3280                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 3281                     " a 4mpage in page %p's pv list", m));
 3282                 pte = pmap_pte_quick(pmap, pv->pv_va);
 3283                 tpte = pte_load_clear(pte);
 3284                 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
 3285                     pmap, pv->pv_va));
 3286                 if (tpte & PG_W)
 3287                         pmap->pm_stats.wired_count--;
 3288                 if (tpte & PG_A)
 3289                         vm_page_aflag_set(m, PGA_REFERENCED);
 3290 
 3291                 /*
 3292                  * Update the vm_page_t clean and reference bits.
 3293                  */
 3294                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3295                         vm_page_dirty(m);
 3296                 pmap_unuse_pt(pmap, pv->pv_va, &free);
 3297                 pmap_invalidate_page(pmap, pv->pv_va);
 3298                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 3299                 free_pv_entry(pmap, pv);
 3300                 PMAP_UNLOCK(pmap);
 3301         }
 3302         vm_page_aflag_clear(m, PGA_WRITEABLE);
 3303         sched_unpin();
 3304         rw_wunlock(&pvh_global_lock);
 3305         vm_page_free_pages_toq(&free, true);
 3306 }
 3307 
 3308 /*
 3309  * pmap_protect_pde: do the things to protect a 4mpage in a process
 3310  */
 3311 static boolean_t
 3312 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 3313 {
 3314         pd_entry_t newpde, oldpde;
 3315         vm_offset_t eva, va;
 3316         vm_page_t m;
 3317         boolean_t anychanged;
 3318 
 3319         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3320         KASSERT((sva & PDRMASK) == 0,
 3321             ("pmap_protect_pde: sva is not 4mpage aligned"));
 3322         anychanged = FALSE;
 3323 retry:
 3324         oldpde = newpde = *pde;
 3325         if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 3326             (PG_MANAGED | PG_M | PG_RW)) {
 3327                 eva = sva + NBPDR;
 3328                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 3329                     va < eva; va += PAGE_SIZE, m++)
 3330                         vm_page_dirty(m);
 3331         }
 3332         if ((prot & VM_PROT_WRITE) == 0)
 3333                 newpde &= ~(PG_RW | PG_M);
 3334 #if defined(PAE) || defined(PAE_TABLES)
 3335         if ((prot & VM_PROT_EXECUTE) == 0)
 3336                 newpde |= pg_nx;
 3337 #endif
 3338         if (newpde != oldpde) {
 3339                 /*
 3340                  * As an optimization to future operations on this PDE, clear
 3341                  * PG_PROMOTED.  The impending invalidation will remove any
 3342                  * lingering 4KB page mappings from the TLB.
 3343                  */
 3344                 if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED))
 3345                         goto retry;
 3346                 if ((oldpde & PG_G) != 0)
 3347                         pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 3348                 else
 3349                         anychanged = TRUE;
 3350         }
 3351         return (anychanged);
 3352 }
 3353 
 3354 /*
 3355  *      Set the physical protection on the
 3356  *      specified range of this map as requested.
 3357  */
 3358 void
 3359 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 3360 {
 3361         vm_offset_t pdnxt;
 3362         pd_entry_t ptpaddr;
 3363         pt_entry_t *pte;
 3364         boolean_t anychanged, pv_lists_locked;
 3365 
 3366         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 3367         if (prot == VM_PROT_NONE) {
 3368                 pmap_remove(pmap, sva, eva);
 3369                 return;
 3370         }
 3371 
 3372 #if defined(PAE) || defined(PAE_TABLES)
 3373         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 3374             (VM_PROT_WRITE|VM_PROT_EXECUTE))
 3375                 return;
 3376 #else
 3377         if (prot & VM_PROT_WRITE)
 3378                 return;
 3379 #endif
 3380 
 3381         if (pmap_is_current(pmap))
 3382                 pv_lists_locked = FALSE;
 3383         else {
 3384                 pv_lists_locked = TRUE;
 3385 resume:
 3386                 rw_wlock(&pvh_global_lock);
 3387                 sched_pin();
 3388         }
 3389         anychanged = FALSE;
 3390 
 3391         PMAP_LOCK(pmap);
 3392         for (; sva < eva; sva = pdnxt) {
 3393                 pt_entry_t obits, pbits;
 3394                 u_int pdirindex;
 3395 
 3396                 pdnxt = (sva + NBPDR) & ~PDRMASK;
 3397                 if (pdnxt < sva)
 3398                         pdnxt = eva;
 3399 
 3400                 pdirindex = sva >> PDRSHIFT;
 3401                 ptpaddr = pmap->pm_pdir[pdirindex];
 3402 
 3403                 /*
 3404                  * Weed out invalid mappings. Note: we assume that the page
 3405                  * directory table is always allocated, and in kernel virtual.
 3406                  */
 3407                 if (ptpaddr == 0)
 3408                         continue;
 3409 
 3410                 /*
 3411                  * Check for large page.
 3412                  */
 3413                 if ((ptpaddr & PG_PS) != 0) {
 3414                         /*
 3415                          * Are we protecting the entire large page?  If not,
 3416                          * demote the mapping and fall through.
 3417                          */
 3418                         if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 3419                                 /*
 3420                                  * The TLB entry for a PG_G mapping is
 3421                                  * invalidated by pmap_protect_pde().
 3422                                  */
 3423                                 if (pmap_protect_pde(pmap,
 3424                                     &pmap->pm_pdir[pdirindex], sva, prot))
 3425                                         anychanged = TRUE;
 3426                                 continue;
 3427                         } else {
 3428                                 if (!pv_lists_locked) {
 3429                                         pv_lists_locked = TRUE;
 3430                                         if (!rw_try_wlock(&pvh_global_lock)) {
 3431                                                 if (anychanged)
 3432                                                         pmap_invalidate_all(
 3433                                                             pmap);
 3434                                                 PMAP_UNLOCK(pmap);
 3435                                                 goto resume;
 3436                                         }
 3437                                         sched_pin();
 3438                                 }
 3439                                 if (!pmap_demote_pde(pmap,
 3440                                     &pmap->pm_pdir[pdirindex], sva)) {
 3441                                         /*
 3442                                          * The large page mapping was
 3443                                          * destroyed.
 3444                                          */
 3445                                         continue;
 3446                                 }
 3447                         }
 3448                 }
 3449 
 3450                 if (pdnxt > eva)
 3451                         pdnxt = eva;
 3452 
 3453                 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 3454                     sva += PAGE_SIZE) {
 3455                         vm_page_t m;
 3456 
 3457 retry:
 3458                         /*
 3459                          * Regardless of whether a pte is 32 or 64 bits in
 3460                          * size, PG_RW, PG_A, and PG_M are among the least
 3461                          * significant 32 bits.
 3462                          */
 3463                         obits = pbits = *pte;
 3464                         if ((pbits & PG_V) == 0)
 3465                                 continue;
 3466 
 3467                         if ((prot & VM_PROT_WRITE) == 0) {
 3468                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 3469                                     (PG_MANAGED | PG_M | PG_RW)) {
 3470                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 3471                                         vm_page_dirty(m);
 3472                                 }
 3473                                 pbits &= ~(PG_RW | PG_M);
 3474                         }
 3475 #if defined(PAE) || defined(PAE_TABLES)
 3476                         if ((prot & VM_PROT_EXECUTE) == 0)
 3477                                 pbits |= pg_nx;
 3478 #endif
 3479 
 3480                         if (pbits != obits) {
 3481 #if defined(PAE) || defined(PAE_TABLES)
 3482                                 if (!atomic_cmpset_64(pte, obits, pbits))
 3483                                         goto retry;
 3484 #else
 3485                                 if (!atomic_cmpset_int((u_int *)pte, obits,
 3486                                     pbits))
 3487                                         goto retry;
 3488 #endif
 3489                                 if (obits & PG_G)
 3490                                         pmap_invalidate_page(pmap, sva);
 3491                                 else
 3492                                         anychanged = TRUE;
 3493                         }
 3494                 }
 3495         }
 3496         if (anychanged)
 3497                 pmap_invalidate_all(pmap);
 3498         if (pv_lists_locked) {
 3499                 sched_unpin();
 3500                 rw_wunlock(&pvh_global_lock);
 3501         }
 3502         PMAP_UNLOCK(pmap);
 3503 }
 3504 
 3505 #if VM_NRESERVLEVEL > 0
 3506 /*
 3507  * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
 3508  * within a single page table page (PTP) to a single 2- or 4MB page mapping.
 3509  * For promotion to occur, two conditions must be met: (1) the 4KB page
 3510  * mappings must map aligned, contiguous physical memory and (2) the 4KB page
 3511  * mappings must have identical characteristics.
 3512  *
 3513  * Managed (PG_MANAGED) mappings within the kernel address space are not
 3514  * promoted.  The reason is that kernel PDEs are replicated in each pmap but
 3515  * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
 3516  * pmap.
 3517  */
 3518 static void
 3519 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 3520 {
 3521         pd_entry_t newpde;
 3522         pt_entry_t *firstpte, oldpte, pa, *pte;
 3523         vm_offset_t oldpteva;
 3524         vm_page_t mpte;
 3525 
 3526         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3527 
 3528         /*
 3529          * Examine the first PTE in the specified PTP.  Abort if this PTE is
 3530          * either invalid, unused, or does not map the first 4KB physical page
 3531          * within a 2- or 4MB page.
 3532          */
 3533         firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
 3534 setpde:
 3535         newpde = *firstpte;
 3536         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 3537                 pmap_pde_p_failures++;
 3538                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 3539                     " in pmap %p", va, pmap);
 3540                 return;
 3541         }
 3542         if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
 3543                 pmap_pde_p_failures++;
 3544                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 3545                     " in pmap %p", va, pmap);
 3546                 return;
 3547         }
 3548         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 3549                 /*
 3550                  * When PG_M is already clear, PG_RW can be cleared without
 3551                  * a TLB invalidation.
 3552                  */
 3553                 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
 3554                     ~PG_RW))  
 3555                         goto setpde;
 3556                 newpde &= ~PG_RW;
 3557         }
 3558 
 3559         /* 
 3560          * Examine each of the other PTEs in the specified PTP.  Abort if this
 3561          * PTE maps an unexpected 4KB physical page or does not have identical
 3562          * characteristics to the first PTE.
 3563          */
 3564         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 3565         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 3566 setpte:
 3567                 oldpte = *pte;
 3568                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 3569                         pmap_pde_p_failures++;
 3570                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 3571                             " in pmap %p", va, pmap);
 3572                         return;
 3573                 }
 3574                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 3575                         /*
 3576                          * When PG_M is already clear, PG_RW can be cleared
 3577                          * without a TLB invalidation.
 3578                          */
 3579                         if (!atomic_cmpset_int((u_int *)pte, oldpte,
 3580                             oldpte & ~PG_RW))
 3581                                 goto setpte;
 3582                         oldpte &= ~PG_RW;
 3583                         oldpteva = (oldpte & PG_FRAME & PDRMASK) |
 3584                             (va & ~PDRMASK);
 3585                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
 3586                             " in pmap %p", oldpteva, pmap);
 3587                 }
 3588                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 3589                         pmap_pde_p_failures++;
 3590                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 3591                             " in pmap %p", va, pmap);
 3592                         return;
 3593                 }
 3594                 pa -= PAGE_SIZE;
 3595         }
 3596 
 3597         /*
 3598          * Save the page table page in its current state until the PDE
 3599          * mapping the superpage is demoted by pmap_demote_pde() or
 3600          * destroyed by pmap_remove_pde(). 
 3601          */
 3602         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 3603         KASSERT(mpte >= vm_page_array &&
 3604             mpte < &vm_page_array[vm_page_array_size],
 3605             ("pmap_promote_pde: page table page is out of range"));
 3606         KASSERT(mpte->pindex == va >> PDRSHIFT,
 3607             ("pmap_promote_pde: page table page's pindex is wrong"));
 3608         if (pmap_insert_pt_page(pmap, mpte)) {
 3609                 pmap_pde_p_failures++;
 3610                 CTR2(KTR_PMAP,
 3611                     "pmap_promote_pde: failure for va %#x in pmap %p", va,
 3612                     pmap);
 3613                 return;
 3614         }
 3615 
 3616         /*
 3617          * Promote the pv entries.
 3618          */
 3619         if ((newpde & PG_MANAGED) != 0)
 3620                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
 3621 
 3622         /*
 3623          * Propagate the PAT index to its proper position.
 3624          */
 3625         if ((newpde & PG_PTE_PAT) != 0)
 3626                 newpde ^= PG_PDE_PAT | PG_PTE_PAT;
 3627 
 3628         /*
 3629          * Map the superpage.
 3630          */
 3631         if (workaround_erratum383)
 3632                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 3633         else if (pmap == kernel_pmap)
 3634                 pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde);
 3635         else
 3636                 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 3637 
 3638         pmap_pde_promotions++;
 3639         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
 3640             " in pmap %p", va, pmap);
 3641 }
 3642 #endif /* VM_NRESERVLEVEL > 0 */
 3643 
 3644 /*
 3645  *      Insert the given physical page (p) at
 3646  *      the specified virtual address (v) in the
 3647  *      target physical map with the protection requested.
 3648  *
 3649  *      If specified, the page will be wired down, meaning
 3650  *      that the related pte can not be reclaimed.
 3651  *
 3652  *      NB:  This is the only routine which MAY NOT lazy-evaluate
 3653  *      or lose information.  That is, this routine must actually
 3654  *      insert this page into the given map NOW.
 3655  */
 3656 int
 3657 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 3658     u_int flags, int8_t psind)
 3659 {
 3660         pd_entry_t *pde;
 3661         pt_entry_t *pte;
 3662         pt_entry_t newpte, origpte;
 3663         pv_entry_t pv;
 3664         vm_paddr_t opa, pa;
 3665         vm_page_t mpte, om;
 3666         int rv;
 3667 
 3668         va = trunc_page(va);
 3669         KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) ||
 3670             (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS),
 3671             ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va));
 3672         KASSERT(va < PMAP_TRM_MIN_ADDRESS,
 3673             ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)",
 3674             va));
 3675         KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 ||
 3676             va < kmi.clean_sva || va >= kmi.clean_eva,
 3677             ("pmap_enter: managed mapping within the clean submap"));
 3678         if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 3679                 VM_OBJECT_ASSERT_LOCKED(m->object);
 3680         KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
 3681             ("pmap_enter: flags %u has reserved bits set", flags));
 3682         pa = VM_PAGE_TO_PHYS(m);
 3683         newpte = (pt_entry_t)(pa | PG_A | PG_V);
 3684         if ((flags & VM_PROT_WRITE) != 0)
 3685                 newpte |= PG_M;
 3686         if ((prot & VM_PROT_WRITE) != 0)
 3687                 newpte |= PG_RW;
 3688         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 3689             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 3690 #if defined(PAE) || defined(PAE_TABLES)
 3691         if ((prot & VM_PROT_EXECUTE) == 0)
 3692                 newpte |= pg_nx;
 3693 #endif
 3694         if ((flags & PMAP_ENTER_WIRED) != 0)
 3695                 newpte |= PG_W;
 3696         if (pmap != kernel_pmap)
 3697                 newpte |= PG_U;
 3698         newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
 3699         if ((m->oflags & VPO_UNMANAGED) == 0)
 3700                 newpte |= PG_MANAGED;
 3701 
 3702         rw_wlock(&pvh_global_lock);
 3703         PMAP_LOCK(pmap);
 3704         sched_pin();
 3705         if (psind == 1) {
 3706                 /* Assert the required virtual and physical alignment. */ 
 3707                 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
 3708                 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 3709                 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m);
 3710                 goto out;
 3711         }
 3712 
 3713         pde = pmap_pde(pmap, va);
 3714         if (pmap != kernel_pmap) {
 3715                 /*
 3716                  * va is for UVA.
 3717                  * In the case that a page table page is not resident,
 3718                  * we are creating it here.  pmap_allocpte() handles
 3719                  * demotion.
 3720                  */
 3721                 mpte = pmap_allocpte(pmap, va, flags);
 3722                 if (mpte == NULL) {
 3723                         KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
 3724                             ("pmap_allocpte failed with sleep allowed"));
 3725                         rv = KERN_RESOURCE_SHORTAGE;
 3726                         goto out;
 3727                 }
 3728         } else {
 3729                 /*
 3730                  * va is for KVA, so pmap_demote_pde() will never fail
 3731                  * to install a page table page.  PG_V is also
 3732                  * asserted by pmap_demote_pde().
 3733                  */
 3734                 mpte = NULL;
 3735                 KASSERT(pde != NULL && (*pde & PG_V) != 0,
 3736                     ("KVA %#x invalid pde pdir %#jx", va,
 3737                     (uintmax_t)pmap->pm_pdir[PTDPTDI]));
 3738                 if ((*pde & PG_PS) != 0)
 3739                         pmap_demote_pde(pmap, pde, va);
 3740         }
 3741         pte = pmap_pte_quick(pmap, va);
 3742 
 3743         /*
 3744          * Page Directory table entry is not valid, which should not
 3745          * happen.  We should have either allocated the page table
 3746          * page or demoted the existing mapping above.
 3747          */
 3748         if (pte == NULL) {
 3749                 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
 3750                     (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
 3751         }
 3752 
 3753         origpte = *pte;
 3754         pv = NULL;
 3755 
 3756         /*
 3757          * Is the specified virtual address already mapped?
 3758          */
 3759         if ((origpte & PG_V) != 0) {
 3760                 /*
 3761                  * Wiring change, just update stats. We don't worry about
 3762                  * wiring PT pages as they remain resident as long as there
 3763                  * are valid mappings in them. Hence, if a user page is wired,
 3764                  * the PT page will be also.
 3765                  */
 3766                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 3767                         pmap->pm_stats.wired_count++;
 3768                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 3769                         pmap->pm_stats.wired_count--;
 3770 
 3771                 /*
 3772                  * Remove the extra PT page reference.
 3773                  */
 3774                 if (mpte != NULL) {
 3775                         mpte->wire_count--;
 3776                         KASSERT(mpte->wire_count > 0,
 3777                             ("pmap_enter: missing reference to page table page,"
 3778                              " va: 0x%x", va));
 3779                 }
 3780 
 3781                 /*
 3782                  * Has the physical page changed?
 3783                  */
 3784                 opa = origpte & PG_FRAME;
 3785                 if (opa == pa) {
 3786                         /*
 3787                          * No, might be a protection or wiring change.
 3788                          */
 3789                         if ((origpte & PG_MANAGED) != 0 &&
 3790                             (newpte & PG_RW) != 0)
 3791                                 vm_page_aflag_set(m, PGA_WRITEABLE);
 3792                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 3793                                 goto unchanged;
 3794                         goto validate;
 3795                 }
 3796 
 3797                 /*
 3798                  * The physical page has changed.  Temporarily invalidate
 3799                  * the mapping.  This ensures that all threads sharing the
 3800                  * pmap keep a consistent view of the mapping, which is
 3801                  * necessary for the correct handling of COW faults.  It
 3802                  * also permits reuse of the old mapping's PV entry,
 3803                  * avoiding an allocation.
 3804                  *
 3805                  * For consistency, handle unmanaged mappings the same way.
 3806                  */
 3807                 origpte = pte_load_clear(pte);
 3808                 KASSERT((origpte & PG_FRAME) == opa,
 3809                     ("pmap_enter: unexpected pa update for %#x", va));
 3810                 if ((origpte & PG_MANAGED) != 0) {
 3811                         om = PHYS_TO_VM_PAGE(opa);
 3812 
 3813                         /*
 3814                          * The pmap lock is sufficient to synchronize with
 3815                          * concurrent calls to pmap_page_test_mappings() and
 3816                          * pmap_ts_referenced().
 3817                          */
 3818                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3819                                 vm_page_dirty(om);
 3820                         if ((origpte & PG_A) != 0)
 3821                                 vm_page_aflag_set(om, PGA_REFERENCED);
 3822                         pv = pmap_pvh_remove(&om->md, pmap, va);
 3823                         if ((newpte & PG_MANAGED) == 0)
 3824                                 free_pv_entry(pmap, pv);
 3825                         if ((om->aflags & PGA_WRITEABLE) != 0 &&
 3826                             TAILQ_EMPTY(&om->md.pv_list) &&
 3827                             ((om->flags & PG_FICTITIOUS) != 0 ||
 3828                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 3829                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
 3830                 }
 3831                 if ((origpte & PG_A) != 0)
 3832                         pmap_invalidate_page(pmap, va);
 3833                 origpte = 0;
 3834         } else {
 3835                 /*
 3836                  * Increment the counters.
 3837                  */
 3838                 if ((newpte & PG_W) != 0)
 3839                         pmap->pm_stats.wired_count++;
 3840                 pmap->pm_stats.resident_count++;
 3841         }
 3842 
 3843         /*
 3844          * Enter on the PV list if part of our managed memory.
 3845          */
 3846         if ((newpte & PG_MANAGED) != 0) {
 3847                 if (pv == NULL) {
 3848                         pv = get_pv_entry(pmap, FALSE);
 3849                         pv->pv_va = va;
 3850                 }
 3851                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 3852                 if ((newpte & PG_RW) != 0)
 3853                         vm_page_aflag_set(m, PGA_WRITEABLE);
 3854         }
 3855 
 3856         /*
 3857          * Update the PTE.
 3858          */
 3859         if ((origpte & PG_V) != 0) {
 3860 validate:
 3861                 origpte = pte_load_store(pte, newpte);
 3862                 KASSERT((origpte & PG_FRAME) == pa,
 3863                     ("pmap_enter: unexpected pa update for %#x", va));
 3864                 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
 3865                     (PG_M | PG_RW)) {
 3866                         if ((origpte & PG_MANAGED) != 0)
 3867                                 vm_page_dirty(m);
 3868 
 3869                         /*
 3870                          * Although the PTE may still have PG_RW set, TLB
 3871                          * invalidation may nonetheless be required because
 3872                          * the PTE no longer has PG_M set.
 3873                          */
 3874                 }
 3875 #if defined(PAE) || defined(PAE_TABLES)
 3876                 else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 3877                         /*
 3878                          * This PTE change does not require TLB invalidation.
 3879                          */
 3880                         goto unchanged;
 3881                 }
 3882 #endif
 3883                 if ((origpte & PG_A) != 0)
 3884                         pmap_invalidate_page(pmap, va);
 3885         } else
 3886                 pte_store(pte, newpte);
 3887 
 3888 unchanged:
 3889 
 3890 #if VM_NRESERVLEVEL > 0
 3891         /*
 3892          * If both the page table page and the reservation are fully
 3893          * populated, then attempt promotion.
 3894          */
 3895         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 3896             pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
 3897             vm_reserv_level_iffullpop(m) == 0)
 3898                 pmap_promote_pde(pmap, pde, va);
 3899 #endif
 3900 
 3901         rv = KERN_SUCCESS;
 3902 out:
 3903         sched_unpin();
 3904         rw_wunlock(&pvh_global_lock);
 3905         PMAP_UNLOCK(pmap);
 3906         return (rv);
 3907 }
 3908 
 3909 /*
 3910  * Tries to create a read- and/or execute-only 2 or 4 MB page mapping.  Returns
 3911  * true if successful.  Returns false if (1) a mapping already exists at the
 3912  * specified virtual address or (2) a PV entry cannot be allocated without
 3913  * reclaiming another PV entry.
 3914  */
 3915 static bool
 3916 pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 3917 {
 3918         pd_entry_t newpde;
 3919 
 3920         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3921         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 3922             PG_PS | PG_V;
 3923         if ((m->oflags & VPO_UNMANAGED) == 0)
 3924                 newpde |= PG_MANAGED;
 3925 #if defined(PAE) || defined(PAE_TABLES)
 3926         if ((prot & VM_PROT_EXECUTE) == 0)
 3927                 newpde |= pg_nx;
 3928 #endif
 3929         if (pmap != kernel_pmap)
 3930                 newpde |= PG_U;
 3931         return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
 3932             PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL) ==
 3933             KERN_SUCCESS);
 3934 }
 3935 
 3936 /*
 3937  * Tries to create the specified 2 or 4 MB page mapping.  Returns KERN_SUCCESS
 3938  * if the mapping was created, and either KERN_FAILURE or
 3939  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
 3940  * PMAP_ENTER_NOREPLACE was specified and a mapping already exists at the
 3941  * specified virtual address.  Returns KERN_RESOURCE_SHORTAGE if
 3942  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
 3943  *
 3944  * The parameter "m" is only used when creating a managed, writeable mapping.
 3945  */
 3946 static int
 3947 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
 3948     vm_page_t m)
 3949 {
 3950         struct spglist free;
 3951         pd_entry_t oldpde, *pde;
 3952         vm_page_t mt;
 3953 
 3954         rw_assert(&pvh_global_lock, RA_WLOCKED);
 3955         KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
 3956             ("pmap_enter_pde: newpde is missing PG_M"));
 3957         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3958         pde = pmap_pde(pmap, va);
 3959         oldpde = *pde;
 3960         if ((oldpde & PG_V) != 0) {
 3961                 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 3962                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3963                             " in pmap %p", va, pmap);
 3964                         return (KERN_FAILURE);
 3965                 }
 3966                 /* Break the existing mapping(s). */
 3967                 SLIST_INIT(&free);
 3968                 if ((oldpde & PG_PS) != 0) {
 3969                         /*
 3970                          * If the PDE resulted from a promotion, then a
 3971                          * reserved PT page could be freed.
 3972                          */
 3973                         (void)pmap_remove_pde(pmap, pde, va, &free);
 3974                         if ((oldpde & PG_G) == 0)
 3975                                 pmap_invalidate_pde_page(pmap, va, oldpde);
 3976                 } else {
 3977                         if (pmap_remove_ptes(pmap, va, va + NBPDR, &free))
 3978                                pmap_invalidate_all(pmap);
 3979                 }
 3980                 vm_page_free_pages_toq(&free, true);
 3981                 if (pmap == kernel_pmap) {
 3982                         mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 3983                         if (pmap_insert_pt_page(pmap, mt)) {
 3984                                 /*
 3985                                  * XXX Currently, this can't happen because
 3986                                  * we do not perform pmap_enter(psind == 1)
 3987                                  * on the kernel pmap.
 3988                                  */
 3989                                 panic("pmap_enter_pde: trie insert failed");
 3990                         }
 3991                 } else
 3992                         KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
 3993                             pde));
 3994         }
 3995         if ((newpde & PG_MANAGED) != 0) {
 3996                 /*
 3997                  * Abort this mapping if its PV entry could not be created.
 3998                  */
 3999                 if (!pmap_pv_insert_pde(pmap, va, newpde, flags)) {
 4000                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 4001                             " in pmap %p", va, pmap);
 4002                         return (KERN_RESOURCE_SHORTAGE);
 4003                 }
 4004                 if ((newpde & PG_RW) != 0) {
 4005                         for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 4006                                 vm_page_aflag_set(mt, PGA_WRITEABLE);
 4007                 }
 4008         }
 4009 
 4010         /*
 4011          * Increment counters.
 4012          */
 4013         if ((newpde & PG_W) != 0)
 4014                 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
 4015         pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
 4016 
 4017         /*
 4018          * Map the superpage.  (This is not a promoted mapping; there will not
 4019          * be any lingering 4KB page mappings in the TLB.)
 4020          */
 4021         pde_store(pde, newpde);
 4022 
 4023         pmap_pde_mappings++;
 4024         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 4025             " in pmap %p", va, pmap);
 4026         return (KERN_SUCCESS);
 4027 }
 4028 
 4029 /*
 4030  * Maps a sequence of resident pages belonging to the same object.
 4031  * The sequence begins with the given page m_start.  This page is
 4032  * mapped at the given virtual address start.  Each subsequent page is
 4033  * mapped at a virtual address that is offset from start by the same
 4034  * amount as the page is offset from m_start within the object.  The
 4035  * last page in the sequence is the page with the largest offset from
 4036  * m_start that can be mapped at a virtual address less than the given
 4037  * virtual address end.  Not every virtual page between start and end
 4038  * is mapped; only those for which a resident page exists with the
 4039  * corresponding offset from m_start are mapped.
 4040  */
 4041 void
 4042 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
 4043     vm_page_t m_start, vm_prot_t prot)
 4044 {
 4045         vm_offset_t va;
 4046         vm_page_t m, mpte;
 4047         vm_pindex_t diff, psize;
 4048 
 4049         VM_OBJECT_ASSERT_LOCKED(m_start->object);
 4050 
 4051         psize = atop(end - start);
 4052         mpte = NULL;
 4053         m = m_start;
 4054         rw_wlock(&pvh_global_lock);
 4055         PMAP_LOCK(pmap);
 4056         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 4057                 va = start + ptoa(diff);
 4058                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 4059                     m->psind == 1 && pg_ps_enabled &&
 4060                     pmap_enter_4mpage(pmap, va, m, prot))
 4061                         m = &m[NBPDR / PAGE_SIZE - 1];
 4062                 else
 4063                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 4064                             mpte);
 4065                 m = TAILQ_NEXT(m, listq);
 4066         }
 4067         rw_wunlock(&pvh_global_lock);
 4068         PMAP_UNLOCK(pmap);
 4069 }
 4070 
 4071 /*
 4072  * this code makes some *MAJOR* assumptions:
 4073  * 1. Current pmap & pmap exists.
 4074  * 2. Not wired.
 4075  * 3. Read access.
 4076  * 4. No page table pages.
 4077  * but is *MUCH* faster than pmap_enter...
 4078  */
 4079 
 4080 void
 4081 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 4082 {
 4083 
 4084         rw_wlock(&pvh_global_lock);
 4085         PMAP_LOCK(pmap);
 4086         (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 4087         rw_wunlock(&pvh_global_lock);
 4088         PMAP_UNLOCK(pmap);
 4089 }
 4090 
 4091 static vm_page_t
 4092 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 4093     vm_prot_t prot, vm_page_t mpte)
 4094 {
 4095         pt_entry_t *pte;
 4096         vm_paddr_t pa;
 4097         struct spglist free;
 4098 
 4099         KASSERT(pmap != kernel_pmap || va < kmi.clean_sva ||
 4100             va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0,
 4101             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 4102         rw_assert(&pvh_global_lock, RA_WLOCKED);
 4103         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4104 
 4105         /*
 4106          * In the case that a page table page is not
 4107          * resident, we are creating it here.
 4108          */
 4109         if (pmap != kernel_pmap) {
 4110                 u_int ptepindex;
 4111                 pd_entry_t ptepa;
 4112 
 4113                 /*
 4114                  * Calculate pagetable page index
 4115                  */
 4116                 ptepindex = va >> PDRSHIFT;
 4117                 if (mpte && (mpte->pindex == ptepindex)) {
 4118                         mpte->wire_count++;
 4119                 } else {
 4120                         /*
 4121                          * Get the page directory entry
 4122                          */
 4123                         ptepa = pmap->pm_pdir[ptepindex];
 4124 
 4125                         /*
 4126                          * If the page table page is mapped, we just increment
 4127                          * the hold count, and activate it.
 4128                          */
 4129                         if (ptepa) {
 4130                                 if (ptepa & PG_PS)
 4131                                         return (NULL);
 4132                                 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 4133                                 mpte->wire_count++;
 4134                         } else {
 4135                                 mpte = _pmap_allocpte(pmap, ptepindex,
 4136                                     PMAP_ENTER_NOSLEEP);
 4137                                 if (mpte == NULL)
 4138                                         return (mpte);
 4139                         }
 4140                 }
 4141         } else {
 4142                 mpte = NULL;
 4143         }
 4144 
 4145         sched_pin();
 4146         pte = pmap_pte_quick(pmap, va);
 4147         if (*pte) {
 4148                 if (mpte != NULL) {
 4149                         mpte->wire_count--;
 4150                         mpte = NULL;
 4151                 }
 4152                 sched_unpin();
 4153                 return (mpte);
 4154         }
 4155 
 4156         /*
 4157          * Enter on the PV list if part of our managed memory.
 4158          */
 4159         if ((m->oflags & VPO_UNMANAGED) == 0 &&
 4160             !pmap_try_insert_pv_entry(pmap, va, m)) {
 4161                 if (mpte != NULL) {
 4162                         SLIST_INIT(&free);
 4163                         if (pmap_unwire_ptp(pmap, mpte, &free)) {
 4164                                 pmap_invalidate_page(pmap, va);
 4165                                 vm_page_free_pages_toq(&free, true);
 4166                         }
 4167                         
 4168                         mpte = NULL;
 4169                 }
 4170                 sched_unpin();
 4171                 return (mpte);
 4172         }
 4173 
 4174         /*
 4175          * Increment counters
 4176          */
 4177         pmap->pm_stats.resident_count++;
 4178 
 4179         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
 4180 #if defined(PAE) || defined(PAE_TABLES)
 4181         if ((prot & VM_PROT_EXECUTE) == 0)
 4182                 pa |= pg_nx;
 4183 #endif
 4184 
 4185         /*
 4186          * Now validate mapping with RO protection
 4187          */
 4188         if ((m->oflags & VPO_UNMANAGED) != 0)
 4189                 pte_store(pte, pa | PG_V | PG_U);
 4190         else
 4191                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 4192         sched_unpin();
 4193         return (mpte);
 4194 }
 4195 
 4196 /*
 4197  * Make a temporary mapping for a physical address.  This is only intended
 4198  * to be used for panic dumps.
 4199  */
 4200 void *
 4201 pmap_kenter_temporary(vm_paddr_t pa, int i)
 4202 {
 4203         vm_offset_t va;
 4204 
 4205         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 4206         pmap_kenter(va, pa);
 4207         invlpg(va);
 4208         return ((void *)crashdumpmap);
 4209 }
 4210 
 4211 /*
 4212  * This code maps large physical mmap regions into the
 4213  * processor address space.  Note that some shortcuts
 4214  * are taken, but the code works.
 4215  */
 4216 void
 4217 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
 4218     vm_pindex_t pindex, vm_size_t size)
 4219 {
 4220         pd_entry_t *pde;
 4221         vm_paddr_t pa, ptepa;
 4222         vm_page_t p;
 4223         int pat_mode;
 4224 
 4225         VM_OBJECT_ASSERT_WLOCKED(object);
 4226         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 4227             ("pmap_object_init_pt: non-device object"));
 4228         if (pg_ps_enabled &&
 4229             (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 4230                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
 4231                         return;
 4232                 p = vm_page_lookup(object, pindex);
 4233                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
 4234                     ("pmap_object_init_pt: invalid page %p", p));
 4235                 pat_mode = p->md.pat_mode;
 4236 
 4237                 /*
 4238                  * Abort the mapping if the first page is not physically
 4239                  * aligned to a 2/4MB page boundary.
 4240                  */
 4241                 ptepa = VM_PAGE_TO_PHYS(p);
 4242                 if (ptepa & (NBPDR - 1))
 4243                         return;
 4244 
 4245                 /*
 4246                  * Skip the first page.  Abort the mapping if the rest of
 4247                  * the pages are not physically contiguous or have differing
 4248                  * memory attributes.
 4249                  */
 4250                 p = TAILQ_NEXT(p, listq);
 4251                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 4252                     pa += PAGE_SIZE) {
 4253                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
 4254                             ("pmap_object_init_pt: invalid page %p", p));
 4255                         if (pa != VM_PAGE_TO_PHYS(p) ||
 4256                             pat_mode != p->md.pat_mode)
 4257                                 return;
 4258                         p = TAILQ_NEXT(p, listq);
 4259                 }
 4260 
 4261                 /*
 4262                  * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
 4263                  * "size" is a multiple of 2/4M, adding the PAT setting to
 4264                  * "pa" will not affect the termination of this loop.
 4265                  */
 4266                 PMAP_LOCK(pmap);
 4267                 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 4268                     pa < ptepa + size; pa += NBPDR) {
 4269                         pde = pmap_pde(pmap, addr);
 4270                         if (*pde == 0) {
 4271                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
 4272                                     PG_U | PG_RW | PG_V);
 4273                                 pmap->pm_stats.resident_count += NBPDR /
 4274                                     PAGE_SIZE;
 4275                                 pmap_pde_mappings++;
 4276                         }
 4277                         /* Else continue on if the PDE is already valid. */
 4278                         addr += NBPDR;
 4279                 }
 4280                 PMAP_UNLOCK(pmap);
 4281         }
 4282 }
 4283 
 4284 /*
 4285  *      Clear the wired attribute from the mappings for the specified range of
 4286  *      addresses in the given pmap.  Every valid mapping within that range
 4287  *      must have the wired attribute set.  In contrast, invalid mappings
 4288  *      cannot have the wired attribute set, so they are ignored.
 4289  *
 4290  *      The wired attribute of the page table entry is not a hardware feature,
 4291  *      so there is no need to invalidate any TLB entries.
 4292  */
 4293 void
 4294 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 4295 {
 4296         vm_offset_t pdnxt;
 4297         pd_entry_t *pde;
 4298         pt_entry_t *pte;
 4299         boolean_t pv_lists_locked;
 4300 
 4301         if (pmap_is_current(pmap))
 4302                 pv_lists_locked = FALSE;
 4303         else {
 4304                 pv_lists_locked = TRUE;
 4305 resume:
 4306                 rw_wlock(&pvh_global_lock);
 4307                 sched_pin();
 4308         }
 4309         PMAP_LOCK(pmap);
 4310         for (; sva < eva; sva = pdnxt) {
 4311                 pdnxt = (sva + NBPDR) & ~PDRMASK;
 4312                 if (pdnxt < sva)
 4313                         pdnxt = eva;
 4314                 pde = pmap_pde(pmap, sva);
 4315                 if ((*pde & PG_V) == 0)
 4316                         continue;
 4317                 if ((*pde & PG_PS) != 0) {
 4318                         if ((*pde & PG_W) == 0)
 4319                                 panic("pmap_unwire: pde %#jx is missing PG_W",
 4320                                     (uintmax_t)*pde);
 4321 
 4322                         /*
 4323                          * Are we unwiring the entire large page?  If not,
 4324                          * demote the mapping and fall through.
 4325                          */
 4326                         if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 4327                                 /*
 4328                                  * Regardless of whether a pde (or pte) is 32
 4329                                  * or 64 bits in size, PG_W is among the least
 4330                                  * significant 32 bits.
 4331                                  */
 4332                                 atomic_clear_int((u_int *)pde, PG_W);
 4333                                 pmap->pm_stats.wired_count -= NBPDR /
 4334                                     PAGE_SIZE;
 4335                                 continue;
 4336                         } else {
 4337                                 if (!pv_lists_locked) {
 4338                                         pv_lists_locked = TRUE;
 4339                                         if (!rw_try_wlock(&pvh_global_lock)) {
 4340                                                 PMAP_UNLOCK(pmap);
 4341                                                 /* Repeat sva. */
 4342                                                 goto resume;
 4343                                         }
 4344                                         sched_pin();
 4345                                 }
 4346                                 if (!pmap_demote_pde(pmap, pde, sva))
 4347                                         panic("pmap_unwire: demotion failed");
 4348                         }
 4349                 }
 4350                 if (pdnxt > eva)
 4351                         pdnxt = eva;
 4352                 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 4353                     sva += PAGE_SIZE) {
 4354                         if ((*pte & PG_V) == 0)
 4355                                 continue;
 4356                         if ((*pte & PG_W) == 0)
 4357                                 panic("pmap_unwire: pte %#jx is missing PG_W",
 4358                                     (uintmax_t)*pte);
 4359 
 4360                         /*
 4361                          * PG_W must be cleared atomically.  Although the pmap
 4362                          * lock synchronizes access to PG_W, another processor
 4363                          * could be setting PG_M and/or PG_A concurrently.
 4364                          *
 4365                          * PG_W is among the least significant 32 bits.
 4366                          */
 4367                         atomic_clear_int((u_int *)pte, PG_W);
 4368                         pmap->pm_stats.wired_count--;
 4369                 }
 4370         }
 4371         if (pv_lists_locked) {
 4372                 sched_unpin();
 4373                 rw_wunlock(&pvh_global_lock);
 4374         }
 4375         PMAP_UNLOCK(pmap);
 4376 }
 4377 
 4378 
 4379 /*
 4380  *      Copy the range specified by src_addr/len
 4381  *      from the source map to the range dst_addr/len
 4382  *      in the destination map.
 4383  *
 4384  *      This routine is only advisory and need not do anything.  Since
 4385  *      current pmap is always the kernel pmap when executing in
 4386  *      kernel, and we do not copy from the kernel pmap to a user
 4387  *      pmap, this optimization is not usable in 4/4G full split i386
 4388  *      world.
 4389  */
 4390 
 4391 void
 4392 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 4393     vm_offset_t src_addr)
 4394 {
 4395         struct spglist free;
 4396         pt_entry_t *src_pte, *dst_pte, ptetemp;
 4397         pd_entry_t srcptepaddr;
 4398         vm_page_t dstmpte, srcmpte;
 4399         vm_offset_t addr, end_addr, pdnxt;
 4400         u_int ptepindex;
 4401 
 4402         if (dst_addr != src_addr)
 4403                 return;
 4404 
 4405         end_addr = src_addr + len;
 4406 
 4407         rw_wlock(&pvh_global_lock);
 4408         if (dst_pmap < src_pmap) {
 4409                 PMAP_LOCK(dst_pmap);
 4410                 PMAP_LOCK(src_pmap);
 4411         } else {
 4412                 PMAP_LOCK(src_pmap);
 4413                 PMAP_LOCK(dst_pmap);
 4414         }
 4415         sched_pin();
 4416         for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 4417                 KASSERT(addr < PMAP_TRM_MIN_ADDRESS,
 4418                     ("pmap_copy: invalid to pmap_copy the trampoline"));
 4419 
 4420                 pdnxt = (addr + NBPDR) & ~PDRMASK;
 4421                 if (pdnxt < addr)
 4422                         pdnxt = end_addr;
 4423                 ptepindex = addr >> PDRSHIFT;
 4424 
 4425                 srcptepaddr = src_pmap->pm_pdir[ptepindex];
 4426                 if (srcptepaddr == 0)
 4427                         continue;
 4428 
 4429                 if (srcptepaddr & PG_PS) {
 4430                         if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 4431                                 continue;
 4432                         if (dst_pmap->pm_pdir[ptepindex] == 0 &&
 4433                             ((srcptepaddr & PG_MANAGED) == 0 ||
 4434                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
 4435                             PMAP_ENTER_NORECLAIM))) {
 4436                                 dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
 4437                                     ~PG_W;
 4438                                 dst_pmap->pm_stats.resident_count +=
 4439                                     NBPDR / PAGE_SIZE;
 4440                                 pmap_pde_mappings++;
 4441                         }
 4442                         continue;
 4443                 }
 4444 
 4445                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
 4446                 KASSERT(srcmpte->wire_count > 0,
 4447                     ("pmap_copy: source page table page is unused"));
 4448 
 4449                 if (pdnxt > end_addr)
 4450                         pdnxt = end_addr;
 4451 
 4452                 src_pte = pmap_pte_quick3(src_pmap, addr);
 4453                 while (addr < pdnxt) {
 4454                         ptetemp = *src_pte;
 4455                         /*
 4456                          * we only virtual copy managed pages
 4457                          */
 4458                         if ((ptetemp & PG_MANAGED) != 0) {
 4459                                 dstmpte = pmap_allocpte(dst_pmap, addr,
 4460                                     PMAP_ENTER_NOSLEEP);
 4461                                 if (dstmpte == NULL)
 4462                                         goto out;
 4463                                 dst_pte = pmap_pte_quick(dst_pmap, addr);
 4464                                 if (*dst_pte == 0 &&
 4465                                     pmap_try_insert_pv_entry(dst_pmap, addr,
 4466                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 4467                                         /*
 4468                                          * Clear the wired, modified, and
 4469                                          * accessed (referenced) bits
 4470                                          * during the copy.
 4471                                          */
 4472                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
 4473                                             PG_A);
 4474                                         dst_pmap->pm_stats.resident_count++;
 4475                                 } else {
 4476                                         SLIST_INIT(&free);
 4477                                         if (pmap_unwire_ptp(dst_pmap, dstmpte,
 4478                                             &free)) {
 4479                                                 pmap_invalidate_page(dst_pmap,
 4480                                                     addr);
 4481                                                 vm_page_free_pages_toq(&free,
 4482                                                     true);
 4483                                         }
 4484                                         goto out;
 4485                                 }
 4486                                 if (dstmpte->wire_count >= srcmpte->wire_count)
 4487                                         break;
 4488                         }
 4489                         addr += PAGE_SIZE;
 4490                         src_pte++;
 4491                 }
 4492         }
 4493 out:
 4494         sched_unpin();
 4495         rw_wunlock(&pvh_global_lock);
 4496         PMAP_UNLOCK(src_pmap);
 4497         PMAP_UNLOCK(dst_pmap);
 4498 }
 4499 
 4500 /*
 4501  * Zero 1 page of virtual memory mapped from a hardware page by the caller.
 4502  */
 4503 static __inline void
 4504 pagezero(void *page)
 4505 {
 4506 #if defined(I686_CPU)
 4507         if (cpu_class == CPUCLASS_686) {
 4508                 if (cpu_feature & CPUID_SSE2)
 4509                         sse2_pagezero(page);
 4510                 else
 4511                         i686_pagezero(page);
 4512         } else
 4513 #endif
 4514                 bzero(page, PAGE_SIZE);
 4515 }
 4516 
 4517 /*
 4518  * Zero the specified hardware page.
 4519  */
 4520 void
 4521 pmap_zero_page(vm_page_t m)
 4522 {
 4523         pt_entry_t *cmap_pte2;
 4524         struct pcpu *pc;
 4525 
 4526         sched_pin();
 4527         pc = get_pcpu();
 4528         cmap_pte2 = pc->pc_cmap_pte2;
 4529         mtx_lock(&pc->pc_cmap_lock);
 4530         if (*cmap_pte2)
 4531                 panic("pmap_zero_page: CMAP2 busy");
 4532         *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 4533             pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 4534         invlcaddr(pc->pc_cmap_addr2);
 4535         pagezero(pc->pc_cmap_addr2);
 4536         *cmap_pte2 = 0;
 4537 
 4538         /*
 4539          * Unpin the thread before releasing the lock.  Otherwise the thread
 4540          * could be rescheduled while still bound to the current CPU, only
 4541          * to unpin itself immediately upon resuming execution.
 4542          */
 4543         sched_unpin();
 4544         mtx_unlock(&pc->pc_cmap_lock);
 4545 }
 4546 
 4547 /*
 4548  * Zero an an area within a single hardware page.  off and size must not
 4549  * cover an area beyond a single hardware page.
 4550  */
 4551 void
 4552 pmap_zero_page_area(vm_page_t m, int off, int size)
 4553 {
 4554         pt_entry_t *cmap_pte2;
 4555         struct pcpu *pc;
 4556 
 4557         sched_pin();
 4558         pc = get_pcpu();
 4559         cmap_pte2 = pc->pc_cmap_pte2;
 4560         mtx_lock(&pc->pc_cmap_lock);
 4561         if (*cmap_pte2)
 4562                 panic("pmap_zero_page_area: CMAP2 busy");
 4563         *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 4564             pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 4565         invlcaddr(pc->pc_cmap_addr2);
 4566         if (off == 0 && size == PAGE_SIZE) 
 4567                 pagezero(pc->pc_cmap_addr2);
 4568         else
 4569                 bzero(pc->pc_cmap_addr2 + off, size);
 4570         *cmap_pte2 = 0;
 4571         sched_unpin();
 4572         mtx_unlock(&pc->pc_cmap_lock);
 4573 }
 4574 
 4575 /*
 4576  * Copy 1 specified hardware page to another.
 4577  */
 4578 void
 4579 pmap_copy_page(vm_page_t src, vm_page_t dst)
 4580 {
 4581         pt_entry_t *cmap_pte1, *cmap_pte2;
 4582         struct pcpu *pc;
 4583 
 4584         sched_pin();
 4585         pc = get_pcpu();
 4586         cmap_pte1 = pc->pc_cmap_pte1; 
 4587         cmap_pte2 = pc->pc_cmap_pte2;
 4588         mtx_lock(&pc->pc_cmap_lock);
 4589         if (*cmap_pte1)
 4590                 panic("pmap_copy_page: CMAP1 busy");
 4591         if (*cmap_pte2)
 4592                 panic("pmap_copy_page: CMAP2 busy");
 4593         *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
 4594             pmap_cache_bits(kernel_pmap, src->md.pat_mode, 0);
 4595         invlcaddr(pc->pc_cmap_addr1);
 4596         *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
 4597             pmap_cache_bits(kernel_pmap, dst->md.pat_mode, 0);
 4598         invlcaddr(pc->pc_cmap_addr2);
 4599         bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
 4600         *cmap_pte1 = 0;
 4601         *cmap_pte2 = 0;
 4602         sched_unpin();
 4603         mtx_unlock(&pc->pc_cmap_lock);
 4604 }
 4605 
 4606 int unmapped_buf_allowed = 1;
 4607 
 4608 void
 4609 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
 4610     vm_offset_t b_offset, int xfersize)
 4611 {
 4612         vm_page_t a_pg, b_pg;
 4613         char *a_cp, *b_cp;
 4614         vm_offset_t a_pg_offset, b_pg_offset;
 4615         pt_entry_t *cmap_pte1, *cmap_pte2;
 4616         struct pcpu *pc;
 4617         int cnt;
 4618 
 4619         sched_pin();
 4620         pc = get_pcpu();
 4621         cmap_pte1 = pc->pc_cmap_pte1; 
 4622         cmap_pte2 = pc->pc_cmap_pte2;
 4623         mtx_lock(&pc->pc_cmap_lock);
 4624         if (*cmap_pte1 != 0)
 4625                 panic("pmap_copy_pages: CMAP1 busy");
 4626         if (*cmap_pte2 != 0)
 4627                 panic("pmap_copy_pages: CMAP2 busy");
 4628         while (xfersize > 0) {
 4629                 a_pg = ma[a_offset >> PAGE_SHIFT];
 4630                 a_pg_offset = a_offset & PAGE_MASK;
 4631                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 4632                 b_pg = mb[b_offset >> PAGE_SHIFT];
 4633                 b_pg_offset = b_offset & PAGE_MASK;
 4634                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 4635                 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
 4636                     pmap_cache_bits(kernel_pmap, a_pg->md.pat_mode, 0);
 4637                 invlcaddr(pc->pc_cmap_addr1);
 4638                 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
 4639                     PG_M | pmap_cache_bits(kernel_pmap, b_pg->md.pat_mode, 0);
 4640                 invlcaddr(pc->pc_cmap_addr2);
 4641                 a_cp = pc->pc_cmap_addr1 + a_pg_offset;
 4642                 b_cp = pc->pc_cmap_addr2 + b_pg_offset;
 4643                 bcopy(a_cp, b_cp, cnt);
 4644                 a_offset += cnt;
 4645                 b_offset += cnt;
 4646                 xfersize -= cnt;
 4647         }
 4648         *cmap_pte1 = 0;
 4649         *cmap_pte2 = 0;
 4650         sched_unpin();
 4651         mtx_unlock(&pc->pc_cmap_lock);
 4652 }
 4653 
 4654 /*
 4655  * Returns true if the pmap's pv is one of the first
 4656  * 16 pvs linked to from this page.  This count may
 4657  * be changed upwards or downwards in the future; it
 4658  * is only necessary that true be returned for a small
 4659  * subset of pmaps for proper page aging.
 4660  */
 4661 boolean_t
 4662 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 4663 {
 4664         struct md_page *pvh;
 4665         pv_entry_t pv;
 4666         int loops = 0;
 4667         boolean_t rv;
 4668 
 4669         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 4670             ("pmap_page_exists_quick: page %p is not managed", m));
 4671         rv = FALSE;
 4672         rw_wlock(&pvh_global_lock);
 4673         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 4674                 if (PV_PMAP(pv) == pmap) {
 4675                         rv = TRUE;
 4676                         break;
 4677                 }
 4678                 loops++;
 4679                 if (loops >= 16)
 4680                         break;
 4681         }
 4682         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 4683                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4684                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 4685                         if (PV_PMAP(pv) == pmap) {
 4686                                 rv = TRUE;
 4687                                 break;
 4688                         }
 4689                         loops++;
 4690                         if (loops >= 16)
 4691                                 break;
 4692                 }
 4693         }
 4694         rw_wunlock(&pvh_global_lock);
 4695         return (rv);
 4696 }
 4697 
 4698 /*
 4699  *      pmap_page_wired_mappings:
 4700  *
 4701  *      Return the number of managed mappings to the given physical page
 4702  *      that are wired.
 4703  */
 4704 int
 4705 pmap_page_wired_mappings(vm_page_t m)
 4706 {
 4707         int count;
 4708 
 4709         count = 0;
 4710         if ((m->oflags & VPO_UNMANAGED) != 0)
 4711                 return (count);
 4712         rw_wlock(&pvh_global_lock);
 4713         count = pmap_pvh_wired_mappings(&m->md, count);
 4714         if ((m->flags & PG_FICTITIOUS) == 0) {
 4715             count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 4716                 count);
 4717         }
 4718         rw_wunlock(&pvh_global_lock);
 4719         return (count);
 4720 }
 4721 
 4722 /*
 4723  *      pmap_pvh_wired_mappings:
 4724  *
 4725  *      Return the updated number "count" of managed mappings that are wired.
 4726  */
 4727 static int
 4728 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
 4729 {
 4730         pmap_t pmap;
 4731         pt_entry_t *pte;
 4732         pv_entry_t pv;
 4733 
 4734         rw_assert(&pvh_global_lock, RA_WLOCKED);
 4735         sched_pin();
 4736         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 4737                 pmap = PV_PMAP(pv);
 4738                 PMAP_LOCK(pmap);
 4739                 pte = pmap_pte_quick(pmap, pv->pv_va);
 4740                 if ((*pte & PG_W) != 0)
 4741                         count++;
 4742                 PMAP_UNLOCK(pmap);
 4743         }
 4744         sched_unpin();
 4745         return (count);
 4746 }
 4747 
 4748 /*
 4749  * Returns TRUE if the given page is mapped individually or as part of
 4750  * a 4mpage.  Otherwise, returns FALSE.
 4751  */
 4752 boolean_t
 4753 pmap_page_is_mapped(vm_page_t m)
 4754 {
 4755         boolean_t rv;
 4756 
 4757         if ((m->oflags & VPO_UNMANAGED) != 0)
 4758                 return (FALSE);
 4759         rw_wlock(&pvh_global_lock);
 4760         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 4761             ((m->flags & PG_FICTITIOUS) == 0 &&
 4762             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 4763         rw_wunlock(&pvh_global_lock);
 4764         return (rv);
 4765 }
 4766 
 4767 /*
 4768  * Remove all pages from specified address space
 4769  * this aids process exit speeds.  Also, this code
 4770  * is special cased for current process only, but
 4771  * can have the more generic (and slightly slower)
 4772  * mode enabled.  This is much faster than pmap_remove
 4773  * in the case of running down an entire address space.
 4774  */
 4775 void
 4776 pmap_remove_pages(pmap_t pmap)
 4777 {
 4778         pt_entry_t *pte, tpte;
 4779         vm_page_t m, mpte, mt;
 4780         pv_entry_t pv;
 4781         struct md_page *pvh;
 4782         struct pv_chunk *pc, *npc;
 4783         struct spglist free;
 4784         int field, idx;
 4785         int32_t bit;
 4786         uint32_t inuse, bitmask;
 4787         int allfree;
 4788 
 4789         if (pmap != PCPU_GET(curpmap)) {
 4790                 printf("warning: pmap_remove_pages called with non-current pmap\n");
 4791                 return;
 4792         }
 4793         SLIST_INIT(&free);
 4794         rw_wlock(&pvh_global_lock);
 4795         PMAP_LOCK(pmap);
 4796         sched_pin();
 4797         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 4798                 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
 4799                     pc->pc_pmap));
 4800                 allfree = 1;
 4801                 for (field = 0; field < _NPCM; field++) {
 4802                         inuse = ~pc->pc_map[field] & pc_freemask[field];
 4803                         while (inuse != 0) {
 4804                                 bit = bsfl(inuse);
 4805                                 bitmask = 1UL << bit;
 4806                                 idx = field * 32 + bit;
 4807                                 pv = &pc->pc_pventry[idx];
 4808                                 inuse &= ~bitmask;
 4809 
 4810                                 pte = pmap_pde(pmap, pv->pv_va);
 4811                                 tpte = *pte;
 4812                                 if ((tpte & PG_PS) == 0) {
 4813                                         pte = pmap_pte_quick(pmap, pv->pv_va);
 4814                                         tpte = *pte & ~PG_PTE_PAT;
 4815                                 }
 4816 
 4817                                 if (tpte == 0) {
 4818                                         printf(
 4819                                             "TPTE at %p  IS ZERO @ VA %08x\n",
 4820                                             pte, pv->pv_va);
 4821                                         panic("bad pte");
 4822                                 }
 4823 
 4824 /*
 4825  * We cannot remove wired pages from a process' mapping at this time
 4826  */
 4827                                 if (tpte & PG_W) {
 4828                                         allfree = 0;
 4829                                         continue;
 4830                                 }
 4831 
 4832                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 4833                                 KASSERT(m->phys_addr == (tpte & PG_FRAME),
 4834                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 4835                                     m, (uintmax_t)m->phys_addr,
 4836                                     (uintmax_t)tpte));
 4837 
 4838                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 4839                                     m < &vm_page_array[vm_page_array_size],
 4840                                     ("pmap_remove_pages: bad tpte %#jx",
 4841                                     (uintmax_t)tpte));
 4842 
 4843                                 pte_clear(pte);
 4844 
 4845                                 /*
 4846                                  * Update the vm_page_t clean/reference bits.
 4847                                  */
 4848                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 4849                                         if ((tpte & PG_PS) != 0) {
 4850                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 4851                                                         vm_page_dirty(mt);
 4852                                         } else
 4853                                                 vm_page_dirty(m);
 4854                                 }
 4855 
 4856                                 /* Mark free */
 4857                                 PV_STAT(pv_entry_frees++);
 4858                                 PV_STAT(pv_entry_spare++);
 4859                                 pv_entry_count--;
 4860                                 pc->pc_map[field] |= bitmask;
 4861                                 if ((tpte & PG_PS) != 0) {
 4862                                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 4863                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 4864                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 4865                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 4866                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 4867                                                         if (TAILQ_EMPTY(&mt->md.pv_list))
 4868                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
 4869                                         }
 4870                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 4871                                         if (mpte != NULL) {
 4872                                                 pmap->pm_stats.resident_count--;
 4873                                                 KASSERT(mpte->wire_count == NPTEPG,
 4874                                                     ("pmap_remove_pages: pte page wire count error"));
 4875                                                 mpte->wire_count = 0;
 4876                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
 4877                                         }
 4878                                 } else {
 4879                                         pmap->pm_stats.resident_count--;
 4880                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 4881                                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 4882                                             (m->flags & PG_FICTITIOUS) == 0) {
 4883                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4884                                                 if (TAILQ_EMPTY(&pvh->pv_list))
 4885                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
 4886                                         }
 4887                                         pmap_unuse_pt(pmap, pv->pv_va, &free);
 4888                                 }
 4889                         }
 4890                 }
 4891                 if (allfree) {
 4892                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4893                         free_pv_chunk(pc);
 4894                 }
 4895         }
 4896         sched_unpin();
 4897         pmap_invalidate_all(pmap);
 4898         rw_wunlock(&pvh_global_lock);
 4899         PMAP_UNLOCK(pmap);
 4900         vm_page_free_pages_toq(&free, true);
 4901 }
 4902 
 4903 /*
 4904  *      pmap_is_modified:
 4905  *
 4906  *      Return whether or not the specified physical page was modified
 4907  *      in any physical maps.
 4908  */
 4909 boolean_t
 4910 pmap_is_modified(vm_page_t m)
 4911 {
 4912         boolean_t rv;
 4913 
 4914         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 4915             ("pmap_is_modified: page %p is not managed", m));
 4916 
 4917         /*
 4918          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 4919          * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 4920          * is clear, no PTEs can have PG_M set.
 4921          */
 4922         VM_OBJECT_ASSERT_WLOCKED(m->object);
 4923         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 4924                 return (FALSE);
 4925         rw_wlock(&pvh_global_lock);
 4926         rv = pmap_is_modified_pvh(&m->md) ||
 4927             ((m->flags & PG_FICTITIOUS) == 0 &&
 4928             pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 4929         rw_wunlock(&pvh_global_lock);
 4930         return (rv);
 4931 }
 4932 
 4933 /*
 4934  * Returns TRUE if any of the given mappings were used to modify
 4935  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
 4936  * mappings are supported.
 4937  */
 4938 static boolean_t
 4939 pmap_is_modified_pvh(struct md_page *pvh)
 4940 {
 4941         pv_entry_t pv;
 4942         pt_entry_t *pte;
 4943         pmap_t pmap;
 4944         boolean_t rv;
 4945 
 4946         rw_assert(&pvh_global_lock, RA_WLOCKED);
 4947         rv = FALSE;
 4948         sched_pin();
 4949         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 4950                 pmap = PV_PMAP(pv);
 4951                 PMAP_LOCK(pmap);
 4952                 pte = pmap_pte_quick(pmap, pv->pv_va);
 4953                 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
 4954                 PMAP_UNLOCK(pmap);
 4955                 if (rv)
 4956                         break;
 4957         }
 4958         sched_unpin();
 4959         return (rv);
 4960 }
 4961 
 4962 /*
 4963  *      pmap_is_prefaultable:
 4964  *
 4965  *      Return whether or not the specified virtual address is elgible
 4966  *      for prefault.
 4967  */
 4968 boolean_t
 4969 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 4970 {
 4971         pd_entry_t pde;
 4972         boolean_t rv;
 4973 
 4974         rv = FALSE;
 4975         PMAP_LOCK(pmap);
 4976         pde = *pmap_pde(pmap, addr);
 4977         if (pde != 0 && (pde & PG_PS) == 0)
 4978                 rv = pmap_pte_ufast(pmap, addr, pde) == 0;
 4979         PMAP_UNLOCK(pmap);
 4980         return (rv);
 4981 }
 4982 
 4983 /*
 4984  *      pmap_is_referenced:
 4985  *
 4986  *      Return whether or not the specified physical page was referenced
 4987  *      in any physical maps.
 4988  */
 4989 boolean_t
 4990 pmap_is_referenced(vm_page_t m)
 4991 {
 4992         boolean_t rv;
 4993 
 4994         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 4995             ("pmap_is_referenced: page %p is not managed", m));
 4996         rw_wlock(&pvh_global_lock);
 4997         rv = pmap_is_referenced_pvh(&m->md) ||
 4998             ((m->flags & PG_FICTITIOUS) == 0 &&
 4999             pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 5000         rw_wunlock(&pvh_global_lock);
 5001         return (rv);
 5002 }
 5003 
 5004 /*
 5005  * Returns TRUE if any of the given mappings were referenced and FALSE
 5006  * otherwise.  Both page and 4mpage mappings are supported.
 5007  */
 5008 static boolean_t
 5009 pmap_is_referenced_pvh(struct md_page *pvh)
 5010 {
 5011         pv_entry_t pv;
 5012         pt_entry_t *pte;
 5013         pmap_t pmap;
 5014         boolean_t rv;
 5015 
 5016         rw_assert(&pvh_global_lock, RA_WLOCKED);
 5017         rv = FALSE;
 5018         sched_pin();
 5019         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 5020                 pmap = PV_PMAP(pv);
 5021                 PMAP_LOCK(pmap);
 5022                 pte = pmap_pte_quick(pmap, pv->pv_va);
 5023                 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
 5024                 PMAP_UNLOCK(pmap);
 5025                 if (rv)
 5026                         break;
 5027         }
 5028         sched_unpin();
 5029         return (rv);
 5030 }
 5031 
 5032 /*
 5033  * Clear the write and modified bits in each of the given page's mappings.
 5034  */
 5035 void
 5036 pmap_remove_write(vm_page_t m)
 5037 {
 5038         struct md_page *pvh;
 5039         pv_entry_t next_pv, pv;
 5040         pmap_t pmap;
 5041         pd_entry_t *pde;
 5042         pt_entry_t oldpte, *pte;
 5043         vm_offset_t va;
 5044 
 5045         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5046             ("pmap_remove_write: page %p is not managed", m));
 5047 
 5048         /*
 5049          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 5050          * set by another thread while the object is locked.  Thus,
 5051          * if PGA_WRITEABLE is clear, no page table entries need updating.
 5052          */
 5053         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5054         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 5055                 return;
 5056         rw_wlock(&pvh_global_lock);
 5057         sched_pin();
 5058         if ((m->flags & PG_FICTITIOUS) != 0)
 5059                 goto small_mappings;
 5060         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5061         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 5062                 va = pv->pv_va;
 5063                 pmap = PV_PMAP(pv);
 5064                 PMAP_LOCK(pmap);
 5065                 pde = pmap_pde(pmap, va);
 5066                 if ((*pde & PG_RW) != 0)
 5067                         (void)pmap_demote_pde(pmap, pde, va);
 5068                 PMAP_UNLOCK(pmap);
 5069         }
 5070 small_mappings:
 5071         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5072                 pmap = PV_PMAP(pv);
 5073                 PMAP_LOCK(pmap);
 5074                 pde = pmap_pde(pmap, pv->pv_va);
 5075                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
 5076                     " a 4mpage in page %p's pv list", m));
 5077                 pte = pmap_pte_quick(pmap, pv->pv_va);
 5078 retry:
 5079                 oldpte = *pte;
 5080                 if ((oldpte & PG_RW) != 0) {
 5081                         /*
 5082                          * Regardless of whether a pte is 32 or 64 bits
 5083                          * in size, PG_RW and PG_M are among the least
 5084                          * significant 32 bits.
 5085                          */
 5086                         if (!atomic_cmpset_int((u_int *)pte, oldpte,
 5087                             oldpte & ~(PG_RW | PG_M)))
 5088                                 goto retry;
 5089                         if ((oldpte & PG_M) != 0)
 5090                                 vm_page_dirty(m);
 5091                         pmap_invalidate_page(pmap, pv->pv_va);
 5092                 }
 5093                 PMAP_UNLOCK(pmap);
 5094         }
 5095         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5096         sched_unpin();
 5097         rw_wunlock(&pvh_global_lock);
 5098 }
 5099 
 5100 /*
 5101  *      pmap_ts_referenced:
 5102  *
 5103  *      Return a count of reference bits for a page, clearing those bits.
 5104  *      It is not necessary for every reference bit to be cleared, but it
 5105  *      is necessary that 0 only be returned when there are truly no
 5106  *      reference bits set.
 5107  *
 5108  *      As an optimization, update the page's dirty field if a modified bit is
 5109  *      found while counting reference bits.  This opportunistic update can be
 5110  *      performed at low cost and can eliminate the need for some future calls
 5111  *      to pmap_is_modified().  However, since this function stops after
 5112  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
 5113  *      dirty pages.  Those dirty pages will only be detected by a future call
 5114  *      to pmap_is_modified().
 5115  */
 5116 int
 5117 pmap_ts_referenced(vm_page_t m)
 5118 {
 5119         struct md_page *pvh;
 5120         pv_entry_t pv, pvf;
 5121         pmap_t pmap;
 5122         pd_entry_t *pde;
 5123         pt_entry_t *pte;
 5124         vm_paddr_t pa;
 5125         int rtval = 0;
 5126 
 5127         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5128             ("pmap_ts_referenced: page %p is not managed", m));
 5129         pa = VM_PAGE_TO_PHYS(m);
 5130         pvh = pa_to_pvh(pa);
 5131         rw_wlock(&pvh_global_lock);
 5132         sched_pin();
 5133         if ((m->flags & PG_FICTITIOUS) != 0 ||
 5134             (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 5135                 goto small_mappings;
 5136         pv = pvf;
 5137         do {
 5138                 pmap = PV_PMAP(pv);
 5139                 PMAP_LOCK(pmap);
 5140                 pde = pmap_pde(pmap, pv->pv_va);
 5141                 if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 5142                         /*
 5143                          * Although "*pde" is mapping a 2/4MB page, because
 5144                          * this function is called at a 4KB page granularity,
 5145                          * we only update the 4KB page under test.
 5146                          */
 5147                         vm_page_dirty(m);
 5148                 }
 5149                 if ((*pde & PG_A) != 0) {
 5150                         /*
 5151                          * Since this reference bit is shared by either 1024
 5152                          * or 512 4KB pages, it should not be cleared every
 5153                          * time it is tested.  Apply a simple "hash" function
 5154                          * on the physical page number, the virtual superpage
 5155                          * number, and the pmap address to select one 4KB page
 5156                          * out of the 1024 or 512 on which testing the
 5157                          * reference bit will result in clearing that bit.
 5158                          * This function is designed to avoid the selection of
 5159                          * the same 4KB page for every 2- or 4MB page mapping.
 5160                          *
 5161                          * On demotion, a mapping that hasn't been referenced
 5162                          * is simply destroyed.  To avoid the possibility of a
 5163                          * subsequent page fault on a demoted wired mapping,
 5164                          * always leave its reference bit set.  Moreover,
 5165                          * since the superpage is wired, the current state of
 5166                          * its reference bit won't affect page replacement.
 5167                          */
 5168                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 5169                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 5170                             (*pde & PG_W) == 0) {
 5171                                 atomic_clear_int((u_int *)pde, PG_A);
 5172                                 pmap_invalidate_page(pmap, pv->pv_va);
 5173                         }
 5174                         rtval++;
 5175                 }
 5176                 PMAP_UNLOCK(pmap);
 5177                 /* Rotate the PV list if it has more than one entry. */
 5178                 if (TAILQ_NEXT(pv, pv_next) != NULL) {
 5179                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 5180                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 5181                 }
 5182                 if (rtval >= PMAP_TS_REFERENCED_MAX)
 5183                         goto out;
 5184         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 5185 small_mappings:
 5186         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 5187                 goto out;
 5188         pv = pvf;
 5189         do {
 5190                 pmap = PV_PMAP(pv);
 5191                 PMAP_LOCK(pmap);
 5192                 pde = pmap_pde(pmap, pv->pv_va);
 5193                 KASSERT((*pde & PG_PS) == 0,
 5194                     ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
 5195                     m));
 5196                 pte = pmap_pte_quick(pmap, pv->pv_va);
 5197                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5198                         vm_page_dirty(m);
 5199                 if ((*pte & PG_A) != 0) {
 5200                         atomic_clear_int((u_int *)pte, PG_A);
 5201                         pmap_invalidate_page(pmap, pv->pv_va);
 5202                         rtval++;
 5203                 }
 5204                 PMAP_UNLOCK(pmap);
 5205                 /* Rotate the PV list if it has more than one entry. */
 5206                 if (TAILQ_NEXT(pv, pv_next) != NULL) {
 5207                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 5208                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 5209                 }
 5210         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
 5211             PMAP_TS_REFERENCED_MAX);
 5212 out:
 5213         sched_unpin();
 5214         rw_wunlock(&pvh_global_lock);
 5215         return (rtval);
 5216 }
 5217 
 5218 /*
 5219  *      Apply the given advice to the specified range of addresses within the
 5220  *      given pmap.  Depending on the advice, clear the referenced and/or
 5221  *      modified flags in each mapping and set the mapped page's dirty field.
 5222  */
 5223 void
 5224 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 5225 {
 5226         pd_entry_t oldpde, *pde;
 5227         pt_entry_t *pte;
 5228         vm_offset_t va, pdnxt;
 5229         vm_page_t m;
 5230         boolean_t anychanged, pv_lists_locked;
 5231 
 5232         if (advice != MADV_DONTNEED && advice != MADV_FREE)
 5233                 return;
 5234         if (pmap_is_current(pmap))
 5235                 pv_lists_locked = FALSE;
 5236         else {
 5237                 pv_lists_locked = TRUE;
 5238 resume:
 5239                 rw_wlock(&pvh_global_lock);
 5240                 sched_pin();
 5241         }
 5242         anychanged = FALSE;
 5243         PMAP_LOCK(pmap);
 5244         for (; sva < eva; sva = pdnxt) {
 5245                 pdnxt = (sva + NBPDR) & ~PDRMASK;
 5246                 if (pdnxt < sva)
 5247                         pdnxt = eva;
 5248                 pde = pmap_pde(pmap, sva);
 5249                 oldpde = *pde;
 5250                 if ((oldpde & PG_V) == 0)
 5251                         continue;
 5252                 else if ((oldpde & PG_PS) != 0) {
 5253                         if ((oldpde & PG_MANAGED) == 0)
 5254                                 continue;
 5255                         if (!pv_lists_locked) {
 5256                                 pv_lists_locked = TRUE;
 5257                                 if (!rw_try_wlock(&pvh_global_lock)) {
 5258                                         if (anychanged)
 5259                                                 pmap_invalidate_all(pmap);
 5260                                         PMAP_UNLOCK(pmap);
 5261                                         goto resume;
 5262                                 }
 5263                                 sched_pin();
 5264                         }
 5265                         if (!pmap_demote_pde(pmap, pde, sva)) {
 5266                                 /*
 5267                                  * The large page mapping was destroyed.
 5268                                  */
 5269                                 continue;
 5270                         }
 5271 
 5272                         /*
 5273                          * Unless the page mappings are wired, remove the
 5274                          * mapping to a single page so that a subsequent
 5275                          * access may repromote.  Since the underlying page
 5276                          * table page is fully populated, this removal never
 5277                          * frees a page table page.
 5278                          */
 5279                         if ((oldpde & PG_W) == 0) {
 5280                                 pte = pmap_pte_quick(pmap, sva);
 5281                                 KASSERT((*pte & PG_V) != 0,
 5282                                     ("pmap_advise: invalid PTE"));
 5283                                 pmap_remove_pte(pmap, pte, sva, NULL);
 5284                                 anychanged = TRUE;
 5285                         }
 5286                 }
 5287                 if (pdnxt > eva)
 5288                         pdnxt = eva;
 5289                 va = pdnxt;
 5290                 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 5291                     sva += PAGE_SIZE) {
 5292                         if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 5293                                 goto maybe_invlrng;
 5294                         else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 5295                                 if (advice == MADV_DONTNEED) {
 5296                                         /*
 5297                                          * Future calls to pmap_is_modified()
 5298                                          * can be avoided by making the page
 5299                                          * dirty now.
 5300                                          */
 5301                                         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 5302                                         vm_page_dirty(m);
 5303                                 }
 5304                                 atomic_clear_int((u_int *)pte, PG_M | PG_A);
 5305                         } else if ((*pte & PG_A) != 0)
 5306                                 atomic_clear_int((u_int *)pte, PG_A);
 5307                         else
 5308                                 goto maybe_invlrng;
 5309                         if ((*pte & PG_G) != 0) {
 5310                                 if (va == pdnxt)
 5311                                         va = sva;
 5312                         } else
 5313                                 anychanged = TRUE;
 5314                         continue;
 5315 maybe_invlrng:
 5316                         if (va != pdnxt) {
 5317                                 pmap_invalidate_range(pmap, va, sva);
 5318                                 va = pdnxt;
 5319                         }
 5320                 }
 5321                 if (va != pdnxt)
 5322                         pmap_invalidate_range(pmap, va, sva);
 5323         }
 5324         if (anychanged)
 5325                 pmap_invalidate_all(pmap);
 5326         if (pv_lists_locked) {
 5327                 sched_unpin();
 5328                 rw_wunlock(&pvh_global_lock);
 5329         }
 5330         PMAP_UNLOCK(pmap);
 5331 }
 5332 
 5333 /*
 5334  *      Clear the modify bits on the specified physical page.
 5335  */
 5336 void
 5337 pmap_clear_modify(vm_page_t m)
 5338 {
 5339         struct md_page *pvh;
 5340         pv_entry_t next_pv, pv;
 5341         pmap_t pmap;
 5342         pd_entry_t oldpde, *pde;
 5343         pt_entry_t oldpte, *pte;
 5344         vm_offset_t va;
 5345 
 5346         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5347             ("pmap_clear_modify: page %p is not managed", m));
 5348         VM_OBJECT_ASSERT_WLOCKED(m->object);
 5349         KASSERT(!vm_page_xbusied(m),
 5350             ("pmap_clear_modify: page %p is exclusive busied", m));
 5351 
 5352         /*
 5353          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 5354          * If the object containing the page is locked and the page is not
 5355          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 5356          */
 5357         if ((m->aflags & PGA_WRITEABLE) == 0)
 5358                 return;
 5359         rw_wlock(&pvh_global_lock);
 5360         sched_pin();
 5361         if ((m->flags & PG_FICTITIOUS) != 0)
 5362                 goto small_mappings;
 5363         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5364         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 5365                 va = pv->pv_va;
 5366                 pmap = PV_PMAP(pv);
 5367                 PMAP_LOCK(pmap);
 5368                 pde = pmap_pde(pmap, va);
 5369                 oldpde = *pde;
 5370                 if ((oldpde & PG_RW) != 0) {
 5371                         if (pmap_demote_pde(pmap, pde, va)) {
 5372                                 if ((oldpde & PG_W) == 0) {
 5373                                         /*
 5374                                          * Write protect the mapping to a
 5375                                          * single page so that a subsequent
 5376                                          * write access may repromote.
 5377                                          */
 5378                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 5379                                             PG_PS_FRAME);
 5380                                         pte = pmap_pte_quick(pmap, va);
 5381                                         oldpte = *pte;
 5382                                         if ((oldpte & PG_V) != 0) {
 5383                                                 /*
 5384                                                  * Regardless of whether a pte is 32 or 64 bits
 5385                                                  * in size, PG_RW and PG_M are among the least
 5386                                                  * significant 32 bits.
 5387                                                  */
 5388                                                 while (!atomic_cmpset_int((u_int *)pte,
 5389                                                     oldpte,
 5390                                                     oldpte & ~(PG_M | PG_RW)))
 5391                                                         oldpte = *pte;
 5392                                                 vm_page_dirty(m);
 5393                                                 pmap_invalidate_page(pmap, va);
 5394                                         }
 5395                                 }
 5396                         }
 5397                 }
 5398                 PMAP_UNLOCK(pmap);
 5399         }
 5400 small_mappings:
 5401         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 5402                 pmap = PV_PMAP(pv);
 5403                 PMAP_LOCK(pmap);
 5404                 pde = pmap_pde(pmap, pv->pv_va);
 5405                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 5406                     " a 4mpage in page %p's pv list", m));
 5407                 pte = pmap_pte_quick(pmap, pv->pv_va);
 5408                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 5409                         /*
 5410                          * Regardless of whether a pte is 32 or 64 bits
 5411                          * in size, PG_M is among the least significant
 5412                          * 32 bits. 
 5413                          */
 5414                         atomic_clear_int((u_int *)pte, PG_M);
 5415                         pmap_invalidate_page(pmap, pv->pv_va);
 5416                 }
 5417                 PMAP_UNLOCK(pmap);
 5418         }
 5419         sched_unpin();
 5420         rw_wunlock(&pvh_global_lock);
 5421 }
 5422 
 5423 /*
 5424  * Miscellaneous support routines follow
 5425  */
 5426 
 5427 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 5428 static __inline void
 5429 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
 5430 {
 5431         u_int opte, npte;
 5432 
 5433         /*
 5434          * The cache mode bits are all in the low 32-bits of the
 5435          * PTE, so we can just spin on updating the low 32-bits.
 5436          */
 5437         do {
 5438                 opte = *(u_int *)pte;
 5439                 npte = opte & ~PG_PTE_CACHE;
 5440                 npte |= cache_bits;
 5441         } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 5442 }
 5443 
 5444 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
 5445 static __inline void
 5446 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
 5447 {
 5448         u_int opde, npde;
 5449 
 5450         /*
 5451          * The cache mode bits are all in the low 32-bits of the
 5452          * PDE, so we can just spin on updating the low 32-bits.
 5453          */
 5454         do {
 5455                 opde = *(u_int *)pde;
 5456                 npde = opde & ~PG_PDE_CACHE;
 5457                 npde |= cache_bits;
 5458         } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 5459 }
 5460 
 5461 /*
 5462  * Map a set of physical memory pages into the kernel virtual
 5463  * address space. Return a pointer to where it is mapped. This
 5464  * routine is intended to be used for mapping device memory,
 5465  * NOT real memory.
 5466  */
 5467 void *
 5468 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 5469 {
 5470         struct pmap_preinit_mapping *ppim;
 5471         vm_offset_t va, offset;
 5472         vm_size_t tmpsize;
 5473         int i;
 5474 
 5475         offset = pa & PAGE_MASK;
 5476         size = round_page(offset + size);
 5477         pa = pa & PG_FRAME;
 5478 
 5479         if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW)
 5480                 va = pa + PMAP_MAP_LOW;
 5481         else if (!pmap_initialized) {
 5482                 va = 0;
 5483                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 5484                         ppim = pmap_preinit_mapping + i;
 5485                         if (ppim->va == 0) {
 5486                                 ppim->pa = pa;
 5487                                 ppim->sz = size;
 5488                                 ppim->mode = mode;
 5489                                 ppim->va = virtual_avail;
 5490                                 virtual_avail += size;
 5491                                 va = ppim->va;
 5492                                 break;
 5493                         }
 5494                 }
 5495                 if (va == 0)
 5496                         panic("%s: too many preinit mappings", __func__);
 5497         } else {
 5498                 /*
 5499                  * If we have a preinit mapping, re-use it.
 5500                  */
 5501                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 5502                         ppim = pmap_preinit_mapping + i;
 5503                         if (ppim->pa == pa && ppim->sz == size &&
 5504                             ppim->mode == mode)
 5505                                 return ((void *)(ppim->va + offset));
 5506                 }
 5507                 va = kva_alloc(size);
 5508                 if (va == 0)
 5509                         panic("%s: Couldn't allocate KVA", __func__);
 5510         }
 5511         for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 5512                 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 5513         pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 5514         pmap_invalidate_cache_range(va, va + size);
 5515         return ((void *)(va + offset));
 5516 }
 5517 
 5518 void *
 5519 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 5520 {
 5521 
 5522         return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 5523 }
 5524 
 5525 void *
 5526 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 5527 {
 5528 
 5529         return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 5530 }
 5531 
 5532 void
 5533 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 5534 {
 5535         struct pmap_preinit_mapping *ppim;
 5536         vm_offset_t offset;
 5537         int i;
 5538 
 5539         if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE)
 5540                 return;
 5541         offset = va & PAGE_MASK;
 5542         size = round_page(offset + size);
 5543         va = trunc_page(va);
 5544         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 5545                 ppim = pmap_preinit_mapping + i;
 5546                 if (ppim->va == va && ppim->sz == size) {
 5547                         if (pmap_initialized)
 5548                                 return;
 5549                         ppim->pa = 0;
 5550                         ppim->va = 0;
 5551                         ppim->sz = 0;
 5552                         ppim->mode = 0;
 5553                         if (va + size == virtual_avail)
 5554                                 virtual_avail = va;
 5555                         return;
 5556                 }
 5557         }
 5558         if (pmap_initialized)
 5559                 kva_free(va, size);
 5560 }
 5561 
 5562 /*
 5563  * Sets the memory attribute for the specified page.
 5564  */
 5565 void
 5566 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 5567 {
 5568 
 5569         m->md.pat_mode = ma;
 5570         if ((m->flags & PG_FICTITIOUS) != 0)
 5571                 return;
 5572 
 5573         /*
 5574          * If "m" is a normal page, flush it from the cache.
 5575          * See pmap_invalidate_cache_range().
 5576          *
 5577          * First, try to find an existing mapping of the page by sf
 5578          * buffer. sf_buf_invalidate_cache() modifies mapping and
 5579          * flushes the cache.
 5580          */    
 5581         if (sf_buf_invalidate_cache(m))
 5582                 return;
 5583 
 5584         /*
 5585          * If page is not mapped by sf buffer, but CPU does not
 5586          * support self snoop, map the page transient and do
 5587          * invalidation. In the worst case, whole cache is flushed by
 5588          * pmap_invalidate_cache_range().
 5589          */
 5590         if ((cpu_feature & CPUID_SS) == 0)
 5591                 pmap_flush_page(m);
 5592 }
 5593 
 5594 static void
 5595 pmap_flush_page(vm_page_t m)
 5596 {
 5597         pt_entry_t *cmap_pte2;
 5598         struct pcpu *pc;
 5599         vm_offset_t sva, eva;
 5600         bool useclflushopt;
 5601 
 5602         useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 5603         if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
 5604                 sched_pin();
 5605                 pc = get_pcpu();
 5606                 cmap_pte2 = pc->pc_cmap_pte2; 
 5607                 mtx_lock(&pc->pc_cmap_lock);
 5608                 if (*cmap_pte2)
 5609                         panic("pmap_flush_page: CMAP2 busy");
 5610                 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
 5611                     PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode,
 5612                     0);
 5613                 invlcaddr(pc->pc_cmap_addr2);
 5614                 sva = (vm_offset_t)pc->pc_cmap_addr2;
 5615                 eva = sva + PAGE_SIZE;
 5616 
 5617                 /*
 5618                  * Use mfence or sfence despite the ordering implied by
 5619                  * mtx_{un,}lock() because clflush on non-Intel CPUs
 5620                  * and clflushopt are not guaranteed to be ordered by
 5621                  * any other instruction.
 5622                  */
 5623                 if (useclflushopt)
 5624                         sfence();
 5625                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 5626                         mfence();
 5627                 for (; sva < eva; sva += cpu_clflush_line_size) {
 5628                         if (useclflushopt)
 5629                                 clflushopt(sva);
 5630                         else
 5631                                 clflush(sva);
 5632                 }
 5633                 if (useclflushopt)
 5634                         sfence();
 5635                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 5636                         mfence();
 5637                 *cmap_pte2 = 0;
 5638                 sched_unpin();
 5639                 mtx_unlock(&pc->pc_cmap_lock);
 5640         } else
 5641                 pmap_invalidate_cache();
 5642 }
 5643 
 5644 /*
 5645  * Changes the specified virtual address range's memory type to that given by
 5646  * the parameter "mode".  The specified virtual address range must be
 5647  * completely contained within either the kernel map.
 5648  *
 5649  * Returns zero if the change completed successfully, and either EINVAL or
 5650  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
 5651  * of the virtual address range was not mapped, and ENOMEM is returned if
 5652  * there was insufficient memory available to complete the change.
 5653  */
 5654 int
 5655 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 5656 {
 5657         vm_offset_t base, offset, tmpva;
 5658         pd_entry_t *pde;
 5659         pt_entry_t *pte;
 5660         int cache_bits_pte, cache_bits_pde;
 5661         boolean_t changed;
 5662 
 5663         base = trunc_page(va);
 5664         offset = va & PAGE_MASK;
 5665         size = round_page(offset + size);
 5666 
 5667         /*
 5668          * Only supported on kernel virtual addresses above the recursive map.
 5669          */
 5670         if (base < VM_MIN_KERNEL_ADDRESS)
 5671                 return (EINVAL);
 5672 
 5673         cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
 5674         cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
 5675         changed = FALSE;
 5676 
 5677         /*
 5678          * Pages that aren't mapped aren't supported.  Also break down
 5679          * 2/4MB pages into 4KB pages if required.
 5680          */
 5681         PMAP_LOCK(kernel_pmap);
 5682         for (tmpva = base; tmpva < base + size; ) {
 5683                 pde = pmap_pde(kernel_pmap, tmpva);
 5684                 if (*pde == 0) {
 5685                         PMAP_UNLOCK(kernel_pmap);
 5686                         return (EINVAL);
 5687                 }
 5688                 if (*pde & PG_PS) {
 5689                         /*
 5690                          * If the current 2/4MB page already has
 5691                          * the required memory type, then we need not
 5692                          * demote this page.  Just increment tmpva to
 5693                          * the next 2/4MB page frame.
 5694                          */
 5695                         if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
 5696                                 tmpva = trunc_4mpage(tmpva) + NBPDR;
 5697                                 continue;
 5698                         }
 5699 
 5700                         /*
 5701                          * If the current offset aligns with a 2/4MB
 5702                          * page frame and there is at least 2/4MB left
 5703                          * within the range, then we need not break
 5704                          * down this page into 4KB pages.
 5705                          */
 5706                         if ((tmpva & PDRMASK) == 0 &&
 5707                             tmpva + PDRMASK < base + size) {
 5708                                 tmpva += NBPDR;
 5709                                 continue;
 5710                         }
 5711                         if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
 5712                                 PMAP_UNLOCK(kernel_pmap);
 5713                                 return (ENOMEM);
 5714                         }
 5715                 }
 5716                 pte = vtopte(tmpva);
 5717                 if (*pte == 0) {
 5718                         PMAP_UNLOCK(kernel_pmap);
 5719                         return (EINVAL);
 5720                 }
 5721                 tmpva += PAGE_SIZE;
 5722         }
 5723         PMAP_UNLOCK(kernel_pmap);
 5724 
 5725         /*
 5726          * Ok, all the pages exist, so run through them updating their
 5727          * cache mode if required.
 5728          */
 5729         for (tmpva = base; tmpva < base + size; ) {
 5730                 pde = pmap_pde(kernel_pmap, tmpva);
 5731                 if (*pde & PG_PS) {
 5732                         if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
 5733                                 pmap_pde_attr(pde, cache_bits_pde);
 5734                                 changed = TRUE;
 5735                         }
 5736                         tmpva = trunc_4mpage(tmpva) + NBPDR;
 5737                 } else {
 5738                         pte = vtopte(tmpva);
 5739                         if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
 5740                                 pmap_pte_attr(pte, cache_bits_pte);
 5741                                 changed = TRUE;
 5742                         }
 5743                         tmpva += PAGE_SIZE;
 5744                 }
 5745         }
 5746 
 5747         /*
 5748          * Flush CPU caches to make sure any data isn't cached that
 5749          * shouldn't be, etc.
 5750          */
 5751         if (changed) {
 5752                 pmap_invalidate_range(kernel_pmap, base, tmpva);
 5753                 pmap_invalidate_cache_range(base, tmpva);
 5754         }
 5755         return (0);
 5756 }
 5757 
 5758 /*
 5759  * perform the pmap work for mincore
 5760  */
 5761 int
 5762 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 5763 {
 5764         pd_entry_t pde;
 5765         pt_entry_t pte;
 5766         vm_paddr_t pa;
 5767         int val;
 5768 
 5769         PMAP_LOCK(pmap);
 5770 retry:
 5771         pde = *pmap_pde(pmap, addr);
 5772         if (pde != 0) {
 5773                 if ((pde & PG_PS) != 0) {
 5774                         pte = pde;
 5775                         /* Compute the physical address of the 4KB page. */
 5776                         pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) &
 5777                             PG_FRAME;
 5778                         val = MINCORE_SUPER;
 5779                 } else {
 5780                         pte = pmap_pte_ufast(pmap, addr, pde);
 5781                         pa = pte & PG_FRAME;
 5782                         val = 0;
 5783                 }
 5784         } else {
 5785                 pte = 0;
 5786                 pa = 0;
 5787                 val = 0;
 5788         }
 5789         if ((pte & PG_V) != 0) {
 5790                 val |= MINCORE_INCORE;
 5791                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5792                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 5793                 if ((pte & PG_A) != 0)
 5794                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 5795         }
 5796         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 5797             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 5798             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 5799                 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 5800                 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 5801                         goto retry;
 5802         } else
 5803                 PA_UNLOCK_COND(*locked_pa);
 5804         PMAP_UNLOCK(pmap);
 5805         return (val);
 5806 }
 5807 
 5808 void
 5809 pmap_activate(struct thread *td)
 5810 {
 5811         pmap_t  pmap, oldpmap;
 5812         u_int   cpuid;
 5813         u_int32_t  cr3;
 5814 
 5815         critical_enter();
 5816         pmap = vmspace_pmap(td->td_proc->p_vmspace);
 5817         oldpmap = PCPU_GET(curpmap);
 5818         cpuid = PCPU_GET(cpuid);
 5819 #if defined(SMP)
 5820         CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 5821         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 5822 #else
 5823         CPU_CLR(cpuid, &oldpmap->pm_active);
 5824         CPU_SET(cpuid, &pmap->pm_active);
 5825 #endif
 5826 #if defined(PAE) || defined(PAE_TABLES)
 5827         cr3 = vtophys(pmap->pm_pdpt);
 5828 #else
 5829         cr3 = vtophys(pmap->pm_pdir);
 5830 #endif
 5831         /*
 5832          * pmap_activate is for the current thread on the current cpu
 5833          */
 5834         td->td_pcb->pcb_cr3 = cr3;
 5835         PCPU_SET(curpmap, pmap);
 5836         critical_exit();
 5837 }
 5838 
 5839 void
 5840 pmap_activate_boot(pmap_t pmap)
 5841 {
 5842         u_int cpuid;
 5843 
 5844         cpuid = PCPU_GET(cpuid);
 5845 #if defined(SMP)
 5846         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 5847 #else
 5848         CPU_SET(cpuid, &pmap->pm_active);
 5849 #endif
 5850         PCPU_SET(curpmap, pmap);
 5851 }
 5852 
 5853 void
 5854 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 5855 {
 5856 }
 5857 
 5858 /*
 5859  *      Increase the starting virtual address of the given mapping if a
 5860  *      different alignment might result in more superpage mappings.
 5861  */
 5862 void
 5863 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
 5864     vm_offset_t *addr, vm_size_t size)
 5865 {
 5866         vm_offset_t superpage_offset;
 5867 
 5868         if (size < NBPDR)
 5869                 return;
 5870         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 5871                 offset += ptoa(object->pg_color);
 5872         superpage_offset = offset & PDRMASK;
 5873         if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 5874             (*addr & PDRMASK) == superpage_offset)
 5875                 return;
 5876         if ((*addr & PDRMASK) < superpage_offset)
 5877                 *addr = (*addr & ~PDRMASK) + superpage_offset;
 5878         else
 5879                 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 5880 }
 5881 
 5882 vm_offset_t
 5883 pmap_quick_enter_page(vm_page_t m)
 5884 {
 5885         vm_offset_t qaddr;
 5886         pt_entry_t *pte;
 5887 
 5888         critical_enter();
 5889         qaddr = PCPU_GET(qmap_addr);
 5890         pte = vtopte(qaddr);
 5891 
 5892         KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
 5893         *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 5894             pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(m), 0);
 5895         invlpg(qaddr);
 5896 
 5897         return (qaddr);
 5898 }
 5899 
 5900 void
 5901 pmap_quick_remove_page(vm_offset_t addr)
 5902 {
 5903         vm_offset_t qaddr;
 5904         pt_entry_t *pte;
 5905 
 5906         qaddr = PCPU_GET(qmap_addr);
 5907         pte = vtopte(qaddr);
 5908 
 5909         KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
 5910         KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
 5911 
 5912         *pte = 0;
 5913         critical_exit();
 5914 }
 5915 
 5916 static vmem_t *pmap_trm_arena;
 5917 static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS;
 5918 static int trm_guard = PAGE_SIZE;
 5919 
 5920 static int
 5921 pmap_trm_import(void *unused __unused, vmem_size_t size, int flags,
 5922     vmem_addr_t *addrp)
 5923 {
 5924         vm_page_t m;
 5925         vmem_addr_t af, addr, prev_addr;
 5926         pt_entry_t *trm_pte;
 5927 
 5928         prev_addr = atomic_load_long(&pmap_trm_arena_last);
 5929         size = round_page(size) + trm_guard;
 5930         for (;;) {
 5931                 if (prev_addr + size < prev_addr || prev_addr + size < size ||
 5932                     prev_addr + size > PMAP_TRM_MAX_ADDRESS)
 5933                         return (ENOMEM);
 5934                 addr = prev_addr + size;
 5935                 if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr))
 5936                         break;
 5937         }
 5938         prev_addr += trm_guard;
 5939         trm_pte = PTmap + atop(prev_addr);
 5940         for (af = prev_addr; af < addr; af += PAGE_SIZE) {
 5941                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
 5942                     VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 5943                 pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) |
 5944                     PG_M | PG_A | PG_RW | PG_V | pgeflag |
 5945                     pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE));
 5946         }
 5947         *addrp = prev_addr;
 5948         return (0);
 5949 }
 5950 
 5951 static
 5952 void pmap_init_trm(void)
 5953 {
 5954         vm_page_t pd_m;
 5955 
 5956         TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard);
 5957         if ((trm_guard & PAGE_MASK) != 0)
 5958                 trm_guard = 0;
 5959         pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK);
 5960         vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE);
 5961         pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
 5962             VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO);
 5963         if ((pd_m->flags & PG_ZERO) == 0)
 5964                 pmap_zero_page(pd_m);
 5965         PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V |
 5966             pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, TRUE);
 5967 }
 5968 
 5969 void *
 5970 pmap_trm_alloc(size_t size, int flags)
 5971 {
 5972         vmem_addr_t res;
 5973         int error;
 5974 
 5975         MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0);
 5976         error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int),
 5977             0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res);
 5978         if (error != 0)
 5979                 return (NULL);
 5980         if ((flags & M_ZERO) != 0)
 5981                 bzero((void *)res, size);
 5982         return ((void *)res);
 5983 }
 5984 
 5985 void
 5986 pmap_trm_free(void *addr, size_t size)
 5987 {
 5988 
 5989         vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4));
 5990 }
 5991 
 5992 #if defined(PMAP_DEBUG)
 5993 pmap_pid_dump(int pid)
 5994 {
 5995         pmap_t pmap;
 5996         struct proc *p;
 5997         int npte = 0;
 5998         int index;
 5999 
 6000         sx_slock(&allproc_lock);
 6001         FOREACH_PROC_IN_SYSTEM(p) {
 6002                 if (p->p_pid != pid)
 6003                         continue;
 6004 
 6005                 if (p->p_vmspace) {
 6006                         int i,j;
 6007                         index = 0;
 6008                         pmap = vmspace_pmap(p->p_vmspace);
 6009                         for (i = 0; i < NPDEPTD; i++) {
 6010                                 pd_entry_t *pde;
 6011                                 pt_entry_t *pte;
 6012                                 vm_offset_t base = i << PDRSHIFT;
 6013                                 
 6014                                 pde = &pmap->pm_pdir[i];
 6015                                 if (pde && pmap_pde_v(pde)) {
 6016                                         for (j = 0; j < NPTEPG; j++) {
 6017                                                 vm_offset_t va = base + (j << PAGE_SHIFT);
 6018                                                 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 6019                                                         if (index) {
 6020                                                                 index = 0;
 6021                                                                 printf("\n");
 6022                                                         }
 6023                                                         sx_sunlock(&allproc_lock);
 6024                                                         return (npte);
 6025                                                 }
 6026                                                 pte = pmap_pte(pmap, va);
 6027                                                 if (pte && pmap_pte_v(pte)) {
 6028                                                         pt_entry_t pa;
 6029                                                         vm_page_t m;
 6030                                                         pa = *pte;
 6031                                                         m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
 6032                                                         printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 6033                                                                 va, pa, m->hold_count, m->wire_count, m->flags);
 6034                                                         npte++;
 6035                                                         index++;
 6036                                                         if (index >= 2) {
 6037                                                                 index = 0;
 6038                                                                 printf("\n");
 6039                                                         } else {
 6040                                                                 printf(" ");
 6041                                                         }
 6042                                                 }
 6043                                         }
 6044                                 }
 6045                         }
 6046                 }
 6047         }
 6048         sx_sunlock(&allproc_lock);
 6049         return (npte);
 6050 }
 6051 #endif

Cache object: cea644b9e5b1318f2f8497c8382b87e3


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.