The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1991 Regents of the University of California.
    3  * All rights reserved.
    4  * Copyright (c) 1994 John S. Dyson
    5  * All rights reserved.
    6  * Copyright (c) 1994 David Greenman
    7  * All rights reserved.
    8  * Copyright (c) 2003 Peter Wemm
    9  * All rights reserved.
   10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
   11  * All rights reserved.
   12  *
   13  * This code is derived from software contributed to Berkeley by
   14  * the Systems Programming Group of the University of Utah Computer
   15  * Science Department and William Jolitz of UUNET Technologies Inc.
   16  *
   17  * Redistribution and use in source and binary forms, with or without
   18  * modification, are permitted provided that the following conditions
   19  * are met:
   20  * 1. Redistributions of source code must retain the above copyright
   21  *    notice, this list of conditions and the following disclaimer.
   22  * 2. Redistributions in binary form must reproduce the above copyright
   23  *    notice, this list of conditions and the following disclaimer in the
   24  *    documentation and/or other materials provided with the distribution.
   25  * 3. All advertising materials mentioning features or use of this software
   26  *    must display the following acknowledgement:
   27  *      This product includes software developed by the University of
   28  *      California, Berkeley and its contributors.
   29  * 4. Neither the name of the University nor the names of its contributors
   30  *    may be used to endorse or promote products derived from this software
   31  *    without specific prior written permission.
   32  *
   33  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   34  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   35  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   36  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   37  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   38  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   39  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   40  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   41  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   42  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   43  * SUCH DAMAGE.
   44  *
   45  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
   46  */
   47 /*-
   48  * Copyright (c) 2003 Networks Associates Technology, Inc.
   49  * All rights reserved.
   50  *
   51  * This software was developed for the FreeBSD Project by Jake Burkholder,
   52  * Safeport Network Services, and Network Associates Laboratories, the
   53  * Security Research Division of Network Associates, Inc. under
   54  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
   55  * CHATS research program.
   56  *
   57  * Redistribution and use in source and binary forms, with or without
   58  * modification, are permitted provided that the following conditions
   59  * are met:
   60  * 1. Redistributions of source code must retain the above copyright
   61  *    notice, this list of conditions and the following disclaimer.
   62  * 2. Redistributions in binary form must reproduce the above copyright
   63  *    notice, this list of conditions and the following disclaimer in the
   64  *    documentation and/or other materials provided with the distribution.
   65  *
   66  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   67  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   68  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   69  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   70  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   71  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   72  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   73  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   74  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   75  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   76  * SUCH DAMAGE.
   77  */
   78 
   79 #include <sys/cdefs.h>
   80 __FBSDID("$FreeBSD: releng/8.1/sys/amd64/amd64/pmap.c 206183 2010-04-05 16:11:42Z alc $");
   81 
   82 /*
   83  *      Manages physical address maps.
   84  *
   85  *      In addition to hardware address maps, this
   86  *      module is called upon to provide software-use-only
   87  *      maps which may or may not be stored in the same
   88  *      form as hardware maps.  These pseudo-maps are
   89  *      used to store intermediate results from copy
   90  *      operations to and from address spaces.
   91  *
   92  *      Since the information managed by this module is
   93  *      also stored by the logical address mapping module,
   94  *      this module may throw away valid virtual-to-physical
   95  *      mappings at almost any time.  However, invalidations
   96  *      of virtual-to-physical mappings must be done as
   97  *      requested.
   98  *
   99  *      In order to cope with hardware architectures which
  100  *      make virtual-to-physical map invalidates expensive,
  101  *      this module may delay invalidate or reduced protection
  102  *      operations until such time as they are actually
  103  *      necessary.  This module is given full information as
  104  *      to which processors are currently using which maps,
  105  *      and to when physical maps must be made correct.
  106  */
  107 
  108 #include "opt_msgbuf.h"
  109 #include "opt_pmap.h"
  110 #include "opt_vm.h"
  111 
  112 #include <sys/param.h>
  113 #include <sys/systm.h>
  114 #include <sys/kernel.h>
  115 #include <sys/ktr.h>
  116 #include <sys/lock.h>
  117 #include <sys/malloc.h>
  118 #include <sys/mman.h>
  119 #include <sys/msgbuf.h>
  120 #include <sys/mutex.h>
  121 #include <sys/proc.h>
  122 #include <sys/sx.h>
  123 #include <sys/vmmeter.h>
  124 #include <sys/sched.h>
  125 #include <sys/sysctl.h>
  126 #ifdef SMP
  127 #include <sys/smp.h>
  128 #endif
  129 
  130 #include <vm/vm.h>
  131 #include <vm/vm_param.h>
  132 #include <vm/vm_kern.h>
  133 #include <vm/vm_page.h>
  134 #include <vm/vm_map.h>
  135 #include <vm/vm_object.h>
  136 #include <vm/vm_extern.h>
  137 #include <vm/vm_pageout.h>
  138 #include <vm/vm_pager.h>
  139 #include <vm/vm_reserv.h>
  140 #include <vm/uma.h>
  141 
  142 #include <machine/cpu.h>
  143 #include <machine/cputypes.h>
  144 #include <machine/md_var.h>
  145 #include <machine/pcb.h>
  146 #include <machine/specialreg.h>
  147 #ifdef SMP
  148 #include <machine/smp.h>
  149 #endif
  150 
  151 #ifndef PMAP_SHPGPERPROC
  152 #define PMAP_SHPGPERPROC 200
  153 #endif
  154 
  155 #if !defined(DIAGNOSTIC)
  156 #define PMAP_INLINE     __gnu89_inline
  157 #else
  158 #define PMAP_INLINE
  159 #endif
  160 
  161 #define PV_STATS
  162 #ifdef PV_STATS
  163 #define PV_STAT(x)      do { x ; } while (0)
  164 #else
  165 #define PV_STAT(x)      do { } while (0)
  166 #endif
  167 
  168 #define pa_index(pa)    ((pa) >> PDRSHIFT)
  169 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
  170 
  171 struct pmap kernel_pmap_store;
  172 
  173 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
  174 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
  175 
  176 static int ndmpdp;
  177 static vm_paddr_t dmaplimit;
  178 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
  179 pt_entry_t pg_nx;
  180 
  181 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  182 
  183 static int pg_ps_enabled = 1;
  184 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
  185     "Are large page mappings enabled?");
  186 
  187 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
  188 static u_int64_t        KPDphys;        /* phys addr of kernel level 2 */
  189 u_int64_t               KPDPphys;       /* phys addr of kernel level 3 */
  190 u_int64_t               KPML4phys;      /* phys addr of kernel level 4 */
  191 
  192 static u_int64_t        DMPDphys;       /* phys addr of direct mapped level 2 */
  193 static u_int64_t        DMPDPphys;      /* phys addr of direct mapped level 3 */
  194 
  195 /*
  196  * Data for the pv entry allocation mechanism
  197  */
  198 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
  199 static struct md_page *pv_table;
  200 static int shpgperproc = PMAP_SHPGPERPROC;
  201 
  202 /*
  203  * All those kernel PT submaps that BSD is so fond of
  204  */
  205 pt_entry_t *CMAP1 = 0;
  206 caddr_t CADDR1 = 0;
  207 struct msgbuf *msgbufp = 0;
  208 
  209 /*
  210  * Crashdump maps.
  211  */
  212 static caddr_t crashdumpmap;
  213 
  214 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
  215 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
  216 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  217 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  218 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  219 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  220 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
  221                     vm_offset_t va);
  222 static int      pmap_pvh_wired_mappings(struct md_page *pvh, int count);
  223 
  224 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
  225 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  226 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
  227     vm_offset_t va);
  228 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
  229     vm_prot_t prot);
  230 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
  231     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
  232 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
  233 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
  234 static void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
  235 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
  236 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
  237 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
  238 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
  239 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  240 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
  241     vm_prot_t prot);
  242 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
  243 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  244                 vm_page_t *free);
  245 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
  246                 vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
  247 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
  248 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  249     vm_page_t *free);
  250 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
  251                 vm_offset_t va);
  252 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
  253 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
  254     vm_page_t m);
  255 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  256     pd_entry_t newpde);
  257 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
  258 
  259 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
  260 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
  261 
  262 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
  263 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
  264                 vm_page_t* free);
  265 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
  266 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
  267 
  268 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
  269 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
  270 
  271 /*
  272  * Move the kernel virtual free pointer to the next
  273  * 2MB.  This is used to help improve performance
  274  * by using a large (2MB) page for much of the kernel
  275  * (.text, .data, .bss)
  276  */
  277 static vm_offset_t
  278 pmap_kmem_choose(vm_offset_t addr)
  279 {
  280         vm_offset_t newaddr = addr;
  281 
  282         newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
  283         return newaddr;
  284 }
  285 
  286 /********************/
  287 /* Inline functions */
  288 /********************/
  289 
  290 /* Return a non-clipped PD index for a given VA */
  291 static __inline vm_pindex_t
  292 pmap_pde_pindex(vm_offset_t va)
  293 {
  294         return va >> PDRSHIFT;
  295 }
  296 
  297 
  298 /* Return various clipped indexes for a given VA */
  299 static __inline vm_pindex_t
  300 pmap_pte_index(vm_offset_t va)
  301 {
  302 
  303         return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
  304 }
  305 
  306 static __inline vm_pindex_t
  307 pmap_pde_index(vm_offset_t va)
  308 {
  309 
  310         return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
  311 }
  312 
  313 static __inline vm_pindex_t
  314 pmap_pdpe_index(vm_offset_t va)
  315 {
  316 
  317         return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
  318 }
  319 
  320 static __inline vm_pindex_t
  321 pmap_pml4e_index(vm_offset_t va)
  322 {
  323 
  324         return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
  325 }
  326 
  327 /* Return a pointer to the PML4 slot that corresponds to a VA */
  328 static __inline pml4_entry_t *
  329 pmap_pml4e(pmap_t pmap, vm_offset_t va)
  330 {
  331 
  332         return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
  333 }
  334 
  335 /* Return a pointer to the PDP slot that corresponds to a VA */
  336 static __inline pdp_entry_t *
  337 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
  338 {
  339         pdp_entry_t *pdpe;
  340 
  341         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
  342         return (&pdpe[pmap_pdpe_index(va)]);
  343 }
  344 
  345 /* Return a pointer to the PDP slot that corresponds to a VA */
  346 static __inline pdp_entry_t *
  347 pmap_pdpe(pmap_t pmap, vm_offset_t va)
  348 {
  349         pml4_entry_t *pml4e;
  350 
  351         pml4e = pmap_pml4e(pmap, va);
  352         if ((*pml4e & PG_V) == 0)
  353                 return NULL;
  354         return (pmap_pml4e_to_pdpe(pml4e, va));
  355 }
  356 
  357 /* Return a pointer to the PD slot that corresponds to a VA */
  358 static __inline pd_entry_t *
  359 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
  360 {
  361         pd_entry_t *pde;
  362 
  363         pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
  364         return (&pde[pmap_pde_index(va)]);
  365 }
  366 
  367 /* Return a pointer to the PD slot that corresponds to a VA */
  368 static __inline pd_entry_t *
  369 pmap_pde(pmap_t pmap, vm_offset_t va)
  370 {
  371         pdp_entry_t *pdpe;
  372 
  373         pdpe = pmap_pdpe(pmap, va);
  374         if (pdpe == NULL || (*pdpe & PG_V) == 0)
  375                  return NULL;
  376         return (pmap_pdpe_to_pde(pdpe, va));
  377 }
  378 
  379 /* Return a pointer to the PT slot that corresponds to a VA */
  380 static __inline pt_entry_t *
  381 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
  382 {
  383         pt_entry_t *pte;
  384 
  385         pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
  386         return (&pte[pmap_pte_index(va)]);
  387 }
  388 
  389 /* Return a pointer to the PT slot that corresponds to a VA */
  390 static __inline pt_entry_t *
  391 pmap_pte(pmap_t pmap, vm_offset_t va)
  392 {
  393         pd_entry_t *pde;
  394 
  395         pde = pmap_pde(pmap, va);
  396         if (pde == NULL || (*pde & PG_V) == 0)
  397                 return NULL;
  398         if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
  399                 return ((pt_entry_t *)pde);
  400         return (pmap_pde_to_pte(pde, va));
  401 }
  402 
  403 
  404 PMAP_INLINE pt_entry_t *
  405 vtopte(vm_offset_t va)
  406 {
  407         u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  408 
  409         return (PTmap + ((va >> PAGE_SHIFT) & mask));
  410 }
  411 
  412 static __inline pd_entry_t *
  413 vtopde(vm_offset_t va)
  414 {
  415         u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  416 
  417         return (PDmap + ((va >> PDRSHIFT) & mask));
  418 }
  419 
  420 static u_int64_t
  421 allocpages(vm_paddr_t *firstaddr, int n)
  422 {
  423         u_int64_t ret;
  424 
  425         ret = *firstaddr;
  426         bzero((void *)ret, n * PAGE_SIZE);
  427         *firstaddr += n * PAGE_SIZE;
  428         return (ret);
  429 }
  430 
  431 static void
  432 create_pagetables(vm_paddr_t *firstaddr)
  433 {
  434         int i;
  435 
  436         /* Allocate pages */
  437         KPTphys = allocpages(firstaddr, NKPT);
  438         KPML4phys = allocpages(firstaddr, 1);
  439         KPDPphys = allocpages(firstaddr, NKPML4E);
  440         KPDphys = allocpages(firstaddr, NKPDPE);
  441 
  442         ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
  443         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
  444                 ndmpdp = 4;
  445         DMPDPphys = allocpages(firstaddr, NDMPML4E);
  446         if (TRUE || (amd_feature & AMDID_PAGE1GB) == 0)
  447                 DMPDphys = allocpages(firstaddr, ndmpdp);
  448         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
  449 
  450         /* Fill in the underlying page table pages */
  451         /* Read-only from zero to physfree */
  452         /* XXX not fully used, underneath 2M pages */
  453         for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
  454                 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
  455                 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
  456         }
  457 
  458         /* Now map the page tables at their location within PTmap */
  459         for (i = 0; i < NKPT; i++) {
  460                 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
  461                 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
  462         }
  463 
  464         /* Map from zero to end of allocations under 2M pages */
  465         /* This replaces some of the KPTphys entries above */
  466         for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
  467                 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
  468                 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
  469         }
  470 
  471         /* And connect up the PD to the PDP */
  472         for (i = 0; i < NKPDPE; i++) {
  473                 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
  474                     (i << PAGE_SHIFT);
  475                 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
  476         }
  477 
  478         /* Now set up the direct map space using either 2MB or 1GB pages */
  479         /* Preset PG_M and PG_A because demotion expects it */
  480         if (TRUE || (amd_feature & AMDID_PAGE1GB) == 0) {
  481                 for (i = 0; i < NPDEPG * ndmpdp; i++) {
  482                         ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
  483                         ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS |
  484                             PG_G | PG_M | PG_A;
  485                 }
  486                 /* And the direct map space's PDP */
  487                 for (i = 0; i < ndmpdp; i++) {
  488                         ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
  489                             (i << PAGE_SHIFT);
  490                         ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
  491                 }
  492         } else {
  493                 for (i = 0; i < ndmpdp; i++) {
  494                         ((pdp_entry_t *)DMPDPphys)[i] =
  495                             (vm_paddr_t)i << PDPSHIFT;
  496                         ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS |
  497                             PG_G | PG_M | PG_A;
  498                 }
  499         }
  500 
  501         /* And recursively map PML4 to itself in order to get PTmap */
  502         ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
  503         ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
  504 
  505         /* Connect the Direct Map slot up to the PML4 */
  506         ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
  507         ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
  508 
  509         /* Connect the KVA slot up to the PML4 */
  510         ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
  511         ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
  512 }
  513 
  514 /*
  515  *      Bootstrap the system enough to run with virtual memory.
  516  *
  517  *      On amd64 this is called after mapping has already been enabled
  518  *      and just syncs the pmap module with what has already been done.
  519  *      [We can't call it easily with mapping off since the kernel is not
  520  *      mapped with PA == VA, hence we would have to relocate every address
  521  *      from the linked base (virtual) address "KERNBASE" to the actual
  522  *      (physical) address starting relative to 0]
  523  */
  524 void
  525 pmap_bootstrap(vm_paddr_t *firstaddr)
  526 {
  527         vm_offset_t va;
  528         pt_entry_t *pte, *unused;
  529 
  530         /*
  531          * Create an initial set of page tables to run the kernel in.
  532          */
  533         create_pagetables(firstaddr);
  534 
  535         virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
  536         virtual_avail = pmap_kmem_choose(virtual_avail);
  537 
  538         virtual_end = VM_MAX_KERNEL_ADDRESS;
  539 
  540 
  541         /* XXX do %cr0 as well */
  542         load_cr4(rcr4() | CR4_PGE | CR4_PSE);
  543         load_cr3(KPML4phys);
  544 
  545         /*
  546          * Initialize the kernel pmap (which is statically allocated).
  547          */
  548         PMAP_LOCK_INIT(kernel_pmap);
  549         kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
  550         kernel_pmap->pm_root = NULL;
  551         kernel_pmap->pm_active = -1;    /* don't allow deactivation */
  552         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
  553 
  554         /*
  555          * Reserve some special page table entries/VA space for temporary
  556          * mapping of pages.
  557          */
  558 #define SYSMAP(c, p, v, n)      \
  559         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
  560 
  561         va = virtual_avail;
  562         pte = vtopte(va);
  563 
  564         /*
  565          * CMAP1 is only used for the memory test.
  566          */
  567         SYSMAP(caddr_t, CMAP1, CADDR1, 1)
  568 
  569         /*
  570          * Crashdump maps.
  571          */
  572         SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
  573 
  574         /*
  575          * msgbufp is used to map the system message buffer.
  576          */
  577         SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
  578 
  579         virtual_avail = va;
  580 
  581         *CMAP1 = 0;
  582 
  583         invltlb();
  584 
  585         /* Initialize the PAT MSR. */
  586         pmap_init_pat();
  587 }
  588 
  589 /*
  590  * Setup the PAT MSR.
  591  */
  592 void
  593 pmap_init_pat(void)
  594 {
  595         uint64_t pat_msr;
  596 
  597         /* Bail if this CPU doesn't implement PAT. */
  598         if (!(cpu_feature & CPUID_PAT))
  599                 panic("no PAT??");
  600 
  601         /*
  602          * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
  603          * Program 4 and 5 as WP and WC.
  604          * Leave 6 and 7 as UC and UC-.
  605          */
  606         pat_msr = rdmsr(MSR_PAT);
  607         pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
  608         pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
  609             PAT_VALUE(5, PAT_WRITE_COMBINING);
  610         wrmsr(MSR_PAT, pat_msr);
  611 }
  612 
  613 /*
  614  *      Initialize a vm_page's machine-dependent fields.
  615  */
  616 void
  617 pmap_page_init(vm_page_t m)
  618 {
  619 
  620         TAILQ_INIT(&m->md.pv_list);
  621         m->md.pat_mode = PAT_WRITE_BACK;
  622 }
  623 
  624 /*
  625  *      Initialize the pmap module.
  626  *      Called by vm_init, to initialize any structures that the pmap
  627  *      system needs to map virtual memory.
  628  */
  629 void
  630 pmap_init(void)
  631 {
  632         vm_page_t mpte;
  633         vm_size_t s;
  634         int i, pv_npg;
  635 
  636         /*
  637          * Initialize the vm page array entries for the kernel pmap's
  638          * page table pages.
  639          */ 
  640         for (i = 0; i < NKPT; i++) {
  641                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
  642                 KASSERT(mpte >= vm_page_array &&
  643                     mpte < &vm_page_array[vm_page_array_size],
  644                     ("pmap_init: page table page is out of range"));
  645                 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
  646                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
  647         }
  648 
  649         /*
  650          * Initialize the address space (zone) for the pv entries.  Set a
  651          * high water mark so that the system can recover from excessive
  652          * numbers of pv entries.
  653          */
  654         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
  655         pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
  656         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
  657         pv_entry_high_water = 9 * (pv_entry_max / 10);
  658 
  659         /*
  660          * If the kernel is running in a virtual machine on an AMD Family 10h
  661          * processor, then it must assume that MCA is enabled by the virtual
  662          * machine monitor.
  663          */
  664         if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
  665             CPUID_TO_FAMILY(cpu_id) == 0x10)
  666                 workaround_erratum383 = 1;
  667 
  668         /*
  669          * Are large page mappings enabled?
  670          */
  671         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
  672         if (pg_ps_enabled) {
  673                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
  674                     ("pmap_init: can't assign to pagesizes[1]"));
  675                 pagesizes[1] = NBPDR;
  676         }
  677 
  678         /*
  679          * Calculate the size of the pv head table for superpages.
  680          */
  681         for (i = 0; phys_avail[i + 1]; i += 2);
  682         pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
  683 
  684         /*
  685          * Allocate memory for the pv head table for superpages.
  686          */
  687         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
  688         s = round_page(s);
  689         pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
  690         for (i = 0; i < pv_npg; i++)
  691                 TAILQ_INIT(&pv_table[i].pv_list);
  692 }
  693 
  694 static int
  695 pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
  696 {
  697         int error;
  698 
  699         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
  700         if (error == 0 && req->newptr) {
  701                 shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
  702                 pv_entry_high_water = 9 * (pv_entry_max / 10);
  703         }
  704         return (error);
  705 }
  706 SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, 
  707     &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
  708 
  709 static int
  710 pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
  711 {
  712         int error;
  713 
  714         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
  715         if (error == 0 && req->newptr) {
  716                 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
  717                 pv_entry_high_water = 9 * (pv_entry_max / 10);
  718         }
  719         return (error);
  720 }
  721 SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, 
  722     &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
  723 
  724 SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
  725     "2MB page mapping counters");
  726 
  727 static u_long pmap_pde_demotions;
  728 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
  729     &pmap_pde_demotions, 0, "2MB page demotions");
  730 
  731 static u_long pmap_pde_mappings;
  732 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
  733     &pmap_pde_mappings, 0, "2MB page mappings");
  734 
  735 static u_long pmap_pde_p_failures;
  736 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
  737     &pmap_pde_p_failures, 0, "2MB page promotion failures");
  738 
  739 static u_long pmap_pde_promotions;
  740 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
  741     &pmap_pde_promotions, 0, "2MB page promotions");
  742 
  743 SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
  744     "1GB page mapping counters");
  745 
  746 static u_long pmap_pdpe_demotions;
  747 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
  748     &pmap_pdpe_demotions, 0, "1GB page demotions");
  749 
  750 
  751 /***************************************************
  752  * Low level helper routines.....
  753  ***************************************************/
  754 
  755 /*
  756  * Determine the appropriate bits to set in a PTE or PDE for a specified
  757  * caching mode.
  758  */
  759 static int
  760 pmap_cache_bits(int mode, boolean_t is_pde)
  761 {
  762         int pat_flag, pat_index, cache_bits;
  763 
  764         /* The PAT bit is different for PTE's and PDE's. */
  765         pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
  766 
  767         /* Map the caching mode to a PAT index. */
  768         switch (mode) {
  769         case PAT_UNCACHEABLE:
  770                 pat_index = 3;
  771                 break;
  772         case PAT_WRITE_THROUGH:
  773                 pat_index = 1;
  774                 break;
  775         case PAT_WRITE_BACK:
  776                 pat_index = 0;
  777                 break;
  778         case PAT_UNCACHED:
  779                 pat_index = 2;
  780                 break;
  781         case PAT_WRITE_COMBINING:
  782                 pat_index = 5;
  783                 break;
  784         case PAT_WRITE_PROTECTED:
  785                 pat_index = 4;
  786                 break;
  787         default:
  788                 panic("Unknown caching mode %d\n", mode);
  789         }
  790 
  791         /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
  792         cache_bits = 0;
  793         if (pat_index & 0x4)
  794                 cache_bits |= pat_flag;
  795         if (pat_index & 0x2)
  796                 cache_bits |= PG_NC_PCD;
  797         if (pat_index & 0x1)
  798                 cache_bits |= PG_NC_PWT;
  799         return (cache_bits);
  800 }
  801 
  802 /*
  803  * After changing the page size for the specified virtual address in the page
  804  * table, flush the corresponding entries from the processor's TLB.  Only the
  805  * calling processor's TLB is affected.
  806  *
  807  * The calling thread must be pinned to a processor.
  808  */
  809 static void
  810 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
  811 {
  812         u_long cr4;
  813 
  814         if ((newpde & PG_PS) == 0)
  815                 /* Demotion: flush a specific 2MB page mapping. */
  816                 invlpg(va);
  817         else if ((newpde & PG_G) == 0)
  818                 /*
  819                  * Promotion: flush every 4KB page mapping from the TLB
  820                  * because there are too many to flush individually.
  821                  */
  822                 invltlb();
  823         else {
  824                 /*
  825                  * Promotion: flush every 4KB page mapping from the TLB,
  826                  * including any global (PG_G) mappings.
  827                  */
  828                 cr4 = rcr4();
  829                 load_cr4(cr4 & ~CR4_PGE);
  830                 /*
  831                  * Although preemption at this point could be detrimental to
  832                  * performance, it would not lead to an error.  PG_G is simply
  833                  * ignored if CR4.PGE is clear.  Moreover, in case this block
  834                  * is re-entered, the load_cr4() either above or below will
  835                  * modify CR4.PGE flushing the TLB.
  836                  */
  837                 load_cr4(cr4 | CR4_PGE);
  838         }
  839 }
  840 #ifdef SMP
  841 /*
  842  * For SMP, these functions have to use the IPI mechanism for coherence.
  843  *
  844  * N.B.: Before calling any of the following TLB invalidation functions,
  845  * the calling processor must ensure that all stores updating a non-
  846  * kernel page table are globally performed.  Otherwise, another
  847  * processor could cache an old, pre-update entry without being
  848  * invalidated.  This can happen one of two ways: (1) The pmap becomes
  849  * active on another processor after its pm_active field is checked by
  850  * one of the following functions but before a store updating the page
  851  * table is globally performed. (2) The pmap becomes active on another
  852  * processor before its pm_active field is checked but due to
  853  * speculative loads one of the following functions stills reads the
  854  * pmap as inactive on the other processor.
  855  * 
  856  * The kernel page table is exempt because its pm_active field is
  857  * immutable.  The kernel page table is always active on every
  858  * processor.
  859  */
  860 void
  861 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
  862 {
  863         u_int cpumask;
  864         u_int other_cpus;
  865 
  866         sched_pin();
  867         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
  868                 invlpg(va);
  869                 smp_invlpg(va);
  870         } else {
  871                 cpumask = PCPU_GET(cpumask);
  872                 other_cpus = PCPU_GET(other_cpus);
  873                 if (pmap->pm_active & cpumask)
  874                         invlpg(va);
  875                 if (pmap->pm_active & other_cpus)
  876                         smp_masked_invlpg(pmap->pm_active & other_cpus, va);
  877         }
  878         sched_unpin();
  879 }
  880 
  881 void
  882 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  883 {
  884         u_int cpumask;
  885         u_int other_cpus;
  886         vm_offset_t addr;
  887 
  888         sched_pin();
  889         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
  890                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
  891                         invlpg(addr);
  892                 smp_invlpg_range(sva, eva);
  893         } else {
  894                 cpumask = PCPU_GET(cpumask);
  895                 other_cpus = PCPU_GET(other_cpus);
  896                 if (pmap->pm_active & cpumask)
  897                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
  898                                 invlpg(addr);
  899                 if (pmap->pm_active & other_cpus)
  900                         smp_masked_invlpg_range(pmap->pm_active & other_cpus,
  901                             sva, eva);
  902         }
  903         sched_unpin();
  904 }
  905 
  906 void
  907 pmap_invalidate_all(pmap_t pmap)
  908 {
  909         u_int cpumask;
  910         u_int other_cpus;
  911 
  912         sched_pin();
  913         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
  914                 invltlb();
  915                 smp_invltlb();
  916         } else {
  917                 cpumask = PCPU_GET(cpumask);
  918                 other_cpus = PCPU_GET(other_cpus);
  919                 if (pmap->pm_active & cpumask)
  920                         invltlb();
  921                 if (pmap->pm_active & other_cpus)
  922                         smp_masked_invltlb(pmap->pm_active & other_cpus);
  923         }
  924         sched_unpin();
  925 }
  926 
  927 void
  928 pmap_invalidate_cache(void)
  929 {
  930 
  931         sched_pin();
  932         wbinvd();
  933         smp_cache_flush();
  934         sched_unpin();
  935 }
  936 
  937 struct pde_action {
  938         cpumask_t store;        /* processor that updates the PDE */
  939         cpumask_t invalidate;   /* processors that invalidate their TLB */
  940         vm_offset_t va;
  941         pd_entry_t *pde;
  942         pd_entry_t newpde;
  943 };
  944 
  945 static void
  946 pmap_update_pde_action(void *arg)
  947 {
  948         struct pde_action *act = arg;
  949 
  950         if (act->store == PCPU_GET(cpumask))
  951                 pde_store(act->pde, act->newpde);
  952 }
  953 
  954 static void
  955 pmap_update_pde_teardown(void *arg)
  956 {
  957         struct pde_action *act = arg;
  958 
  959         if ((act->invalidate & PCPU_GET(cpumask)) != 0)
  960                 pmap_update_pde_invalidate(act->va, act->newpde);
  961 }
  962 
  963 /*
  964  * Change the page size for the specified virtual address in a way that
  965  * prevents any possibility of the TLB ever having two entries that map the
  966  * same virtual address using different page sizes.  This is the recommended
  967  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  968  * machine check exception for a TLB state that is improperly diagnosed as a
  969  * hardware error.
  970  */
  971 static void
  972 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
  973 {
  974         struct pde_action act;
  975         cpumask_t active, cpumask;
  976 
  977         sched_pin();
  978         cpumask = PCPU_GET(cpumask);
  979         if (pmap == kernel_pmap)
  980                 active = all_cpus;
  981         else
  982                 active = pmap->pm_active;
  983         if ((active & PCPU_GET(other_cpus)) != 0) {
  984                 act.store = cpumask;
  985                 act.invalidate = active;
  986                 act.va = va;
  987                 act.pde = pde;
  988                 act.newpde = newpde;
  989                 smp_rendezvous_cpus(cpumask | active,
  990                     smp_no_rendevous_barrier, pmap_update_pde_action,
  991                     pmap_update_pde_teardown, &act);
  992         } else {
  993                 pde_store(pde, newpde);
  994                 if ((active & cpumask) != 0)
  995                         pmap_update_pde_invalidate(va, newpde);
  996         }
  997         sched_unpin();
  998 }
  999 #else /* !SMP */
 1000 /*
 1001  * Normal, non-SMP, invalidation functions.
 1002  * We inline these within pmap.c for speed.
 1003  */
 1004 PMAP_INLINE void
 1005 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 1006 {
 1007 
 1008         if (pmap == kernel_pmap || pmap->pm_active)
 1009                 invlpg(va);
 1010 }
 1011 
 1012 PMAP_INLINE void
 1013 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 1014 {
 1015         vm_offset_t addr;
 1016 
 1017         if (pmap == kernel_pmap || pmap->pm_active)
 1018                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 1019                         invlpg(addr);
 1020 }
 1021 
 1022 PMAP_INLINE void
 1023 pmap_invalidate_all(pmap_t pmap)
 1024 {
 1025 
 1026         if (pmap == kernel_pmap || pmap->pm_active)
 1027                 invltlb();
 1028 }
 1029 
 1030 PMAP_INLINE void
 1031 pmap_invalidate_cache(void)
 1032 {
 1033 
 1034         wbinvd();
 1035 }
 1036 
 1037 static void
 1038 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 1039 {
 1040 
 1041         pde_store(pde, newpde);
 1042         if (pmap == kernel_pmap || pmap->pm_active)
 1043                 pmap_update_pde_invalidate(va, newpde);
 1044 }
 1045 #endif /* !SMP */
 1046 
 1047 static void
 1048 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 1049 {
 1050 
 1051         KASSERT((sva & PAGE_MASK) == 0,
 1052             ("pmap_invalidate_cache_range: sva not page-aligned"));
 1053         KASSERT((eva & PAGE_MASK) == 0,
 1054             ("pmap_invalidate_cache_range: eva not page-aligned"));
 1055 
 1056         if (cpu_feature & CPUID_SS)
 1057                 ; /* If "Self Snoop" is supported, do nothing. */
 1058         else if ((cpu_feature & CPUID_CLFSH) != 0 &&
 1059                  eva - sva < 2 * 1024 * 1024) {
 1060 
 1061                 /*
 1062                  * Otherwise, do per-cache line flush.  Use the mfence
 1063                  * instruction to insure that previous stores are
 1064                  * included in the write-back.  The processor
 1065                  * propagates flush to other processors in the cache
 1066                  * coherence domain.
 1067                  */
 1068                 mfence();
 1069                 for (; sva < eva; sva += cpu_clflush_line_size)
 1070                         clflush(sva);
 1071                 mfence();
 1072         } else {
 1073 
 1074                 /*
 1075                  * No targeted cache flush methods are supported by CPU,
 1076                  * or the supplied range is bigger than 2MB.
 1077                  * Globally invalidate cache.
 1078                  */
 1079                 pmap_invalidate_cache();
 1080         }
 1081 }
 1082 
 1083 /*
 1084  * Are we current address space or kernel?
 1085  */
 1086 static __inline int
 1087 pmap_is_current(pmap_t pmap)
 1088 {
 1089         return (pmap == kernel_pmap ||
 1090             (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
 1091 }
 1092 
 1093 /*
 1094  *      Routine:        pmap_extract
 1095  *      Function:
 1096  *              Extract the physical page address associated
 1097  *              with the given map/virtual_address pair.
 1098  */
 1099 vm_paddr_t 
 1100 pmap_extract(pmap_t pmap, vm_offset_t va)
 1101 {
 1102         vm_paddr_t rtval;
 1103         pt_entry_t *pte;
 1104         pd_entry_t pde, *pdep;
 1105 
 1106         rtval = 0;
 1107         PMAP_LOCK(pmap);
 1108         pdep = pmap_pde(pmap, va);
 1109         if (pdep != NULL) {
 1110                 pde = *pdep;
 1111                 if (pde) {
 1112                         if ((pde & PG_PS) != 0)
 1113                                 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 1114                         else {
 1115                                 pte = pmap_pde_to_pte(pdep, va);
 1116                                 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 1117                         }
 1118                 }
 1119         }
 1120         PMAP_UNLOCK(pmap);
 1121         return (rtval);
 1122 }
 1123 
 1124 /*
 1125  *      Routine:        pmap_extract_and_hold
 1126  *      Function:
 1127  *              Atomically extract and hold the physical page
 1128  *              with the given pmap and virtual address pair
 1129  *              if that mapping permits the given protection.
 1130  */
 1131 vm_page_t
 1132 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 1133 {
 1134         pd_entry_t pde, *pdep;
 1135         pt_entry_t pte;
 1136         vm_page_t m;
 1137 
 1138         m = NULL;
 1139         vm_page_lock_queues();
 1140         PMAP_LOCK(pmap);
 1141         pdep = pmap_pde(pmap, va);
 1142         if (pdep != NULL && (pde = *pdep)) {
 1143                 if (pde & PG_PS) {
 1144                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 1145                                 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 1146                                     (va & PDRMASK));
 1147                                 vm_page_hold(m);
 1148                         }
 1149                 } else {
 1150                         pte = *pmap_pde_to_pte(pdep, va);
 1151                         if ((pte & PG_V) &&
 1152                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 1153                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 1154                                 vm_page_hold(m);
 1155                         }
 1156                 }
 1157         }
 1158         vm_page_unlock_queues();
 1159         PMAP_UNLOCK(pmap);
 1160         return (m);
 1161 }
 1162 
 1163 vm_paddr_t
 1164 pmap_kextract(vm_offset_t va)
 1165 {
 1166         pd_entry_t pde;
 1167         vm_paddr_t pa;
 1168 
 1169         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 1170                 pa = DMAP_TO_PHYS(va);
 1171         } else {
 1172                 pde = *vtopde(va);
 1173                 if (pde & PG_PS) {
 1174                         pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 1175                 } else {
 1176                         /*
 1177                          * Beware of a concurrent promotion that changes the
 1178                          * PDE at this point!  For example, vtopte() must not
 1179                          * be used to access the PTE because it would use the
 1180                          * new PDE.  It is, however, safe to use the old PDE
 1181                          * because the page table page is preserved by the
 1182                          * promotion.
 1183                          */
 1184                         pa = *pmap_pde_to_pte(&pde, va);
 1185                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 1186                 }
 1187         }
 1188         return pa;
 1189 }
 1190 
 1191 /***************************************************
 1192  * Low level mapping routines.....
 1193  ***************************************************/
 1194 
 1195 /*
 1196  * Add a wired page to the kva.
 1197  * Note: not SMP coherent.
 1198  */
 1199 PMAP_INLINE void 
 1200 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 1201 {
 1202         pt_entry_t *pte;
 1203 
 1204         pte = vtopte(va);
 1205         pte_store(pte, pa | PG_RW | PG_V | PG_G);
 1206 }
 1207 
 1208 static __inline void
 1209 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 1210 {
 1211         pt_entry_t *pte;
 1212 
 1213         pte = vtopte(va);
 1214         pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
 1215 }
 1216 
 1217 /*
 1218  * Remove a page from the kernel pagetables.
 1219  * Note: not SMP coherent.
 1220  */
 1221 PMAP_INLINE void
 1222 pmap_kremove(vm_offset_t va)
 1223 {
 1224         pt_entry_t *pte;
 1225 
 1226         pte = vtopte(va);
 1227         pte_clear(pte);
 1228 }
 1229 
 1230 /*
 1231  *      Used to map a range of physical addresses into kernel
 1232  *      virtual address space.
 1233  *
 1234  *      The value passed in '*virt' is a suggested virtual address for
 1235  *      the mapping. Architectures which can support a direct-mapped
 1236  *      physical to virtual region can return the appropriate address
 1237  *      within that region, leaving '*virt' unchanged. Other
 1238  *      architectures should map the pages starting at '*virt' and
 1239  *      update '*virt' with the first usable address after the mapped
 1240  *      region.
 1241  */
 1242 vm_offset_t
 1243 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 1244 {
 1245         return PHYS_TO_DMAP(start);
 1246 }
 1247 
 1248 
 1249 /*
 1250  * Add a list of wired pages to the kva
 1251  * this routine is only used for temporary
 1252  * kernel mappings that do not need to have
 1253  * page modification or references recorded.
 1254  * Note that old mappings are simply written
 1255  * over.  The page *must* be wired.
 1256  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1257  */
 1258 void
 1259 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 1260 {
 1261         pt_entry_t *endpte, oldpte, *pte;
 1262 
 1263         oldpte = 0;
 1264         pte = vtopte(sva);
 1265         endpte = pte + count;
 1266         while (pte < endpte) {
 1267                 oldpte |= *pte;
 1268                 pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G |
 1269                     pmap_cache_bits((*ma)->md.pat_mode, 0) | PG_RW | PG_V);
 1270                 pte++;
 1271                 ma++;
 1272         }
 1273         if ((oldpte & PG_V) != 0)
 1274                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
 1275                     PAGE_SIZE);
 1276 }
 1277 
 1278 /*
 1279  * This routine tears out page mappings from the
 1280  * kernel -- it is meant only for temporary mappings.
 1281  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 1282  */
 1283 void
 1284 pmap_qremove(vm_offset_t sva, int count)
 1285 {
 1286         vm_offset_t va;
 1287 
 1288         va = sva;
 1289         while (count-- > 0) {
 1290                 pmap_kremove(va);
 1291                 va += PAGE_SIZE;
 1292         }
 1293         pmap_invalidate_range(kernel_pmap, sva, va);
 1294 }
 1295 
 1296 /***************************************************
 1297  * Page table page management routines.....
 1298  ***************************************************/
 1299 static __inline void
 1300 pmap_free_zero_pages(vm_page_t free)
 1301 {
 1302         vm_page_t m;
 1303 
 1304         while (free != NULL) {
 1305                 m = free;
 1306                 free = m->right;
 1307                 /* Preserve the page's PG_ZERO setting. */
 1308                 vm_page_free_toq(m);
 1309         }
 1310 }
 1311 
 1312 /*
 1313  * Schedule the specified unused page table page to be freed.  Specifically,
 1314  * add the page to the specified list of pages that will be released to the
 1315  * physical memory manager after the TLB has been updated.
 1316  */
 1317 static __inline void
 1318 pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
 1319 {
 1320 
 1321         if (set_PG_ZERO)
 1322                 m->flags |= PG_ZERO;
 1323         else
 1324                 m->flags &= ~PG_ZERO;
 1325         m->right = *free;
 1326         *free = m;
 1327 }
 1328         
 1329 /*
 1330  * Inserts the specified page table page into the specified pmap's collection
 1331  * of idle page table pages.  Each of a pmap's page table pages is responsible
 1332  * for mapping a distinct range of virtual addresses.  The pmap's collection is
 1333  * ordered by this virtual address range.
 1334  */
 1335 static void
 1336 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 1337 {
 1338         vm_page_t root;
 1339 
 1340         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1341         root = pmap->pm_root;
 1342         if (root == NULL) {
 1343                 mpte->left = NULL;
 1344                 mpte->right = NULL;
 1345         } else {
 1346                 root = vm_page_splay(mpte->pindex, root);
 1347                 if (mpte->pindex < root->pindex) {
 1348                         mpte->left = root->left;
 1349                         mpte->right = root;
 1350                         root->left = NULL;
 1351                 } else if (mpte->pindex == root->pindex)
 1352                         panic("pmap_insert_pt_page: pindex already inserted");
 1353                 else {
 1354                         mpte->right = root->right;
 1355                         mpte->left = root;
 1356                         root->right = NULL;
 1357                 }
 1358         }
 1359         pmap->pm_root = mpte;
 1360 }
 1361 
 1362 /*
 1363  * Looks for a page table page mapping the specified virtual address in the
 1364  * specified pmap's collection of idle page table pages.  Returns NULL if there
 1365  * is no page table page corresponding to the specified virtual address.
 1366  */
 1367 static vm_page_t
 1368 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
 1369 {
 1370         vm_page_t mpte;
 1371         vm_pindex_t pindex = pmap_pde_pindex(va);
 1372 
 1373         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1374         if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
 1375                 mpte = vm_page_splay(pindex, mpte);
 1376                 if ((pmap->pm_root = mpte)->pindex != pindex)
 1377                         mpte = NULL;
 1378         }
 1379         return (mpte);
 1380 }
 1381 
 1382 /*
 1383  * Removes the specified page table page from the specified pmap's collection
 1384  * of idle page table pages.  The specified page table page must be a member of
 1385  * the pmap's collection.
 1386  */
 1387 static void
 1388 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
 1389 {
 1390         vm_page_t root;
 1391 
 1392         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1393         if (mpte != pmap->pm_root) {
 1394                 root = vm_page_splay(mpte->pindex, pmap->pm_root);
 1395                 KASSERT(mpte == root,
 1396                     ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
 1397                     mpte, pmap));
 1398         }
 1399         if (mpte->left == NULL)
 1400                 root = mpte->right;
 1401         else {
 1402                 root = vm_page_splay(mpte->pindex, mpte->left);
 1403                 root->right = mpte->right;
 1404         }
 1405         pmap->pm_root = root;
 1406 }
 1407 
 1408 /*
 1409  * This routine unholds page table pages, and if the hold count
 1410  * drops to zero, then it decrements the wire count.
 1411  */
 1412 static __inline int
 1413 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
 1414 {
 1415 
 1416         --m->wire_count;
 1417         if (m->wire_count == 0)
 1418                 return _pmap_unwire_pte_hold(pmap, va, m, free);
 1419         else
 1420                 return 0;
 1421 }
 1422 
 1423 static int 
 1424 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 
 1425     vm_page_t *free)
 1426 {
 1427 
 1428         /*
 1429          * unmap the page table page
 1430          */
 1431         if (m->pindex >= (NUPDE + NUPDPE)) {
 1432                 /* PDP page */
 1433                 pml4_entry_t *pml4;
 1434                 pml4 = pmap_pml4e(pmap, va);
 1435                 *pml4 = 0;
 1436         } else if (m->pindex >= NUPDE) {
 1437                 /* PD page */
 1438                 pdp_entry_t *pdp;
 1439                 pdp = pmap_pdpe(pmap, va);
 1440                 *pdp = 0;
 1441         } else {
 1442                 /* PTE page */
 1443                 pd_entry_t *pd;
 1444                 pd = pmap_pde(pmap, va);
 1445                 *pd = 0;
 1446         }
 1447         --pmap->pm_stats.resident_count;
 1448         if (m->pindex < NUPDE) {
 1449                 /* We just released a PT, unhold the matching PD */
 1450                 vm_page_t pdpg;
 1451 
 1452                 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 1453                 pmap_unwire_pte_hold(pmap, va, pdpg, free);
 1454         }
 1455         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 1456                 /* We just released a PD, unhold the matching PDP */
 1457                 vm_page_t pdppg;
 1458 
 1459                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 1460                 pmap_unwire_pte_hold(pmap, va, pdppg, free);
 1461         }
 1462 
 1463         /*
 1464          * This is a release store so that the ordinary store unmapping
 1465          * the page table page is globally performed before TLB shoot-
 1466          * down is begun.
 1467          */
 1468         atomic_subtract_rel_int(&cnt.v_wire_count, 1);
 1469 
 1470         /* 
 1471          * Put page on a list so that it is released after
 1472          * *ALL* TLB shootdown is done
 1473          */
 1474         pmap_add_delayed_free_list(m, free, TRUE);
 1475         
 1476         return 1;
 1477 }
 1478 
 1479 /*
 1480  * After removing a page table entry, this routine is used to
 1481  * conditionally free the page, and manage the hold/wire counts.
 1482  */
 1483 static int
 1484 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
 1485 {
 1486         vm_page_t mpte;
 1487 
 1488         if (va >= VM_MAXUSER_ADDRESS)
 1489                 return 0;
 1490         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 1491         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 1492         return pmap_unwire_pte_hold(pmap, va, mpte, free);
 1493 }
 1494 
 1495 void
 1496 pmap_pinit0(pmap_t pmap)
 1497 {
 1498 
 1499         PMAP_LOCK_INIT(pmap);
 1500         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 1501         pmap->pm_root = NULL;
 1502         pmap->pm_active = 0;
 1503         TAILQ_INIT(&pmap->pm_pvchunk);
 1504         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 1505 }
 1506 
 1507 /*
 1508  * Initialize a preallocated and zeroed pmap structure,
 1509  * such as one in a vmspace structure.
 1510  */
 1511 int
 1512 pmap_pinit(pmap_t pmap)
 1513 {
 1514         vm_page_t pml4pg;
 1515         static vm_pindex_t color;
 1516 
 1517         PMAP_LOCK_INIT(pmap);
 1518 
 1519         /*
 1520          * allocate the page directory page
 1521          */
 1522         while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
 1523             VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 1524                 VM_WAIT;
 1525 
 1526         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 1527 
 1528         if ((pml4pg->flags & PG_ZERO) == 0)
 1529                 pagezero(pmap->pm_pml4);
 1530 
 1531         /* Wire in kernel global address entries. */
 1532         pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
 1533         pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
 1534 
 1535         /* install self-referential address mapping entry(s) */
 1536         pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
 1537 
 1538         pmap->pm_root = NULL;
 1539         pmap->pm_active = 0;
 1540         TAILQ_INIT(&pmap->pm_pvchunk);
 1541         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 1542 
 1543         return (1);
 1544 }
 1545 
 1546 /*
 1547  * this routine is called if the page table page is not
 1548  * mapped correctly.
 1549  *
 1550  * Note: If a page allocation fails at page table level two or three,
 1551  * one or two pages may be held during the wait, only to be released
 1552  * afterwards.  This conservative approach is easily argued to avoid
 1553  * race conditions.
 1554  */
 1555 static vm_page_t
 1556 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
 1557 {
 1558         vm_page_t m, pdppg, pdpg;
 1559 
 1560         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 1561             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 1562             ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 1563 
 1564         /*
 1565          * Allocate a page table page.
 1566          */
 1567         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 1568             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 1569                 if (flags & M_WAITOK) {
 1570                         PMAP_UNLOCK(pmap);
 1571                         vm_page_unlock_queues();
 1572                         VM_WAIT;
 1573                         vm_page_lock_queues();
 1574                         PMAP_LOCK(pmap);
 1575                 }
 1576 
 1577                 /*
 1578                  * Indicate the need to retry.  While waiting, the page table
 1579                  * page may have been allocated.
 1580                  */
 1581                 return (NULL);
 1582         }
 1583         if ((m->flags & PG_ZERO) == 0)
 1584                 pmap_zero_page(m);
 1585 
 1586         /*
 1587          * Map the pagetable page into the process address space, if
 1588          * it isn't already there.
 1589          */
 1590 
 1591         if (ptepindex >= (NUPDE + NUPDPE)) {
 1592                 pml4_entry_t *pml4;
 1593                 vm_pindex_t pml4index;
 1594 
 1595                 /* Wire up a new PDPE page */
 1596                 pml4index = ptepindex - (NUPDE + NUPDPE);
 1597                 pml4 = &pmap->pm_pml4[pml4index];
 1598                 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 1599 
 1600         } else if (ptepindex >= NUPDE) {
 1601                 vm_pindex_t pml4index;
 1602                 vm_pindex_t pdpindex;
 1603                 pml4_entry_t *pml4;
 1604                 pdp_entry_t *pdp;
 1605 
 1606                 /* Wire up a new PDE page */
 1607                 pdpindex = ptepindex - NUPDE;
 1608                 pml4index = pdpindex >> NPML4EPGSHIFT;
 1609 
 1610                 pml4 = &pmap->pm_pml4[pml4index];
 1611                 if ((*pml4 & PG_V) == 0) {
 1612                         /* Have to allocate a new pdp, recurse */
 1613                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 1614                             flags) == NULL) {
 1615                                 --m->wire_count;
 1616                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 1617                                 vm_page_free_zero(m);
 1618                                 return (NULL);
 1619                         }
 1620                 } else {
 1621                         /* Add reference to pdp page */
 1622                         pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 1623                         pdppg->wire_count++;
 1624                 }
 1625                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 1626 
 1627                 /* Now find the pdp page */
 1628                 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 1629                 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 1630 
 1631         } else {
 1632                 vm_pindex_t pml4index;
 1633                 vm_pindex_t pdpindex;
 1634                 pml4_entry_t *pml4;
 1635                 pdp_entry_t *pdp;
 1636                 pd_entry_t *pd;
 1637 
 1638                 /* Wire up a new PTE page */
 1639                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 1640                 pml4index = pdpindex >> NPML4EPGSHIFT;
 1641 
 1642                 /* First, find the pdp and check that its valid. */
 1643                 pml4 = &pmap->pm_pml4[pml4index];
 1644                 if ((*pml4 & PG_V) == 0) {
 1645                         /* Have to allocate a new pd, recurse */
 1646                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 1647                             flags) == NULL) {
 1648                                 --m->wire_count;
 1649                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 1650                                 vm_page_free_zero(m);
 1651                                 return (NULL);
 1652                         }
 1653                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 1654                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 1655                 } else {
 1656                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 1657                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 1658                         if ((*pdp & PG_V) == 0) {
 1659                                 /* Have to allocate a new pd, recurse */
 1660                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 1661                                     flags) == NULL) {
 1662                                         --m->wire_count;
 1663                                         atomic_subtract_int(&cnt.v_wire_count,
 1664                                             1);
 1665                                         vm_page_free_zero(m);
 1666                                         return (NULL);
 1667                                 }
 1668                         } else {
 1669                                 /* Add reference to the pd page */
 1670                                 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 1671                                 pdpg->wire_count++;
 1672                         }
 1673                 }
 1674                 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 1675 
 1676                 /* Now we know where the page directory page is */
 1677                 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 1678                 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 1679         }
 1680 
 1681         pmap->pm_stats.resident_count++;
 1682 
 1683         return m;
 1684 }
 1685 
 1686 static vm_page_t
 1687 pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
 1688 {
 1689         vm_pindex_t pdpindex, ptepindex;
 1690         pdp_entry_t *pdpe;
 1691         vm_page_t pdpg;
 1692 
 1693         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 1694             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 1695             ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
 1696 retry:
 1697         pdpe = pmap_pdpe(pmap, va);
 1698         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 1699                 /* Add a reference to the pd page. */
 1700                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 1701                 pdpg->wire_count++;
 1702         } else {
 1703                 /* Allocate a pd page. */
 1704                 ptepindex = pmap_pde_pindex(va);
 1705                 pdpindex = ptepindex >> NPDPEPGSHIFT;
 1706                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
 1707                 if (pdpg == NULL && (flags & M_WAITOK))
 1708                         goto retry;
 1709         }
 1710         return (pdpg);
 1711 }
 1712 
 1713 static vm_page_t
 1714 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
 1715 {
 1716         vm_pindex_t ptepindex;
 1717         pd_entry_t *pd;
 1718         vm_page_t m;
 1719 
 1720         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 1721             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 1722             ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 1723 
 1724         /*
 1725          * Calculate pagetable page index
 1726          */
 1727         ptepindex = pmap_pde_pindex(va);
 1728 retry:
 1729         /*
 1730          * Get the page directory entry
 1731          */
 1732         pd = pmap_pde(pmap, va);
 1733 
 1734         /*
 1735          * This supports switching from a 2MB page to a
 1736          * normal 4K page.
 1737          */
 1738         if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 1739                 if (!pmap_demote_pde(pmap, pd, va)) {
 1740                         /*
 1741                          * Invalidation of the 2MB page mapping may have caused
 1742                          * the deallocation of the underlying PD page.
 1743                          */
 1744                         pd = NULL;
 1745                 }
 1746         }
 1747 
 1748         /*
 1749          * If the page table page is mapped, we just increment the
 1750          * hold count, and activate it.
 1751          */
 1752         if (pd != NULL && (*pd & PG_V) != 0) {
 1753                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 1754                 m->wire_count++;
 1755         } else {
 1756                 /*
 1757                  * Here if the pte page isn't mapped, or if it has been
 1758                  * deallocated.
 1759                  */
 1760                 m = _pmap_allocpte(pmap, ptepindex, flags);
 1761                 if (m == NULL && (flags & M_WAITOK))
 1762                         goto retry;
 1763         }
 1764         return (m);
 1765 }
 1766 
 1767 
 1768 /***************************************************
 1769  * Pmap allocation/deallocation routines.
 1770  ***************************************************/
 1771 
 1772 /*
 1773  * Release any resources held by the given physical map.
 1774  * Called when a pmap initialized by pmap_pinit is being released.
 1775  * Should only be called if the map contains no valid mappings.
 1776  */
 1777 void
 1778 pmap_release(pmap_t pmap)
 1779 {
 1780         vm_page_t m;
 1781 
 1782         KASSERT(pmap->pm_stats.resident_count == 0,
 1783             ("pmap_release: pmap resident count %ld != 0",
 1784             pmap->pm_stats.resident_count));
 1785         KASSERT(pmap->pm_root == NULL,
 1786             ("pmap_release: pmap has reserved page table page(s)"));
 1787 
 1788         m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
 1789 
 1790         pmap->pm_pml4[KPML4I] = 0;      /* KVA */
 1791         pmap->pm_pml4[DMPML4I] = 0;     /* Direct Map */
 1792         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
 1793 
 1794         m->wire_count--;
 1795         atomic_subtract_int(&cnt.v_wire_count, 1);
 1796         vm_page_free_zero(m);
 1797         PMAP_LOCK_DESTROY(pmap);
 1798 }
 1799 
 1800 static int
 1801 kvm_size(SYSCTL_HANDLER_ARGS)
 1802 {
 1803         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 1804 
 1805         return sysctl_handle_long(oidp, &ksize, 0, req);
 1806 }
 1807 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
 1808     0, 0, kvm_size, "LU", "Size of KVM");
 1809 
 1810 static int
 1811 kvm_free(SYSCTL_HANDLER_ARGS)
 1812 {
 1813         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 1814 
 1815         return sysctl_handle_long(oidp, &kfree, 0, req);
 1816 }
 1817 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
 1818     0, 0, kvm_free, "LU", "Amount of KVM free");
 1819 
 1820 /*
 1821  * grow the number of kernel page table entries, if needed
 1822  */
 1823 void
 1824 pmap_growkernel(vm_offset_t addr)
 1825 {
 1826         vm_paddr_t paddr;
 1827         vm_page_t nkpg;
 1828         pd_entry_t *pde, newpdir;
 1829         pdp_entry_t *pdpe;
 1830 
 1831         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 1832 
 1833         /*
 1834          * Return if "addr" is within the range of kernel page table pages
 1835          * that were preallocated during pmap bootstrap.  Moreover, leave
 1836          * "kernel_vm_end" and the kernel page table as they were.
 1837          *
 1838          * The correctness of this action is based on the following
 1839          * argument: vm_map_findspace() allocates contiguous ranges of the
 1840          * kernel virtual address space.  It calls this function if a range
 1841          * ends after "kernel_vm_end".  If the kernel is mapped between
 1842          * "kernel_vm_end" and "addr", then the range cannot begin at
 1843          * "kernel_vm_end".  In fact, its beginning address cannot be less
 1844          * than the kernel.  Thus, there is no immediate need to allocate
 1845          * any new kernel page table pages between "kernel_vm_end" and
 1846          * "KERNBASE".
 1847          */
 1848         if (KERNBASE < addr && addr <= KERNBASE + NKPT * NBPDR)
 1849                 return;
 1850 
 1851         addr = roundup2(addr, NBPDR);
 1852         if (addr - 1 >= kernel_map->max_offset)
 1853                 addr = kernel_map->max_offset;
 1854         while (kernel_vm_end < addr) {
 1855                 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 1856                 if ((*pdpe & PG_V) == 0) {
 1857                         /* We need a new PDP entry */
 1858                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 1859                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 1860                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 1861                         if (nkpg == NULL)
 1862                                 panic("pmap_growkernel: no memory to grow kernel");
 1863                         if ((nkpg->flags & PG_ZERO) == 0)
 1864                                 pmap_zero_page(nkpg);
 1865                         paddr = VM_PAGE_TO_PHYS(nkpg);
 1866                         *pdpe = (pdp_entry_t)
 1867                                 (paddr | PG_V | PG_RW | PG_A | PG_M);
 1868                         continue; /* try again */
 1869                 }
 1870                 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 1871                 if ((*pde & PG_V) != 0) {
 1872                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 1873                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 1874                                 kernel_vm_end = kernel_map->max_offset;
 1875                                 break;                       
 1876                         }
 1877                         continue;
 1878                 }
 1879 
 1880                 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 1881                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 1882                     VM_ALLOC_ZERO);
 1883                 if (nkpg == NULL)
 1884                         panic("pmap_growkernel: no memory to grow kernel");
 1885                 if ((nkpg->flags & PG_ZERO) == 0)
 1886                         pmap_zero_page(nkpg);
 1887                 paddr = VM_PAGE_TO_PHYS(nkpg);
 1888                 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
 1889                 pde_store(pde, newpdir);
 1890 
 1891                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 1892                 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 1893                         kernel_vm_end = kernel_map->max_offset;
 1894                         break;                       
 1895                 }
 1896         }
 1897 }
 1898 
 1899 
 1900 /***************************************************
 1901  * page management routines.
 1902  ***************************************************/
 1903 
 1904 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 1905 CTASSERT(_NPCM == 3);
 1906 CTASSERT(_NPCPV == 168);
 1907 
 1908 static __inline struct pv_chunk *
 1909 pv_to_chunk(pv_entry_t pv)
 1910 {
 1911 
 1912         return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
 1913 }
 1914 
 1915 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 1916 
 1917 #define PC_FREE0        0xfffffffffffffffful
 1918 #define PC_FREE1        0xfffffffffffffffful
 1919 #define PC_FREE2        0x000000fffffffffful
 1920 
 1921 static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 1922 
 1923 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 1924         "Current number of pv entries");
 1925 
 1926 #ifdef PV_STATS
 1927 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 1928 
 1929 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 1930         "Current number of pv entry chunks");
 1931 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 1932         "Current number of pv entry chunks allocated");
 1933 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 1934         "Current number of pv entry chunks frees");
 1935 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 1936         "Number of times tried to get a chunk page but failed.");
 1937 
 1938 static long pv_entry_frees, pv_entry_allocs;
 1939 static int pv_entry_spare;
 1940 
 1941 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 1942         "Current number of pv entry frees");
 1943 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 1944         "Current number of pv entry allocs");
 1945 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 1946         "Current number of spare pv entries");
 1947 
 1948 static int pmap_collect_inactive, pmap_collect_active;
 1949 
 1950 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
 1951         "Current number times pmap_collect called on inactive queue");
 1952 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
 1953         "Current number times pmap_collect called on active queue");
 1954 #endif
 1955 
 1956 /*
 1957  * We are in a serious low memory condition.  Resort to
 1958  * drastic measures to free some pages so we can allocate
 1959  * another pv entry chunk.  This is normally called to
 1960  * unmap inactive pages, and if necessary, active pages.
 1961  *
 1962  * We do not, however, unmap 2mpages because subsequent accesses will
 1963  * allocate per-page pv entries until repromotion occurs, thereby
 1964  * exacerbating the shortage of free pv entries.
 1965  */
 1966 static void
 1967 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
 1968 {
 1969         struct md_page *pvh;
 1970         pd_entry_t *pde;
 1971         pmap_t pmap;
 1972         pt_entry_t *pte, tpte;
 1973         pv_entry_t next_pv, pv;
 1974         vm_offset_t va;
 1975         vm_page_t m, free;
 1976 
 1977         TAILQ_FOREACH(m, &vpq->pl, pageq) {
 1978                 if (m->hold_count || m->busy)
 1979                         continue;
 1980                 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 1981                         va = pv->pv_va;
 1982                         pmap = PV_PMAP(pv);
 1983                         /* Avoid deadlock and lock recursion. */
 1984                         if (pmap > locked_pmap)
 1985                                 PMAP_LOCK(pmap);
 1986                         else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 1987                                 continue;
 1988                         pmap->pm_stats.resident_count--;
 1989                         pde = pmap_pde(pmap, va);
 1990                         KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
 1991                             " a 2mpage in page %p's pv list", m));
 1992                         pte = pmap_pde_to_pte(pde, va);
 1993                         tpte = pte_load_clear(pte);
 1994                         KASSERT((tpte & PG_W) == 0,
 1995                             ("pmap_collect: wired pte %#lx", tpte));
 1996                         if (tpte & PG_A)
 1997                                 vm_page_flag_set(m, PG_REFERENCED);
 1998                         if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 1999                                 vm_page_dirty(m);
 2000                         free = NULL;
 2001                         pmap_unuse_pt(pmap, va, *pde, &free);
 2002                         pmap_invalidate_page(pmap, va);
 2003                         pmap_free_zero_pages(free);
 2004                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 2005                         if (TAILQ_EMPTY(&m->md.pv_list)) {
 2006                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2007                                 if (TAILQ_EMPTY(&pvh->pv_list))
 2008                                         vm_page_flag_clear(m, PG_WRITEABLE);
 2009                         }
 2010                         free_pv_entry(pmap, pv);
 2011                         if (pmap != locked_pmap)
 2012                                 PMAP_UNLOCK(pmap);
 2013                 }
 2014         }
 2015 }
 2016 
 2017 
 2018 /*
 2019  * free the pv_entry back to the free list
 2020  */
 2021 static void
 2022 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 2023 {
 2024         vm_page_t m;
 2025         struct pv_chunk *pc;
 2026         int idx, field, bit;
 2027 
 2028         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2029         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2030         PV_STAT(pv_entry_frees++);
 2031         PV_STAT(pv_entry_spare++);
 2032         pv_entry_count--;
 2033         pc = pv_to_chunk(pv);
 2034         idx = pv - &pc->pc_pventry[0];
 2035         field = idx / 64;
 2036         bit = idx % 64;
 2037         pc->pc_map[field] |= 1ul << bit;
 2038         /* move to head of list */
 2039         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2040         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 2041             pc->pc_map[2] != PC_FREE2) {
 2042                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2043                 return;
 2044         }
 2045         PV_STAT(pv_entry_spare -= _NPCPV);
 2046         PV_STAT(pc_chunk_count--);
 2047         PV_STAT(pc_chunk_frees++);
 2048         /* entire chunk is free, return it */
 2049         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 2050         dump_drop_page(m->phys_addr);
 2051         vm_page_unwire(m, 0);
 2052         vm_page_free(m);
 2053 }
 2054 
 2055 /*
 2056  * get a new pv_entry, allocating a block from the system
 2057  * when needed.
 2058  */
 2059 static pv_entry_t
 2060 get_pv_entry(pmap_t pmap, int try)
 2061 {
 2062         static const struct timeval printinterval = { 60, 0 };
 2063         static struct timeval lastprint;
 2064         static vm_pindex_t colour;
 2065         struct vpgqueues *pq;
 2066         int bit, field;
 2067         pv_entry_t pv;
 2068         struct pv_chunk *pc;
 2069         vm_page_t m;
 2070 
 2071         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2072         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2073         PV_STAT(pv_entry_allocs++);
 2074         pv_entry_count++;
 2075         if (pv_entry_count > pv_entry_high_water)
 2076                 if (ratecheck(&lastprint, &printinterval))
 2077                         printf("Approaching the limit on PV entries, consider "
 2078                             "increasing either the vm.pmap.shpgperproc or the "
 2079                             "vm.pmap.pv_entry_max sysctl.\n");
 2080         pq = NULL;
 2081 retry:
 2082         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 2083         if (pc != NULL) {
 2084                 for (field = 0; field < _NPCM; field++) {
 2085                         if (pc->pc_map[field]) {
 2086                                 bit = bsfq(pc->pc_map[field]);
 2087                                 break;
 2088                         }
 2089                 }
 2090                 if (field < _NPCM) {
 2091                         pv = &pc->pc_pventry[field * 64 + bit];
 2092                         pc->pc_map[field] &= ~(1ul << bit);
 2093                         /* If this was the last item, move it to tail */
 2094                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 2095                             pc->pc_map[2] == 0) {
 2096                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 2097                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 2098                         }
 2099                         PV_STAT(pv_entry_spare--);
 2100                         return (pv);
 2101                 }
 2102         }
 2103         /* No free items, allocate another chunk */
 2104         m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ?
 2105             VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
 2106             VM_ALLOC_WIRED);
 2107         if (m == NULL) {
 2108                 if (try) {
 2109                         pv_entry_count--;
 2110                         PV_STAT(pc_chunk_tryfail++);
 2111                         return (NULL);
 2112                 }
 2113                 /*
 2114                  * Reclaim pv entries: At first, destroy mappings to inactive
 2115                  * pages.  After that, if a pv chunk entry is still needed,
 2116                  * destroy mappings to active pages.
 2117                  */
 2118                 if (pq == NULL) {
 2119                         PV_STAT(pmap_collect_inactive++);
 2120                         pq = &vm_page_queues[PQ_INACTIVE];
 2121                 } else if (pq == &vm_page_queues[PQ_INACTIVE]) {
 2122                         PV_STAT(pmap_collect_active++);
 2123                         pq = &vm_page_queues[PQ_ACTIVE];
 2124                 } else
 2125                         panic("get_pv_entry: increase vm.pmap.shpgperproc");
 2126                 pmap_collect(pmap, pq);
 2127                 goto retry;
 2128         }
 2129         PV_STAT(pc_chunk_count++);
 2130         PV_STAT(pc_chunk_allocs++);
 2131         colour++;
 2132         dump_add_page(m->phys_addr);
 2133         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 2134         pc->pc_pmap = pmap;
 2135         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
 2136         pc->pc_map[1] = PC_FREE1;
 2137         pc->pc_map[2] = PC_FREE2;
 2138         pv = &pc->pc_pventry[0];
 2139         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 2140         PV_STAT(pv_entry_spare += _NPCPV - 1);
 2141         return (pv);
 2142 }
 2143 
 2144 /*
 2145  * First find and then remove the pv entry for the specified pmap and virtual
 2146  * address from the specified pv list.  Returns the pv entry if found and NULL
 2147  * otherwise.  This operation can be performed on pv lists for either 4KB or
 2148  * 2MB page mappings.
 2149  */
 2150 static __inline pv_entry_t
 2151 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 2152 {
 2153         pv_entry_t pv;
 2154 
 2155         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2156         TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 2157                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 2158                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
 2159                         break;
 2160                 }
 2161         }
 2162         return (pv);
 2163 }
 2164 
 2165 /*
 2166  * After demotion from a 2MB page mapping to 512 4KB page mappings,
 2167  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
 2168  * entries for each of the 4KB page mappings.
 2169  */
 2170 static void
 2171 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 2172 {
 2173         struct md_page *pvh;
 2174         pv_entry_t pv;
 2175         vm_offset_t va_last;
 2176         vm_page_t m;
 2177 
 2178         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2179         KASSERT((pa & PDRMASK) == 0,
 2180             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 2181 
 2182         /*
 2183          * Transfer the 2mpage's pv entry for this mapping to the first
 2184          * page's pv list.
 2185          */
 2186         pvh = pa_to_pvh(pa);
 2187         va = trunc_2mpage(va);
 2188         pv = pmap_pvh_remove(pvh, pmap, va);
 2189         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 2190         m = PHYS_TO_VM_PAGE(pa);
 2191         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 2192         /* Instantiate the remaining NPTEPG - 1 pv entries. */
 2193         va_last = va + NBPDR - PAGE_SIZE;
 2194         do {
 2195                 m++;
 2196                 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
 2197                     ("pmap_pv_demote_pde: page %p is not managed", m));
 2198                 va += PAGE_SIZE;
 2199                 pmap_insert_entry(pmap, va, m);
 2200         } while (va < va_last);
 2201 }
 2202 
 2203 /*
 2204  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
 2205  * replace the many pv entries for the 4KB page mappings by a single pv entry
 2206  * for the 2MB page mapping.
 2207  */
 2208 static void
 2209 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 2210 {
 2211         struct md_page *pvh;
 2212         pv_entry_t pv;
 2213         vm_offset_t va_last;
 2214         vm_page_t m;
 2215 
 2216         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2217         KASSERT((pa & PDRMASK) == 0,
 2218             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 2219 
 2220         /*
 2221          * Transfer the first page's pv entry for this mapping to the
 2222          * 2mpage's pv list.  Aside from avoiding the cost of a call
 2223          * to get_pv_entry(), a transfer avoids the possibility that
 2224          * get_pv_entry() calls pmap_collect() and that pmap_collect()
 2225          * removes one of the mappings that is being promoted.
 2226          */
 2227         m = PHYS_TO_VM_PAGE(pa);
 2228         va = trunc_2mpage(va);
 2229         pv = pmap_pvh_remove(&m->md, pmap, va);
 2230         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 2231         pvh = pa_to_pvh(pa);
 2232         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
 2233         /* Free the remaining NPTEPG - 1 pv entries. */
 2234         va_last = va + NBPDR - PAGE_SIZE;
 2235         do {
 2236                 m++;
 2237                 va += PAGE_SIZE;
 2238                 pmap_pvh_free(&m->md, pmap, va);
 2239         } while (va < va_last);
 2240 }
 2241 
 2242 /*
 2243  * First find and then destroy the pv entry for the specified pmap and virtual
 2244  * address.  This operation can be performed on pv lists for either 4KB or 2MB
 2245  * page mappings.
 2246  */
 2247 static void
 2248 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 2249 {
 2250         pv_entry_t pv;
 2251 
 2252         pv = pmap_pvh_remove(pvh, pmap, va);
 2253         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 2254         free_pv_entry(pmap, pv);
 2255 }
 2256 
 2257 static void
 2258 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 2259 {
 2260         struct md_page *pvh;
 2261 
 2262         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2263         pmap_pvh_free(&m->md, pmap, va);
 2264         if (TAILQ_EMPTY(&m->md.pv_list)) {
 2265                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2266                 if (TAILQ_EMPTY(&pvh->pv_list))
 2267                         vm_page_flag_clear(m, PG_WRITEABLE);
 2268         }
 2269 }
 2270 
 2271 /*
 2272  * Create a pv entry for page at pa for
 2273  * (pmap, va).
 2274  */
 2275 static void
 2276 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 2277 {
 2278         pv_entry_t pv;
 2279 
 2280         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2281         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2282         pv = get_pv_entry(pmap, FALSE);
 2283         pv->pv_va = va;
 2284         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 2285 }
 2286 
 2287 /*
 2288  * Conditionally create a pv entry.
 2289  */
 2290 static boolean_t
 2291 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 2292 {
 2293         pv_entry_t pv;
 2294 
 2295         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2296         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2297         if (pv_entry_count < pv_entry_high_water && 
 2298             (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 2299                 pv->pv_va = va;
 2300                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 2301                 return (TRUE);
 2302         } else
 2303                 return (FALSE);
 2304 }
 2305 
 2306 /*
 2307  * Create the pv entry for a 2MB page mapping.
 2308  */
 2309 static boolean_t
 2310 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 2311 {
 2312         struct md_page *pvh;
 2313         pv_entry_t pv;
 2314 
 2315         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2316         if (pv_entry_count < pv_entry_high_water && 
 2317             (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 2318                 pv->pv_va = va;
 2319                 pvh = pa_to_pvh(pa);
 2320                 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
 2321                 return (TRUE);
 2322         } else
 2323                 return (FALSE);
 2324 }
 2325 
 2326 /*
 2327  * Fills a page table page with mappings to consecutive physical pages.
 2328  */
 2329 static void
 2330 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 2331 {
 2332         pt_entry_t *pte;
 2333 
 2334         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 2335                 *pte = newpte;
 2336                 newpte += PAGE_SIZE;
 2337         }
 2338 }
 2339 
 2340 /*
 2341  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
 2342  * mapping is invalidated.
 2343  */
 2344 static boolean_t
 2345 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 2346 {
 2347         pd_entry_t newpde, oldpde;
 2348         pt_entry_t *firstpte, newpte;
 2349         vm_paddr_t mptepa;
 2350         vm_page_t free, mpte;
 2351 
 2352         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2353         oldpde = *pde;
 2354         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 2355             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 2356         mpte = pmap_lookup_pt_page(pmap, va);
 2357         if (mpte != NULL)
 2358                 pmap_remove_pt_page(pmap, mpte);
 2359         else {
 2360                 KASSERT((oldpde & PG_W) == 0,
 2361                     ("pmap_demote_pde: page table page for a wired mapping"
 2362                     " is missing"));
 2363 
 2364                 /*
 2365                  * Invalidate the 2MB page mapping and return "failure" if the
 2366                  * mapping was never accessed or the allocation of the new
 2367                  * page table page fails.  If the 2MB page mapping belongs to
 2368                  * the direct map region of the kernel's address space, then
 2369                  * the page allocation request specifies the highest possible
 2370                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
 2371                  * normal.  Page table pages are preallocated for every other
 2372                  * part of the kernel address space, so the direct map region
 2373                  * is the only part of the kernel address space that must be
 2374                  * handled here.
 2375                  */
 2376                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 2377                     pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
 2378                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 2379                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 2380                         free = NULL;
 2381                         pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
 2382                         pmap_invalidate_page(pmap, trunc_2mpage(va));
 2383                         pmap_free_zero_pages(free);
 2384                         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
 2385                             " in pmap %p", va, pmap);
 2386                         return (FALSE);
 2387                 }
 2388                 if (va < VM_MAXUSER_ADDRESS)
 2389                         pmap->pm_stats.resident_count++;
 2390         }
 2391         mptepa = VM_PAGE_TO_PHYS(mpte);
 2392         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 2393         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 2394         KASSERT((oldpde & PG_A) != 0,
 2395             ("pmap_demote_pde: oldpde is missing PG_A"));
 2396         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 2397             ("pmap_demote_pde: oldpde is missing PG_M"));
 2398         newpte = oldpde & ~PG_PS;
 2399         if ((newpte & PG_PDE_PAT) != 0)
 2400                 newpte ^= PG_PDE_PAT | PG_PTE_PAT;
 2401 
 2402         /*
 2403          * If the page table page is new, initialize it.
 2404          */
 2405         if (mpte->wire_count == 1) {
 2406                 mpte->wire_count = NPTEPG;
 2407                 pmap_fill_ptp(firstpte, newpte);
 2408         }
 2409         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 2410             ("pmap_demote_pde: firstpte and newpte map different physical"
 2411             " addresses"));
 2412 
 2413         /*
 2414          * If the mapping has changed attributes, update the page table
 2415          * entries.
 2416          */
 2417         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 2418                 pmap_fill_ptp(firstpte, newpte);
 2419 
 2420         /*
 2421          * Demote the mapping.  This pmap is locked.  The old PDE has
 2422          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 2423          * set.  Thus, there is no danger of a race with another
 2424          * processor changing the setting of PG_A and/or PG_M between
 2425          * the read above and the store below. 
 2426          */
 2427         if (workaround_erratum383)
 2428                 pmap_update_pde(pmap, va, pde, newpde);
 2429         else
 2430                 pde_store(pde, newpde);
 2431 
 2432         /*
 2433          * Invalidate a stale recursive mapping of the page table page.
 2434          */
 2435         if (va >= VM_MAXUSER_ADDRESS)
 2436                 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 2437 
 2438         /*
 2439          * Demote the pv entry.  This depends on the earlier demotion
 2440          * of the mapping.  Specifically, the (re)creation of a per-
 2441          * page pv entry might trigger the execution of pmap_collect(),
 2442          * which might reclaim a newly (re)created per-page pv entry
 2443          * and destroy the associated mapping.  In order to destroy
 2444          * the mapping, the PDE must have already changed from mapping
 2445          * the 2mpage to referencing the page table page.
 2446          */
 2447         if ((oldpde & PG_MANAGED) != 0)
 2448                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
 2449 
 2450         pmap_pde_demotions++;
 2451         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
 2452             " in pmap %p", va, pmap);
 2453         return (TRUE);
 2454 }
 2455 
 2456 /*
 2457  * pmap_remove_pde: do the things to unmap a superpage in a process
 2458  */
 2459 static int
 2460 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 2461     vm_page_t *free)
 2462 {
 2463         struct md_page *pvh;
 2464         pd_entry_t oldpde;
 2465         vm_offset_t eva, va;
 2466         vm_page_t m, mpte;
 2467 
 2468         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2469         KASSERT((sva & PDRMASK) == 0,
 2470             ("pmap_remove_pde: sva is not 2mpage aligned"));
 2471         oldpde = pte_load_clear(pdq);
 2472         if (oldpde & PG_W)
 2473                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 2474 
 2475         /*
 2476          * Machines that don't support invlpg, also don't support
 2477          * PG_G.
 2478          */
 2479         if (oldpde & PG_G)
 2480                 pmap_invalidate_page(kernel_pmap, sva);
 2481         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 2482         if (oldpde & PG_MANAGED) {
 2483                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 2484                 pmap_pvh_free(pvh, pmap, sva);
 2485                 eva = sva + NBPDR;
 2486                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 2487                     va < eva; va += PAGE_SIZE, m++) {
 2488                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2489                                 vm_page_dirty(m);
 2490                         if (oldpde & PG_A)
 2491                                 vm_page_flag_set(m, PG_REFERENCED);
 2492                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 2493                             TAILQ_EMPTY(&pvh->pv_list))
 2494                                 vm_page_flag_clear(m, PG_WRITEABLE);
 2495                 }
 2496         }
 2497         if (pmap == kernel_pmap) {
 2498                 if (!pmap_demote_pde(pmap, pdq, sva))
 2499                         panic("pmap_remove_pde: failed demotion");
 2500         } else {
 2501                 mpte = pmap_lookup_pt_page(pmap, sva);
 2502                 if (mpte != NULL) {
 2503                         pmap_remove_pt_page(pmap, mpte);
 2504                         pmap->pm_stats.resident_count--;
 2505                         KASSERT(mpte->wire_count == NPTEPG,
 2506                             ("pmap_remove_pde: pte page wire count error"));
 2507                         mpte->wire_count = 0;
 2508                         pmap_add_delayed_free_list(mpte, free, FALSE);
 2509                         atomic_subtract_int(&cnt.v_wire_count, 1);
 2510                 }
 2511         }
 2512         return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 2513 }
 2514 
 2515 /*
 2516  * pmap_remove_pte: do the things to unmap a page in a process
 2517  */
 2518 static int
 2519 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
 2520     pd_entry_t ptepde, vm_page_t *free)
 2521 {
 2522         pt_entry_t oldpte;
 2523         vm_page_t m;
 2524 
 2525         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2526         oldpte = pte_load_clear(ptq);
 2527         if (oldpte & PG_W)
 2528                 pmap->pm_stats.wired_count -= 1;
 2529         /*
 2530          * Machines that don't support invlpg, also don't support
 2531          * PG_G.
 2532          */
 2533         if (oldpte & PG_G)
 2534                 pmap_invalidate_page(kernel_pmap, va);
 2535         pmap->pm_stats.resident_count -= 1;
 2536         if (oldpte & PG_MANAGED) {
 2537                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 2538                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2539                         vm_page_dirty(m);
 2540                 if (oldpte & PG_A)
 2541                         vm_page_flag_set(m, PG_REFERENCED);
 2542                 pmap_remove_entry(pmap, m, va);
 2543         }
 2544         return (pmap_unuse_pt(pmap, va, ptepde, free));
 2545 }
 2546 
 2547 /*
 2548  * Remove a single page from a process address space
 2549  */
 2550 static void
 2551 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
 2552 {
 2553         pt_entry_t *pte;
 2554 
 2555         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2556         if ((*pde & PG_V) == 0)
 2557                 return;
 2558         pte = pmap_pde_to_pte(pde, va);
 2559         if ((*pte & PG_V) == 0)
 2560                 return;
 2561         pmap_remove_pte(pmap, pte, va, *pde, free);
 2562         pmap_invalidate_page(pmap, va);
 2563 }
 2564 
 2565 /*
 2566  *      Remove the given range of addresses from the specified map.
 2567  *
 2568  *      It is assumed that the start and end are properly
 2569  *      rounded to the page size.
 2570  */
 2571 void
 2572 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 2573 {
 2574         vm_offset_t va_next;
 2575         pml4_entry_t *pml4e;
 2576         pdp_entry_t *pdpe;
 2577         pd_entry_t ptpaddr, *pde;
 2578         pt_entry_t *pte;
 2579         vm_page_t free = NULL;
 2580         int anyvalid;
 2581 
 2582         /*
 2583          * Perform an unsynchronized read.  This is, however, safe.
 2584          */
 2585         if (pmap->pm_stats.resident_count == 0)
 2586                 return;
 2587 
 2588         anyvalid = 0;
 2589 
 2590         vm_page_lock_queues();
 2591         PMAP_LOCK(pmap);
 2592 
 2593         /*
 2594          * special handling of removing one page.  a very
 2595          * common operation and easy to short circuit some
 2596          * code.
 2597          */
 2598         if (sva + PAGE_SIZE == eva) {
 2599                 pde = pmap_pde(pmap, sva);
 2600                 if (pde && (*pde & PG_PS) == 0) {
 2601                         pmap_remove_page(pmap, sva, pde, &free);
 2602                         goto out;
 2603                 }
 2604         }
 2605 
 2606         for (; sva < eva; sva = va_next) {
 2607 
 2608                 if (pmap->pm_stats.resident_count == 0)
 2609                         break;
 2610 
 2611                 pml4e = pmap_pml4e(pmap, sva);
 2612                 if ((*pml4e & PG_V) == 0) {
 2613                         va_next = (sva + NBPML4) & ~PML4MASK;
 2614                         if (va_next < sva)
 2615                                 va_next = eva;
 2616                         continue;
 2617                 }
 2618 
 2619                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 2620                 if ((*pdpe & PG_V) == 0) {
 2621                         va_next = (sva + NBPDP) & ~PDPMASK;
 2622                         if (va_next < sva)
 2623                                 va_next = eva;
 2624                         continue;
 2625                 }
 2626 
 2627                 /*
 2628                  * Calculate index for next page table.
 2629                  */
 2630                 va_next = (sva + NBPDR) & ~PDRMASK;
 2631                 if (va_next < sva)
 2632                         va_next = eva;
 2633 
 2634                 pde = pmap_pdpe_to_pde(pdpe, sva);
 2635                 ptpaddr = *pde;
 2636 
 2637                 /*
 2638                  * Weed out invalid mappings.
 2639                  */
 2640                 if (ptpaddr == 0)
 2641                         continue;
 2642 
 2643                 /*
 2644                  * Check for large page.
 2645                  */
 2646                 if ((ptpaddr & PG_PS) != 0) {
 2647                         /*
 2648                          * Are we removing the entire large page?  If not,
 2649                          * demote the mapping and fall through.
 2650                          */
 2651                         if (sva + NBPDR == va_next && eva >= va_next) {
 2652                                 /*
 2653                                  * The TLB entry for a PG_G mapping is
 2654                                  * invalidated by pmap_remove_pde().
 2655                                  */
 2656                                 if ((ptpaddr & PG_G) == 0)
 2657                                         anyvalid = 1;
 2658                                 pmap_remove_pde(pmap, pde, sva, &free);
 2659                                 continue;
 2660                         } else if (!pmap_demote_pde(pmap, pde, sva)) {
 2661                                 /* The large page mapping was destroyed. */
 2662                                 continue;
 2663                         } else
 2664                                 ptpaddr = *pde;
 2665                 }
 2666 
 2667                 /*
 2668                  * Limit our scan to either the end of the va represented
 2669                  * by the current page table page, or to the end of the
 2670                  * range being removed.
 2671                  */
 2672                 if (va_next > eva)
 2673                         va_next = eva;
 2674 
 2675                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 2676                     sva += PAGE_SIZE) {
 2677                         if (*pte == 0)
 2678                                 continue;
 2679 
 2680                         /*
 2681                          * The TLB entry for a PG_G mapping is invalidated
 2682                          * by pmap_remove_pte().
 2683                          */
 2684                         if ((*pte & PG_G) == 0)
 2685                                 anyvalid = 1;
 2686                         if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
 2687                                 break;
 2688                 }
 2689         }
 2690 out:
 2691         if (anyvalid)
 2692                 pmap_invalidate_all(pmap);
 2693         vm_page_unlock_queues();        
 2694         PMAP_UNLOCK(pmap);
 2695         pmap_free_zero_pages(free);
 2696 }
 2697 
 2698 /*
 2699  *      Routine:        pmap_remove_all
 2700  *      Function:
 2701  *              Removes this physical page from
 2702  *              all physical maps in which it resides.
 2703  *              Reflects back modify bits to the pager.
 2704  *
 2705  *      Notes:
 2706  *              Original versions of this routine were very
 2707  *              inefficient because they iteratively called
 2708  *              pmap_remove (slow...)
 2709  */
 2710 
 2711 void
 2712 pmap_remove_all(vm_page_t m)
 2713 {
 2714         struct md_page *pvh;
 2715         pv_entry_t pv;
 2716         pmap_t pmap;
 2717         pt_entry_t *pte, tpte;
 2718         pd_entry_t *pde;
 2719         vm_offset_t va;
 2720         vm_page_t free;
 2721 
 2722         KASSERT((m->flags & PG_FICTITIOUS) == 0,
 2723             ("pmap_remove_all: page %p is fictitious", m));
 2724         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2725         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2726         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 2727                 va = pv->pv_va;
 2728                 pmap = PV_PMAP(pv);
 2729                 PMAP_LOCK(pmap);
 2730                 pde = pmap_pde(pmap, va);
 2731                 (void)pmap_demote_pde(pmap, pde, va);
 2732                 PMAP_UNLOCK(pmap);
 2733         }
 2734         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 2735                 pmap = PV_PMAP(pv);
 2736                 PMAP_LOCK(pmap);
 2737                 pmap->pm_stats.resident_count--;
 2738                 pde = pmap_pde(pmap, pv->pv_va);
 2739                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 2740                     " a 2mpage in page %p's pv list", m));
 2741                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 2742                 tpte = pte_load_clear(pte);
 2743                 if (tpte & PG_W)
 2744                         pmap->pm_stats.wired_count--;
 2745                 if (tpte & PG_A)
 2746                         vm_page_flag_set(m, PG_REFERENCED);
 2747 
 2748                 /*
 2749                  * Update the vm_page_t clean and reference bits.
 2750                  */
 2751                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2752                         vm_page_dirty(m);
 2753                 free = NULL;
 2754                 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 2755                 pmap_invalidate_page(pmap, pv->pv_va);
 2756                 pmap_free_zero_pages(free);
 2757                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 2758                 free_pv_entry(pmap, pv);
 2759                 PMAP_UNLOCK(pmap);
 2760         }
 2761         vm_page_flag_clear(m, PG_WRITEABLE);
 2762 }
 2763 
 2764 /*
 2765  * pmap_protect_pde: do the things to protect a 2mpage in a process
 2766  */
 2767 static boolean_t
 2768 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 2769 {
 2770         pd_entry_t newpde, oldpde;
 2771         vm_offset_t eva, va;
 2772         vm_page_t m;
 2773         boolean_t anychanged;
 2774 
 2775         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2776         KASSERT((sva & PDRMASK) == 0,
 2777             ("pmap_protect_pde: sva is not 2mpage aligned"));
 2778         anychanged = FALSE;
 2779 retry:
 2780         oldpde = newpde = *pde;
 2781         if (oldpde & PG_MANAGED) {
 2782                 eva = sva + NBPDR;
 2783                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 2784                     va < eva; va += PAGE_SIZE, m++) {
 2785                         /*
 2786                          * In contrast to the analogous operation on a 4KB page
 2787                          * mapping, the mapping's PG_A flag is not cleared and
 2788                          * the page's PG_REFERENCED flag is not set.  The
 2789                          * reason is that pmap_demote_pde() expects that a 2MB
 2790                          * page mapping with a stored page table page has PG_A
 2791                          * set.
 2792                          */
 2793                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 2794                                 vm_page_dirty(m);
 2795                 }
 2796         }
 2797         if ((prot & VM_PROT_WRITE) == 0)
 2798                 newpde &= ~(PG_RW | PG_M);
 2799         if ((prot & VM_PROT_EXECUTE) == 0)
 2800                 newpde |= pg_nx;
 2801         if (newpde != oldpde) {
 2802                 if (!atomic_cmpset_long(pde, oldpde, newpde))
 2803                         goto retry;
 2804                 if (oldpde & PG_G)
 2805                         pmap_invalidate_page(pmap, sva);
 2806                 else
 2807                         anychanged = TRUE;
 2808         }
 2809         return (anychanged);
 2810 }
 2811 
 2812 /*
 2813  *      Set the physical protection on the
 2814  *      specified range of this map as requested.
 2815  */
 2816 void
 2817 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 2818 {
 2819         vm_offset_t va_next;
 2820         pml4_entry_t *pml4e;
 2821         pdp_entry_t *pdpe;
 2822         pd_entry_t ptpaddr, *pde;
 2823         pt_entry_t *pte;
 2824         int anychanged;
 2825 
 2826         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 2827                 pmap_remove(pmap, sva, eva);
 2828                 return;
 2829         }
 2830 
 2831         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 2832             (VM_PROT_WRITE|VM_PROT_EXECUTE))
 2833                 return;
 2834 
 2835         anychanged = 0;
 2836 
 2837         vm_page_lock_queues();
 2838         PMAP_LOCK(pmap);
 2839         for (; sva < eva; sva = va_next) {
 2840 
 2841                 pml4e = pmap_pml4e(pmap, sva);
 2842                 if ((*pml4e & PG_V) == 0) {
 2843                         va_next = (sva + NBPML4) & ~PML4MASK;
 2844                         if (va_next < sva)
 2845                                 va_next = eva;
 2846                         continue;
 2847                 }
 2848 
 2849                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 2850                 if ((*pdpe & PG_V) == 0) {
 2851                         va_next = (sva + NBPDP) & ~PDPMASK;
 2852                         if (va_next < sva)
 2853                                 va_next = eva;
 2854                         continue;
 2855                 }
 2856 
 2857                 va_next = (sva + NBPDR) & ~PDRMASK;
 2858                 if (va_next < sva)
 2859                         va_next = eva;
 2860 
 2861                 pde = pmap_pdpe_to_pde(pdpe, sva);
 2862                 ptpaddr = *pde;
 2863 
 2864                 /*
 2865                  * Weed out invalid mappings.
 2866                  */
 2867                 if (ptpaddr == 0)
 2868                         continue;
 2869 
 2870                 /*
 2871                  * Check for large page.
 2872                  */
 2873                 if ((ptpaddr & PG_PS) != 0) {
 2874                         /*
 2875                          * Are we protecting the entire large page?  If not,
 2876                          * demote the mapping and fall through.
 2877                          */
 2878                         if (sva + NBPDR == va_next && eva >= va_next) {
 2879                                 /*
 2880                                  * The TLB entry for a PG_G mapping is
 2881                                  * invalidated by pmap_protect_pde().
 2882                                  */
 2883                                 if (pmap_protect_pde(pmap, pde, sva, prot))
 2884                                         anychanged = 1;
 2885                                 continue;
 2886                         } else if (!pmap_demote_pde(pmap, pde, sva)) {
 2887                                 /* The large page mapping was destroyed. */
 2888                                 continue;
 2889                         }
 2890                 }
 2891 
 2892                 if (va_next > eva)
 2893                         va_next = eva;
 2894 
 2895                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 2896                     sva += PAGE_SIZE) {
 2897                         pt_entry_t obits, pbits;
 2898                         vm_page_t m;
 2899 
 2900 retry:
 2901                         obits = pbits = *pte;
 2902                         if ((pbits & PG_V) == 0)
 2903                                 continue;
 2904                         if (pbits & PG_MANAGED) {
 2905                                 m = NULL;
 2906                                 if (pbits & PG_A) {
 2907                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 2908                                         vm_page_flag_set(m, PG_REFERENCED);
 2909                                         pbits &= ~PG_A;
 2910                                 }
 2911                                 if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 2912                                         if (m == NULL)
 2913                                                 m = PHYS_TO_VM_PAGE(pbits &
 2914                                                     PG_FRAME);
 2915                                         vm_page_dirty(m);
 2916                                 }
 2917                         }
 2918 
 2919                         if ((prot & VM_PROT_WRITE) == 0)
 2920                                 pbits &= ~(PG_RW | PG_M);
 2921                         if ((prot & VM_PROT_EXECUTE) == 0)
 2922                                 pbits |= pg_nx;
 2923 
 2924                         if (pbits != obits) {
 2925                                 if (!atomic_cmpset_long(pte, obits, pbits))
 2926                                         goto retry;
 2927                                 if (obits & PG_G)
 2928                                         pmap_invalidate_page(pmap, sva);
 2929                                 else
 2930                                         anychanged = 1;
 2931                         }
 2932                 }
 2933         }
 2934         if (anychanged)
 2935                 pmap_invalidate_all(pmap);
 2936         vm_page_unlock_queues();
 2937         PMAP_UNLOCK(pmap);
 2938 }
 2939 
 2940 /*
 2941  * Tries to promote the 512, contiguous 4KB page mappings that are within a
 2942  * single page table page (PTP) to a single 2MB page mapping.  For promotion
 2943  * to occur, two conditions must be met: (1) the 4KB page mappings must map
 2944  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
 2945  * identical characteristics. 
 2946  */
 2947 static void
 2948 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 2949 {
 2950         pd_entry_t newpde;
 2951         pt_entry_t *firstpte, oldpte, pa, *pte;
 2952         vm_offset_t oldpteva;
 2953         vm_page_t mpte;
 2954 
 2955         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2956 
 2957         /*
 2958          * Examine the first PTE in the specified PTP.  Abort if this PTE is
 2959          * either invalid, unused, or does not map the first 4KB physical page
 2960          * within a 2MB page. 
 2961          */
 2962         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 2963 setpde:
 2964         newpde = *firstpte;
 2965         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 2966                 pmap_pde_p_failures++;
 2967                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 2968                     " in pmap %p", va, pmap);
 2969                 return;
 2970         }
 2971         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 2972                 /*
 2973                  * When PG_M is already clear, PG_RW can be cleared without
 2974                  * a TLB invalidation.
 2975                  */
 2976                 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 2977                         goto setpde;
 2978                 newpde &= ~PG_RW;
 2979         }
 2980 
 2981         /*
 2982          * Examine each of the other PTEs in the specified PTP.  Abort if this
 2983          * PTE maps an unexpected 4KB physical page or does not have identical
 2984          * characteristics to the first PTE.
 2985          */
 2986         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 2987         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 2988 setpte:
 2989                 oldpte = *pte;
 2990                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 2991                         pmap_pde_p_failures++;
 2992                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 2993                             " in pmap %p", va, pmap);
 2994                         return;
 2995                 }
 2996                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 2997                         /*
 2998                          * When PG_M is already clear, PG_RW can be cleared
 2999                          * without a TLB invalidation.
 3000                          */
 3001                         if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 3002                                 goto setpte;
 3003                         oldpte &= ~PG_RW;
 3004                         oldpteva = (oldpte & PG_FRAME & PDRMASK) |
 3005                             (va & ~PDRMASK);
 3006                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 3007                             " in pmap %p", oldpteva, pmap);
 3008                 }
 3009                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 3010                         pmap_pde_p_failures++;
 3011                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 3012                             " in pmap %p", va, pmap);
 3013                         return;
 3014                 }
 3015                 pa -= PAGE_SIZE;
 3016         }
 3017 
 3018         /*
 3019          * Save the page table page in its current state until the PDE
 3020          * mapping the superpage is demoted by pmap_demote_pde() or
 3021          * destroyed by pmap_remove_pde(). 
 3022          */
 3023         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 3024         KASSERT(mpte >= vm_page_array &&
 3025             mpte < &vm_page_array[vm_page_array_size],
 3026             ("pmap_promote_pde: page table page is out of range"));
 3027         KASSERT(mpte->pindex == pmap_pde_pindex(va),
 3028             ("pmap_promote_pde: page table page's pindex is wrong"));
 3029         pmap_insert_pt_page(pmap, mpte);
 3030 
 3031         /*
 3032          * Promote the pv entries.
 3033          */
 3034         if ((newpde & PG_MANAGED) != 0)
 3035                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
 3036 
 3037         /*
 3038          * Propagate the PAT index to its proper position.
 3039          */
 3040         if ((newpde & PG_PTE_PAT) != 0)
 3041                 newpde ^= PG_PDE_PAT | PG_PTE_PAT;
 3042 
 3043         /*
 3044          * Map the superpage.
 3045          */
 3046         if (workaround_erratum383)
 3047                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 3048         else
 3049                 pde_store(pde, PG_PS | newpde);
 3050 
 3051         pmap_pde_promotions++;
 3052         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 3053             " in pmap %p", va, pmap);
 3054 }
 3055 
 3056 /*
 3057  *      Insert the given physical page (p) at
 3058  *      the specified virtual address (v) in the
 3059  *      target physical map with the protection requested.
 3060  *
 3061  *      If specified, the page will be wired down, meaning
 3062  *      that the related pte can not be reclaimed.
 3063  *
 3064  *      NB:  This is the only routine which MAY NOT lazy-evaluate
 3065  *      or lose information.  That is, this routine must actually
 3066  *      insert this page into the given map NOW.
 3067  */
 3068 void
 3069 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
 3070     vm_prot_t prot, boolean_t wired)
 3071 {
 3072         vm_paddr_t pa;
 3073         pd_entry_t *pde;
 3074         pt_entry_t *pte;
 3075         vm_paddr_t opa;
 3076         pt_entry_t origpte, newpte;
 3077         vm_page_t mpte, om;
 3078         boolean_t invlva;
 3079 
 3080         va = trunc_page(va);
 3081         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 3082         KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 3083             ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va));
 3084 
 3085         mpte = NULL;
 3086 
 3087         vm_page_lock_queues();
 3088         PMAP_LOCK(pmap);
 3089 
 3090         /*
 3091          * In the case that a page table page is not
 3092          * resident, we are creating it here.
 3093          */
 3094         if (va < VM_MAXUSER_ADDRESS) {
 3095                 mpte = pmap_allocpte(pmap, va, M_WAITOK);
 3096         }
 3097 
 3098         pde = pmap_pde(pmap, va);
 3099         if (pde != NULL && (*pde & PG_V) != 0) {
 3100                 if ((*pde & PG_PS) != 0)
 3101                         panic("pmap_enter: attempted pmap_enter on 2MB page");
 3102                 pte = pmap_pde_to_pte(pde, va);
 3103         } else
 3104                 panic("pmap_enter: invalid page directory va=%#lx", va);
 3105 
 3106         pa = VM_PAGE_TO_PHYS(m);
 3107         om = NULL;
 3108         origpte = *pte;
 3109         opa = origpte & PG_FRAME;
 3110 
 3111         /*
 3112          * Mapping has not changed, must be protection or wiring change.
 3113          */
 3114         if (origpte && (opa == pa)) {
 3115                 /*
 3116                  * Wiring change, just update stats. We don't worry about
 3117                  * wiring PT pages as they remain resident as long as there
 3118                  * are valid mappings in them. Hence, if a user page is wired,
 3119                  * the PT page will be also.
 3120                  */
 3121                 if (wired && ((origpte & PG_W) == 0))
 3122                         pmap->pm_stats.wired_count++;
 3123                 else if (!wired && (origpte & PG_W))
 3124                         pmap->pm_stats.wired_count--;
 3125 
 3126                 /*
 3127                  * Remove extra pte reference
 3128                  */
 3129                 if (mpte)
 3130                         mpte->wire_count--;
 3131 
 3132                 /*
 3133                  * We might be turning off write access to the page,
 3134                  * so we go ahead and sense modify status.
 3135                  */
 3136                 if (origpte & PG_MANAGED) {
 3137                         om = m;
 3138                         pa |= PG_MANAGED;
 3139                 }
 3140                 goto validate;
 3141         } 
 3142         /*
 3143          * Mapping has changed, invalidate old range and fall through to
 3144          * handle validating new mapping.
 3145          */
 3146         if (opa) {
 3147                 if (origpte & PG_W)
 3148                         pmap->pm_stats.wired_count--;
 3149                 if (origpte & PG_MANAGED) {
 3150                         om = PHYS_TO_VM_PAGE(opa);
 3151                         pmap_remove_entry(pmap, om, va);
 3152                 }
 3153                 if (mpte != NULL) {
 3154                         mpte->wire_count--;
 3155                         KASSERT(mpte->wire_count > 0,
 3156                             ("pmap_enter: missing reference to page table page,"
 3157                              " va: 0x%lx", va));
 3158                 }
 3159         } else
 3160                 pmap->pm_stats.resident_count++;
 3161 
 3162         /*
 3163          * Enter on the PV list if part of our managed memory.
 3164          */
 3165         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 3166                 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 3167                     ("pmap_enter: managed mapping within the clean submap"));
 3168                 pmap_insert_entry(pmap, va, m);
 3169                 pa |= PG_MANAGED;
 3170         }
 3171 
 3172         /*
 3173          * Increment counters
 3174          */
 3175         if (wired)
 3176                 pmap->pm_stats.wired_count++;
 3177 
 3178 validate:
 3179         /*
 3180          * Now validate mapping with desired protection/wiring.
 3181          */
 3182         newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
 3183         if ((prot & VM_PROT_WRITE) != 0) {
 3184                 newpte |= PG_RW;
 3185                 vm_page_flag_set(m, PG_WRITEABLE);
 3186         }
 3187         if ((prot & VM_PROT_EXECUTE) == 0)
 3188                 newpte |= pg_nx;
 3189         if (wired)
 3190                 newpte |= PG_W;
 3191         if (va < VM_MAXUSER_ADDRESS)
 3192                 newpte |= PG_U;
 3193         if (pmap == kernel_pmap)
 3194                 newpte |= PG_G;
 3195 
 3196         /*
 3197          * if the mapping or permission bits are different, we need
 3198          * to update the pte.
 3199          */
 3200         if ((origpte & ~(PG_M|PG_A)) != newpte) {
 3201                 newpte |= PG_A;
 3202                 if ((access & VM_PROT_WRITE) != 0)
 3203                         newpte |= PG_M;
 3204                 if (origpte & PG_V) {
 3205                         invlva = FALSE;
 3206                         origpte = pte_load_store(pte, newpte);
 3207                         if (origpte & PG_A) {
 3208                                 if (origpte & PG_MANAGED)
 3209                                         vm_page_flag_set(om, PG_REFERENCED);
 3210                                 if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
 3211                                     PG_NX) == 0 && (newpte & PG_NX)))
 3212                                         invlva = TRUE;
 3213                         }
 3214                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 3215                                 if ((origpte & PG_MANAGED) != 0)
 3216                                         vm_page_dirty(om);
 3217                                 if ((newpte & PG_RW) == 0)
 3218                                         invlva = TRUE;
 3219                         }
 3220                         if (invlva)
 3221                                 pmap_invalidate_page(pmap, va);
 3222                 } else
 3223                         pte_store(pte, newpte);
 3224         }
 3225 
 3226         /*
 3227          * If both the page table page and the reservation are fully
 3228          * populated, then attempt promotion.
 3229          */
 3230         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 3231             pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
 3232                 pmap_promote_pde(pmap, pde, va);
 3233 
 3234         vm_page_unlock_queues();
 3235         PMAP_UNLOCK(pmap);
 3236 }
 3237 
 3238 /*
 3239  * Tries to create a 2MB page mapping.  Returns TRUE if successful and FALSE
 3240  * otherwise.  Fails if (1) a page table page cannot be allocated without
 3241  * blocking, (2) a mapping already exists at the specified virtual address, or
 3242  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
 3243  */
 3244 static boolean_t
 3245 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 3246 {
 3247         pd_entry_t *pde, newpde;
 3248         vm_page_t free, mpde;
 3249 
 3250         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3251         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3252         if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
 3253                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3254                     " in pmap %p", va, pmap);
 3255                 return (FALSE);
 3256         }
 3257         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
 3258         pde = &pde[pmap_pde_index(va)];
 3259         if ((*pde & PG_V) != 0) {
 3260                 KASSERT(mpde->wire_count > 1,
 3261                     ("pmap_enter_pde: mpde's wire count is too low"));
 3262                 mpde->wire_count--;
 3263                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3264                     " in pmap %p", va, pmap);
 3265                 return (FALSE);
 3266         }
 3267         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
 3268             PG_PS | PG_V;
 3269         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 3270                 newpde |= PG_MANAGED;
 3271 
 3272                 /*
 3273                  * Abort this mapping if its PV entry could not be created.
 3274                  */
 3275                 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
 3276                         free = NULL;
 3277                         if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) {
 3278                                 pmap_invalidate_page(pmap, va);
 3279                                 pmap_free_zero_pages(free);
 3280                         }
 3281                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3282                             " in pmap %p", va, pmap);
 3283                         return (FALSE);
 3284                 }
 3285         }
 3286         if ((prot & VM_PROT_EXECUTE) == 0)
 3287                 newpde |= pg_nx;
 3288         if (va < VM_MAXUSER_ADDRESS)
 3289                 newpde |= PG_U;
 3290 
 3291         /*
 3292          * Increment counters.
 3293          */
 3294         pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
 3295 
 3296         /*
 3297          * Map the superpage.
 3298          */
 3299         pde_store(pde, newpde);
 3300 
 3301         pmap_pde_mappings++;
 3302         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 3303             " in pmap %p", va, pmap);
 3304         return (TRUE);
 3305 }
 3306 
 3307 /*
 3308  * Maps a sequence of resident pages belonging to the same object.
 3309  * The sequence begins with the given page m_start.  This page is
 3310  * mapped at the given virtual address start.  Each subsequent page is
 3311  * mapped at a virtual address that is offset from start by the same
 3312  * amount as the page is offset from m_start within the object.  The
 3313  * last page in the sequence is the page with the largest offset from
 3314  * m_start that can be mapped at a virtual address less than the given
 3315  * virtual address end.  Not every virtual page between start and end
 3316  * is mapped; only those for which a resident page exists with the
 3317  * corresponding offset from m_start are mapped.
 3318  */
 3319 void
 3320 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
 3321     vm_page_t m_start, vm_prot_t prot)
 3322 {
 3323         vm_offset_t va;
 3324         vm_page_t m, mpte;
 3325         vm_pindex_t diff, psize;
 3326 
 3327         VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 3328         psize = atop(end - start);
 3329         mpte = NULL;
 3330         m = m_start;
 3331         PMAP_LOCK(pmap);
 3332         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 3333                 va = start + ptoa(diff);
 3334                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 3335                     (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
 3336                     pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
 3337                     pmap_enter_pde(pmap, va, m, prot))
 3338                         m = &m[NBPDR / PAGE_SIZE - 1];
 3339                 else
 3340                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 3341                             mpte);
 3342                 m = TAILQ_NEXT(m, listq);
 3343         }
 3344         PMAP_UNLOCK(pmap);
 3345 }
 3346 
 3347 /*
 3348  * this code makes some *MAJOR* assumptions:
 3349  * 1. Current pmap & pmap exists.
 3350  * 2. Not wired.
 3351  * 3. Read access.
 3352  * 4. No page table pages.
 3353  * but is *MUCH* faster than pmap_enter...
 3354  */
 3355 
 3356 void
 3357 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 3358 {
 3359 
 3360         PMAP_LOCK(pmap);
 3361         (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 3362         PMAP_UNLOCK(pmap);
 3363 }
 3364 
 3365 static vm_page_t
 3366 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 3367     vm_prot_t prot, vm_page_t mpte)
 3368 {
 3369         vm_page_t free;
 3370         pt_entry_t *pte;
 3371         vm_paddr_t pa;
 3372 
 3373         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 3374             (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
 3375             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 3376         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3377         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3378 
 3379         /*
 3380          * In the case that a page table page is not
 3381          * resident, we are creating it here.
 3382          */
 3383         if (va < VM_MAXUSER_ADDRESS) {
 3384                 vm_pindex_t ptepindex;
 3385                 pd_entry_t *ptepa;
 3386 
 3387                 /*
 3388                  * Calculate pagetable page index
 3389                  */
 3390                 ptepindex = pmap_pde_pindex(va);
 3391                 if (mpte && (mpte->pindex == ptepindex)) {
 3392                         mpte->wire_count++;
 3393                 } else {
 3394                         /*
 3395                          * Get the page directory entry
 3396                          */
 3397                         ptepa = pmap_pde(pmap, va);
 3398 
 3399                         /*
 3400                          * If the page table page is mapped, we just increment
 3401                          * the hold count, and activate it.
 3402                          */
 3403                         if (ptepa && (*ptepa & PG_V) != 0) {
 3404                                 if (*ptepa & PG_PS)
 3405                                         return (NULL);
 3406                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 3407                                 mpte->wire_count++;
 3408                         } else {
 3409                                 mpte = _pmap_allocpte(pmap, ptepindex,
 3410                                     M_NOWAIT);
 3411                                 if (mpte == NULL)
 3412                                         return (mpte);
 3413                         }
 3414                 }
 3415                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 3416                 pte = &pte[pmap_pte_index(va)];
 3417         } else {
 3418                 mpte = NULL;
 3419                 pte = vtopte(va);
 3420         }
 3421         if (*pte) {
 3422                 if (mpte != NULL) {
 3423                         mpte->wire_count--;
 3424                         mpte = NULL;
 3425                 }
 3426                 return (mpte);
 3427         }
 3428 
 3429         /*
 3430          * Enter on the PV list if part of our managed memory.
 3431          */
 3432         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
 3433             !pmap_try_insert_pv_entry(pmap, va, m)) {
 3434                 if (mpte != NULL) {
 3435                         free = NULL;
 3436                         if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
 3437                                 pmap_invalidate_page(pmap, va);
 3438                                 pmap_free_zero_pages(free);
 3439                         }
 3440                         mpte = NULL;
 3441                 }
 3442                 return (mpte);
 3443         }
 3444 
 3445         /*
 3446          * Increment counters
 3447          */
 3448         pmap->pm_stats.resident_count++;
 3449 
 3450         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
 3451         if ((prot & VM_PROT_EXECUTE) == 0)
 3452                 pa |= pg_nx;
 3453 
 3454         /*
 3455          * Now validate mapping with RO protection
 3456          */
 3457         if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 3458                 pte_store(pte, pa | PG_V | PG_U);
 3459         else
 3460                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 3461         return mpte;
 3462 }
 3463 
 3464 /*
 3465  * Make a temporary mapping for a physical address.  This is only intended
 3466  * to be used for panic dumps.
 3467  */
 3468 void *
 3469 pmap_kenter_temporary(vm_paddr_t pa, int i)
 3470 {
 3471         vm_offset_t va;
 3472 
 3473         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 3474         pmap_kenter(va, pa);
 3475         invlpg(va);
 3476         return ((void *)crashdumpmap);
 3477 }
 3478 
 3479 /*
 3480  * This code maps large physical mmap regions into the
 3481  * processor address space.  Note that some shortcuts
 3482  * are taken, but the code works.
 3483  */
 3484 void
 3485 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
 3486     vm_pindex_t pindex, vm_size_t size)
 3487 {
 3488         pd_entry_t *pde;
 3489         vm_paddr_t pa, ptepa;
 3490         vm_page_t p, pdpg;
 3491         int pat_mode;
 3492 
 3493         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 3494         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 3495             ("pmap_object_init_pt: non-device object"));
 3496         if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 3497                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
 3498                         return;
 3499                 p = vm_page_lookup(object, pindex);
 3500                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
 3501                     ("pmap_object_init_pt: invalid page %p", p));
 3502                 pat_mode = p->md.pat_mode;
 3503 
 3504                 /*
 3505                  * Abort the mapping if the first page is not physically
 3506                  * aligned to a 2MB page boundary.
 3507                  */
 3508                 ptepa = VM_PAGE_TO_PHYS(p);
 3509                 if (ptepa & (NBPDR - 1))
 3510                         return;
 3511 
 3512                 /*
 3513                  * Skip the first page.  Abort the mapping if the rest of
 3514                  * the pages are not physically contiguous or have differing
 3515                  * memory attributes.
 3516                  */
 3517                 p = TAILQ_NEXT(p, listq);
 3518                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 3519                     pa += PAGE_SIZE) {
 3520                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
 3521                             ("pmap_object_init_pt: invalid page %p", p));
 3522                         if (pa != VM_PAGE_TO_PHYS(p) ||
 3523                             pat_mode != p->md.pat_mode)
 3524                                 return;
 3525                         p = TAILQ_NEXT(p, listq);
 3526                 }
 3527 
 3528                 /*
 3529                  * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 3530                  * "size" is a multiple of 2M, adding the PAT setting to "pa"
 3531                  * will not affect the termination of this loop.
 3532                  */ 
 3533                 PMAP_LOCK(pmap);
 3534                 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
 3535                     size; pa += NBPDR) {
 3536                         pdpg = pmap_allocpde(pmap, addr, M_NOWAIT);
 3537                         if (pdpg == NULL) {
 3538                                 /*
 3539                                  * The creation of mappings below is only an
 3540                                  * optimization.  If a page directory page
 3541                                  * cannot be allocated without blocking,
 3542                                  * continue on to the next mapping rather than
 3543                                  * blocking.
 3544                                  */
 3545                                 addr += NBPDR;
 3546                                 continue;
 3547                         }
 3548                         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 3549                         pde = &pde[pmap_pde_index(addr)];
 3550                         if ((*pde & PG_V) == 0) {
 3551                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
 3552                                     PG_U | PG_RW | PG_V);
 3553                                 pmap->pm_stats.resident_count += NBPDR /
 3554                                     PAGE_SIZE;
 3555                                 pmap_pde_mappings++;
 3556                         } else {
 3557                                 /* Continue on if the PDE is already valid. */
 3558                                 pdpg->wire_count--;
 3559                                 KASSERT(pdpg->wire_count > 0,
 3560                                     ("pmap_object_init_pt: missing reference "
 3561                                     "to page directory page, va: 0x%lx", addr));
 3562                         }
 3563                         addr += NBPDR;
 3564                 }
 3565                 PMAP_UNLOCK(pmap);
 3566         }
 3567 }
 3568 
 3569 /*
 3570  *      Routine:        pmap_change_wiring
 3571  *      Function:       Change the wiring attribute for a map/virtual-address
 3572  *                      pair.
 3573  *      In/out conditions:
 3574  *                      The mapping must already exist in the pmap.
 3575  */
 3576 void
 3577 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 3578 {
 3579         pd_entry_t *pde;
 3580         pt_entry_t *pte;
 3581         boolean_t are_queues_locked;
 3582 
 3583         are_queues_locked = FALSE;
 3584 
 3585         /*
 3586          * Wiring is not a hardware characteristic so there is no need to
 3587          * invalidate TLB.
 3588          */
 3589 retry:
 3590         PMAP_LOCK(pmap);
 3591         pde = pmap_pde(pmap, va);
 3592         if ((*pde & PG_PS) != 0) {
 3593                 if (!wired != ((*pde & PG_W) == 0)) {
 3594                         if (!are_queues_locked) {
 3595                                 are_queues_locked = TRUE;
 3596                                 if (!mtx_trylock(&vm_page_queue_mtx)) {
 3597                                         PMAP_UNLOCK(pmap);
 3598                                         vm_page_lock_queues();
 3599                                         goto retry;
 3600                                 }
 3601                         }
 3602                         if (!pmap_demote_pde(pmap, pde, va))
 3603                                 panic("pmap_change_wiring: demotion failed");
 3604                 } else
 3605                         goto out;
 3606         }
 3607         pte = pmap_pde_to_pte(pde, va);
 3608         if (wired && (*pte & PG_W) == 0) {
 3609                 pmap->pm_stats.wired_count++;
 3610                 atomic_set_long(pte, PG_W);
 3611         } else if (!wired && (*pte & PG_W) != 0) {
 3612                 pmap->pm_stats.wired_count--;
 3613                 atomic_clear_long(pte, PG_W);
 3614         }
 3615 out:
 3616         if (are_queues_locked)
 3617                 vm_page_unlock_queues();
 3618         PMAP_UNLOCK(pmap);
 3619 }
 3620 
 3621 
 3622 
 3623 /*
 3624  *      Copy the range specified by src_addr/len
 3625  *      from the source map to the range dst_addr/len
 3626  *      in the destination map.
 3627  *
 3628  *      This routine is only advisory and need not do anything.
 3629  */
 3630 
 3631 void
 3632 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 3633     vm_offset_t src_addr)
 3634 {
 3635         vm_page_t   free;
 3636         vm_offset_t addr;
 3637         vm_offset_t end_addr = src_addr + len;
 3638         vm_offset_t va_next;
 3639 
 3640         if (dst_addr != src_addr)
 3641                 return;
 3642 
 3643         vm_page_lock_queues();
 3644         if (dst_pmap < src_pmap) {
 3645                 PMAP_LOCK(dst_pmap);
 3646                 PMAP_LOCK(src_pmap);
 3647         } else {
 3648                 PMAP_LOCK(src_pmap);
 3649                 PMAP_LOCK(dst_pmap);
 3650         }
 3651         for (addr = src_addr; addr < end_addr; addr = va_next) {
 3652                 pt_entry_t *src_pte, *dst_pte;
 3653                 vm_page_t dstmpde, dstmpte, srcmpte;
 3654                 pml4_entry_t *pml4e;
 3655                 pdp_entry_t *pdpe;
 3656                 pd_entry_t srcptepaddr, *pde;
 3657 
 3658                 KASSERT(addr < UPT_MIN_ADDRESS,
 3659                     ("pmap_copy: invalid to pmap_copy page tables"));
 3660 
 3661                 pml4e = pmap_pml4e(src_pmap, addr);
 3662                 if ((*pml4e & PG_V) == 0) {
 3663                         va_next = (addr + NBPML4) & ~PML4MASK;
 3664                         if (va_next < addr)
 3665                                 va_next = end_addr;
 3666                         continue;
 3667                 }
 3668 
 3669                 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 3670                 if ((*pdpe & PG_V) == 0) {
 3671                         va_next = (addr + NBPDP) & ~PDPMASK;
 3672                         if (va_next < addr)
 3673                                 va_next = end_addr;
 3674                         continue;
 3675                 }
 3676 
 3677                 va_next = (addr + NBPDR) & ~PDRMASK;
 3678                 if (va_next < addr)
 3679                         va_next = end_addr;
 3680 
 3681                 pde = pmap_pdpe_to_pde(pdpe, addr);
 3682                 srcptepaddr = *pde;
 3683                 if (srcptepaddr == 0)
 3684                         continue;
 3685                         
 3686                 if (srcptepaddr & PG_PS) {
 3687                         dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
 3688                         if (dstmpde == NULL)
 3689                                 break;
 3690                         pde = (pd_entry_t *)
 3691                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
 3692                         pde = &pde[pmap_pde_index(addr)];
 3693                         if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 3694                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
 3695                             PG_PS_FRAME))) {
 3696                                 *pde = srcptepaddr & ~PG_W;
 3697                                 dst_pmap->pm_stats.resident_count +=
 3698                                     NBPDR / PAGE_SIZE;
 3699                         } else
 3700                                 dstmpde->wire_count--;
 3701                         continue;
 3702                 }
 3703 
 3704                 srcptepaddr &= PG_FRAME;
 3705                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 3706                 KASSERT(srcmpte->wire_count > 0,
 3707                     ("pmap_copy: source page table page is unused"));
 3708 
 3709                 if (va_next > end_addr)
 3710                         va_next = end_addr;
 3711 
 3712                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 3713                 src_pte = &src_pte[pmap_pte_index(addr)];
 3714                 dstmpte = NULL;
 3715                 while (addr < va_next) {
 3716                         pt_entry_t ptetemp;
 3717                         ptetemp = *src_pte;
 3718                         /*
 3719                          * we only virtual copy managed pages
 3720                          */
 3721                         if ((ptetemp & PG_MANAGED) != 0) {
 3722                                 if (dstmpte != NULL &&
 3723                                     dstmpte->pindex == pmap_pde_pindex(addr))
 3724                                         dstmpte->wire_count++;
 3725                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
 3726                                     addr, M_NOWAIT)) == NULL)
 3727                                         goto out;
 3728                                 dst_pte = (pt_entry_t *)
 3729                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 3730                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
 3731                                 if (*dst_pte == 0 &&
 3732                                     pmap_try_insert_pv_entry(dst_pmap, addr,
 3733                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 3734                                         /*
 3735                                          * Clear the wired, modified, and
 3736                                          * accessed (referenced) bits
 3737                                          * during the copy.
 3738                                          */
 3739                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
 3740                                             PG_A);
 3741                                         dst_pmap->pm_stats.resident_count++;
 3742                                 } else {
 3743                                         free = NULL;
 3744                                         if (pmap_unwire_pte_hold(dst_pmap,
 3745                                             addr, dstmpte, &free)) {
 3746                                                 pmap_invalidate_page(dst_pmap,
 3747                                                     addr);
 3748                                                 pmap_free_zero_pages(free);
 3749                                         }
 3750                                         goto out;
 3751                                 }
 3752                                 if (dstmpte->wire_count >= srcmpte->wire_count)
 3753                                         break;
 3754                         }
 3755                         addr += PAGE_SIZE;
 3756                         src_pte++;
 3757                 }
 3758         }
 3759 out:
 3760         vm_page_unlock_queues();
 3761         PMAP_UNLOCK(src_pmap);
 3762         PMAP_UNLOCK(dst_pmap);
 3763 }       
 3764 
 3765 /*
 3766  *      pmap_zero_page zeros the specified hardware page by mapping 
 3767  *      the page into KVM and using bzero to clear its contents.
 3768  */
 3769 void
 3770 pmap_zero_page(vm_page_t m)
 3771 {
 3772         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 3773 
 3774         pagezero((void *)va);
 3775 }
 3776 
 3777 /*
 3778  *      pmap_zero_page_area zeros the specified hardware page by mapping 
 3779  *      the page into KVM and using bzero to clear its contents.
 3780  *
 3781  *      off and size may not cover an area beyond a single hardware page.
 3782  */
 3783 void
 3784 pmap_zero_page_area(vm_page_t m, int off, int size)
 3785 {
 3786         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 3787 
 3788         if (off == 0 && size == PAGE_SIZE)
 3789                 pagezero((void *)va);
 3790         else
 3791                 bzero((char *)va + off, size);
 3792 }
 3793 
 3794 /*
 3795  *      pmap_zero_page_idle zeros the specified hardware page by mapping 
 3796  *      the page into KVM and using bzero to clear its contents.  This
 3797  *      is intended to be called from the vm_pagezero process only and
 3798  *      outside of Giant.
 3799  */
 3800 void
 3801 pmap_zero_page_idle(vm_page_t m)
 3802 {
 3803         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 3804 
 3805         pagezero((void *)va);
 3806 }
 3807 
 3808 /*
 3809  *      pmap_copy_page copies the specified (machine independent)
 3810  *      page by mapping the page into virtual memory and using
 3811  *      bcopy to copy the page, one machine dependent page at a
 3812  *      time.
 3813  */
 3814 void
 3815 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 3816 {
 3817         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 3818         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 3819 
 3820         pagecopy((void *)src, (void *)dst);
 3821 }
 3822 
 3823 /*
 3824  * Returns true if the pmap's pv is one of the first
 3825  * 16 pvs linked to from this page.  This count may
 3826  * be changed upwards or downwards in the future; it
 3827  * is only necessary that true be returned for a small
 3828  * subset of pmaps for proper page aging.
 3829  */
 3830 boolean_t
 3831 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 3832 {
 3833         struct md_page *pvh;
 3834         pv_entry_t pv;
 3835         int loops = 0;
 3836 
 3837         if (m->flags & PG_FICTITIOUS)
 3838                 return FALSE;
 3839 
 3840         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3841         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 3842                 if (PV_PMAP(pv) == pmap) {
 3843                         return TRUE;
 3844                 }
 3845                 loops++;
 3846                 if (loops >= 16)
 3847                         break;
 3848         }
 3849         if (loops < 16) {
 3850                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3851                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 3852                         if (PV_PMAP(pv) == pmap)
 3853                                 return (TRUE);
 3854                         loops++;
 3855                         if (loops >= 16)
 3856                                 break;
 3857                 }
 3858         }
 3859         return (FALSE);
 3860 }
 3861 
 3862 /*
 3863  *      pmap_page_wired_mappings:
 3864  *
 3865  *      Return the number of managed mappings to the given physical page
 3866  *      that are wired.
 3867  */
 3868 int
 3869 pmap_page_wired_mappings(vm_page_t m)
 3870 {
 3871         int count;
 3872 
 3873         count = 0;
 3874         if ((m->flags & PG_FICTITIOUS) != 0)
 3875                 return (count);
 3876         count = pmap_pvh_wired_mappings(&m->md, count);
 3877         return (pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count));
 3878 }
 3879 
 3880 /*
 3881  *      pmap_pvh_wired_mappings:
 3882  *
 3883  *      Return the updated number "count" of managed mappings that are wired.
 3884  */
 3885 static int
 3886 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
 3887 {
 3888         pmap_t pmap;
 3889         pt_entry_t *pte;
 3890         pv_entry_t pv;
 3891 
 3892         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3893         TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 3894                 pmap = PV_PMAP(pv);
 3895                 PMAP_LOCK(pmap);
 3896                 pte = pmap_pte(pmap, pv->pv_va);
 3897                 if ((*pte & PG_W) != 0)
 3898                         count++;
 3899                 PMAP_UNLOCK(pmap);
 3900         }
 3901         return (count);
 3902 }
 3903 
 3904 /*
 3905  * Returns TRUE if the given page is mapped individually or as part of
 3906  * a 2mpage.  Otherwise, returns FALSE.
 3907  */
 3908 boolean_t
 3909 pmap_page_is_mapped(vm_page_t m)
 3910 {
 3911         struct md_page *pvh;
 3912 
 3913         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 3914                 return (FALSE);
 3915         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 3916         if (TAILQ_EMPTY(&m->md.pv_list)) {
 3917                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3918                 return (!TAILQ_EMPTY(&pvh->pv_list));
 3919         } else
 3920                 return (TRUE);
 3921 }
 3922 
 3923 /*
 3924  * Remove all pages from specified address space
 3925  * this aids process exit speeds.  Also, this code
 3926  * is special cased for current process only, but
 3927  * can have the more generic (and slightly slower)
 3928  * mode enabled.  This is much faster than pmap_remove
 3929  * in the case of running down an entire address space.
 3930  */
 3931 void
 3932 pmap_remove_pages(pmap_t pmap)
 3933 {
 3934         pd_entry_t ptepde;
 3935         pt_entry_t *pte, tpte;
 3936         vm_page_t free = NULL;
 3937         vm_page_t m, mpte, mt;
 3938         pv_entry_t pv;
 3939         struct md_page *pvh;
 3940         struct pv_chunk *pc, *npc;
 3941         int field, idx;
 3942         int64_t bit;
 3943         uint64_t inuse, bitmask;
 3944         int allfree;
 3945 
 3946         if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 3947                 printf("warning: pmap_remove_pages called with non-current pmap\n");
 3948                 return;
 3949         }
 3950         vm_page_lock_queues();
 3951         PMAP_LOCK(pmap);
 3952         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 3953                 allfree = 1;
 3954                 for (field = 0; field < _NPCM; field++) {
 3955                         inuse = (~(pc->pc_map[field])) & pc_freemask[field];
 3956                         while (inuse != 0) {
 3957                                 bit = bsfq(inuse);
 3958                                 bitmask = 1UL << bit;
 3959                                 idx = field * 64 + bit;
 3960                                 pv = &pc->pc_pventry[idx];
 3961                                 inuse &= ~bitmask;
 3962 
 3963                                 pte = pmap_pdpe(pmap, pv->pv_va);
 3964                                 ptepde = *pte;
 3965                                 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 3966                                 tpte = *pte;
 3967                                 if ((tpte & (PG_PS | PG_V)) == PG_V) {
 3968                                         ptepde = tpte;
 3969                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 3970                                             PG_FRAME);
 3971                                         pte = &pte[pmap_pte_index(pv->pv_va)];
 3972                                         tpte = *pte & ~PG_PTE_PAT;
 3973                                 }
 3974                                 if ((tpte & PG_V) == 0)
 3975                                         panic("bad pte");
 3976 
 3977 /*
 3978  * We cannot remove wired pages from a process' mapping at this time
 3979  */
 3980                                 if (tpte & PG_W) {
 3981                                         allfree = 0;
 3982                                         continue;
 3983                                 }
 3984 
 3985                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 3986                                 KASSERT(m->phys_addr == (tpte & PG_FRAME),
 3987                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 3988                                     m, (uintmax_t)m->phys_addr,
 3989                                     (uintmax_t)tpte));
 3990 
 3991                                 KASSERT(m < &vm_page_array[vm_page_array_size],
 3992                                         ("pmap_remove_pages: bad tpte %#jx",
 3993                                         (uintmax_t)tpte));
 3994 
 3995                                 pte_clear(pte);
 3996 
 3997                                 /*
 3998                                  * Update the vm_page_t clean/reference bits.
 3999                                  */
 4000                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 4001                                         if ((tpte & PG_PS) != 0) {
 4002                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 4003                                                         vm_page_dirty(mt);
 4004                                         } else
 4005                                                 vm_page_dirty(m);
 4006                                 }
 4007 
 4008                                 /* Mark free */
 4009                                 PV_STAT(pv_entry_frees++);
 4010                                 PV_STAT(pv_entry_spare++);
 4011                                 pv_entry_count--;
 4012                                 pc->pc_map[field] |= bitmask;
 4013                                 if ((tpte & PG_PS) != 0) {
 4014                                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 4015                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 4016                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
 4017                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 4018                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 4019                                                         if (TAILQ_EMPTY(&mt->md.pv_list))
 4020                                                                 vm_page_flag_clear(mt, PG_WRITEABLE);
 4021                                         }
 4022                                         mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
 4023                                         if (mpte != NULL) {
 4024                                                 pmap_remove_pt_page(pmap, mpte);
 4025                                                 pmap->pm_stats.resident_count--;
 4026                                                 KASSERT(mpte->wire_count == NPTEPG,
 4027                                                     ("pmap_remove_pages: pte page wire count error"));
 4028                                                 mpte->wire_count = 0;
 4029                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
 4030                                                 atomic_subtract_int(&cnt.v_wire_count, 1);
 4031                                         }
 4032                                 } else {
 4033                                         pmap->pm_stats.resident_count--;
 4034                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 4035                                         if (TAILQ_EMPTY(&m->md.pv_list)) {
 4036                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4037                                                 if (TAILQ_EMPTY(&pvh->pv_list))
 4038                                                         vm_page_flag_clear(m, PG_WRITEABLE);
 4039                                         }
 4040                                 }
 4041                                 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 4042                         }
 4043                 }
 4044                 if (allfree) {
 4045                         PV_STAT(pv_entry_spare -= _NPCPV);
 4046                         PV_STAT(pc_chunk_count--);
 4047                         PV_STAT(pc_chunk_frees++);
 4048                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 4049                         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 4050                         dump_drop_page(m->phys_addr);
 4051                         vm_page_unwire(m, 0);
 4052                         vm_page_free(m);
 4053                 }
 4054         }
 4055         pmap_invalidate_all(pmap);
 4056         vm_page_unlock_queues();
 4057         PMAP_UNLOCK(pmap);
 4058         pmap_free_zero_pages(free);
 4059 }
 4060 
 4061 /*
 4062  *      pmap_is_modified:
 4063  *
 4064  *      Return whether or not the specified physical page was modified
 4065  *      in any physical maps.
 4066  */
 4067 boolean_t
 4068 pmap_is_modified(vm_page_t m)
 4069 {
 4070 
 4071         if (m->flags & PG_FICTITIOUS)
 4072                 return (FALSE);
 4073         if (pmap_is_modified_pvh(&m->md))
 4074                 return (TRUE);
 4075         return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 4076 }
 4077 
 4078 /*
 4079  * Returns TRUE if any of the given mappings were used to modify
 4080  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
 4081  * mappings are supported.
 4082  */
 4083 static boolean_t
 4084 pmap_is_modified_pvh(struct md_page *pvh)
 4085 {
 4086         pv_entry_t pv;
 4087         pt_entry_t *pte;
 4088         pmap_t pmap;
 4089         boolean_t rv;
 4090 
 4091         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4092         rv = FALSE;
 4093         TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 4094                 pmap = PV_PMAP(pv);
 4095                 PMAP_LOCK(pmap);
 4096                 pte = pmap_pte(pmap, pv->pv_va);
 4097                 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
 4098                 PMAP_UNLOCK(pmap);
 4099                 if (rv)
 4100                         break;
 4101         }
 4102         return (rv);
 4103 }
 4104 
 4105 /*
 4106  *      pmap_is_prefaultable:
 4107  *
 4108  *      Return whether or not the specified virtual address is elgible
 4109  *      for prefault.
 4110  */
 4111 boolean_t
 4112 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 4113 {
 4114         pd_entry_t *pde;
 4115         pt_entry_t *pte;
 4116         boolean_t rv;
 4117 
 4118         rv = FALSE;
 4119         PMAP_LOCK(pmap);
 4120         pde = pmap_pde(pmap, addr);
 4121         if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 4122                 pte = pmap_pde_to_pte(pde, addr);
 4123                 rv = (*pte & PG_V) == 0;
 4124         }
 4125         PMAP_UNLOCK(pmap);
 4126         return (rv);
 4127 }
 4128 
 4129 /*
 4130  * Clear the write and modified bits in each of the given page's mappings.
 4131  */
 4132 void
 4133 pmap_remove_write(vm_page_t m)
 4134 {
 4135         struct md_page *pvh;
 4136         pmap_t pmap;
 4137         pv_entry_t next_pv, pv;
 4138         pd_entry_t *pde;
 4139         pt_entry_t oldpte, *pte;
 4140         vm_offset_t va;
 4141 
 4142         if ((m->flags & PG_FICTITIOUS) != 0 ||
 4143             (m->flags & PG_WRITEABLE) == 0)
 4144                 return;
 4145         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4146         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4147         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
 4148                 va = pv->pv_va;
 4149                 pmap = PV_PMAP(pv);
 4150                 PMAP_LOCK(pmap);
 4151                 pde = pmap_pde(pmap, va);
 4152                 if ((*pde & PG_RW) != 0)
 4153                         (void)pmap_demote_pde(pmap, pde, va);
 4154                 PMAP_UNLOCK(pmap);
 4155         }
 4156         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 4157                 pmap = PV_PMAP(pv);
 4158                 PMAP_LOCK(pmap);
 4159                 pde = pmap_pde(pmap, pv->pv_va);
 4160                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
 4161                     " a 2mpage in page %p's pv list", m));
 4162                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 4163 retry:
 4164                 oldpte = *pte;
 4165                 if (oldpte & PG_RW) {
 4166                         if (!atomic_cmpset_long(pte, oldpte, oldpte &
 4167                             ~(PG_RW | PG_M)))
 4168                                 goto retry;
 4169                         if ((oldpte & PG_M) != 0)
 4170                                 vm_page_dirty(m);
 4171                         pmap_invalidate_page(pmap, pv->pv_va);
 4172                 }
 4173                 PMAP_UNLOCK(pmap);
 4174         }
 4175         vm_page_flag_clear(m, PG_WRITEABLE);
 4176 }
 4177 
 4178 /*
 4179  *      pmap_ts_referenced:
 4180  *
 4181  *      Return a count of reference bits for a page, clearing those bits.
 4182  *      It is not necessary for every reference bit to be cleared, but it
 4183  *      is necessary that 0 only be returned when there are truly no
 4184  *      reference bits set.
 4185  *
 4186  *      XXX: The exact number of bits to check and clear is a matter that
 4187  *      should be tested and standardized at some point in the future for
 4188  *      optimal aging of shared pages.
 4189  */
 4190 int
 4191 pmap_ts_referenced(vm_page_t m)
 4192 {
 4193         struct md_page *pvh;
 4194         pv_entry_t pv, pvf, pvn;
 4195         pmap_t pmap;
 4196         pd_entry_t oldpde, *pde;
 4197         pt_entry_t *pte;
 4198         vm_offset_t va;
 4199         int rtval = 0;
 4200 
 4201         if (m->flags & PG_FICTITIOUS)
 4202                 return (rtval);
 4203         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4204         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4205         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
 4206                 va = pv->pv_va;
 4207                 pmap = PV_PMAP(pv);
 4208                 PMAP_LOCK(pmap);
 4209                 pde = pmap_pde(pmap, va);
 4210                 oldpde = *pde;
 4211                 if ((oldpde & PG_A) != 0) {
 4212                         if (pmap_demote_pde(pmap, pde, va)) {
 4213                                 if ((oldpde & PG_W) == 0) {
 4214                                         /*
 4215                                          * Remove the mapping to a single page
 4216                                          * so that a subsequent access may
 4217                                          * repromote.  Since the underlying
 4218                                          * page table page is fully populated,
 4219                                          * this removal never frees a page
 4220                                          * table page.
 4221                                          */
 4222                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 4223                                             PG_PS_FRAME);
 4224                                         pmap_remove_page(pmap, va, pde, NULL);
 4225                                         rtval++;
 4226                                         if (rtval > 4) {
 4227                                                 PMAP_UNLOCK(pmap);
 4228                                                 return (rtval);
 4229                                         }
 4230                                 }
 4231                         }
 4232                 }
 4233                 PMAP_UNLOCK(pmap);
 4234         }
 4235         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 4236                 pvf = pv;
 4237                 do {
 4238                         pvn = TAILQ_NEXT(pv, pv_list);
 4239                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 4240                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 4241                         pmap = PV_PMAP(pv);
 4242                         PMAP_LOCK(pmap);
 4243                         pde = pmap_pde(pmap, pv->pv_va);
 4244                         KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
 4245                             " found a 2mpage in page %p's pv list", m));
 4246                         pte = pmap_pde_to_pte(pde, pv->pv_va);
 4247                         if ((*pte & PG_A) != 0) {
 4248                                 atomic_clear_long(pte, PG_A);
 4249                                 pmap_invalidate_page(pmap, pv->pv_va);
 4250                                 rtval++;
 4251                                 if (rtval > 4)
 4252                                         pvn = NULL;
 4253                         }
 4254                         PMAP_UNLOCK(pmap);
 4255                 } while ((pv = pvn) != NULL && pv != pvf);
 4256         }
 4257         return (rtval);
 4258 }
 4259 
 4260 /*
 4261  *      Clear the modify bits on the specified physical page.
 4262  */
 4263 void
 4264 pmap_clear_modify(vm_page_t m)
 4265 {
 4266         struct md_page *pvh;
 4267         pmap_t pmap;
 4268         pv_entry_t next_pv, pv;
 4269         pd_entry_t oldpde, *pde;
 4270         pt_entry_t oldpte, *pte;
 4271         vm_offset_t va;
 4272 
 4273         if ((m->flags & PG_FICTITIOUS) != 0)
 4274                 return;
 4275         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4276         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4277         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
 4278                 va = pv->pv_va;
 4279                 pmap = PV_PMAP(pv);
 4280                 PMAP_LOCK(pmap);
 4281                 pde = pmap_pde(pmap, va);
 4282                 oldpde = *pde;
 4283                 if ((oldpde & PG_RW) != 0) {
 4284                         if (pmap_demote_pde(pmap, pde, va)) {
 4285                                 if ((oldpde & PG_W) == 0) {
 4286                                         /*
 4287                                          * Write protect the mapping to a
 4288                                          * single page so that a subsequent
 4289                                          * write access may repromote.
 4290                                          */
 4291                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
 4292                                             PG_PS_FRAME);
 4293                                         pte = pmap_pde_to_pte(pde, va);
 4294                                         oldpte = *pte;
 4295                                         if ((oldpte & PG_V) != 0) {
 4296                                                 while (!atomic_cmpset_long(pte,
 4297                                                     oldpte,
 4298                                                     oldpte & ~(PG_M | PG_RW)))
 4299                                                         oldpte = *pte;
 4300                                                 vm_page_dirty(m);
 4301                                                 pmap_invalidate_page(pmap, va);
 4302                                         }
 4303                                 }
 4304                         }
 4305                 }
 4306                 PMAP_UNLOCK(pmap);
 4307         }
 4308         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 4309                 pmap = PV_PMAP(pv);
 4310                 PMAP_LOCK(pmap);
 4311                 pde = pmap_pde(pmap, pv->pv_va);
 4312                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 4313                     " a 2mpage in page %p's pv list", m));
 4314                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 4315                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 4316                         atomic_clear_long(pte, PG_M);
 4317                         pmap_invalidate_page(pmap, pv->pv_va);
 4318                 }
 4319                 PMAP_UNLOCK(pmap);
 4320         }
 4321 }
 4322 
 4323 /*
 4324  *      pmap_clear_reference:
 4325  *
 4326  *      Clear the reference bit on the specified physical page.
 4327  */
 4328 void
 4329 pmap_clear_reference(vm_page_t m)
 4330 {
 4331         struct md_page *pvh;
 4332         pmap_t pmap;
 4333         pv_entry_t next_pv, pv;
 4334         pd_entry_t oldpde, *pde;
 4335         pt_entry_t *pte;
 4336         vm_offset_t va;
 4337 
 4338         if ((m->flags & PG_FICTITIOUS) != 0)
 4339                 return;
 4340         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 4341         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4342         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
 4343                 va = pv->pv_va;
 4344                 pmap = PV_PMAP(pv);
 4345                 PMAP_LOCK(pmap);
 4346                 pde = pmap_pde(pmap, va);
 4347                 oldpde = *pde;
 4348                 if ((oldpde & PG_A) != 0) {
 4349                         if (pmap_demote_pde(pmap, pde, va)) {
 4350                                 /*
 4351                                  * Remove the mapping to a single page so
 4352                                  * that a subsequent access may repromote.
 4353                                  * Since the underlying page table page is
 4354                                  * fully populated, this removal never frees
 4355                                  * a page table page.
 4356                                  */
 4357                                 va += VM_PAGE_TO_PHYS(m) - (oldpde &
 4358                                     PG_PS_FRAME);
 4359                                 pmap_remove_page(pmap, va, pde, NULL);
 4360                         }
 4361                 }
 4362                 PMAP_UNLOCK(pmap);
 4363         }
 4364         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 4365                 pmap = PV_PMAP(pv);
 4366                 PMAP_LOCK(pmap);
 4367                 pde = pmap_pde(pmap, pv->pv_va);
 4368                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
 4369                     " a 2mpage in page %p's pv list", m));
 4370                 pte = pmap_pde_to_pte(pde, pv->pv_va);
 4371                 if (*pte & PG_A) {
 4372                         atomic_clear_long(pte, PG_A);
 4373                         pmap_invalidate_page(pmap, pv->pv_va);
 4374                 }
 4375                 PMAP_UNLOCK(pmap);
 4376         }
 4377 }
 4378 
 4379 /*
 4380  * Miscellaneous support routines follow
 4381  */
 4382 
 4383 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 4384 static __inline void
 4385 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
 4386 {
 4387         u_int opte, npte;
 4388 
 4389         /*
 4390          * The cache mode bits are all in the low 32-bits of the
 4391          * PTE, so we can just spin on updating the low 32-bits.
 4392          */
 4393         do {
 4394                 opte = *(u_int *)pte;
 4395                 npte = opte & ~PG_PTE_CACHE;
 4396                 npte |= cache_bits;
 4397         } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 4398 }
 4399 
 4400 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 4401 static __inline void
 4402 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
 4403 {
 4404         u_int opde, npde;
 4405 
 4406         /*
 4407          * The cache mode bits are all in the low 32-bits of the
 4408          * PDE, so we can just spin on updating the low 32-bits.
 4409          */
 4410         do {
 4411                 opde = *(u_int *)pde;
 4412                 npde = opde & ~PG_PDE_CACHE;
 4413                 npde |= cache_bits;
 4414         } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 4415 }
 4416 
 4417 /*
 4418  * Map a set of physical memory pages into the kernel virtual
 4419  * address space. Return a pointer to where it is mapped. This
 4420  * routine is intended to be used for mapping device memory,
 4421  * NOT real memory.
 4422  */
 4423 void *
 4424 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 4425 {
 4426         vm_offset_t va, offset;
 4427         vm_size_t tmpsize;
 4428 
 4429         /*
 4430          * If the specified range of physical addresses fits within the direct
 4431          * map window, use the direct map. 
 4432          */
 4433         if (pa < dmaplimit && pa + size < dmaplimit) {
 4434                 va = PHYS_TO_DMAP(pa);
 4435                 if (!pmap_change_attr(va, size, mode))
 4436                         return ((void *)va);
 4437         }
 4438         offset = pa & PAGE_MASK;
 4439         size = roundup(offset + size, PAGE_SIZE);
 4440         va = kmem_alloc_nofault(kernel_map, size);
 4441         if (!va)
 4442                 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 4443         pa = trunc_page(pa);
 4444         for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 4445                 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 4446         pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 4447         pmap_invalidate_cache_range(va, va + tmpsize);
 4448         return ((void *)(va + offset));
 4449 }
 4450 
 4451 void *
 4452 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 4453 {
 4454 
 4455         return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 4456 }
 4457 
 4458 void *
 4459 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 4460 {
 4461 
 4462         return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 4463 }
 4464 
 4465 void
 4466 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 4467 {
 4468         vm_offset_t base, offset, tmpva;
 4469 
 4470         /* If we gave a direct map region in pmap_mapdev, do nothing */
 4471         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 4472                 return;
 4473         base = trunc_page(va);
 4474         offset = va & PAGE_MASK;
 4475         size = roundup(offset + size, PAGE_SIZE);
 4476         for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
 4477                 pmap_kremove(tmpva);
 4478         pmap_invalidate_range(kernel_pmap, va, tmpva);
 4479         kmem_free(kernel_map, base, size);
 4480 }
 4481 
 4482 /*
 4483  * Tries to demote a 1GB page mapping.
 4484  */
 4485 static boolean_t
 4486 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
 4487 {
 4488         pdp_entry_t newpdpe, oldpdpe;
 4489         pd_entry_t *firstpde, newpde, *pde;
 4490         vm_paddr_t mpdepa;
 4491         vm_page_t mpde;
 4492 
 4493         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4494         oldpdpe = *pdpe;
 4495         KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
 4496             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 4497         if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
 4498             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 4499                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 4500                     " in pmap %p", va, pmap);
 4501                 return (FALSE);
 4502         }
 4503         mpdepa = VM_PAGE_TO_PHYS(mpde);
 4504         firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
 4505         newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
 4506         KASSERT((oldpdpe & PG_A) != 0,
 4507             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 4508         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 4509             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 4510         newpde = oldpdpe;
 4511 
 4512         /*
 4513          * Initialize the page directory page.
 4514          */
 4515         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 4516                 *pde = newpde;
 4517                 newpde += NBPDR;
 4518         }
 4519 
 4520         /*
 4521          * Demote the mapping.
 4522          */
 4523         *pdpe = newpdpe;
 4524 
 4525         /*
 4526          * Invalidate a stale recursive mapping of the page directory page.
 4527          */
 4528         pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
 4529 
 4530         pmap_pdpe_demotions++;
 4531         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 4532             " in pmap %p", va, pmap);
 4533         return (TRUE);
 4534 }
 4535 
 4536 /*
 4537  * Sets the memory attribute for the specified page.
 4538  */
 4539 void
 4540 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 4541 {
 4542 
 4543         m->md.pat_mode = ma;
 4544 
 4545         /*
 4546          * If "m" is a normal page, update its direct mapping.  This update
 4547          * can be relied upon to perform any cache operations that are
 4548          * required for data coherence.
 4549          */
 4550         if ((m->flags & PG_FICTITIOUS) == 0 &&
 4551             pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 4552             m->md.pat_mode))
 4553                 panic("memory attribute change on the direct map failed");
 4554 }
 4555 
 4556 /*
 4557  * Changes the specified virtual address range's memory type to that given by
 4558  * the parameter "mode".  The specified virtual address range must be
 4559  * completely contained within either the direct map or the kernel map.  If
 4560  * the virtual address range is contained within the kernel map, then the
 4561  * memory type for each of the corresponding ranges of the direct map is also
 4562  * changed.  (The corresponding ranges of the direct map are those ranges that
 4563  * map the same physical pages as the specified virtual address range.)  These
 4564  * changes to the direct map are necessary because Intel describes the
 4565  * behavior of their processors as "undefined" if two or more mappings to the
 4566  * same physical page have different memory types.
 4567  *
 4568  * Returns zero if the change completed successfully, and either EINVAL or
 4569  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
 4570  * of the virtual address range was not mapped, and ENOMEM is returned if
 4571  * there was insufficient memory available to complete the change.  In the
 4572  * latter case, the memory type may have been changed on some part of the
 4573  * virtual address range or the direct map.
 4574  */
 4575 int
 4576 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 4577 {
 4578         int error;
 4579 
 4580         PMAP_LOCK(kernel_pmap);
 4581         error = pmap_change_attr_locked(va, size, mode);
 4582         PMAP_UNLOCK(kernel_pmap);
 4583         return (error);
 4584 }
 4585 
 4586 static int
 4587 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 4588 {
 4589         vm_offset_t base, offset, tmpva;
 4590         vm_paddr_t pa_start, pa_end;
 4591         pdp_entry_t *pdpe;
 4592         pd_entry_t *pde;
 4593         pt_entry_t *pte;
 4594         int cache_bits_pte, cache_bits_pde, error;
 4595         boolean_t changed;
 4596 
 4597         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 4598         base = trunc_page(va);
 4599         offset = va & PAGE_MASK;
 4600         size = roundup(offset + size, PAGE_SIZE);
 4601 
 4602         /*
 4603          * Only supported on kernel virtual addresses, including the direct
 4604          * map but excluding the recursive map.
 4605          */
 4606         if (base < DMAP_MIN_ADDRESS)
 4607                 return (EINVAL);
 4608 
 4609         cache_bits_pde = pmap_cache_bits(mode, 1);
 4610         cache_bits_pte = pmap_cache_bits(mode, 0);
 4611         changed = FALSE;
 4612 
 4613         /*
 4614          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 4615          * into 4KB pages if required.
 4616          */
 4617         for (tmpva = base; tmpva < base + size; ) {
 4618                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
 4619                 if (*pdpe == 0)
 4620                         return (EINVAL);
 4621                 if (*pdpe & PG_PS) {
 4622                         /*
 4623                          * If the current 1GB page already has the required
 4624                          * memory type, then we need not demote this page. Just
 4625                          * increment tmpva to the next 1GB page frame.
 4626                          */
 4627                         if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
 4628                                 tmpva = trunc_1gpage(tmpva) + NBPDP;
 4629                                 continue;
 4630                         }
 4631 
 4632                         /*
 4633                          * If the current offset aligns with a 1GB page frame
 4634                          * and there is at least 1GB left within the range, then
 4635                          * we need not break down this page into 2MB pages.
 4636                          */
 4637                         if ((tmpva & PDPMASK) == 0 &&
 4638                             tmpva + PDPMASK < base + size) {
 4639                                 tmpva += NBPDP;
 4640                                 continue;
 4641                         }
 4642                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
 4643                                 return (ENOMEM);
 4644                 }
 4645                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
 4646                 if (*pde == 0)
 4647                         return (EINVAL);
 4648                 if (*pde & PG_PS) {
 4649                         /*
 4650                          * If the current 2MB page already has the required
 4651                          * memory type, then we need not demote this page. Just
 4652                          * increment tmpva to the next 2MB page frame.
 4653                          */
 4654                         if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
 4655                                 tmpva = trunc_2mpage(tmpva) + NBPDR;
 4656                                 continue;
 4657                         }
 4658 
 4659                         /*
 4660                          * If the current offset aligns with a 2MB page frame
 4661                          * and there is at least 2MB left within the range, then
 4662                          * we need not break down this page into 4KB pages.
 4663                          */
 4664                         if ((tmpva & PDRMASK) == 0 &&
 4665                             tmpva + PDRMASK < base + size) {
 4666                                 tmpva += NBPDR;
 4667                                 continue;
 4668                         }
 4669                         if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
 4670                                 return (ENOMEM);
 4671                 }
 4672                 pte = pmap_pde_to_pte(pde, tmpva);
 4673                 if (*pte == 0)
 4674                         return (EINVAL);
 4675                 tmpva += PAGE_SIZE;
 4676         }
 4677         error = 0;
 4678 
 4679         /*
 4680          * Ok, all the pages exist, so run through them updating their
 4681          * cache mode if required.
 4682          */
 4683         pa_start = pa_end = 0;
 4684         for (tmpva = base; tmpva < base + size; ) {
 4685                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
 4686                 if (*pdpe & PG_PS) {
 4687                         if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
 4688                                 pmap_pde_attr(pdpe, cache_bits_pde);
 4689                                 changed = TRUE;
 4690                         }
 4691                         if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 4692                                 if (pa_start == pa_end) {
 4693                                         /* Start physical address run. */
 4694                                         pa_start = *pdpe & PG_PS_FRAME;
 4695                                         pa_end = pa_start + NBPDP;
 4696                                 } else if (pa_end == (*pdpe & PG_PS_FRAME))
 4697                                         pa_end += NBPDP;
 4698                                 else {
 4699                                         /* Run ended, update direct map. */
 4700                                         error = pmap_change_attr_locked(
 4701                                             PHYS_TO_DMAP(pa_start),
 4702                                             pa_end - pa_start, mode);
 4703                                         if (error != 0)
 4704                                                 break;
 4705                                         /* Start physical address run. */
 4706                                         pa_start = *pdpe & PG_PS_FRAME;
 4707                                         pa_end = pa_start + NBPDP;
 4708                                 }
 4709                         }
 4710                         tmpva = trunc_1gpage(tmpva) + NBPDP;
 4711                         continue;
 4712                 }
 4713                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
 4714                 if (*pde & PG_PS) {
 4715                         if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
 4716                                 pmap_pde_attr(pde, cache_bits_pde);
 4717                                 changed = TRUE;
 4718                         }
 4719                         if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 4720                                 if (pa_start == pa_end) {
 4721                                         /* Start physical address run. */
 4722                                         pa_start = *pde & PG_PS_FRAME;
 4723                                         pa_end = pa_start + NBPDR;
 4724                                 } else if (pa_end == (*pde & PG_PS_FRAME))
 4725                                         pa_end += NBPDR;
 4726                                 else {
 4727                                         /* Run ended, update direct map. */
 4728                                         error = pmap_change_attr_locked(
 4729                                             PHYS_TO_DMAP(pa_start),
 4730                                             pa_end - pa_start, mode);
 4731                                         if (error != 0)
 4732                                                 break;
 4733                                         /* Start physical address run. */
 4734                                         pa_start = *pde & PG_PS_FRAME;
 4735                                         pa_end = pa_start + NBPDR;
 4736                                 }
 4737                         }
 4738                         tmpva = trunc_2mpage(tmpva) + NBPDR;
 4739                 } else {
 4740                         pte = pmap_pde_to_pte(pde, tmpva);
 4741                         if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
 4742                                 pmap_pte_attr(pte, cache_bits_pte);
 4743                                 changed = TRUE;
 4744                         }
 4745                         if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
 4746                                 if (pa_start == pa_end) {
 4747                                         /* Start physical address run. */
 4748                                         pa_start = *pte & PG_FRAME;
 4749                                         pa_end = pa_start + PAGE_SIZE;
 4750                                 } else if (pa_end == (*pte & PG_FRAME))
 4751                                         pa_end += PAGE_SIZE;
 4752                                 else {
 4753                                         /* Run ended, update direct map. */
 4754                                         error = pmap_change_attr_locked(
 4755                                             PHYS_TO_DMAP(pa_start),
 4756                                             pa_end - pa_start, mode);
 4757                                         if (error != 0)
 4758                                                 break;
 4759                                         /* Start physical address run. */
 4760                                         pa_start = *pte & PG_FRAME;
 4761                                         pa_end = pa_start + PAGE_SIZE;
 4762                                 }
 4763                         }
 4764                         tmpva += PAGE_SIZE;
 4765                 }
 4766         }
 4767         if (error == 0 && pa_start != pa_end)
 4768                 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
 4769                     pa_end - pa_start, mode);
 4770 
 4771         /*
 4772          * Flush CPU caches if required to make sure any data isn't cached that
 4773          * shouldn't be, etc.
 4774          */
 4775         if (changed) {
 4776                 pmap_invalidate_range(kernel_pmap, base, tmpva);
 4777                 pmap_invalidate_cache_range(base, tmpva);
 4778         }
 4779         return (error);
 4780 }
 4781 
 4782 /*
 4783  * perform the pmap work for mincore
 4784  */
 4785 int
 4786 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 4787 {
 4788         pd_entry_t *pdep;
 4789         pt_entry_t pte;
 4790         vm_paddr_t pa;
 4791         vm_page_t m;
 4792         int val = 0;
 4793         
 4794         PMAP_LOCK(pmap);
 4795         pdep = pmap_pde(pmap, addr);
 4796         if (pdep != NULL && (*pdep & PG_V)) {
 4797                 if (*pdep & PG_PS) {
 4798                         pte = *pdep;
 4799                         val = MINCORE_SUPER;
 4800                         /* Compute the physical address of the 4KB page. */
 4801                         pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 4802                             PG_FRAME;
 4803                 } else {
 4804                         pte = *pmap_pde_to_pte(pdep, addr);
 4805                         pa = pte & PG_FRAME;
 4806                 }
 4807         } else {
 4808                 pte = 0;
 4809                 pa = 0;
 4810         }
 4811         PMAP_UNLOCK(pmap);
 4812 
 4813         if (pte != 0) {
 4814                 val |= MINCORE_INCORE;
 4815                 if ((pte & PG_MANAGED) == 0)
 4816                         return val;
 4817 
 4818                 m = PHYS_TO_VM_PAGE(pa);
 4819 
 4820                 /*
 4821                  * Modified by us
 4822                  */
 4823                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 4824                         val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 4825                 else {
 4826                         /*
 4827                          * Modified by someone else
 4828                          */
 4829                         vm_page_lock_queues();
 4830                         if (m->dirty || pmap_is_modified(m))
 4831                                 val |= MINCORE_MODIFIED_OTHER;
 4832                         vm_page_unlock_queues();
 4833                 }
 4834                 /*
 4835                  * Referenced by us
 4836                  */
 4837                 if (pte & PG_A)
 4838                         val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 4839                 else {
 4840                         /*
 4841                          * Referenced by someone else
 4842                          */
 4843                         vm_page_lock_queues();
 4844                         if ((m->flags & PG_REFERENCED) ||
 4845                             pmap_ts_referenced(m)) {
 4846                                 val |= MINCORE_REFERENCED_OTHER;
 4847                                 vm_page_flag_set(m, PG_REFERENCED);
 4848                         }
 4849                         vm_page_unlock_queues();
 4850                 }
 4851         } 
 4852         return val;
 4853 }
 4854 
 4855 void
 4856 pmap_activate(struct thread *td)
 4857 {
 4858         pmap_t  pmap, oldpmap;
 4859         u_int64_t  cr3;
 4860 
 4861         critical_enter();
 4862         pmap = vmspace_pmap(td->td_proc->p_vmspace);
 4863         oldpmap = PCPU_GET(curpmap);
 4864 #ifdef SMP
 4865 if (oldpmap)    /* XXX FIXME */
 4866         atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 4867         atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 4868 #else
 4869 if (oldpmap)    /* XXX FIXME */
 4870         oldpmap->pm_active &= ~PCPU_GET(cpumask);
 4871         pmap->pm_active |= PCPU_GET(cpumask);
 4872 #endif
 4873         cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
 4874         td->td_pcb->pcb_cr3 = cr3;
 4875         load_cr3(cr3);
 4876         critical_exit();
 4877 }
 4878 
 4879 void
 4880 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 4881 {
 4882 }
 4883 
 4884 /*
 4885  *      Increase the starting virtual address of the given mapping if a
 4886  *      different alignment might result in more superpage mappings.
 4887  */
 4888 void
 4889 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
 4890     vm_offset_t *addr, vm_size_t size)
 4891 {
 4892         vm_offset_t superpage_offset;
 4893 
 4894         if (size < NBPDR)
 4895                 return;
 4896         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 4897                 offset += ptoa(object->pg_color);
 4898         superpage_offset = offset & PDRMASK;
 4899         if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 4900             (*addr & PDRMASK) == superpage_offset)
 4901                 return;
 4902         if ((*addr & PDRMASK) < superpage_offset)
 4903                 *addr = (*addr & ~PDRMASK) + superpage_offset;
 4904         else
 4905                 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 4906 }

Cache object: e9465dd410388f64db9c260e376d508a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.