The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/powerpc/aim/mmu_radix.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2018 Matthew Macy
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  *
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   26  */
   27 
   28 #include "opt_platform.h"
   29 
   30 #include <sys/cdefs.h>
   31 __FBSDID("$FreeBSD$");
   32 
   33 #include <sys/param.h>
   34 #include <sys/kernel.h>
   35 #include <sys/systm.h>
   36 #include <sys/conf.h>
   37 #include <sys/bitstring.h>
   38 #include <sys/queue.h>
   39 #include <sys/cpuset.h>
   40 #include <sys/endian.h>
   41 #include <sys/kerneldump.h>
   42 #include <sys/ktr.h>
   43 #include <sys/lock.h>
   44 #include <sys/syslog.h>
   45 #include <sys/msgbuf.h>
   46 #include <sys/malloc.h>
   47 #include <sys/mman.h>
   48 #include <sys/mutex.h>
   49 #include <sys/proc.h>
   50 #include <sys/rwlock.h>
   51 #include <sys/sched.h>
   52 #include <sys/sysctl.h>
   53 #include <sys/systm.h>
   54 #include <sys/vmem.h>
   55 #include <sys/vmmeter.h>
   56 #include <sys/smp.h>
   57 
   58 #include <sys/kdb.h>
   59 
   60 #include <dev/ofw/openfirm.h>
   61 
   62 #include <vm/vm.h>
   63 #include <vm/pmap.h>
   64 #include <vm/vm_param.h>
   65 #include <vm/vm_kern.h>
   66 #include <vm/vm_page.h>
   67 #include <vm/vm_map.h>
   68 #include <vm/vm_object.h>
   69 #include <vm/vm_extern.h>
   70 #include <vm/vm_pageout.h>
   71 #include <vm/vm_phys.h>
   72 #include <vm/vm_reserv.h>
   73 #include <vm/vm_dumpset.h>
   74 #include <vm/uma.h>
   75 
   76 #include <machine/_inttypes.h>
   77 #include <machine/cpu.h>
   78 #include <machine/platform.h>
   79 #include <machine/frame.h>
   80 #include <machine/md_var.h>
   81 #include <machine/psl.h>
   82 #include <machine/bat.h>
   83 #include <machine/hid.h>
   84 #include <machine/pte.h>
   85 #include <machine/sr.h>
   86 #include <machine/trap.h>
   87 #include <machine/mmuvar.h>
   88 
   89 /* For pseries bit. */
   90 #include <powerpc/pseries/phyp-hvcall.h>
   91 
   92 #ifdef INVARIANTS
   93 #include <vm/uma_dbg.h>
   94 #endif
   95 
   96 #define PPC_BITLSHIFT(bit)      (sizeof(long)*NBBY - 1 - (bit))
   97 #define PPC_BIT(bit)            (1UL << PPC_BITLSHIFT(bit))
   98 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit))
   99 
  100 #include "opt_ddb.h"
  101 
  102 #ifdef DDB
  103 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va);
  104 #endif
  105 
  106 #define PG_W    RPTE_WIRED
  107 #define PG_V    RPTE_VALID
  108 #define PG_MANAGED      RPTE_MANAGED
  109 #define PG_PROMOTED     RPTE_PROMOTED
  110 #define PG_M    RPTE_C
  111 #define PG_A    RPTE_R
  112 #define PG_X    RPTE_EAA_X
  113 #define PG_RW   RPTE_EAA_W
  114 #define PG_PTE_CACHE RPTE_ATTR_MASK
  115 
  116 #define RPTE_SHIFT 9
  117 #define NLS_MASK ((1UL<<5)-1)
  118 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT)
  119 #define RPTE_MASK (RPTE_ENTRIES-1)
  120 
  121 #define NLB_SHIFT 0
  122 #define NLB_MASK (((1UL<<52)-1) << 8)
  123 
  124 extern int nkpt;
  125 extern caddr_t crashdumpmap;
  126 
  127 #define RIC_FLUSH_TLB 0
  128 #define RIC_FLUSH_PWC 1
  129 #define RIC_FLUSH_ALL 2
  130 
  131 #define POWER9_TLB_SETS_RADIX   128     /* # sets in POWER9 TLB Radix mode */
  132 
  133 #define PPC_INST_TLBIE                  0x7c000264
  134 #define PPC_INST_TLBIEL                 0x7c000224
  135 #define PPC_INST_SLBIA                  0x7c0003e4
  136 
  137 #define ___PPC_RA(a)    (((a) & 0x1f) << 16)
  138 #define ___PPC_RB(b)    (((b) & 0x1f) << 11)
  139 #define ___PPC_RS(s)    (((s) & 0x1f) << 21)
  140 #define ___PPC_RT(t)    ___PPC_RS(t)
  141 #define ___PPC_R(r)     (((r) & 0x1) << 16)
  142 #define ___PPC_PRS(prs) (((prs) & 0x1) << 17)
  143 #define ___PPC_RIC(ric) (((ric) & 0x3) << 18)
  144 
  145 #define PPC_SLBIA(IH)   __XSTRING(.long PPC_INST_SLBIA | \
  146                                        ((IH & 0x7) << 21))
  147 #define PPC_TLBIE_5(rb,rs,ric,prs,r)                            \
  148         __XSTRING(.long PPC_INST_TLBIE |                        \
  149                           ___PPC_RB(rb) | ___PPC_RS(rs) |       \
  150                           ___PPC_RIC(ric) | ___PPC_PRS(prs) |   \
  151                           ___PPC_R(r))
  152 
  153 #define PPC_TLBIEL(rb,rs,ric,prs,r) \
  154          __XSTRING(.long PPC_INST_TLBIEL | \
  155                            ___PPC_RB(rb) | ___PPC_RS(rs) |      \
  156                            ___PPC_RIC(ric) | ___PPC_PRS(prs) |  \
  157                            ___PPC_R(r))
  158 
  159 #define PPC_INVALIDATE_ERAT             PPC_SLBIA(7)
  160 
  161 static __inline void
  162 ttusync(void)
  163 {
  164         __asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
  165 }
  166 
  167 #define TLBIEL_INVAL_SEL_MASK   0xc00   /* invalidation selector */
  168 #define  TLBIEL_INVAL_PAGE      0x000   /* invalidate a single page */
  169 #define  TLBIEL_INVAL_SET_PID   0x400   /* invalidate a set for the current PID */
  170 #define  TLBIEL_INVAL_SET_LPID  0x800   /* invalidate a set for current LPID */
  171 #define  TLBIEL_INVAL_SET       0xc00   /* invalidate a set for all LPIDs */
  172 
  173 #define TLBIE_ACTUAL_PAGE_MASK          0xe0
  174 #define  TLBIE_ACTUAL_PAGE_4K           0x00
  175 #define  TLBIE_ACTUAL_PAGE_64K          0xa0
  176 #define  TLBIE_ACTUAL_PAGE_2M           0x20
  177 #define  TLBIE_ACTUAL_PAGE_1G           0x40
  178 
  179 #define TLBIE_PRS_PARTITION_SCOPE       0x0
  180 #define TLBIE_PRS_PROCESS_SCOPE 0x1
  181 
  182 #define TLBIE_RIC_INVALIDATE_TLB        0x0     /* Invalidate just TLB */
  183 #define TLBIE_RIC_INVALIDATE_PWC        0x1     /* Invalidate just PWC */
  184 #define TLBIE_RIC_INVALIDATE_ALL        0x2     /* Invalidate TLB, PWC,
  185                                                  * cached {proc, part}tab entries
  186                                                  */
  187 #define TLBIE_RIC_INVALIDATE_SEQ        0x3     /* HPT - only:
  188                                                  * Invalidate a range of translations
  189                                                  */
  190 
  191 static __always_inline void
  192 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid,
  193                         vm_offset_t va, uint16_t ap)
  194 {
  195         uint64_t rb, rs;
  196 
  197         MPASS((va & PAGE_MASK) == 0);
  198 
  199         rs = ((uint64_t)pid << 32) | lpid;
  200         rb = va | is | ap;
  201         __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : :
  202                 "r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory");
  203 }
  204 
  205 static __inline void
  206 radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap)
  207 {
  208 
  209         __asm __volatile("ptesync" ::: "memory");
  210         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
  211             TLBIEL_INVAL_PAGE, 0, 0, va, ap);
  212         __asm __volatile("ptesync" ::: "memory");
  213         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
  214             TLBIEL_INVAL_PAGE, pid, 0, va, ap);
  215 }
  216 
  217 static __inline void
  218 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va)
  219 {
  220 
  221         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
  222                 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K);
  223         radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K);
  224 }
  225 
  226 static __inline void
  227 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va)
  228 {
  229 
  230         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
  231                 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M);
  232         radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M);
  233 }
  234 
  235 static __inline void
  236 radix_tlbie_invlpwc_user(uint32_t pid)
  237 {
  238 
  239         radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
  240                 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
  241 }
  242 
  243 static __inline void
  244 radix_tlbie_flush_user(uint32_t pid)
  245 {
  246 
  247         radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
  248                 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
  249 }
  250 
  251 static __inline void
  252 radix_tlbie_invlpg_kernel_4k(vm_offset_t va)
  253 {
  254 
  255         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
  256             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K);
  257         radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K);
  258 }
  259 
  260 static __inline void
  261 radix_tlbie_invlpg_kernel_2m(vm_offset_t va)
  262 {
  263 
  264         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
  265             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M);
  266         radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M);
  267 }
  268 
  269 /* 1GB pages aren't currently supported. */
  270 static __inline __unused void
  271 radix_tlbie_invlpg_kernel_1g(vm_offset_t va)
  272 {
  273 
  274         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
  275             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G);
  276         radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G);
  277 }
  278 
  279 static __inline void
  280 radix_tlbie_invlpwc_kernel(void)
  281 {
  282 
  283         radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
  284             TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
  285 }
  286 
  287 static __inline void
  288 radix_tlbie_flush_kernel(void)
  289 {
  290 
  291         radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
  292             TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
  293 }
  294 
  295 static __inline vm_pindex_t
  296 pmap_l3e_pindex(vm_offset_t va)
  297 {
  298         return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT);
  299 }
  300 
  301 static __inline vm_pindex_t
  302 pmap_pml3e_index(vm_offset_t va)
  303 {
  304 
  305         return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK);
  306 }
  307 
  308 static __inline vm_pindex_t
  309 pmap_pml2e_index(vm_offset_t va)
  310 {
  311         return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK);
  312 }
  313 
  314 static __inline vm_pindex_t
  315 pmap_pml1e_index(vm_offset_t va)
  316 {
  317         return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT);
  318 }
  319 
  320 /* Return various clipped indexes for a given VA */
  321 static __inline vm_pindex_t
  322 pmap_pte_index(vm_offset_t va)
  323 {
  324 
  325         return ((va >> PAGE_SHIFT) & RPTE_MASK);
  326 }
  327 
  328 /* Return a pointer to the PT slot that corresponds to a VA */
  329 static __inline pt_entry_t *
  330 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va)
  331 {
  332         pt_entry_t *pte;
  333         vm_paddr_t ptepa;
  334 
  335         ptepa = (be64toh(*l3e) & NLB_MASK);
  336         pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa);
  337         return (&pte[pmap_pte_index(va)]);
  338 }
  339 
  340 /* Return a pointer to the PD slot that corresponds to a VA */
  341 static __inline pt_entry_t *
  342 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va)
  343 {
  344         pt_entry_t *l3e;
  345         vm_paddr_t l3pa;
  346 
  347         l3pa = (be64toh(*l2e) & NLB_MASK);
  348         l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa);
  349         return (&l3e[pmap_pml3e_index(va)]);
  350 }
  351 
  352 /* Return a pointer to the PD slot that corresponds to a VA */
  353 static __inline pt_entry_t *
  354 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va)
  355 {
  356         pt_entry_t *l2e;
  357         vm_paddr_t l2pa;
  358 
  359         l2pa = (be64toh(*l1e) & NLB_MASK);
  360 
  361         l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa);
  362         return (&l2e[pmap_pml2e_index(va)]);
  363 }
  364 
  365 static __inline pml1_entry_t *
  366 pmap_pml1e(pmap_t pmap, vm_offset_t va)
  367 {
  368 
  369         return (&pmap->pm_pml1[pmap_pml1e_index(va)]);
  370 }
  371 
  372 static pt_entry_t *
  373 pmap_pml2e(pmap_t pmap, vm_offset_t va)
  374 {
  375         pt_entry_t *l1e;
  376 
  377         l1e = pmap_pml1e(pmap, va);
  378         if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0)
  379                 return (NULL);
  380         return (pmap_l1e_to_l2e(l1e, va));
  381 }
  382 
  383 static __inline pt_entry_t *
  384 pmap_pml3e(pmap_t pmap, vm_offset_t va)
  385 {
  386         pt_entry_t *l2e;
  387 
  388         l2e = pmap_pml2e(pmap, va);
  389         if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0)
  390                 return (NULL);
  391         return (pmap_l2e_to_l3e(l2e, va));
  392 }
  393 
  394 static __inline pt_entry_t *
  395 pmap_pte(pmap_t pmap, vm_offset_t va)
  396 {
  397         pt_entry_t *l3e;
  398 
  399         l3e = pmap_pml3e(pmap, va);
  400         if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
  401                 return (NULL);
  402         return (pmap_l3e_to_pte(l3e, va));
  403 }
  404 
  405 int nkpt = 64;
  406 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
  407     "Number of kernel page table pages allocated on bootup");
  408 
  409 vm_paddr_t dmaplimit;
  410 
  411 SYSCTL_DECL(_vm_pmap);
  412 
  413 #ifdef INVARIANTS
  414 #define VERBOSE_PMAP 0
  415 #define VERBOSE_PROTECT 0
  416 static int pmap_logging;
  417 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN,
  418     &pmap_logging, 0, "verbose debug logging");
  419 #endif
  420 
  421 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
  422 
  423 //static vm_paddr_t     KERNend;        /* phys addr of end of bootstrap data */
  424 
  425 static vm_offset_t qframe = 0;
  426 static struct mtx qframe_mtx;
  427 
  428 void mmu_radix_activate(struct thread *);
  429 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int);
  430 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
  431     vm_size_t);
  432 void mmu_radix_clear_modify(vm_page_t);
  433 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
  434 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *);
  435 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t);
  436 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
  437         vm_prot_t);
  438 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
  439 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va);
  440 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
  441 void mmu_radix_kenter(vm_offset_t, vm_paddr_t);
  442 vm_paddr_t mmu_radix_kextract(vm_offset_t);
  443 void mmu_radix_kremove(vm_offset_t);
  444 boolean_t mmu_radix_is_modified(vm_page_t);
  445 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t);
  446 boolean_t mmu_radix_is_referenced(vm_page_t);
  447 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t,
  448         vm_pindex_t, vm_size_t);
  449 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t);
  450 void mmu_radix_page_init(vm_page_t);
  451 boolean_t mmu_radix_page_is_mapped(vm_page_t m);
  452 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t);
  453 int mmu_radix_page_wired_mappings(vm_page_t);
  454 int mmu_radix_pinit(pmap_t);
  455 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
  456 bool mmu_radix_ps_enabled(pmap_t);
  457 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int);
  458 void mmu_radix_qremove(vm_offset_t, int);
  459 vm_offset_t mmu_radix_quick_enter_page(vm_page_t);
  460 void mmu_radix_quick_remove_page(vm_offset_t);
  461 boolean_t mmu_radix_ts_referenced(vm_page_t);
  462 void mmu_radix_release(pmap_t);
  463 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t);
  464 void mmu_radix_remove_all(vm_page_t);
  465 void mmu_radix_remove_pages(pmap_t);
  466 void mmu_radix_remove_write(vm_page_t);
  467 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t);
  468 void mmu_radix_zero_page(vm_page_t);
  469 void mmu_radix_zero_page_area(vm_page_t, int, int);
  470 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
  471 void mmu_radix_page_array_startup(long pages);
  472 
  473 #include "mmu_oea64.h"
  474 
  475 /*
  476  * Kernel MMU interface
  477  */
  478 
  479 static void     mmu_radix_bootstrap(vm_offset_t, vm_offset_t);
  480 
  481 static void mmu_radix_copy_page(vm_page_t, vm_page_t);
  482 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
  483     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
  484 static void mmu_radix_growkernel(vm_offset_t);
  485 static void mmu_radix_init(void);
  486 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
  487 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
  488 static void mmu_radix_pinit0(pmap_t);
  489 
  490 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t);
  491 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
  492 static void mmu_radix_unmapdev(void *, vm_size_t);
  493 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
  494 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t);
  495 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va);
  496 static void mmu_radix_scan_init(void);
  497 static void     mmu_radix_cpu_bootstrap(int ap);
  498 static void     mmu_radix_tlbie_all(void);
  499 
  500 static struct pmap_funcs mmu_radix_methods = {
  501         .bootstrap = mmu_radix_bootstrap,
  502         .copy_page = mmu_radix_copy_page,
  503         .copy_pages = mmu_radix_copy_pages,
  504         .cpu_bootstrap = mmu_radix_cpu_bootstrap,
  505         .growkernel = mmu_radix_growkernel,
  506         .init = mmu_radix_init,
  507         .map =                  mmu_radix_map,
  508         .mincore =              mmu_radix_mincore,
  509         .pinit = mmu_radix_pinit,
  510         .pinit0 = mmu_radix_pinit0,
  511 
  512         .mapdev = mmu_radix_mapdev,
  513         .mapdev_attr = mmu_radix_mapdev_attr,
  514         .unmapdev = mmu_radix_unmapdev,
  515         .kenter_attr = mmu_radix_kenter_attr,
  516         .dev_direct_mapped = mmu_radix_dev_direct_mapped,
  517         .dumpsys_pa_init = mmu_radix_scan_init,
  518         .dumpsys_map_chunk = mmu_radix_dumpsys_map,
  519         .page_is_mapped = mmu_radix_page_is_mapped,
  520         .ps_enabled = mmu_radix_ps_enabled,
  521         .align_superpage = mmu_radix_align_superpage,
  522         .object_init_pt = mmu_radix_object_init_pt,
  523         .protect = mmu_radix_protect,
  524         /* pmap dispatcher interface */
  525         .clear_modify = mmu_radix_clear_modify,
  526         .copy = mmu_radix_copy,
  527         .enter = mmu_radix_enter,
  528         .enter_object = mmu_radix_enter_object,
  529         .enter_quick = mmu_radix_enter_quick,
  530         .extract = mmu_radix_extract,
  531         .extract_and_hold = mmu_radix_extract_and_hold,
  532         .is_modified = mmu_radix_is_modified,
  533         .is_prefaultable = mmu_radix_is_prefaultable,
  534         .is_referenced = mmu_radix_is_referenced,
  535         .ts_referenced = mmu_radix_ts_referenced,
  536         .page_exists_quick = mmu_radix_page_exists_quick,
  537         .page_init = mmu_radix_page_init,
  538         .page_wired_mappings =  mmu_radix_page_wired_mappings,
  539         .qenter = mmu_radix_qenter,
  540         .qremove = mmu_radix_qremove,
  541         .release = mmu_radix_release,
  542         .remove = mmu_radix_remove,
  543         .remove_all = mmu_radix_remove_all,
  544         .remove_write = mmu_radix_remove_write,
  545         .unwire = mmu_radix_unwire,
  546         .zero_page = mmu_radix_zero_page,
  547         .zero_page_area = mmu_radix_zero_page_area,
  548         .activate = mmu_radix_activate,
  549         .quick_enter_page =  mmu_radix_quick_enter_page,
  550         .quick_remove_page =  mmu_radix_quick_remove_page,
  551         .page_set_memattr = mmu_radix_page_set_memattr,
  552         .page_array_startup =  mmu_radix_page_array_startup,
  553 
  554         /* Internal interfaces */
  555         .kenter = mmu_radix_kenter,
  556         .kextract = mmu_radix_kextract,
  557         .kremove = mmu_radix_kremove,
  558         .change_attr = mmu_radix_change_attr,
  559         .decode_kernel_ptr =  mmu_radix_decode_kernel_ptr,
  560 
  561         .tlbie_all = mmu_radix_tlbie_all,
  562 };
  563 
  564 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods);
  565 
  566 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
  567         struct rwlock **lockp);
  568 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va);
  569 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *);
  570 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
  571     struct spglist *free, struct rwlock **lockp);
  572 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
  573     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
  574 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
  575 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde,
  576     struct spglist *free);
  577 static bool     pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
  578         pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp);
  579 
  580 static bool     pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e,
  581                     u_int flags, struct rwlock **lockp);
  582 #if VM_NRESERVLEVEL > 0
  583 static void     pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  584         struct rwlock **lockp);
  585 #endif
  586 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  587 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
  588 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
  589         vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate);
  590 
  591 static bool     pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
  592         vm_prot_t prot, struct rwlock **lockp);
  593 static int      pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde,
  594         u_int flags, vm_page_t m, struct rwlock **lockp);
  595 
  596 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
  597 static void free_pv_chunk(struct pv_chunk *pc);
  598 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp);
  599 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va,
  600         struct rwlock **lockp);
  601 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
  602         struct rwlock **lockp);
  603 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
  604     struct spglist *free);
  605 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free);
  606 
  607 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start);
  608 static void pmap_invalidate_all(pmap_t pmap);
  609 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush);
  610 
  611 /*
  612  * Internal flags for pmap_enter()'s helper functions.
  613  */
  614 #define PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV entries. */
  615 #define PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace mappings. */
  616 
  617 #define UNIMPLEMENTED() panic("%s not implemented", __func__)
  618 #define UNTESTED() panic("%s not yet tested", __func__)
  619 
  620 /* Number of supported PID bits */
  621 static unsigned int isa3_pid_bits;
  622 
  623 /* PID to start allocating from */
  624 static unsigned int isa3_base_pid;
  625 
  626 #define PROCTAB_SIZE_SHIFT      (isa3_pid_bits + 4)
  627 #define PROCTAB_ENTRIES (1ul << isa3_pid_bits)
  628 
  629 /*
  630  * Map of physical memory regions.
  631  */
  632 static struct   mem_region *regions, *pregions;
  633 static struct   numa_mem_region *numa_pregions;
  634 static u_int    phys_avail_count;
  635 static int      regions_sz, pregions_sz, numa_pregions_sz;
  636 static struct pate *isa3_parttab;
  637 static struct prte *isa3_proctab;
  638 static vmem_t *asid_arena;
  639 
  640 extern void bs_remap_earlyboot(void);
  641 
  642 #define RADIX_PGD_SIZE_SHIFT    16
  643 #define RADIX_PGD_SIZE  (1UL << RADIX_PGD_SIZE_SHIFT)
  644 
  645 #define RADIX_PGD_INDEX_SHIFT   (RADIX_PGD_SIZE_SHIFT-3)
  646 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t))
  647 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t))
  648 
  649 #define NUPML1E         (RADIX_PGD_SIZE/sizeof(uint64_t))       /* number of userland PML1 pages */
  650 #define NUPDPE          (NUPML1E * NL2EPG)/* number of userland PDP pages */
  651 #define NUPDE           (NUPDPE * NL3EPG)       /* number of userland PD entries */
  652 
  653 /* POWER9 only permits a 64k partition table size. */
  654 #define PARTTAB_SIZE_SHIFT      16
  655 #define PARTTAB_SIZE    (1UL << PARTTAB_SIZE_SHIFT)
  656 
  657 #define PARTTAB_HR              (1UL << 63) /* host uses radix */
  658 #define PARTTAB_GR              (1UL << 63) /* guest uses radix must match host */
  659 
  660 /* TLB flush actions. Used as argument to tlbiel_flush() */
  661 enum {
  662         TLB_INVAL_SCOPE_LPID = 2,       /* invalidate TLBs for current LPID */
  663         TLB_INVAL_SCOPE_GLOBAL = 3,     /* invalidate all TLBs */
  664 };
  665 
  666 #define NPV_LIST_LOCKS  MAXCPU
  667 static int pmap_initialized;
  668 static vm_paddr_t proctab0pa;
  669 static vm_paddr_t parttab_phys;
  670 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
  671 
  672 /*
  673  * Data for the pv entry allocation mechanism.
  674  * Updates to pv_invl_gen are protected by the pv_list_locks[]
  675  * elements, but reads are not.
  676  */
  677 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
  678 static struct mtx __exclusive_cache_line pv_chunks_mutex;
  679 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
  680 static struct md_page *pv_table;
  681 static struct md_page pv_dummy;
  682 
  683 #ifdef PV_STATS
  684 #define PV_STAT(x)      do { x ; } while (0)
  685 #else
  686 #define PV_STAT(x)      do { } while (0)
  687 #endif
  688 
  689 #define pa_radix_index(pa)      ((pa) >> L3_PAGE_SIZE_SHIFT)
  690 #define pa_to_pvh(pa)   (&pv_table[pa_radix_index(pa)])
  691 
  692 #define PHYS_TO_PV_LIST_LOCK(pa)        \
  693                         (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS])
  694 
  695 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
  696         struct rwlock **_lockp = (lockp);               \
  697         struct rwlock *_new_lock;                       \
  698                                                         \
  699         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
  700         if (_new_lock != *_lockp) {                     \
  701                 if (*_lockp != NULL)                    \
  702                         rw_wunlock(*_lockp);            \
  703                 *_lockp = _new_lock;                    \
  704                 rw_wlock(*_lockp);                      \
  705         }                                               \
  706 } while (0)
  707 
  708 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
  709         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
  710 
  711 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
  712         struct rwlock **_lockp = (lockp);               \
  713                                                         \
  714         if (*_lockp != NULL) {                          \
  715                 rw_wunlock(*_lockp);                    \
  716                 *_lockp = NULL;                         \
  717         }                                               \
  718 } while (0)
  719 
  720 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
  721         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
  722 
  723 /*
  724  * We support 52 bits, hence:
  725  * bits 52 - 31 = 21, 0b10101
  726  * RTS encoding details
  727  * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
  728  * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
  729  */
  730 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5))
  731 
  732 static int powernv_enabled = 1;
  733 
  734 static __always_inline void
  735 tlbiel_radix_set_isa300(uint32_t set, uint32_t is,
  736         uint32_t pid, uint32_t ric, uint32_t prs)
  737 {
  738         uint64_t rb;
  739         uint64_t rs;
  740 
  741         rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53);
  742         rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31);
  743 
  744         __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
  745                      : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
  746                      : "memory");
  747 }
  748 
  749 static void
  750 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is)
  751 {
  752         uint32_t set;
  753 
  754         __asm __volatile("ptesync": : :"memory");
  755 
  756         /*
  757          * Flush the first set of the TLB, and the entire Page Walk Cache
  758          * and partition table entries. Then flush the remaining sets of the
  759          * TLB.
  760          */
  761         if (is == TLB_INVAL_SCOPE_GLOBAL) {
  762                 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
  763                 for (set = 1; set < num_sets; set++)
  764                         tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
  765         }
  766 
  767         /* Do the same for process scoped entries. */
  768         tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
  769         for (set = 1; set < num_sets; set++)
  770                 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
  771 
  772         __asm __volatile("ptesync": : :"memory");
  773 }
  774 
  775 static void
  776 mmu_radix_tlbiel_flush(int scope)
  777 {
  778         MPASS(scope == TLB_INVAL_SCOPE_LPID ||
  779                   scope == TLB_INVAL_SCOPE_GLOBAL);
  780 
  781         tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, scope);
  782         __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
  783 }
  784 
  785 static void
  786 mmu_radix_tlbie_all(void)
  787 {
  788         if (powernv_enabled)
  789                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
  790         else
  791                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
  792 }
  793 
  794 static void
  795 mmu_radix_init_amor(void)
  796 {
  797         /*
  798         * In HV mode, we init AMOR (Authority Mask Override Register) so that
  799         * the hypervisor and guest can setup IAMR (Instruction Authority Mask
  800         * Register), enable key 0 and set it to 1.
  801         *
  802         * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
  803         */
  804         mtspr(SPR_AMOR, (3ul << 62));
  805 }
  806 
  807 static void
  808 mmu_radix_init_iamr(void)
  809 {
  810         /*
  811          * Radix always uses key0 of the IAMR to determine if an access is
  812          * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
  813          * fetch.
  814          */
  815         mtspr(SPR_IAMR, (1ul << 62));
  816 }
  817 
  818 static void
  819 mmu_radix_pid_set(pmap_t pmap)
  820 {
  821 
  822         mtspr(SPR_PID, pmap->pm_pid);
  823         isync();
  824 }
  825 
  826 /* Quick sort callout for comparing physical addresses. */
  827 static int
  828 pa_cmp(const void *a, const void *b)
  829 {
  830         const vm_paddr_t *pa = a, *pb = b;
  831 
  832         if (*pa < *pb)
  833                 return (-1);
  834         else if (*pa > *pb)
  835                 return (1);
  836         else
  837                 return (0);
  838 }
  839 
  840 #define pte_load_store(ptep, pte)       atomic_swap_long(ptep, pte)
  841 #define pte_load_clear(ptep)            atomic_swap_long(ptep, 0)
  842 #define pte_store(ptep, pte) do {          \
  843         MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X));  \
  844         *(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \
  845 } while (0)
  846 /*
  847  * NB: should only be used for adding directories - not for direct mappings
  848  */
  849 #define pde_store(ptep, pa) do {                                \
  850         *(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \
  851 } while (0)
  852 
  853 #define pte_clear(ptep) do {                                    \
  854                 *(u_long *)(ptep) = (u_long)(0);                \
  855 } while (0)
  856 
  857 #define PMAP_PDE_SUPERPAGE      (1 << 8)        /* supports 2MB superpages */
  858 
  859 /*
  860  * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
  861  * (PTE) page mappings have identical settings for the following fields:
  862  */
  863 #define PG_PTE_PROMOTE  (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \
  864             PG_M | PG_A | RPTE_EAA_MASK | PG_V)
  865 
  866 static __inline void
  867 pmap_resident_count_inc(pmap_t pmap, int count)
  868 {
  869 
  870         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  871         pmap->pm_stats.resident_count += count;
  872 }
  873 
  874 static __inline void
  875 pmap_resident_count_dec(pmap_t pmap, int count)
  876 {
  877 
  878         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  879         KASSERT(pmap->pm_stats.resident_count >= count,
  880             ("pmap %p resident count underflow %ld %d", pmap,
  881             pmap->pm_stats.resident_count, count));
  882         pmap->pm_stats.resident_count -= count;
  883 }
  884 
  885 static void
  886 pagezero(vm_offset_t va)
  887 {
  888         va = trunc_page(va);
  889 
  890         bzero((void *)va, PAGE_SIZE);
  891 }
  892 
  893 static uint64_t
  894 allocpages(int n)
  895 {
  896         u_int64_t ret;
  897 
  898         ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE);
  899         for (int i = 0; i < n; i++)
  900                 pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE));
  901         return (ret);
  902 }
  903 
  904 static pt_entry_t *
  905 kvtopte(vm_offset_t va)
  906 {
  907         pt_entry_t *l3e;
  908 
  909         l3e = pmap_pml3e(kernel_pmap, va);
  910         if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
  911                 return (NULL);
  912         return (pmap_l3e_to_pte(l3e, va));
  913 }
  914 
  915 void
  916 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa)
  917 {
  918         pt_entry_t *pte;
  919 
  920         pte = kvtopte(va);
  921         MPASS(pte != NULL);
  922         *pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \
  923             RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A);
  924 }
  925 
  926 bool
  927 mmu_radix_ps_enabled(pmap_t pmap)
  928 {
  929         return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
  930 }
  931 
  932 static pt_entry_t *
  933 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e)
  934 {
  935         pml3_entry_t *l3e;
  936         pt_entry_t *pte;
  937 
  938         va &= PG_PS_FRAME;
  939         l3e = pmap_pml3e(pmap, va);
  940         if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0)
  941                 return (NULL);
  942 
  943         if (be64toh(*l3e) & RPTE_LEAF) {
  944                 *is_l3e = 1;
  945                 return (l3e);
  946         }
  947         *is_l3e = 0;
  948         va &= PG_FRAME;
  949         pte = pmap_l3e_to_pte(l3e, va);
  950         if (pte == NULL || (be64toh(*pte) & PG_V) == 0)
  951                 return (NULL);
  952         return (pte);
  953 }
  954 
  955 int
  956 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags)
  957 {
  958         pt_entry_t *pte;
  959         pt_entry_t startpte, origpte, newpte;
  960         vm_page_t m;
  961         int is_l3e;
  962 
  963         startpte = 0;
  964  retry:
  965         if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL)
  966                 return (KERN_INVALID_ADDRESS);
  967         origpte = newpte = be64toh(*pte);
  968         if (startpte == 0) {
  969                 startpte = origpte;
  970                 if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) ||
  971                     ((flags & VM_PROT_READ) && (startpte & PG_A))) {
  972                         pmap_invalidate_all(pmap);
  973 #ifdef INVARIANTS
  974                         if (VERBOSE_PMAP || pmap_logging)
  975                                 printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n",
  976                                     __func__, pmap, va, flags, origpte);
  977 #endif
  978                         return (KERN_FAILURE);
  979                 }
  980         }
  981 #ifdef INVARIANTS
  982         if (VERBOSE_PMAP || pmap_logging)
  983                 printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va,
  984                     flags, origpte);
  985 #endif
  986         PMAP_LOCK(pmap);
  987         if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL ||
  988             be64toh(*pte) != origpte) {
  989                 PMAP_UNLOCK(pmap);
  990                 return (KERN_FAILURE);
  991         }
  992         m = PHYS_TO_VM_PAGE(newpte & PG_FRAME);
  993         MPASS(m != NULL);
  994         switch (flags) {
  995         case VM_PROT_READ:
  996                 if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0)
  997                         goto protfail;
  998                 newpte |= PG_A;
  999                 vm_page_aflag_set(m, PGA_REFERENCED);
 1000                 break;
 1001         case VM_PROT_WRITE:
 1002                 if ((newpte & RPTE_EAA_W) == 0)
 1003                         goto protfail;
 1004                 if (is_l3e)
 1005                         goto protfail;
 1006                 newpte |= PG_M;
 1007                 vm_page_dirty(m);
 1008                 break;
 1009         case VM_PROT_EXECUTE:
 1010                 if ((newpte & RPTE_EAA_X) == 0)
 1011                         goto protfail;
 1012                 newpte |= PG_A;
 1013                 vm_page_aflag_set(m, PGA_REFERENCED);
 1014                 break;
 1015         }
 1016 
 1017         if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
 1018                 goto retry;
 1019         ptesync();
 1020         PMAP_UNLOCK(pmap);
 1021         if (startpte == newpte)
 1022                 return (KERN_FAILURE);
 1023         return (0);
 1024  protfail:
 1025         PMAP_UNLOCK(pmap);
 1026         return (KERN_PROTECTION_FAILURE);
 1027 }
 1028 
 1029 /*
 1030  * Returns TRUE if the given page is mapped individually or as part of
 1031  * a 2mpage.  Otherwise, returns FALSE.
 1032  */
 1033 boolean_t
 1034 mmu_radix_page_is_mapped(vm_page_t m)
 1035 {
 1036         struct rwlock *lock;
 1037         boolean_t rv;
 1038 
 1039         if ((m->oflags & VPO_UNMANAGED) != 0)
 1040                 return (FALSE);
 1041         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 1042         rw_rlock(lock);
 1043         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 1044             ((m->flags & PG_FICTITIOUS) == 0 &&
 1045             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 1046         rw_runlock(lock);
 1047         return (rv);
 1048 }
 1049 
 1050 /*
 1051  * Determine the appropriate bits to set in a PTE or PDE for a specified
 1052  * caching mode.
 1053  */
 1054 static int
 1055 pmap_cache_bits(vm_memattr_t ma)
 1056 {
 1057         if (ma != VM_MEMATTR_DEFAULT) {
 1058                 switch (ma) {
 1059                 case VM_MEMATTR_UNCACHEABLE:
 1060                         return (RPTE_ATTR_GUARDEDIO);
 1061                 case VM_MEMATTR_CACHEABLE:
 1062                         return (RPTE_ATTR_MEM);
 1063                 case VM_MEMATTR_WRITE_BACK:
 1064                 case VM_MEMATTR_PREFETCHABLE:
 1065                 case VM_MEMATTR_WRITE_COMBINING:
 1066                         return (RPTE_ATTR_UNGUARDEDIO);
 1067                 }
 1068         }
 1069         return (0);
 1070 }
 1071 
 1072 static void
 1073 pmap_invalidate_page(pmap_t pmap, vm_offset_t start)
 1074 {
 1075         ptesync();
 1076         if (pmap == kernel_pmap)
 1077                 radix_tlbie_invlpg_kernel_4k(start);
 1078         else
 1079                 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
 1080         ttusync();
 1081 }
 1082 
 1083 static void
 1084 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start)
 1085 {
 1086         ptesync();
 1087         if (pmap == kernel_pmap)
 1088                 radix_tlbie_invlpg_kernel_2m(start);
 1089         else
 1090                 radix_tlbie_invlpg_user_2m(pmap->pm_pid, start);
 1091         ttusync();
 1092 }
 1093 
 1094 static void
 1095 pmap_invalidate_pwc(pmap_t pmap)
 1096 {
 1097         ptesync();
 1098         if (pmap == kernel_pmap)
 1099                 radix_tlbie_invlpwc_kernel();
 1100         else
 1101                 radix_tlbie_invlpwc_user(pmap->pm_pid);
 1102         ttusync();
 1103 }
 1104 
 1105 static void
 1106 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end)
 1107 {
 1108         if (((start - end) >> PAGE_SHIFT) > 8) {
 1109                 pmap_invalidate_all(pmap);
 1110                 return;
 1111         }
 1112         ptesync();
 1113         if (pmap == kernel_pmap) {
 1114                 while (start < end) {
 1115                         radix_tlbie_invlpg_kernel_4k(start);
 1116                         start += PAGE_SIZE;
 1117                 }
 1118         } else {
 1119                 while (start < end) {
 1120                         radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
 1121                         start += PAGE_SIZE;
 1122                 }
 1123         }
 1124         ttusync();
 1125 }
 1126 
 1127 static void
 1128 pmap_invalidate_all(pmap_t pmap)
 1129 {
 1130         ptesync();
 1131         if (pmap == kernel_pmap)
 1132                 radix_tlbie_flush_kernel();
 1133         else
 1134                 radix_tlbie_flush_user(pmap->pm_pid);
 1135         ttusync();
 1136 }
 1137 
 1138 static void
 1139 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e)
 1140 {
 1141 
 1142         /*
 1143          * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
 1144          * by a promotion that did not invalidate the 512 4KB page mappings
 1145          * that might exist in the TLB.  Consequently, at this point, the TLB
 1146          * may hold both 4KB and 2MB page mappings for the address range [va,
 1147          * va + L3_PAGE_SIZE).  Therefore, the entire range must be invalidated here.
 1148          * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
 1149          * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a
 1150          * single INVLPG suffices to invalidate the 2MB page mapping from the
 1151          * TLB.
 1152          */
 1153         ptesync();
 1154         if ((l3e & PG_PROMOTED) != 0)
 1155                 pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1);
 1156         else
 1157                 pmap_invalidate_page_2m(pmap, va);
 1158 
 1159         pmap_invalidate_pwc(pmap);
 1160 }
 1161 
 1162 static __inline struct pv_chunk *
 1163 pv_to_chunk(pv_entry_t pv)
 1164 {
 1165 
 1166         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 1167 }
 1168 
 1169 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 1170 
 1171 #define PC_FREE0        0xfffffffffffffffful
 1172 #define PC_FREE1        ((1ul << (_NPCPV % 64)) - 1)
 1173 
 1174 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 };
 1175 
 1176 /*
 1177  * Ensure that the number of spare PV entries in the specified pmap meets or
 1178  * exceeds the given count, "needed".
 1179  *
 1180  * The given PV list lock may be released.
 1181  */
 1182 static void
 1183 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 1184 {
 1185         struct pch new_tail;
 1186         struct pv_chunk *pc;
 1187         vm_page_t m;
 1188         int avail, free;
 1189         bool reclaimed;
 1190 
 1191         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1192         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 1193 
 1194         /*
 1195          * Newly allocated PV chunks must be stored in a private list until
 1196          * the required number of PV chunks have been allocated.  Otherwise,
 1197          * reclaim_pv_chunk() could recycle one of these chunks.  In
 1198          * contrast, these chunks must be added to the pmap upon allocation.
 1199          */
 1200         TAILQ_INIT(&new_tail);
 1201 retry:
 1202         avail = 0;
 1203         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 1204                 //              if ((cpu_feature2 & CPUID2_POPCNT) == 0)
 1205                 bit_count((bitstr_t *)pc->pc_map, 0,
 1206                                   sizeof(pc->pc_map) * NBBY, &free);
 1207 #if 0
 1208                 free = popcnt_pc_map_pq(pc->pc_map);
 1209 #endif
 1210                 if (free == 0)
 1211                         break;
 1212                 avail += free;
 1213                 if (avail >= needed)
 1214                         break;
 1215         }
 1216         for (reclaimed = false; avail < needed; avail += _NPCPV) {
 1217                 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 1218                 if (m == NULL) {
 1219                         m = reclaim_pv_chunk(pmap, lockp);
 1220                         if (m == NULL)
 1221                                 goto retry;
 1222                         reclaimed = true;
 1223                 }
 1224                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 1225                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 1226                 dump_add_page(m->phys_addr);
 1227                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 1228                 pc->pc_pmap = pmap;
 1229                 pc->pc_map[0] = PC_FREE0;
 1230                 pc->pc_map[1] = PC_FREE1;
 1231                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 1232                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 1233                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 1234 
 1235                 /*
 1236                  * The reclaim might have freed a chunk from the current pmap.
 1237                  * If that chunk contained available entries, we need to
 1238                  * re-count the number of available entries.
 1239                  */
 1240                 if (reclaimed)
 1241                         goto retry;
 1242         }
 1243         if (!TAILQ_EMPTY(&new_tail)) {
 1244                 mtx_lock(&pv_chunks_mutex);
 1245                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 1246                 mtx_unlock(&pv_chunks_mutex);
 1247         }
 1248 }
 1249 
 1250 /*
 1251  * First find and then remove the pv entry for the specified pmap and virtual
 1252  * address from the specified pv list.  Returns the pv entry if found and NULL
 1253  * otherwise.  This operation can be performed on pv lists for either 4KB or
 1254  * 2MB page mappings.
 1255  */
 1256 static __inline pv_entry_t
 1257 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 1258 {
 1259         pv_entry_t pv;
 1260 
 1261         TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
 1262 #ifdef INVARIANTS
 1263                 if (PV_PMAP(pv) == NULL) {
 1264                         printf("corrupted pv_chunk/pv %p\n", pv);
 1265                         printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":");
 1266                 }
 1267                 MPASS(PV_PMAP(pv) != NULL);
 1268                 MPASS(pv->pv_va != 0);
 1269 #endif
 1270                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 1271                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
 1272                         pvh->pv_gen++;
 1273                         break;
 1274                 }
 1275         }
 1276         return (pv);
 1277 }
 1278 
 1279 /*
 1280  * After demotion from a 2MB page mapping to 512 4KB page mappings,
 1281  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
 1282  * entries for each of the 4KB page mappings.
 1283  */
 1284 static void
 1285 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 1286     struct rwlock **lockp)
 1287 {
 1288         struct md_page *pvh;
 1289         struct pv_chunk *pc;
 1290         pv_entry_t pv;
 1291         vm_offset_t va_last;
 1292         vm_page_t m;
 1293         int bit, field;
 1294 
 1295         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1296         KASSERT((pa & L3_PAGE_MASK) == 0,
 1297             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 1298         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 1299 
 1300         /*
 1301          * Transfer the 2mpage's pv entry for this mapping to the first
 1302          * page's pv list.  Once this transfer begins, the pv list lock
 1303          * must not be released until the last pv entry is reinstantiated.
 1304          */
 1305         pvh = pa_to_pvh(pa);
 1306         va = trunc_2mpage(va);
 1307         pv = pmap_pvh_remove(pvh, pmap, va);
 1308         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 1309         m = PHYS_TO_VM_PAGE(pa);
 1310         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 1311 
 1312         m->md.pv_gen++;
 1313         /* Instantiate the remaining NPTEPG - 1 pv entries. */
 1314         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 1315         va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
 1316         for (;;) {
 1317                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 1318                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0
 1319                     , ("pmap_pv_demote_pde: missing spare"));
 1320                 for (field = 0; field < _NPCM; field++) {
 1321                         while (pc->pc_map[field]) {
 1322                                 bit = cnttzd(pc->pc_map[field]);
 1323                                 pc->pc_map[field] &= ~(1ul << bit);
 1324                                 pv = &pc->pc_pventry[field * 64 + bit];
 1325                                 va += PAGE_SIZE;
 1326                                 pv->pv_va = va;
 1327                                 m++;
 1328                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 1329                             ("pmap_pv_demote_pde: page %p is not managed", m));
 1330                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 1331 
 1332                                 m->md.pv_gen++;
 1333                                 if (va == va_last)
 1334                                         goto out;
 1335                         }
 1336                 }
 1337                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 1338                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 1339         }
 1340 out:
 1341         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
 1342                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 1343                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 1344         }
 1345         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 1346         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 1347 }
 1348 
 1349 static void
 1350 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap)
 1351 {
 1352 
 1353         if (pmap == NULL)
 1354                 return;
 1355         pmap_invalidate_all(pmap);
 1356         if (pmap != locked_pmap)
 1357                 PMAP_UNLOCK(pmap);
 1358 }
 1359 
 1360 /*
 1361  * We are in a serious low memory condition.  Resort to
 1362  * drastic measures to free some pages so we can allocate
 1363  * another pv entry chunk.
 1364  *
 1365  * Returns NULL if PV entries were reclaimed from the specified pmap.
 1366  *
 1367  * We do not, however, unmap 2mpages because subsequent accesses will
 1368  * allocate per-page pv entries until repromotion occurs, thereby
 1369  * exacerbating the shortage of free pv entries.
 1370  */
 1371 static int active_reclaims = 0;
 1372 static vm_page_t
 1373 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 1374 {
 1375         struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 1376         struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 1377         struct md_page *pvh;
 1378         pml3_entry_t *l3e;
 1379         pmap_t next_pmap, pmap;
 1380         pt_entry_t *pte, tpte;
 1381         pv_entry_t pv;
 1382         vm_offset_t va;
 1383         vm_page_t m, m_pc;
 1384         struct spglist free;
 1385         uint64_t inuse;
 1386         int bit, field, freed;
 1387 
 1388         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 1389         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 1390         pmap = NULL;
 1391         m_pc = NULL;
 1392         SLIST_INIT(&free);
 1393         bzero(&pc_marker_b, sizeof(pc_marker_b));
 1394         bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 1395         pc_marker = (struct pv_chunk *)&pc_marker_b;
 1396         pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 1397 
 1398         mtx_lock(&pv_chunks_mutex);
 1399         active_reclaims++;
 1400         TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
 1401         TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
 1402         while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 1403             SLIST_EMPTY(&free)) {
 1404                 next_pmap = pc->pc_pmap;
 1405                 if (next_pmap == NULL) {
 1406                         /*
 1407                          * The next chunk is a marker.  However, it is
 1408                          * not our marker, so active_reclaims must be
 1409                          * > 1.  Consequently, the next_chunk code
 1410                          * will not rotate the pv_chunks list.
 1411                          */
 1412                         goto next_chunk;
 1413                 }
 1414                 mtx_unlock(&pv_chunks_mutex);
 1415 
 1416                 /*
 1417                  * A pv_chunk can only be removed from the pc_lru list
 1418                  * when both pc_chunks_mutex is owned and the
 1419                  * corresponding pmap is locked.
 1420                  */
 1421                 if (pmap != next_pmap) {
 1422                         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
 1423                         pmap = next_pmap;
 1424                         /* Avoid deadlock and lock recursion. */
 1425                         if (pmap > locked_pmap) {
 1426                                 RELEASE_PV_LIST_LOCK(lockp);
 1427                                 PMAP_LOCK(pmap);
 1428                                 mtx_lock(&pv_chunks_mutex);
 1429                                 continue;
 1430                         } else if (pmap != locked_pmap) {
 1431                                 if (PMAP_TRYLOCK(pmap)) {
 1432                                         mtx_lock(&pv_chunks_mutex);
 1433                                         continue;
 1434                                 } else {
 1435                                         pmap = NULL; /* pmap is not locked */
 1436                                         mtx_lock(&pv_chunks_mutex);
 1437                                         pc = TAILQ_NEXT(pc_marker, pc_lru);
 1438                                         if (pc == NULL ||
 1439                                             pc->pc_pmap != next_pmap)
 1440                                                 continue;
 1441                                         goto next_chunk;
 1442                                 }
 1443                         }
 1444                 }
 1445 
 1446                 /*
 1447                  * Destroy every non-wired, 4 KB page mapping in the chunk.
 1448                  */
 1449                 freed = 0;
 1450                 for (field = 0; field < _NPCM; field++) {
 1451                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 1452                             inuse != 0; inuse &= ~(1UL << bit)) {
 1453                                 bit = cnttzd(inuse);
 1454                                 pv = &pc->pc_pventry[field * 64 + bit];
 1455                                 va = pv->pv_va;
 1456                                 l3e = pmap_pml3e(pmap, va);
 1457                                 if ((be64toh(*l3e) & RPTE_LEAF) != 0)
 1458                                         continue;
 1459                                 pte = pmap_l3e_to_pte(l3e, va);
 1460                                 if ((be64toh(*pte) & PG_W) != 0)
 1461                                         continue;
 1462                                 tpte = be64toh(pte_load_clear(pte));
 1463                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 1464                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 1465                                         vm_page_dirty(m);
 1466                                 if ((tpte & PG_A) != 0)
 1467                                         vm_page_aflag_set(m, PGA_REFERENCED);
 1468                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 1469                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
 1470 
 1471                                 m->md.pv_gen++;
 1472                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 1473                                     (m->flags & PG_FICTITIOUS) == 0) {
 1474                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 1475                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 1476                                                 vm_page_aflag_clear(m,
 1477                                                     PGA_WRITEABLE);
 1478                                         }
 1479                                 }
 1480                                 pc->pc_map[field] |= 1UL << bit;
 1481                                 pmap_unuse_pt(pmap, va, be64toh(*l3e), &free);
 1482                                 freed++;
 1483                         }
 1484                 }
 1485                 if (freed == 0) {
 1486                         mtx_lock(&pv_chunks_mutex);
 1487                         goto next_chunk;
 1488                 }
 1489                 /* Every freed mapping is for a 4 KB page. */
 1490                 pmap_resident_count_dec(pmap, freed);
 1491                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 1492                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 1493                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 1494                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 1495                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) {
 1496                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 1497                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 1498                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 1499                         /* Entire chunk is free; return it. */
 1500                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 1501                         dump_drop_page(m_pc->phys_addr);
 1502                         mtx_lock(&pv_chunks_mutex);
 1503                         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 1504                         break;
 1505                 }
 1506                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 1507                 mtx_lock(&pv_chunks_mutex);
 1508                 /* One freed pv entry in locked_pmap is sufficient. */
 1509                 if (pmap == locked_pmap)
 1510                         break;
 1511 next_chunk:
 1512                 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 1513                 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
 1514                 if (active_reclaims == 1 && pmap != NULL) {
 1515                         /*
 1516                          * Rotate the pv chunks list so that we do not
 1517                          * scan the same pv chunks that could not be
 1518                          * freed (because they contained a wired
 1519                          * and/or superpage mapping) on every
 1520                          * invocation of reclaim_pv_chunk().
 1521                          */
 1522                         while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
 1523                                 MPASS(pc->pc_pmap != NULL);
 1524                                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 1525                                 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 1526                         }
 1527                 }
 1528         }
 1529         TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 1530         TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
 1531         active_reclaims--;
 1532         mtx_unlock(&pv_chunks_mutex);
 1533         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
 1534         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 1535                 m_pc = SLIST_FIRST(&free);
 1536                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 1537                 /* Recycle a freed page table page. */
 1538                 m_pc->ref_count = 1;
 1539         }
 1540         vm_page_free_pages_toq(&free, true);
 1541         return (m_pc);
 1542 }
 1543 
 1544 /*
 1545  * free the pv_entry back to the free list
 1546  */
 1547 static void
 1548 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 1549 {
 1550         struct pv_chunk *pc;
 1551         int idx, field, bit;
 1552 
 1553 #ifdef VERBOSE_PV
 1554         if (pmap != kernel_pmap)
 1555                 printf("%s(%p, %p)\n", __func__, pmap, pv);
 1556 #endif
 1557         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1558         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 1559         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 1560         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 1561         pc = pv_to_chunk(pv);
 1562         idx = pv - &pc->pc_pventry[0];
 1563         field = idx / 64;
 1564         bit = idx % 64;
 1565         pc->pc_map[field] |= 1ul << bit;
 1566         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) {
 1567                 /* 98% of the time, pc is already at the head of the list. */
 1568                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 1569                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 1570                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 1571                 }
 1572                 return;
 1573         }
 1574         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 1575         free_pv_chunk(pc);
 1576 }
 1577 
 1578 static void
 1579 free_pv_chunk(struct pv_chunk *pc)
 1580 {
 1581         vm_page_t m;
 1582 
 1583         mtx_lock(&pv_chunks_mutex);
 1584         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 1585         mtx_unlock(&pv_chunks_mutex);
 1586         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 1587         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 1588         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 1589         /* entire chunk is free, return it */
 1590         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 1591         dump_drop_page(m->phys_addr);
 1592         vm_page_unwire_noq(m);
 1593         vm_page_free(m);
 1594 }
 1595 
 1596 /*
 1597  * Returns a new PV entry, allocating a new PV chunk from the system when
 1598  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
 1599  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
 1600  * returned.
 1601  *
 1602  * The given PV list lock may be released.
 1603  */
 1604 static pv_entry_t
 1605 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 1606 {
 1607         int bit, field;
 1608         pv_entry_t pv;
 1609         struct pv_chunk *pc;
 1610         vm_page_t m;
 1611 
 1612         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1613         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 1614 retry:
 1615         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 1616         if (pc != NULL) {
 1617                 for (field = 0; field < _NPCM; field++) {
 1618                         if (pc->pc_map[field]) {
 1619                                 bit = cnttzd(pc->pc_map[field]);
 1620                                 break;
 1621                         }
 1622                 }
 1623                 if (field < _NPCM) {
 1624                         pv = &pc->pc_pventry[field * 64 + bit];
 1625                         pc->pc_map[field] &= ~(1ul << bit);
 1626                         /* If this was the last item, move it to tail */
 1627                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
 1628                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 1629                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 1630                                     pc_list);
 1631                         }
 1632                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 1633                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 1634                         MPASS(PV_PMAP(pv) != NULL);
 1635                         return (pv);
 1636                 }
 1637         }
 1638         /* No free items, allocate another chunk */
 1639         m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
 1640         if (m == NULL) {
 1641                 if (lockp == NULL) {
 1642                         PV_STAT(pc_chunk_tryfail++);
 1643                         return (NULL);
 1644                 }
 1645                 m = reclaim_pv_chunk(pmap, lockp);
 1646                 if (m == NULL)
 1647                         goto retry;
 1648         }
 1649         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 1650         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 1651         dump_add_page(m->phys_addr);
 1652         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 1653         pc->pc_pmap = pmap;
 1654         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
 1655         pc->pc_map[1] = PC_FREE1;
 1656         mtx_lock(&pv_chunks_mutex);
 1657         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 1658         mtx_unlock(&pv_chunks_mutex);
 1659         pv = &pc->pc_pventry[0];
 1660         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 1661         PV_STAT(atomic_add_long(&pv_entry_count, 1));
 1662         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 1663         MPASS(PV_PMAP(pv) != NULL);
 1664         return (pv);
 1665 }
 1666 
 1667 #if VM_NRESERVLEVEL > 0
 1668 /*
 1669  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
 1670  * replace the many pv entries for the 4KB page mappings by a single pv entry
 1671  * for the 2MB page mapping.
 1672  */
 1673 static void
 1674 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 1675     struct rwlock **lockp)
 1676 {
 1677         struct md_page *pvh;
 1678         pv_entry_t pv;
 1679         vm_offset_t va_last;
 1680         vm_page_t m;
 1681 
 1682         KASSERT((pa & L3_PAGE_MASK) == 0,
 1683             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 1684         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 1685 
 1686         /*
 1687          * Transfer the first page's pv entry for this mapping to the 2mpage's
 1688          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 1689          * a transfer avoids the possibility that get_pv_entry() calls
 1690          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 1691          * mappings that is being promoted.
 1692          */
 1693         m = PHYS_TO_VM_PAGE(pa);
 1694         va = trunc_2mpage(va);
 1695         pv = pmap_pvh_remove(&m->md, pmap, va);
 1696         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 1697         pvh = pa_to_pvh(pa);
 1698         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
 1699         pvh->pv_gen++;
 1700         /* Free the remaining NPTEPG - 1 pv entries. */
 1701         va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
 1702         do {
 1703                 m++;
 1704                 va += PAGE_SIZE;
 1705                 pmap_pvh_free(&m->md, pmap, va);
 1706         } while (va < va_last);
 1707 }
 1708 #endif /* VM_NRESERVLEVEL > 0 */
 1709 
 1710 /*
 1711  * First find and then destroy the pv entry for the specified pmap and virtual
 1712  * address.  This operation can be performed on pv lists for either 4KB or 2MB
 1713  * page mappings.
 1714  */
 1715 static void
 1716 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 1717 {
 1718         pv_entry_t pv;
 1719 
 1720         pv = pmap_pvh_remove(pvh, pmap, va);
 1721         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 1722         free_pv_entry(pmap, pv);
 1723 }
 1724 
 1725 /*
 1726  * Conditionally create the PV entry for a 4KB page mapping if the required
 1727  * memory can be allocated without resorting to reclamation.
 1728  */
 1729 static boolean_t
 1730 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
 1731     struct rwlock **lockp)
 1732 {
 1733         pv_entry_t pv;
 1734 
 1735         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 1736         /* Pass NULL instead of the lock pointer to disable reclamation. */
 1737         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 1738                 pv->pv_va = va;
 1739                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 1740                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 1741                 m->md.pv_gen++;
 1742                 return (TRUE);
 1743         } else
 1744                 return (FALSE);
 1745 }
 1746 
 1747 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX];
 1748 #ifdef INVARIANTS
 1749 static void
 1750 validate_addr(vm_paddr_t addr, vm_size_t size)
 1751 {
 1752         vm_paddr_t end = addr + size;
 1753         bool found = false;
 1754 
 1755         for (int i = 0; i < 2 * phys_avail_count; i += 2) {
 1756                 if (addr >= phys_avail_debug[i] &&
 1757                         end <= phys_avail_debug[i + 1]) {
 1758                         found = true;
 1759                         break;
 1760                 }
 1761         }
 1762         KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array",
 1763                                         addr, end));
 1764 }
 1765 #else
 1766 static void validate_addr(vm_paddr_t addr, vm_size_t size) {}
 1767 #endif
 1768 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A)
 1769 
 1770 static vm_paddr_t
 1771 alloc_pt_page(void)
 1772 {
 1773         vm_paddr_t page;
 1774 
 1775         page = allocpages(1);
 1776         pagezero(PHYS_TO_DMAP(page));
 1777         return (page);
 1778 }
 1779 
 1780 static void
 1781 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end)
 1782 {
 1783         pt_entry_t *pte, pteval;
 1784         vm_paddr_t page;
 1785 
 1786         if (bootverbose)
 1787                 printf("%s %lx -> %lx\n", __func__, start, end);
 1788         while (start < end) {
 1789                 pteval = start | DMAP_PAGE_BITS;
 1790                 pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start));
 1791                 if ((be64toh(*pte) & RPTE_VALID) == 0) {
 1792                         page = alloc_pt_page();
 1793                         pde_store(pte, page);
 1794                 }
 1795                 pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start));
 1796                 if ((start & L2_PAGE_MASK) == 0 &&
 1797                         end - start >= L2_PAGE_SIZE) {
 1798                         start += L2_PAGE_SIZE;
 1799                         goto done;
 1800                 } else if ((be64toh(*pte) & RPTE_VALID) == 0) {
 1801                         page = alloc_pt_page();
 1802                         pde_store(pte, page);
 1803                 }
 1804 
 1805                 pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start));
 1806                 if ((start & L3_PAGE_MASK) == 0 &&
 1807                         end - start >= L3_PAGE_SIZE) {
 1808                         start += L3_PAGE_SIZE;
 1809                         goto done;
 1810                 } else if ((be64toh(*pte) & RPTE_VALID) == 0) {
 1811                         page = alloc_pt_page();
 1812                         pde_store(pte, page);
 1813                 }
 1814                 pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start));
 1815                 start += PAGE_SIZE;
 1816         done:
 1817                 pte_store(pte, pteval);
 1818         }
 1819 }
 1820 
 1821 static void
 1822 mmu_radix_dmap_populate(vm_size_t hwphyssz)
 1823 {
 1824         vm_paddr_t start, end;
 1825 
 1826         for (int i = 0; i < pregions_sz; i++) {
 1827                 start = pregions[i].mr_start;
 1828                 end = start + pregions[i].mr_size;
 1829                 if (hwphyssz && start >= hwphyssz)
 1830                         break;
 1831                 if (hwphyssz && hwphyssz < end)
 1832                         end = hwphyssz;
 1833                 mmu_radix_dmap_range(start, end);
 1834         }
 1835 }
 1836 
 1837 static void
 1838 mmu_radix_setup_pagetables(vm_size_t hwphyssz)
 1839 {
 1840         vm_paddr_t ptpages, pages;
 1841         pt_entry_t *pte;
 1842         vm_paddr_t l1phys;
 1843 
 1844         bzero(kernel_pmap, sizeof(struct pmap));
 1845         PMAP_LOCK_INIT(kernel_pmap);
 1846 
 1847         ptpages = allocpages(3);
 1848         l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE);
 1849         validate_addr(l1phys, RADIX_PGD_SIZE);
 1850         if (bootverbose)
 1851                 printf("l1phys=%lx\n", l1phys);
 1852         MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0);
 1853         for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++)
 1854                 pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE));
 1855         kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys);
 1856 
 1857         mmu_radix_dmap_populate(hwphyssz);
 1858 
 1859         /*
 1860          * Create page tables for first 128MB of KVA
 1861          */
 1862         pages = ptpages;
 1863         pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
 1864         *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
 1865         pages += PAGE_SIZE;
 1866         pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS);
 1867         *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
 1868         pages += PAGE_SIZE;
 1869         pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS);
 1870         /*
 1871          * the kernel page table pages need to be preserved in
 1872          * phys_avail and not overlap with previous  allocations
 1873          */
 1874         pages = allocpages(nkpt);
 1875         if (bootverbose) {
 1876                 printf("phys_avail after dmap populate and nkpt allocation\n");
 1877                 for (int j = 0; j < 2 * phys_avail_count; j+=2)
 1878                         printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
 1879                                    j, phys_avail[j], j + 1, phys_avail[j + 1]);
 1880         }
 1881         KPTphys = pages;
 1882         for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE)
 1883                 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
 1884         kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE;
 1885         if (bootverbose)
 1886                 printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1);
 1887         /*
 1888          * Add a physical memory segment (vm_phys_seg) corresponding to the
 1889          * preallocated kernel page table pages so that vm_page structures
 1890          * representing these pages will be created.  The vm_page structures
 1891          * are required for promotion of the corresponding kernel virtual
 1892          * addresses to superpage mappings.
 1893          */
 1894         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 1895 }
 1896 
 1897 static void
 1898 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end)
 1899 {
 1900         vm_paddr_t      kpstart, kpend;
 1901         vm_size_t       physsz, hwphyssz;
 1902         //uint64_t      l2virt;
 1903         int             rm_pavail, proctab_size;
 1904         int             i, j;
 1905 
 1906         kpstart = start & ~DMAP_BASE_ADDRESS;
 1907         kpend = end & ~DMAP_BASE_ADDRESS;
 1908 
 1909         /* Get physical memory regions from firmware */
 1910         mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
 1911         CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory");
 1912 
 1913         if (2 * VM_PHYSSEG_MAX < regions_sz)
 1914                 panic("mmu_radix_early_bootstrap: phys_avail too small");
 1915 
 1916         if (bootverbose)
 1917                 for (int i = 0; i < regions_sz; i++)
 1918                         printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n",
 1919                             i, regions[i].mr_start, i, regions[i].mr_size);
 1920         /*
 1921          * XXX workaround a simulator bug
 1922          */
 1923         for (int i = 0; i < regions_sz; i++)
 1924                 if (regions[i].mr_start & PAGE_MASK) {
 1925                         regions[i].mr_start += PAGE_MASK;
 1926                         regions[i].mr_start &= ~PAGE_MASK;
 1927                         regions[i].mr_size &= ~PAGE_MASK;
 1928                 }
 1929         if (bootverbose)
 1930                 for (int i = 0; i < pregions_sz; i++)
 1931                         printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n",
 1932                             i, pregions[i].mr_start, i, pregions[i].mr_size);
 1933 
 1934         phys_avail_count = 0;
 1935         physsz = 0;
 1936         hwphyssz = 0;
 1937         TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
 1938         for (i = 0, j = 0; i < regions_sz; i++) {
 1939                 if (bootverbose)
 1940                         printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n",
 1941                             i, regions[i].mr_start, i, regions[i].mr_size);
 1942 
 1943                 if (regions[i].mr_size < PAGE_SIZE)
 1944                         continue;
 1945 
 1946                 if (hwphyssz != 0 &&
 1947                     (physsz + regions[i].mr_size) >= hwphyssz) {
 1948                         if (physsz < hwphyssz) {
 1949                                 phys_avail[j] = regions[i].mr_start;
 1950                                 phys_avail[j + 1] = regions[i].mr_start +
 1951                                     (hwphyssz - physsz);
 1952                                 physsz = hwphyssz;
 1953                                 phys_avail_count++;
 1954                                 dump_avail[j] = phys_avail[j];
 1955                                 dump_avail[j + 1] = phys_avail[j + 1];
 1956                         }
 1957                         break;
 1958                 }
 1959                 phys_avail[j] = regions[i].mr_start;
 1960                 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
 1961                 dump_avail[j] = phys_avail[j];
 1962                 dump_avail[j + 1] = phys_avail[j + 1];
 1963 
 1964                 phys_avail_count++;
 1965                 physsz += regions[i].mr_size;
 1966                 j += 2;
 1967         }
 1968 
 1969         /* Check for overlap with the kernel and exception vectors */
 1970         rm_pavail = 0;
 1971         for (j = 0; j < 2 * phys_avail_count; j+=2) {
 1972                 if (phys_avail[j] < EXC_LAST)
 1973                         phys_avail[j] += EXC_LAST;
 1974 
 1975                 if (phys_avail[j] >= kpstart &&
 1976                     phys_avail[j + 1] <= kpend) {
 1977                         phys_avail[j] = phys_avail[j + 1] = ~0;
 1978                         rm_pavail++;
 1979                         continue;
 1980                 }
 1981 
 1982                 if (kpstart >= phys_avail[j] &&
 1983                     kpstart < phys_avail[j + 1]) {
 1984                         if (kpend < phys_avail[j + 1]) {
 1985                                 phys_avail[2 * phys_avail_count] =
 1986                                     (kpend & ~PAGE_MASK) + PAGE_SIZE;
 1987                                 phys_avail[2 * phys_avail_count + 1] =
 1988                                     phys_avail[j + 1];
 1989                                 phys_avail_count++;
 1990                         }
 1991 
 1992                         phys_avail[j + 1] = kpstart & ~PAGE_MASK;
 1993                 }
 1994 
 1995                 if (kpend >= phys_avail[j] &&
 1996                     kpend < phys_avail[j + 1]) {
 1997                         if (kpstart > phys_avail[j]) {
 1998                                 phys_avail[2 * phys_avail_count] = phys_avail[j];
 1999                                 phys_avail[2 * phys_avail_count + 1] =
 2000                                     kpstart & ~PAGE_MASK;
 2001                                 phys_avail_count++;
 2002                         }
 2003 
 2004                         phys_avail[j] = (kpend & ~PAGE_MASK) +
 2005                             PAGE_SIZE;
 2006                 }
 2007         }
 2008         qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp);
 2009         for (i = 0; i < 2 * phys_avail_count; i++)
 2010                 phys_avail_debug[i] = phys_avail[i];
 2011 
 2012         /* Remove physical available regions marked for removal (~0) */
 2013         if (rm_pavail) {
 2014                 phys_avail_count -= rm_pavail;
 2015                 for (i = 2 * phys_avail_count;
 2016                      i < 2*(phys_avail_count + rm_pavail); i+=2)
 2017                         phys_avail[i] = phys_avail[i + 1] = 0;
 2018         }
 2019         if (bootverbose) {
 2020                 printf("phys_avail ranges after filtering:\n");
 2021                 for (j = 0; j < 2 * phys_avail_count; j+=2)
 2022                         printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
 2023                                    j, phys_avail[j], j + 1, phys_avail[j + 1]);
 2024         }
 2025         physmem = btoc(physsz);
 2026 
 2027         /* XXX assume we're running non-virtualized and
 2028          * we don't support BHYVE
 2029          */
 2030         if (isa3_pid_bits == 0)
 2031                 isa3_pid_bits = 20;
 2032         if (powernv_enabled) {
 2033                 parttab_phys =
 2034                     moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE);
 2035                 validate_addr(parttab_phys, PARTTAB_SIZE);
 2036                 for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++)
 2037                         pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE));
 2038 
 2039         }
 2040         proctab_size = 1UL << PROCTAB_SIZE_SHIFT;
 2041         proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size);
 2042         validate_addr(proctab0pa, proctab_size);
 2043         for (int i = 0; i < proctab_size/PAGE_SIZE; i++)
 2044                 pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE));
 2045 
 2046         mmu_radix_setup_pagetables(hwphyssz);
 2047 }
 2048 
 2049 static void
 2050 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end)
 2051 {
 2052         int             i;
 2053         vm_paddr_t      pa;
 2054         void            *dpcpu;
 2055         vm_offset_t va;
 2056 
 2057         /*
 2058          * Set up the Open Firmware pmap and add its mappings if not in real
 2059          * mode.
 2060          */
 2061         if (bootverbose)
 2062                 printf("%s enter\n", __func__);
 2063 
 2064         /*
 2065          * Calculate the last available physical address, and reserve the
 2066          * vm_page_array (upper bound).
 2067          */
 2068         Maxmem = 0;
 2069         for (i = 0; phys_avail[i + 1] != 0; i += 2)
 2070                 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
 2071 
 2072         /*
 2073          * Remap any early IO mappings (console framebuffer, etc.)
 2074          */
 2075         bs_remap_earlyboot();
 2076 
 2077         /*
 2078          * Allocate a kernel stack with a guard page for thread0 and map it
 2079          * into the kernel page map.
 2080          */
 2081         pa = allocpages(kstack_pages);
 2082         va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
 2083         virtual_avail = va + kstack_pages * PAGE_SIZE;
 2084         CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
 2085         thread0.td_kstack = va;
 2086         for (i = 0; i < kstack_pages; i++) {
 2087                 mmu_radix_kenter(va, pa);
 2088                 pa += PAGE_SIZE;
 2089                 va += PAGE_SIZE;
 2090         }
 2091         thread0.td_kstack_pages = kstack_pages;
 2092 
 2093         /*
 2094          * Allocate virtual address space for the message buffer.
 2095          */
 2096         pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK)  >> PAGE_SHIFT);
 2097         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa);
 2098 
 2099         /*
 2100          * Allocate virtual address space for the dynamic percpu area.
 2101          */
 2102         pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT);
 2103         dpcpu = (void *)PHYS_TO_DMAP(pa);
 2104         dpcpu_init(dpcpu, curcpu);
 2105 
 2106         crashdumpmap = (caddr_t)virtual_avail;
 2107         virtual_avail += MAXDUMPPGS * PAGE_SIZE;
 2108 
 2109         /*
 2110          * Reserve some special page table entries/VA space for temporary
 2111          * mapping of pages.
 2112          */
 2113 }
 2114 
 2115 static void
 2116 mmu_parttab_init(void)
 2117 {
 2118         uint64_t ptcr;
 2119 
 2120         isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys);
 2121 
 2122         if (bootverbose)
 2123                 printf("%s parttab: %p\n", __func__, isa3_parttab);
 2124         ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
 2125         if (bootverbose)
 2126                 printf("setting ptcr %lx\n", ptcr);
 2127         mtspr(SPR_PTCR, ptcr);
 2128 }
 2129 
 2130 static void
 2131 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab)
 2132 {
 2133         uint64_t prev;
 2134 
 2135         if (bootverbose)
 2136                 printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab,
 2137                            lpid, pagetab, proctab);
 2138         prev = be64toh(isa3_parttab[lpid].pagetab);
 2139         isa3_parttab[lpid].pagetab = htobe64(pagetab);
 2140         isa3_parttab[lpid].proctab = htobe64(proctab);
 2141 
 2142         if (prev & PARTTAB_HR) {
 2143                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
 2144                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 2145                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
 2146                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 2147         } else {
 2148                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
 2149                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 2150         }
 2151         ttusync();
 2152 }
 2153 
 2154 static void
 2155 mmu_radix_parttab_init(void)
 2156 {
 2157         uint64_t pagetab;
 2158 
 2159         mmu_parttab_init();
 2160         pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \
 2161                          RADIX_PGD_INDEX_SHIFT | PARTTAB_HR;
 2162         mmu_parttab_update(0, pagetab, 0);
 2163 }
 2164 
 2165 static void
 2166 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size)
 2167 {
 2168         uint64_t pagetab, proctab;
 2169 
 2170         pagetab = be64toh(isa3_parttab[0].pagetab);
 2171         proctab = proctabpa | table_size | PARTTAB_GR;
 2172         mmu_parttab_update(0, pagetab, proctab);
 2173 }
 2174 
 2175 static void
 2176 mmu_radix_proctab_init(void)
 2177 {
 2178 
 2179         isa3_base_pid = 1;
 2180 
 2181         isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa);
 2182         isa3_proctab->proctab0 =
 2183             htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) |
 2184                 RADIX_PGD_INDEX_SHIFT);
 2185 
 2186         if (powernv_enabled) {
 2187                 mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12);
 2188                 __asm __volatile("ptesync" : : : "memory");
 2189                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
 2190                              "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
 2191                 __asm __volatile("eieio; tlbsync; ptesync" : : : "memory");
 2192 #ifdef PSERIES
 2193         } else {
 2194                 int64_t rc;
 2195 
 2196                 rc = phyp_hcall(H_REGISTER_PROC_TBL,
 2197                     PROC_TABLE_NEW | PROC_TABLE_RADIX | PROC_TABLE_GTSE,
 2198                     proctab0pa, 0, PROCTAB_SIZE_SHIFT - 12);
 2199                 if (rc != H_SUCCESS)
 2200                         panic("mmu_radix_proctab_init: "
 2201                                 "failed to register process table: rc=%jd",
 2202                                 (intmax_t)rc);
 2203 #endif
 2204         }
 2205 
 2206         if (bootverbose)
 2207                 printf("process table %p and kernel radix PDE: %p\n",
 2208                            isa3_proctab, kernel_pmap->pm_pml1);
 2209         mtmsr(mfmsr() | PSL_DR );
 2210         mtmsr(mfmsr() &  ~PSL_DR);
 2211         kernel_pmap->pm_pid = isa3_base_pid;
 2212         isa3_base_pid++;
 2213 }
 2214 
 2215 void
 2216 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 2217     int advice)
 2218 {
 2219         struct rwlock *lock;
 2220         pml1_entry_t *l1e;
 2221         pml2_entry_t *l2e;
 2222         pml3_entry_t oldl3e, *l3e;
 2223         pt_entry_t *pte;
 2224         vm_offset_t va, va_next;
 2225         vm_page_t m;
 2226         bool anychanged;
 2227 
 2228         if (advice != MADV_DONTNEED && advice != MADV_FREE)
 2229                 return;
 2230         anychanged = false;
 2231         PMAP_LOCK(pmap);
 2232         for (; sva < eva; sva = va_next) {
 2233                 l1e = pmap_pml1e(pmap, sva);
 2234                 if ((be64toh(*l1e) & PG_V) == 0) {
 2235                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 2236                         if (va_next < sva)
 2237                                 va_next = eva;
 2238                         continue;
 2239                 }
 2240                 l2e = pmap_l1e_to_l2e(l1e, sva);
 2241                 if ((be64toh(*l2e) & PG_V) == 0) {
 2242                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 2243                         if (va_next < sva)
 2244                                 va_next = eva;
 2245                         continue;
 2246                 }
 2247                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 2248                 if (va_next < sva)
 2249                         va_next = eva;
 2250                 l3e = pmap_l2e_to_l3e(l2e, sva);
 2251                 oldl3e = be64toh(*l3e);
 2252                 if ((oldl3e & PG_V) == 0)
 2253                         continue;
 2254                 else if ((oldl3e & RPTE_LEAF) != 0) {
 2255                         if ((oldl3e & PG_MANAGED) == 0)
 2256                                 continue;
 2257                         lock = NULL;
 2258                         if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) {
 2259                                 if (lock != NULL)
 2260                                         rw_wunlock(lock);
 2261 
 2262                                 /*
 2263                                  * The large page mapping was destroyed.
 2264                                  */
 2265                                 continue;
 2266                         }
 2267 
 2268                         /*
 2269                          * Unless the page mappings are wired, remove the
 2270                          * mapping to a single page so that a subsequent
 2271                          * access may repromote.  Choosing the last page
 2272                          * within the address range [sva, min(va_next, eva))
 2273                          * generally results in more repromotions.  Since the
 2274                          * underlying page table page is fully populated, this
 2275                          * removal never frees a page table page.
 2276                          */
 2277                         if ((oldl3e & PG_W) == 0) {
 2278                                 va = eva;
 2279                                 if (va > va_next)
 2280                                         va = va_next;
 2281                                 va -= PAGE_SIZE;
 2282                                 KASSERT(va >= sva,
 2283                                     ("mmu_radix_advise: no address gap"));
 2284                                 pte = pmap_l3e_to_pte(l3e, va);
 2285                                 KASSERT((be64toh(*pte) & PG_V) != 0,
 2286                                     ("pmap_advise: invalid PTE"));
 2287                                 pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL,
 2288                                     &lock);
 2289                                 anychanged = true;
 2290                         }
 2291                         if (lock != NULL)
 2292                                 rw_wunlock(lock);
 2293                 }
 2294                 if (va_next > eva)
 2295                         va_next = eva;
 2296                 va = va_next;
 2297                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next;
 2298                          pte++, sva += PAGE_SIZE) {
 2299                         MPASS(pte == pmap_pte(pmap, sva));
 2300 
 2301                         if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 2302                                 goto maybe_invlrng;
 2303                         else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 2304                                 if (advice == MADV_DONTNEED) {
 2305                                         /*
 2306                                          * Future calls to pmap_is_modified()
 2307                                          * can be avoided by making the page
 2308                                          * dirty now.
 2309                                          */
 2310                                         m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME);
 2311                                         vm_page_dirty(m);
 2312                                 }
 2313                                 atomic_clear_long(pte, htobe64(PG_M | PG_A));
 2314                         } else if ((be64toh(*pte) & PG_A) != 0)
 2315                                 atomic_clear_long(pte, htobe64(PG_A));
 2316                         else
 2317                                 goto maybe_invlrng;
 2318                         anychanged = true;
 2319                         continue;
 2320 maybe_invlrng:
 2321                         if (va != va_next) {
 2322                                 anychanged = true;
 2323                                 va = va_next;
 2324                         }
 2325                 }
 2326                 if (va != va_next)
 2327                         anychanged = true;
 2328         }
 2329         if (anychanged)
 2330                 pmap_invalidate_all(pmap);
 2331         PMAP_UNLOCK(pmap);
 2332 }
 2333 
 2334 /*
 2335  * Routines used in machine-dependent code
 2336  */
 2337 static void
 2338 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end)
 2339 {
 2340         uint64_t lpcr;
 2341 
 2342         if (bootverbose)
 2343                 printf("%s\n", __func__);
 2344         hw_direct_map = 1;
 2345         powernv_enabled = (mfmsr() & PSL_HV) ? 1 : 0;
 2346         mmu_radix_early_bootstrap(start, end);
 2347         if (bootverbose)
 2348                 printf("early bootstrap complete\n");
 2349         if (powernv_enabled) {
 2350                 lpcr = mfspr(SPR_LPCR);
 2351                 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 2352                 mmu_radix_parttab_init();
 2353                 mmu_radix_init_amor();
 2354                 if (bootverbose)
 2355                         printf("powernv init complete\n");
 2356         }
 2357         mmu_radix_init_iamr();
 2358         mmu_radix_proctab_init();
 2359         mmu_radix_pid_set(kernel_pmap);
 2360         if (powernv_enabled)
 2361                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
 2362         else
 2363                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
 2364 
 2365         mmu_radix_late_bootstrap(start, end);
 2366         numa_mem_regions(&numa_pregions, &numa_pregions_sz);
 2367         if (bootverbose)
 2368                 printf("%s done\n", __func__);
 2369         pmap_bootstrapped = 1;
 2370         dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE);
 2371         PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS);
 2372 }
 2373 
 2374 static void
 2375 mmu_radix_cpu_bootstrap(int ap)
 2376 {
 2377         uint64_t lpcr;
 2378         uint64_t ptcr;
 2379 
 2380         if (powernv_enabled) {
 2381                 lpcr = mfspr(SPR_LPCR);
 2382                 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 2383 
 2384                 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
 2385                 mtspr(SPR_PTCR, ptcr);
 2386                 mmu_radix_init_amor();
 2387         }
 2388         mmu_radix_init_iamr();
 2389         mmu_radix_pid_set(kernel_pmap);
 2390         if (powernv_enabled)
 2391                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
 2392         else
 2393                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
 2394 }
 2395 
 2396 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0,
 2397     "2MB page mapping counters");
 2398 
 2399 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions);
 2400 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD,
 2401     &pmap_l3e_demotions, "2MB page demotions");
 2402 
 2403 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings);
 2404 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD,
 2405     &pmap_l3e_mappings, "2MB page mappings");
 2406 
 2407 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures);
 2408 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD,
 2409     &pmap_l3e_p_failures, "2MB page promotion failures");
 2410 
 2411 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions);
 2412 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD,
 2413     &pmap_l3e_promotions, "2MB page promotions");
 2414 
 2415 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0,
 2416     "1GB page mapping counters");
 2417 
 2418 static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions);
 2419 SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD,
 2420     &pmap_l2e_demotions, "1GB page demotions");
 2421 
 2422 void
 2423 mmu_radix_clear_modify(vm_page_t m)
 2424 {
 2425         struct md_page *pvh;
 2426         pmap_t pmap;
 2427         pv_entry_t next_pv, pv;
 2428         pml3_entry_t oldl3e, *l3e;
 2429         pt_entry_t oldpte, *pte;
 2430         struct rwlock *lock;
 2431         vm_offset_t va;
 2432         int md_gen, pvh_gen;
 2433 
 2434         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 2435             ("pmap_clear_modify: page %p is not managed", m));
 2436         vm_page_assert_busied(m);
 2437         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 2438 
 2439         /*
 2440          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 2441          * If the object containing the page is locked and the page is not
 2442          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 2443          */
 2444         if ((m->a.flags & PGA_WRITEABLE) == 0)
 2445                 return;
 2446         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 2447             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 2448         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 2449         rw_wlock(lock);
 2450 restart:
 2451         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
 2452                 pmap = PV_PMAP(pv);
 2453                 if (!PMAP_TRYLOCK(pmap)) {
 2454                         pvh_gen = pvh->pv_gen;
 2455                         rw_wunlock(lock);
 2456                         PMAP_LOCK(pmap);
 2457                         rw_wlock(lock);
 2458                         if (pvh_gen != pvh->pv_gen) {
 2459                                 PMAP_UNLOCK(pmap);
 2460                                 goto restart;
 2461                         }
 2462                 }
 2463                 va = pv->pv_va;
 2464                 l3e = pmap_pml3e(pmap, va);
 2465                 oldl3e = be64toh(*l3e);
 2466                 if ((oldl3e & PG_RW) != 0 &&
 2467                     pmap_demote_l3e_locked(pmap, l3e, va, &lock) &&
 2468                     (oldl3e & PG_W) == 0) {
 2469                         /*
 2470                          * Write protect the mapping to a
 2471                          * single page so that a subsequent
 2472                          * write access may repromote.
 2473                          */
 2474                         va += VM_PAGE_TO_PHYS(m) - (oldl3e &
 2475                             PG_PS_FRAME);
 2476                         pte = pmap_l3e_to_pte(l3e, va);
 2477                         oldpte = be64toh(*pte);
 2478                         while (!atomic_cmpset_long(pte,
 2479                             htobe64(oldpte),
 2480                                 htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))))
 2481                                    oldpte = be64toh(*pte);
 2482                         vm_page_dirty(m);
 2483                         pmap_invalidate_page(pmap, va);
 2484                 }
 2485                 PMAP_UNLOCK(pmap);
 2486         }
 2487         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 2488                 pmap = PV_PMAP(pv);
 2489                 if (!PMAP_TRYLOCK(pmap)) {
 2490                         md_gen = m->md.pv_gen;
 2491                         pvh_gen = pvh->pv_gen;
 2492                         rw_wunlock(lock);
 2493                         PMAP_LOCK(pmap);
 2494                         rw_wlock(lock);
 2495                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 2496                                 PMAP_UNLOCK(pmap);
 2497                                 goto restart;
 2498                         }
 2499                 }
 2500                 l3e = pmap_pml3e(pmap, pv->pv_va);
 2501                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found"
 2502                     " a 2mpage in page %p's pv list", m));
 2503                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
 2504                 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 2505                         atomic_clear_long(pte, htobe64(PG_M));
 2506                         pmap_invalidate_page(pmap, pv->pv_va);
 2507                 }
 2508                 PMAP_UNLOCK(pmap);
 2509         }
 2510         rw_wunlock(lock);
 2511 }
 2512 
 2513 void
 2514 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
 2515     vm_size_t len, vm_offset_t src_addr)
 2516 {
 2517         struct rwlock *lock;
 2518         struct spglist free;
 2519         vm_offset_t addr;
 2520         vm_offset_t end_addr = src_addr + len;
 2521         vm_offset_t va_next;
 2522         vm_page_t dst_pdpg, dstmpte, srcmpte;
 2523         bool invalidate_all;
 2524 
 2525         CTR6(KTR_PMAP,
 2526             "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n",
 2527             __func__, dst_pmap, src_pmap, dst_addr, len, src_addr);
 2528 
 2529         if (dst_addr != src_addr)
 2530                 return;
 2531         lock = NULL;
 2532         invalidate_all = false;
 2533         if (dst_pmap < src_pmap) {
 2534                 PMAP_LOCK(dst_pmap);
 2535                 PMAP_LOCK(src_pmap);
 2536         } else {
 2537                 PMAP_LOCK(src_pmap);
 2538                 PMAP_LOCK(dst_pmap);
 2539         }
 2540 
 2541         for (addr = src_addr; addr < end_addr; addr = va_next) {
 2542                 pml1_entry_t *l1e;
 2543                 pml2_entry_t *l2e;
 2544                 pml3_entry_t srcptepaddr, *l3e;
 2545                 pt_entry_t *src_pte, *dst_pte;
 2546 
 2547                 l1e = pmap_pml1e(src_pmap, addr);
 2548                 if ((be64toh(*l1e) & PG_V) == 0) {
 2549                         va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 2550                         if (va_next < addr)
 2551                                 va_next = end_addr;
 2552                         continue;
 2553                 }
 2554 
 2555                 l2e = pmap_l1e_to_l2e(l1e, addr);
 2556                 if ((be64toh(*l2e) & PG_V) == 0) {
 2557                         va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 2558                         if (va_next < addr)
 2559                                 va_next = end_addr;
 2560                         continue;
 2561                 }
 2562 
 2563                 va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 2564                 if (va_next < addr)
 2565                         va_next = end_addr;
 2566 
 2567                 l3e = pmap_l2e_to_l3e(l2e, addr);
 2568                 srcptepaddr = be64toh(*l3e);
 2569                 if (srcptepaddr == 0)
 2570                         continue;
 2571 
 2572                 if (srcptepaddr & RPTE_LEAF) {
 2573                         if ((addr & L3_PAGE_MASK) != 0 ||
 2574                             addr + L3_PAGE_SIZE > end_addr)
 2575                                 continue;
 2576                         dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL);
 2577                         if (dst_pdpg == NULL)
 2578                                 break;
 2579                         l3e = (pml3_entry_t *)
 2580                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
 2581                         l3e = &l3e[pmap_pml3e_index(addr)];
 2582                         if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 2583                             pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr,
 2584                             PMAP_ENTER_NORECLAIM, &lock))) {
 2585                                 *l3e = htobe64(srcptepaddr & ~PG_W);
 2586                                 pmap_resident_count_inc(dst_pmap,
 2587                                     L3_PAGE_SIZE / PAGE_SIZE);
 2588                                 counter_u64_add(pmap_l3e_mappings, 1);
 2589                         } else
 2590                                 dst_pdpg->ref_count--;
 2591                         continue;
 2592                 }
 2593 
 2594                 srcptepaddr &= PG_FRAME;
 2595                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 2596                 KASSERT(srcmpte->ref_count > 0,
 2597                     ("pmap_copy: source page table page is unused"));
 2598 
 2599                 if (va_next > end_addr)
 2600                         va_next = end_addr;
 2601 
 2602                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 2603                 src_pte = &src_pte[pmap_pte_index(addr)];
 2604                 dstmpte = NULL;
 2605                 while (addr < va_next) {
 2606                         pt_entry_t ptetemp;
 2607                         ptetemp = be64toh(*src_pte);
 2608                         /*
 2609                          * we only virtual copy managed pages
 2610                          */
 2611                         if ((ptetemp & PG_MANAGED) != 0) {
 2612                                 if (dstmpte != NULL &&
 2613                                     dstmpte->pindex == pmap_l3e_pindex(addr))
 2614                                         dstmpte->ref_count++;
 2615                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
 2616                                     addr, NULL)) == NULL)
 2617                                         goto out;
 2618                                 dst_pte = (pt_entry_t *)
 2619                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 2620                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
 2621                                 if (be64toh(*dst_pte) == 0 &&
 2622                                     pmap_try_insert_pv_entry(dst_pmap, addr,
 2623                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
 2624                                     &lock)) {
 2625                                         /*
 2626                                          * Clear the wired, modified, and
 2627                                          * accessed (referenced) bits
 2628                                          * during the copy.
 2629                                          */
 2630                                         *dst_pte = htobe64(ptetemp & ~(PG_W | PG_M |
 2631                                             PG_A));
 2632                                         pmap_resident_count_inc(dst_pmap, 1);
 2633                                 } else {
 2634                                         SLIST_INIT(&free);
 2635                                         if (pmap_unwire_ptp(dst_pmap, addr,
 2636                                             dstmpte, &free)) {
 2637                                                 /*
 2638                                                  * Although "addr" is not
 2639                                                  * mapped, paging-structure
 2640                                                  * caches could nonetheless
 2641                                                  * have entries that refer to
 2642                                                  * the freed page table pages.
 2643                                                  * Invalidate those entries.
 2644                                                  */
 2645                                                 invalidate_all = true;
 2646                                                 vm_page_free_pages_toq(&free,
 2647                                                     true);
 2648                                         }
 2649                                         goto out;
 2650                                 }
 2651                                 if (dstmpte->ref_count >= srcmpte->ref_count)
 2652                                         break;
 2653                         }
 2654                         addr += PAGE_SIZE;
 2655                         if (__predict_false((addr & L3_PAGE_MASK) == 0))
 2656                                 src_pte = pmap_pte(src_pmap, addr);
 2657                         else
 2658                                 src_pte++;
 2659                 }
 2660         }
 2661 out:
 2662         if (invalidate_all)
 2663                 pmap_invalidate_all(dst_pmap);
 2664         if (lock != NULL)
 2665                 rw_wunlock(lock);
 2666         PMAP_UNLOCK(src_pmap);
 2667         PMAP_UNLOCK(dst_pmap);
 2668 }
 2669 
 2670 static void
 2671 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst)
 2672 {
 2673         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 2674         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 2675 
 2676         CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst);
 2677         /*
 2678          * XXX slow
 2679          */
 2680         bcopy((void *)src, (void *)dst, PAGE_SIZE);
 2681 }
 2682 
 2683 static void
 2684 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
 2685     vm_offset_t b_offset, int xfersize)
 2686 {
 2687         void *a_cp, *b_cp;
 2688         vm_offset_t a_pg_offset, b_pg_offset;
 2689         int cnt;
 2690 
 2691         CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma,
 2692             a_offset, mb, b_offset, xfersize);
 2693         
 2694         while (xfersize > 0) {
 2695                 a_pg_offset = a_offset & PAGE_MASK;
 2696                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 2697                 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
 2698                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
 2699                     a_pg_offset;
 2700                 b_pg_offset = b_offset & PAGE_MASK;
 2701                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 2702                 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
 2703                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
 2704                     b_pg_offset;
 2705                 bcopy(a_cp, b_cp, cnt);
 2706                 a_offset += cnt;
 2707                 b_offset += cnt;
 2708                 xfersize -= cnt;
 2709         }
 2710 }
 2711 
 2712 #if VM_NRESERVLEVEL > 0
 2713 /*
 2714  * Tries to promote the 512, contiguous 4KB page mappings that are within a
 2715  * single page table page (PTP) to a single 2MB page mapping.  For promotion
 2716  * to occur, two conditions must be met: (1) the 4KB page mappings must map
 2717  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
 2718  * identical characteristics.
 2719  */
 2720 static int
 2721 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va,
 2722     struct rwlock **lockp)
 2723 {
 2724         pml3_entry_t newpde;
 2725         pt_entry_t *firstpte, oldpte, pa, *pte;
 2726         vm_page_t mpte;
 2727 
 2728         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 2729 
 2730         /*
 2731          * Examine the first PTE in the specified PTP.  Abort if this PTE is
 2732          * either invalid, unused, or does not map the first 4KB physical page
 2733          * within a 2MB page.
 2734          */
 2735         firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME);
 2736 setpde:
 2737         newpde = be64toh(*firstpte);
 2738         if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 2739                 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
 2740                     " in pmap %p", va, pmap);
 2741                 goto fail;
 2742         }
 2743         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 2744                 /*
 2745                  * When PG_M is already clear, PG_RW can be cleared without
 2746                  * a TLB invalidation.
 2747                  */
 2748                 if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W)))
 2749                         goto setpde;
 2750                 newpde &= ~RPTE_EAA_W;
 2751         }
 2752 
 2753         /*
 2754          * Examine each of the other PTEs in the specified PTP.  Abort if this
 2755          * PTE maps an unexpected 4KB physical page or does not have identical
 2756          * characteristics to the first PTE.
 2757          */
 2758         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE;
 2759         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 2760 setpte:
 2761                 oldpte = be64toh(*pte);
 2762                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 2763                         CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
 2764                             " in pmap %p", va, pmap);
 2765                         goto fail;
 2766                 }
 2767                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 2768                         /*
 2769                          * When PG_M is already clear, PG_RW can be cleared
 2770                          * without a TLB invalidation.
 2771                          */
 2772                         if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)))
 2773                                 goto setpte;
 2774                         oldpte &= ~RPTE_EAA_W;
 2775                         CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx"
 2776                             " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) |
 2777                             (va & ~L3_PAGE_MASK), pmap);
 2778                 }
 2779                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 2780                         CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
 2781                             " in pmap %p", va, pmap);
 2782                         goto fail;
 2783                 }
 2784                 pa -= PAGE_SIZE;
 2785         }
 2786 
 2787         /*
 2788          * Save the page table page in its current state until the PDE
 2789          * mapping the superpage is demoted by pmap_demote_pde() or
 2790          * destroyed by pmap_remove_pde().
 2791          */
 2792         mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME);
 2793         KASSERT(mpte >= vm_page_array &&
 2794             mpte < &vm_page_array[vm_page_array_size],
 2795             ("pmap_promote_l3e: page table page is out of range"));
 2796         KASSERT(mpte->pindex == pmap_l3e_pindex(va),
 2797             ("pmap_promote_l3e: page table page's pindex is wrong"));
 2798         if (pmap_insert_pt_page(pmap, mpte)) {
 2799                 CTR2(KTR_PMAP,
 2800                     "pmap_promote_l3e: failure for va %#lx in pmap %p", va,
 2801                     pmap);
 2802                 goto fail;
 2803         }
 2804 
 2805         /*
 2806          * Promote the pv entries.
 2807          */
 2808         if ((newpde & PG_MANAGED) != 0)
 2809                 pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp);
 2810 
 2811         pte_store(pde, PG_PROMOTED | newpde);
 2812         ptesync();
 2813         counter_u64_add(pmap_l3e_promotions, 1);
 2814         CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx"
 2815             " in pmap %p", va, pmap);
 2816         return (0);
 2817  fail:
 2818         counter_u64_add(pmap_l3e_p_failures, 1);
 2819         return (KERN_FAILURE);
 2820 }
 2821 #endif /* VM_NRESERVLEVEL > 0 */
 2822 
 2823 int
 2824 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
 2825     vm_prot_t prot, u_int flags, int8_t psind)
 2826 {
 2827         struct rwlock *lock;
 2828         pml3_entry_t *l3e;
 2829         pt_entry_t *pte;
 2830         pt_entry_t newpte, origpte;
 2831         pv_entry_t pv;
 2832         vm_paddr_t opa, pa;
 2833         vm_page_t mpte, om;
 2834         int rv, retrycount;
 2835         boolean_t nosleep, invalidate_all, invalidate_page;
 2836 
 2837         va = trunc_page(va);
 2838         retrycount = 0;
 2839         invalidate_page = invalidate_all = false;
 2840         CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va,
 2841             m, prot, flags, psind);
 2842         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 2843         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va),
 2844             ("pmap_enter: managed mapping within the clean submap"));
 2845         if ((m->oflags & VPO_UNMANAGED) == 0)
 2846                 VM_PAGE_OBJECT_BUSY_ASSERT(m);
 2847 
 2848         KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
 2849             ("pmap_enter: flags %u has reserved bits set", flags));
 2850         pa = VM_PAGE_TO_PHYS(m);
 2851         newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF);
 2852         if ((flags & VM_PROT_WRITE) != 0)
 2853                 newpte |= PG_M;
 2854         if ((flags & VM_PROT_READ) != 0)
 2855                 newpte |= PG_A;
 2856         if (prot & VM_PROT_READ)
 2857                 newpte |= RPTE_EAA_R;
 2858         if ((prot & VM_PROT_WRITE) != 0)
 2859                 newpte |= RPTE_EAA_W;
 2860         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 2861             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 2862 
 2863         if (prot & VM_PROT_EXECUTE)
 2864                 newpte |= PG_X;
 2865         if ((flags & PMAP_ENTER_WIRED) != 0)
 2866                 newpte |= PG_W;
 2867         if (va >= DMAP_MIN_ADDRESS)
 2868                 newpte |= RPTE_EAA_P;
 2869         newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs);
 2870         /*
 2871          * Set modified bit gratuitously for writeable mappings if
 2872          * the page is unmanaged. We do not want to take a fault
 2873          * to do the dirty bit accounting for these mappings.
 2874          */
 2875         if ((m->oflags & VPO_UNMANAGED) != 0) {
 2876                 if ((newpte & PG_RW) != 0)
 2877                         newpte |= PG_M;
 2878         } else
 2879                 newpte |= PG_MANAGED;
 2880 
 2881         lock = NULL;
 2882         PMAP_LOCK(pmap);
 2883         if (psind == 1) {
 2884                 /* Assert the required virtual and physical alignment. */
 2885                 KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned"));
 2886                 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 2887                 rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock);
 2888                 goto out;
 2889         }
 2890         mpte = NULL;
 2891 
 2892         /*
 2893          * In the case that a page table page is not
 2894          * resident, we are creating it here.
 2895          */
 2896 retry:
 2897         l3e = pmap_pml3e(pmap, va);
 2898         if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 ||
 2899             pmap_demote_l3e_locked(pmap, l3e, va, &lock))) {
 2900                 pte = pmap_l3e_to_pte(l3e, va);
 2901                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 2902                         mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
 2903                         mpte->ref_count++;
 2904                 }
 2905         } else if (va < VM_MAXUSER_ADDRESS) {
 2906                 /*
 2907                  * Here if the pte page isn't mapped, or if it has been
 2908                  * deallocated.
 2909                  */
 2910                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 2911                 mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va),
 2912                     nosleep ? NULL : &lock);
 2913                 if (mpte == NULL && nosleep) {
 2914                         rv = KERN_RESOURCE_SHORTAGE;
 2915                         goto out;
 2916                 }
 2917                 if (__predict_false(retrycount++ == 6))
 2918                         panic("too many retries");
 2919                 invalidate_all = true;
 2920                 goto retry;
 2921         } else
 2922                 panic("pmap_enter: invalid page directory va=%#lx", va);
 2923 
 2924         origpte = be64toh(*pte);
 2925         pv = NULL;
 2926 
 2927         /*
 2928          * Is the specified virtual address already mapped?
 2929          */
 2930         if ((origpte & PG_V) != 0) {
 2931 #ifdef INVARIANTS
 2932                 if (VERBOSE_PMAP || pmap_logging) {
 2933                         printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --"
 2934                             " asid=%lu curpid=%d name=%s origpte0x%lx\n",
 2935                             pmap, va, m, prot, flags, psind, pmap->pm_pid,
 2936                             curproc->p_pid, curproc->p_comm, origpte);
 2937 #ifdef DDB
 2938                         pmap_pte_walk(pmap->pm_pml1, va);
 2939 #endif
 2940                 }
 2941 #endif
 2942                 /*
 2943                  * Wiring change, just update stats. We don't worry about
 2944                  * wiring PT pages as they remain resident as long as there
 2945                  * are valid mappings in them. Hence, if a user page is wired,
 2946                  * the PT page will be also.
 2947                  */
 2948                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 2949                         pmap->pm_stats.wired_count++;
 2950                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 2951                         pmap->pm_stats.wired_count--;
 2952 
 2953                 /*
 2954                  * Remove the extra PT page reference.
 2955                  */
 2956                 if (mpte != NULL) {
 2957                         mpte->ref_count--;
 2958                         KASSERT(mpte->ref_count > 0,
 2959                             ("pmap_enter: missing reference to page table page,"
 2960                              " va: 0x%lx", va));
 2961                 }
 2962 
 2963                 /*
 2964                  * Has the physical page changed?
 2965                  */
 2966                 opa = origpte & PG_FRAME;
 2967                 if (opa == pa) {
 2968                         /*
 2969                          * No, might be a protection or wiring change.
 2970                          */
 2971                         if ((origpte & PG_MANAGED) != 0 &&
 2972                             (newpte & PG_RW) != 0)
 2973                                 vm_page_aflag_set(m, PGA_WRITEABLE);
 2974                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) {
 2975                                 if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) {
 2976                                         if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
 2977                                                 goto retry;
 2978                                         if ((newpte & PG_M) != (origpte & PG_M))
 2979                                                 vm_page_dirty(m);
 2980                                         if ((newpte & PG_A) != (origpte & PG_A))
 2981                                                 vm_page_aflag_set(m, PGA_REFERENCED);
 2982                                         ptesync();
 2983                                 } else
 2984                                         invalidate_all = true;
 2985                                 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 2986                                         goto unchanged;
 2987                         }
 2988                         goto validate;
 2989                 }
 2990 
 2991                 /*
 2992                  * The physical page has changed.  Temporarily invalidate
 2993                  * the mapping.  This ensures that all threads sharing the
 2994                  * pmap keep a consistent view of the mapping, which is
 2995                  * necessary for the correct handling of COW faults.  It
 2996                  * also permits reuse of the old mapping's PV entry,
 2997                  * avoiding an allocation.
 2998                  *
 2999                  * For consistency, handle unmanaged mappings the same way.
 3000                  */
 3001                 origpte = be64toh(pte_load_clear(pte));
 3002                 KASSERT((origpte & PG_FRAME) == opa,
 3003                     ("pmap_enter: unexpected pa update for %#lx", va));
 3004                 if ((origpte & PG_MANAGED) != 0) {
 3005                         om = PHYS_TO_VM_PAGE(opa);
 3006 
 3007                         /*
 3008                          * The pmap lock is sufficient to synchronize with
 3009                          * concurrent calls to pmap_page_test_mappings() and
 3010                          * pmap_ts_referenced().
 3011                          */
 3012                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3013                                 vm_page_dirty(om);
 3014                         if ((origpte & PG_A) != 0)
 3015                                 vm_page_aflag_set(om, PGA_REFERENCED);
 3016                         CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 3017                         pv = pmap_pvh_remove(&om->md, pmap, va);
 3018                         if ((newpte & PG_MANAGED) == 0)
 3019                                 free_pv_entry(pmap, pv);
 3020 #ifdef INVARIANTS
 3021                         else if (origpte & PG_MANAGED) {
 3022                                 if (pv == NULL) {
 3023 #ifdef DDB
 3024                                         pmap_page_print_mappings(om);
 3025 #endif
 3026                                         MPASS(pv != NULL);
 3027                                 }
 3028                         }
 3029 #endif
 3030                         if ((om->a.flags & PGA_WRITEABLE) != 0 &&
 3031                             TAILQ_EMPTY(&om->md.pv_list) &&
 3032                             ((om->flags & PG_FICTITIOUS) != 0 ||
 3033                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 3034                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
 3035                 }
 3036                 if ((origpte & PG_A) != 0)
 3037                         invalidate_page = true;
 3038                 origpte = 0;
 3039         } else {
 3040                 if (pmap != kernel_pmap) {
 3041 #ifdef INVARIANTS
 3042                         if (VERBOSE_PMAP || pmap_logging)
 3043                                 printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n",
 3044                                     pmap, va, m, prot, flags, psind,
 3045                                     pmap->pm_pid, curproc->p_pid,
 3046                                     curproc->p_comm);
 3047 #endif
 3048                 }
 3049 
 3050                 /*
 3051                  * Increment the counters.
 3052                  */
 3053                 if ((newpte & PG_W) != 0)
 3054                         pmap->pm_stats.wired_count++;
 3055                 pmap_resident_count_inc(pmap, 1);
 3056         }
 3057 
 3058         /*
 3059          * Enter on the PV list if part of our managed memory.
 3060          */
 3061         if ((newpte & PG_MANAGED) != 0) {
 3062                 if (pv == NULL) {
 3063                         pv = get_pv_entry(pmap, &lock);
 3064                         pv->pv_va = va;
 3065                 }
 3066 #ifdef VERBOSE_PV
 3067                 else
 3068                         printf("reassigning pv: %p to pmap: %p\n",
 3069                                    pv, pmap);
 3070 #endif
 3071                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 3072                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 3073                 m->md.pv_gen++;
 3074                 if ((newpte & PG_RW) != 0)
 3075                         vm_page_aflag_set(m, PGA_WRITEABLE);
 3076         }
 3077 
 3078         /*
 3079          * Update the PTE.
 3080          */
 3081         if ((origpte & PG_V) != 0) {
 3082 validate:
 3083                 origpte = be64toh(pte_load_store(pte, htobe64(newpte)));
 3084                 KASSERT((origpte & PG_FRAME) == pa,
 3085                     ("pmap_enter: unexpected pa update for %#lx", va));
 3086                 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
 3087                     (PG_M | PG_RW)) {
 3088                         if ((origpte & PG_MANAGED) != 0)
 3089                                 vm_page_dirty(m);
 3090                         invalidate_page = true;
 3091 
 3092                         /*
 3093                          * Although the PTE may still have PG_RW set, TLB
 3094                          * invalidation may nonetheless be required because
 3095                          * the PTE no longer has PG_M set.
 3096                          */
 3097                 } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) {
 3098                         /*
 3099                          * Removing capabilities requires invalidation on POWER
 3100                          */
 3101                         invalidate_page = true;
 3102                         goto unchanged;
 3103                 }
 3104                 if ((origpte & PG_A) != 0)
 3105                         invalidate_page = true;
 3106         } else {
 3107                 pte_store(pte, newpte);
 3108                 ptesync();
 3109         }
 3110 unchanged:
 3111 
 3112 #if VM_NRESERVLEVEL > 0
 3113         /*
 3114          * If both the page table page and the reservation are fully
 3115          * populated, then attempt promotion.
 3116          */
 3117         if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
 3118             mmu_radix_ps_enabled(pmap) &&
 3119             (m->flags & PG_FICTITIOUS) == 0 &&
 3120             vm_reserv_level_iffullpop(m) == 0 &&
 3121                 pmap_promote_l3e(pmap, l3e, va, &lock) == 0)
 3122                 invalidate_all = true;
 3123 #endif
 3124         if (invalidate_all)
 3125                 pmap_invalidate_all(pmap);
 3126         else if (invalidate_page)
 3127                 pmap_invalidate_page(pmap, va);
 3128 
 3129         rv = KERN_SUCCESS;
 3130 out:
 3131         if (lock != NULL)
 3132                 rw_wunlock(lock);
 3133         PMAP_UNLOCK(pmap);
 3134 
 3135         return (rv);
 3136 }
 3137 
 3138 /*
 3139  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
 3140  * if successful.  Returns false if (1) a page table page cannot be allocated
 3141  * without sleeping, (2) a mapping already exists at the specified virtual
 3142  * address, or (3) a PV entry cannot be allocated without reclaiming another
 3143  * PV entry.
 3144  */
 3145 static bool
 3146 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 3147     struct rwlock **lockp)
 3148 {
 3149         pml3_entry_t newpde;
 3150 
 3151         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3152         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) |
 3153             RPTE_LEAF | PG_V;
 3154         if ((m->oflags & VPO_UNMANAGED) == 0)
 3155                 newpde |= PG_MANAGED;
 3156         if (prot & VM_PROT_EXECUTE)
 3157                 newpde |= PG_X;
 3158         if (prot & VM_PROT_READ)
 3159                 newpde |= RPTE_EAA_R;
 3160         if (va >= DMAP_MIN_ADDRESS)
 3161                 newpde |= RPTE_EAA_P;
 3162         return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
 3163             PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 3164             KERN_SUCCESS);
 3165 }
 3166 
 3167 /*
 3168  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
 3169  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
 3170  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
 3171  * a mapping already exists at the specified virtual address.  Returns
 3172  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
 3173  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
 3174  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
 3175  *
 3176  * The parameter "m" is only used when creating a managed, writeable mapping.
 3177  */
 3178 static int
 3179 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags,
 3180     vm_page_t m, struct rwlock **lockp)
 3181 {
 3182         struct spglist free;
 3183         pml3_entry_t oldl3e, *l3e;
 3184         vm_page_t mt, pdpg;
 3185 
 3186         KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
 3187             ("pmap_enter_pde: newpde is missing PG_M"));
 3188         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3189 
 3190         if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 3191             NULL : lockp)) == NULL) {
 3192                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3193                     " in pmap %p", va, pmap);
 3194                 return (KERN_RESOURCE_SHORTAGE);
 3195         }
 3196         l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 3197         l3e = &l3e[pmap_pml3e_index(va)];
 3198         oldl3e = be64toh(*l3e);
 3199         if ((oldl3e & PG_V) != 0) {
 3200                 KASSERT(pdpg->ref_count > 1,
 3201                     ("pmap_enter_pde: pdpg's wire count is too low"));
 3202                 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 3203                         pdpg->ref_count--;
 3204                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3205                             " in pmap %p", va, pmap);
 3206                         return (KERN_FAILURE);
 3207                 }
 3208                 /* Break the existing mapping(s). */
 3209                 SLIST_INIT(&free);
 3210                 if ((oldl3e & RPTE_LEAF) != 0) {
 3211                         /*
 3212                          * The reference to the PD page that was acquired by
 3213                          * pmap_allocl3e() ensures that it won't be freed.
 3214                          * However, if the PDE resulted from a promotion, then
 3215                          * a reserved PT page could be freed.
 3216                          */
 3217                         (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp);
 3218                         pmap_invalidate_l3e_page(pmap, va, oldl3e);
 3219                 } else {
 3220                         if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e,
 3221                             &free, lockp))
 3222                                pmap_invalidate_all(pmap);
 3223                 }
 3224                 vm_page_free_pages_toq(&free, true);
 3225                 if (va >= VM_MAXUSER_ADDRESS) {
 3226                         mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
 3227                         if (pmap_insert_pt_page(pmap, mt)) {
 3228                                 /*
 3229                                  * XXX Currently, this can't happen because
 3230                                  * we do not perform pmap_enter(psind == 1)
 3231                                  * on the kernel pmap.
 3232                                  */
 3233                                 panic("pmap_enter_pde: trie insert failed");
 3234                         }
 3235                 } else
 3236                         KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p",
 3237                             l3e));
 3238         }
 3239         if ((newpde & PG_MANAGED) != 0) {
 3240                 /*
 3241                  * Abort this mapping if its PV entry could not be created.
 3242                  */
 3243                 if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) {
 3244                         SLIST_INIT(&free);
 3245                         if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
 3246                                 /*
 3247                                  * Although "va" is not mapped, paging-
 3248                                  * structure caches could nonetheless have
 3249                                  * entries that refer to the freed page table
 3250                                  * pages.  Invalidate those entries.
 3251                                  */
 3252                                 pmap_invalidate_page(pmap, va);
 3253                                 vm_page_free_pages_toq(&free, true);
 3254                         }
 3255                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 3256                             " in pmap %p", va, pmap);
 3257                         return (KERN_RESOURCE_SHORTAGE);
 3258                 }
 3259                 if ((newpde & PG_RW) != 0) {
 3260                         for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
 3261                                 vm_page_aflag_set(mt, PGA_WRITEABLE);
 3262                 }
 3263         }
 3264 
 3265         /*
 3266          * Increment counters.
 3267          */
 3268         if ((newpde & PG_W) != 0)
 3269                 pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE;
 3270         pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
 3271 
 3272         /*
 3273          * Map the superpage.  (This is not a promoted mapping; there will not
 3274          * be any lingering 4KB page mappings in the TLB.)
 3275          */
 3276         pte_store(l3e, newpde);
 3277         ptesync();
 3278 
 3279         counter_u64_add(pmap_l3e_mappings, 1);
 3280         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 3281             " in pmap %p", va, pmap);
 3282         return (KERN_SUCCESS);
 3283 }
 3284 
 3285 void
 3286 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start,
 3287     vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
 3288 {
 3289 
 3290         struct rwlock *lock;
 3291         vm_offset_t va;
 3292         vm_page_t m, mpte;
 3293         vm_pindex_t diff, psize;
 3294         bool invalidate;
 3295         VM_OBJECT_ASSERT_LOCKED(m_start->object);
 3296 
 3297         CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start,
 3298             end, m_start, prot);
 3299 
 3300         invalidate = false;
 3301         psize = atop(end - start);
 3302         mpte = NULL;
 3303         m = m_start;
 3304         lock = NULL;
 3305         PMAP_LOCK(pmap);
 3306         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 3307                 va = start + ptoa(diff);
 3308                 if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end &&
 3309                     m->psind == 1 && mmu_radix_ps_enabled(pmap) &&
 3310                     pmap_enter_2mpage(pmap, va, m, prot, &lock))
 3311                         m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1];
 3312                 else
 3313                         mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot,
 3314                             mpte, &lock, &invalidate);
 3315                 m = TAILQ_NEXT(m, listq);
 3316         }
 3317         ptesync();
 3318         if (lock != NULL)
 3319                 rw_wunlock(lock);
 3320         if (invalidate)
 3321                 pmap_invalidate_all(pmap);
 3322         PMAP_UNLOCK(pmap);
 3323 }
 3324 
 3325 static vm_page_t
 3326 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
 3327     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate)
 3328 {
 3329         struct spglist free;
 3330         pt_entry_t *pte;
 3331         vm_paddr_t pa;
 3332 
 3333         KASSERT(!VA_IS_CLEANMAP(va) ||
 3334             (m->oflags & VPO_UNMANAGED) != 0,
 3335             ("mmu_radix_enter_quick_locked: managed mapping within the clean submap"));
 3336         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3337 
 3338         /*
 3339          * In the case that a page table page is not
 3340          * resident, we are creating it here.
 3341          */
 3342         if (va < VM_MAXUSER_ADDRESS) {
 3343                 vm_pindex_t ptepindex;
 3344                 pml3_entry_t *ptepa;
 3345 
 3346                 /*
 3347                  * Calculate pagetable page index
 3348                  */
 3349                 ptepindex = pmap_l3e_pindex(va);
 3350                 if (mpte && (mpte->pindex == ptepindex)) {
 3351                         mpte->ref_count++;
 3352                 } else {
 3353                         /*
 3354                          * Get the page directory entry
 3355                          */
 3356                         ptepa = pmap_pml3e(pmap, va);
 3357 
 3358                         /*
 3359                          * If the page table page is mapped, we just increment
 3360                          * the hold count, and activate it.  Otherwise, we
 3361                          * attempt to allocate a page table page.  If this
 3362                          * attempt fails, we don't retry.  Instead, we give up.
 3363                          */
 3364                         if (ptepa && (be64toh(*ptepa) & PG_V) != 0) {
 3365                                 if (be64toh(*ptepa) & RPTE_LEAF)
 3366                                         return (NULL);
 3367                                 mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME);
 3368                                 mpte->ref_count++;
 3369                         } else {
 3370                                 /*
 3371                                  * Pass NULL instead of the PV list lock
 3372                                  * pointer, because we don't intend to sleep.
 3373                                  */
 3374                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 3375                                 if (mpte == NULL)
 3376                                         return (mpte);
 3377                         }
 3378                 }
 3379                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 3380                 pte = &pte[pmap_pte_index(va)];
 3381         } else {
 3382                 mpte = NULL;
 3383                 pte = pmap_pte(pmap, va);
 3384         }
 3385         if (be64toh(*pte)) {
 3386                 if (mpte != NULL) {
 3387                         mpte->ref_count--;
 3388                         mpte = NULL;
 3389                 }
 3390                 return (mpte);
 3391         }
 3392 
 3393         /*
 3394          * Enter on the PV list if part of our managed memory.
 3395          */
 3396         if ((m->oflags & VPO_UNMANAGED) == 0 &&
 3397             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 3398                 if (mpte != NULL) {
 3399                         SLIST_INIT(&free);
 3400                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 3401                                 /*
 3402                                  * Although "va" is not mapped, paging-
 3403                                  * structure caches could nonetheless have
 3404                                  * entries that refer to the freed page table
 3405                                  * pages.  Invalidate those entries.
 3406                                  */
 3407                                 *invalidate = true;
 3408                                 vm_page_free_pages_toq(&free, true);
 3409                         }
 3410                         mpte = NULL;
 3411                 }
 3412                 return (mpte);
 3413         }
 3414 
 3415         /*
 3416          * Increment counters
 3417          */
 3418         pmap_resident_count_inc(pmap, 1);
 3419 
 3420         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs);
 3421         if (prot & VM_PROT_EXECUTE)
 3422                 pa |= PG_X;
 3423         else
 3424                 pa |= RPTE_EAA_R;
 3425         if ((m->oflags & VPO_UNMANAGED) == 0)
 3426                 pa |= PG_MANAGED;
 3427 
 3428         pte_store(pte, pa);
 3429         return (mpte);
 3430 }
 3431 
 3432 void
 3433 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
 3434     vm_prot_t prot)
 3435 {
 3436         struct rwlock *lock;
 3437         bool invalidate;
 3438 
 3439         lock = NULL;
 3440         invalidate = false;
 3441         PMAP_LOCK(pmap);
 3442         mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock,
 3443             &invalidate);
 3444         ptesync();
 3445         if (lock != NULL)
 3446                 rw_wunlock(lock);
 3447         if (invalidate)
 3448                 pmap_invalidate_all(pmap);
 3449         PMAP_UNLOCK(pmap);
 3450 }
 3451 
 3452 vm_paddr_t
 3453 mmu_radix_extract(pmap_t pmap, vm_offset_t va)
 3454 {
 3455         pml3_entry_t *l3e;
 3456         pt_entry_t *pte;
 3457         vm_paddr_t pa;
 3458 
 3459         l3e = pmap_pml3e(pmap, va);
 3460         if (__predict_false(l3e == NULL))
 3461                 return (0);
 3462         if (be64toh(*l3e) & RPTE_LEAF) {
 3463                 pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
 3464                 pa |= (va & L3_PAGE_MASK);
 3465         } else {
 3466                 /*
 3467                  * Beware of a concurrent promotion that changes the
 3468                  * PDE at this point!  For example, vtopte() must not
 3469                  * be used to access the PTE because it would use the
 3470                  * new PDE.  It is, however, safe to use the old PDE
 3471                  * because the page table page is preserved by the
 3472                  * promotion.
 3473                  */
 3474                 pte = pmap_l3e_to_pte(l3e, va);
 3475                 if (__predict_false(pte == NULL))
 3476                         return (0);
 3477                 pa = be64toh(*pte);
 3478                 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 3479                 pa |= (va & PAGE_MASK);
 3480         }
 3481         return (pa);
 3482 }
 3483 
 3484 vm_page_t
 3485 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 3486 {
 3487         pml3_entry_t l3e, *l3ep;
 3488         pt_entry_t pte;
 3489         vm_page_t m;
 3490 
 3491         m = NULL;
 3492         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot);
 3493         PMAP_LOCK(pmap);
 3494         l3ep = pmap_pml3e(pmap, va);
 3495         if (l3ep != NULL && (l3e = be64toh(*l3ep))) {
 3496                 if (l3e & RPTE_LEAF) {
 3497                         if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0)
 3498                                 m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) |
 3499                                     (va & L3_PAGE_MASK));
 3500                 } else {
 3501                         /* Native endian PTE, do not pass to pmap functions */
 3502                         pte = be64toh(*pmap_l3e_to_pte(l3ep, va));
 3503                         if ((pte & PG_V) &&
 3504                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0))
 3505                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 3506                 }
 3507                 if (m != NULL && !vm_page_wire_mapped(m))
 3508                         m = NULL;
 3509         }
 3510         PMAP_UNLOCK(pmap);
 3511         return (m);
 3512 }
 3513 
 3514 static void
 3515 mmu_radix_growkernel(vm_offset_t addr)
 3516 {
 3517         vm_paddr_t paddr;
 3518         vm_page_t nkpg;
 3519         pml3_entry_t *l3e;
 3520         pml2_entry_t *l2e;
 3521 
 3522         CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
 3523         if (VM_MIN_KERNEL_ADDRESS < addr &&
 3524                 addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE))
 3525                 return;
 3526 
 3527         addr = roundup2(addr, L3_PAGE_SIZE);
 3528         if (addr - 1 >= vm_map_max(kernel_map))
 3529                 addr = vm_map_max(kernel_map);
 3530         while (kernel_vm_end < addr) {
 3531                 l2e = pmap_pml2e(kernel_pmap, kernel_vm_end);
 3532                 if ((be64toh(*l2e) & PG_V) == 0) {
 3533                         /* We need a new PDP entry */
 3534                         nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
 3535                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 3536                         if (nkpg == NULL)
 3537                                 panic("pmap_growkernel: no memory to grow kernel");
 3538                         nkpg->pindex = kernel_vm_end >> L2_PAGE_SIZE_SHIFT;
 3539                         paddr = VM_PAGE_TO_PHYS(nkpg);
 3540                         pde_store(l2e, paddr);
 3541                         continue; /* try again */
 3542                 }
 3543                 l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end);
 3544                 if ((be64toh(*l3e) & PG_V) != 0) {
 3545                         kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 3546                         if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 3547                                 kernel_vm_end = vm_map_max(kernel_map);
 3548                                 break;
 3549                         }
 3550                         continue;
 3551                 }
 3552 
 3553                 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
 3554                     VM_ALLOC_ZERO);
 3555                 if (nkpg == NULL)
 3556                         panic("pmap_growkernel: no memory to grow kernel");
 3557                 nkpg->pindex = pmap_l3e_pindex(kernel_vm_end);
 3558                 paddr = VM_PAGE_TO_PHYS(nkpg);
 3559                 pde_store(l3e, paddr);
 3560 
 3561                 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 3562                 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 3563                         kernel_vm_end = vm_map_max(kernel_map);
 3564                         break;
 3565                 }
 3566         }
 3567         ptesync();
 3568 }
 3569 
 3570 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory");
 3571 static uma_zone_t zone_radix_pgd;
 3572 
 3573 static int
 3574 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused,
 3575     int flags)
 3576 {
 3577         int req;
 3578 
 3579         req = VM_ALLOC_WIRED | malloc2vm_flags(flags);
 3580         for (int i = 0; i < count; i++) {
 3581                 vm_page_t m = vm_page_alloc_noobj_contig(req,
 3582                     RADIX_PGD_SIZE / PAGE_SIZE,
 3583                     0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE,
 3584                     VM_MEMATTR_DEFAULT);
 3585                 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 3586         }
 3587         return (count);
 3588 }
 3589 
 3590 static void
 3591 radix_pgd_release(void *arg __unused, void **store, int count)
 3592 {
 3593         vm_page_t m;
 3594         struct spglist free;
 3595         int page_count;
 3596 
 3597         SLIST_INIT(&free);
 3598         page_count = RADIX_PGD_SIZE/PAGE_SIZE;
 3599 
 3600         for (int i = 0; i < count; i++) {
 3601                 /*
 3602                  * XXX selectively remove dmap and KVA entries so we don't
 3603                  * need to bzero
 3604                  */
 3605                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
 3606                 for (int j = page_count-1; j >= 0; j--) {
 3607                         vm_page_unwire_noq(&m[j]);
 3608                         SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss);
 3609                 }
 3610                 vm_page_free_pages_toq(&free, false);
 3611         }
 3612 }
 3613 
 3614 static void
 3615 mmu_radix_init(void)
 3616 {
 3617         vm_page_t mpte;
 3618         vm_size_t s;
 3619         int error, i, pv_npg;
 3620 
 3621         /* XXX is this really needed for POWER? */
 3622         /* L1TF, reserve page @0 unconditionally */
 3623         vm_page_blacklist_add(0, bootverbose);
 3624 
 3625         zone_radix_pgd = uma_zcache_create("radix_pgd_cache",
 3626                 RADIX_PGD_SIZE, NULL, NULL,
 3627 #ifdef INVARIANTS
 3628             trash_init, trash_fini,
 3629 #else
 3630             NULL, NULL,
 3631 #endif
 3632                 radix_pgd_import, radix_pgd_release,
 3633                 NULL, UMA_ZONE_NOBUCKET);
 3634 
 3635         /*
 3636          * Initialize the vm page array entries for the kernel pmap's
 3637          * page table pages.
 3638          */
 3639         PMAP_LOCK(kernel_pmap);
 3640         for (i = 0; i < nkpt; i++) {
 3641                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 3642                 KASSERT(mpte >= vm_page_array &&
 3643                     mpte < &vm_page_array[vm_page_array_size],
 3644                     ("pmap_init: page table page is out of range size: %lu",
 3645                      vm_page_array_size));
 3646                 mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i;
 3647                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 3648                 MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte);
 3649                 //pmap_insert_pt_page(kernel_pmap, mpte);
 3650                 mpte->ref_count = 1;
 3651         }
 3652         PMAP_UNLOCK(kernel_pmap);
 3653         vm_wire_add(nkpt);
 3654 
 3655         CTR1(KTR_PMAP, "%s()", __func__);
 3656         TAILQ_INIT(&pv_dummy.pv_list);
 3657 
 3658         /*
 3659          * Are large page mappings enabled?
 3660          */
 3661         TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
 3662         if (superpages_enabled) {
 3663                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 3664                     ("pmap_init: can't assign to pagesizes[1]"));
 3665                 pagesizes[1] = L3_PAGE_SIZE;
 3666         }
 3667 
 3668         /*
 3669          * Initialize the pv chunk list mutex.
 3670          */
 3671         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 3672 
 3673         /*
 3674          * Initialize the pool of pv list locks.
 3675          */
 3676         for (i = 0; i < NPV_LIST_LOCKS; i++)
 3677                 rw_init(&pv_list_locks[i], "pmap pv list");
 3678 
 3679         /*
 3680          * Calculate the size of the pv head table for superpages.
 3681          */
 3682         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE);
 3683 
 3684         /*
 3685          * Allocate memory for the pv head table for superpages.
 3686          */
 3687         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 3688         s = round_page(s);
 3689         pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
 3690         for (i = 0; i < pv_npg; i++)
 3691                 TAILQ_INIT(&pv_table[i].pv_list);
 3692         TAILQ_INIT(&pv_dummy.pv_list);
 3693 
 3694         pmap_initialized = 1;
 3695         mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
 3696         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 3697             (vmem_addr_t *)&qframe);
 3698 
 3699         if (error != 0)
 3700                 panic("qframe allocation failed");
 3701         asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits),
 3702             1, 1, M_WAITOK);
 3703 }
 3704 
 3705 static boolean_t
 3706 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 3707 {
 3708         struct rwlock *lock;
 3709         pv_entry_t pv;
 3710         struct md_page *pvh;
 3711         pt_entry_t *pte, mask;
 3712         pmap_t pmap;
 3713         int md_gen, pvh_gen;
 3714         boolean_t rv;
 3715 
 3716         rv = FALSE;
 3717         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 3718         rw_rlock(lock);
 3719 restart:
 3720         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 3721                 pmap = PV_PMAP(pv);
 3722                 if (!PMAP_TRYLOCK(pmap)) {
 3723                         md_gen = m->md.pv_gen;
 3724                         rw_runlock(lock);
 3725                         PMAP_LOCK(pmap);
 3726                         rw_rlock(lock);
 3727                         if (md_gen != m->md.pv_gen) {
 3728                                 PMAP_UNLOCK(pmap);
 3729                                 goto restart;
 3730                         }
 3731                 }
 3732                 pte = pmap_pte(pmap, pv->pv_va);
 3733                 mask = 0;
 3734                 if (modified)
 3735                         mask |= PG_RW | PG_M;
 3736                 if (accessed)
 3737                         mask |= PG_V | PG_A;
 3738                 rv = (be64toh(*pte) & mask) == mask;
 3739                 PMAP_UNLOCK(pmap);
 3740                 if (rv)
 3741                         goto out;
 3742         }
 3743         if ((m->flags & PG_FICTITIOUS) == 0) {
 3744                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 3745                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
 3746                         pmap = PV_PMAP(pv);
 3747                         if (!PMAP_TRYLOCK(pmap)) {
 3748                                 md_gen = m->md.pv_gen;
 3749                                 pvh_gen = pvh->pv_gen;
 3750                                 rw_runlock(lock);
 3751                                 PMAP_LOCK(pmap);
 3752                                 rw_rlock(lock);
 3753                                 if (md_gen != m->md.pv_gen ||
 3754                                     pvh_gen != pvh->pv_gen) {
 3755                                         PMAP_UNLOCK(pmap);
 3756                                         goto restart;
 3757                                 }
 3758                         }
 3759                         pte = pmap_pml3e(pmap, pv->pv_va);
 3760                         mask = 0;
 3761                         if (modified)
 3762                                 mask |= PG_RW | PG_M;
 3763                         if (accessed)
 3764                                 mask |= PG_V | PG_A;
 3765                         rv = (be64toh(*pte) & mask) == mask;
 3766                         PMAP_UNLOCK(pmap);
 3767                         if (rv)
 3768                                 goto out;
 3769                 }
 3770         }
 3771 out:
 3772         rw_runlock(lock);
 3773         return (rv);
 3774 }
 3775 
 3776 /*
 3777  *      pmap_is_modified:
 3778  *
 3779  *      Return whether or not the specified physical page was modified
 3780  *      in any physical maps.
 3781  */
 3782 boolean_t
 3783 mmu_radix_is_modified(vm_page_t m)
 3784 {
 3785 
 3786         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3787             ("pmap_is_modified: page %p is not managed", m));
 3788 
 3789         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 3790         /*
 3791          * If the page is not busied then this check is racy.
 3792          */
 3793         if (!pmap_page_is_write_mapped(m))
 3794                 return (FALSE);
 3795         return (pmap_page_test_mappings(m, FALSE, TRUE));
 3796 }
 3797 
 3798 boolean_t
 3799 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 3800 {
 3801         pml3_entry_t *l3e;
 3802         pt_entry_t *pte;
 3803         boolean_t rv;
 3804 
 3805         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
 3806         rv = FALSE;
 3807         PMAP_LOCK(pmap);
 3808         l3e = pmap_pml3e(pmap, addr);
 3809         if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) {
 3810                 pte = pmap_l3e_to_pte(l3e, addr);
 3811                 rv = (be64toh(*pte) & PG_V) == 0;
 3812         }
 3813         PMAP_UNLOCK(pmap);
 3814         return (rv);
 3815 }
 3816 
 3817 boolean_t
 3818 mmu_radix_is_referenced(vm_page_t m)
 3819 {
 3820         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3821             ("pmap_is_referenced: page %p is not managed", m));
 3822         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 3823         return (pmap_page_test_mappings(m, TRUE, FALSE));
 3824 }
 3825 
 3826 /*
 3827  *      pmap_ts_referenced:
 3828  *
 3829  *      Return a count of reference bits for a page, clearing those bits.
 3830  *      It is not necessary for every reference bit to be cleared, but it
 3831  *      is necessary that 0 only be returned when there are truly no
 3832  *      reference bits set.
 3833  *
 3834  *      As an optimization, update the page's dirty field if a modified bit is
 3835  *      found while counting reference bits.  This opportunistic update can be
 3836  *      performed at low cost and can eliminate the need for some future calls
 3837  *      to pmap_is_modified().  However, since this function stops after
 3838  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
 3839  *      dirty pages.  Those dirty pages will only be detected by a future call
 3840  *      to pmap_is_modified().
 3841  *
 3842  *      A DI block is not needed within this function, because
 3843  *      invalidations are performed before the PV list lock is
 3844  *      released.
 3845  */
 3846 boolean_t
 3847 mmu_radix_ts_referenced(vm_page_t m)
 3848 {
 3849         struct md_page *pvh;
 3850         pv_entry_t pv, pvf;
 3851         pmap_t pmap;
 3852         struct rwlock *lock;
 3853         pml3_entry_t oldl3e, *l3e;
 3854         pt_entry_t *pte;
 3855         vm_paddr_t pa;
 3856         int cleared, md_gen, not_cleared, pvh_gen;
 3857         struct spglist free;
 3858 
 3859         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 3860         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 3861             ("pmap_ts_referenced: page %p is not managed", m));
 3862         SLIST_INIT(&free);
 3863         cleared = 0;
 3864         pa = VM_PAGE_TO_PHYS(m);
 3865         lock = PHYS_TO_PV_LIST_LOCK(pa);
 3866         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 3867         rw_wlock(lock);
 3868 retry:
 3869         not_cleared = 0;
 3870         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 3871                 goto small_mappings;
 3872         pv = pvf;
 3873         do {
 3874                 if (pvf == NULL)
 3875                         pvf = pv;
 3876                 pmap = PV_PMAP(pv);
 3877                 if (!PMAP_TRYLOCK(pmap)) {
 3878                         pvh_gen = pvh->pv_gen;
 3879                         rw_wunlock(lock);
 3880                         PMAP_LOCK(pmap);
 3881                         rw_wlock(lock);
 3882                         if (pvh_gen != pvh->pv_gen) {
 3883                                 PMAP_UNLOCK(pmap);
 3884                                 goto retry;
 3885                         }
 3886                 }
 3887                 l3e = pmap_pml3e(pmap, pv->pv_va);
 3888                 oldl3e = be64toh(*l3e);
 3889                 if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 3890                         /*
 3891                          * Although "oldpde" is mapping a 2MB page, because
 3892                          * this function is called at a 4KB page granularity,
 3893                          * we only update the 4KB page under test.
 3894                          */
 3895                         vm_page_dirty(m);
 3896                 }
 3897                 if ((oldl3e & PG_A) != 0) {
 3898                         /*
 3899                          * Since this reference bit is shared by 512 4KB
 3900                          * pages, it should not be cleared every time it is
 3901                          * tested.  Apply a simple "hash" function on the
 3902                          * physical page number, the virtual superpage number,
 3903                          * and the pmap address to select one 4KB page out of
 3904                          * the 512 on which testing the reference bit will
 3905                          * result in clearing that reference bit.  This
 3906                          * function is designed to avoid the selection of the
 3907                          * same 4KB page for every 2MB page mapping.
 3908                          *
 3909                          * On demotion, a mapping that hasn't been referenced
 3910                          * is simply destroyed.  To avoid the possibility of a
 3911                          * subsequent page fault on a demoted wired mapping,
 3912                          * always leave its reference bit set.  Moreover,
 3913                          * since the superpage is wired, the current state of
 3914                          * its reference bit won't affect page replacement.
 3915                          */
 3916                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^
 3917                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 3918                             (oldl3e & PG_W) == 0) {
 3919                                 atomic_clear_long(l3e, htobe64(PG_A));
 3920                                 pmap_invalidate_page(pmap, pv->pv_va);
 3921                                 cleared++;
 3922                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 3923                                     ("inconsistent pv lock %p %p for page %p",
 3924                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 3925                         } else
 3926                                 not_cleared++;
 3927                 }
 3928                 PMAP_UNLOCK(pmap);
 3929                 /* Rotate the PV list if it has more than one entry. */
 3930                 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
 3931                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
 3932                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
 3933                         pvh->pv_gen++;
 3934                 }
 3935                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 3936                         goto out;
 3937         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 3938 small_mappings:
 3939         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 3940                 goto out;
 3941         pv = pvf;
 3942         do {
 3943                 if (pvf == NULL)
 3944                         pvf = pv;
 3945                 pmap = PV_PMAP(pv);
 3946                 if (!PMAP_TRYLOCK(pmap)) {
 3947                         pvh_gen = pvh->pv_gen;
 3948                         md_gen = m->md.pv_gen;
 3949                         rw_wunlock(lock);
 3950                         PMAP_LOCK(pmap);
 3951                         rw_wlock(lock);
 3952                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 3953                                 PMAP_UNLOCK(pmap);
 3954                                 goto retry;
 3955                         }
 3956                 }
 3957                 l3e = pmap_pml3e(pmap, pv->pv_va);
 3958                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
 3959                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 3960                     m));
 3961                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
 3962                 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW))
 3963                         vm_page_dirty(m);
 3964                 if ((be64toh(*pte) & PG_A) != 0) {
 3965                         atomic_clear_long(pte, htobe64(PG_A));
 3966                         pmap_invalidate_page(pmap, pv->pv_va);
 3967                         cleared++;
 3968                 }
 3969                 PMAP_UNLOCK(pmap);
 3970                 /* Rotate the PV list if it has more than one entry. */
 3971                 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
 3972                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
 3973                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
 3974                         m->md.pv_gen++;
 3975                 }
 3976         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 3977             not_cleared < PMAP_TS_REFERENCED_MAX);
 3978 out:
 3979         rw_wunlock(lock);
 3980         vm_page_free_pages_toq(&free, true);
 3981         return (cleared + not_cleared);
 3982 }
 3983 
 3984 static vm_offset_t
 3985 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start,
 3986     vm_paddr_t end, int prot __unused)
 3987 {
 3988 
 3989         CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end,
 3990                  prot);
 3991         return (PHYS_TO_DMAP(start));
 3992 }
 3993 
 3994 void
 3995 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr,
 3996     vm_object_t object, vm_pindex_t pindex, vm_size_t size)
 3997 {
 3998         pml3_entry_t *l3e;
 3999         vm_paddr_t pa, ptepa;
 4000         vm_page_t p, pdpg;
 4001         vm_memattr_t ma;
 4002 
 4003         CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr,
 4004             object, pindex, size);
 4005         VM_OBJECT_ASSERT_WLOCKED(object);
 4006         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 4007                         ("pmap_object_init_pt: non-device object"));
 4008         /* NB: size can be logically ored with addr here */
 4009         if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) {
 4010                 if (!mmu_radix_ps_enabled(pmap))
 4011                         return;
 4012                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
 4013                         return;
 4014                 p = vm_page_lookup(object, pindex);
 4015                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
 4016                     ("pmap_object_init_pt: invalid page %p", p));
 4017                 ma = p->md.mdpg_cache_attrs;
 4018 
 4019                 /*
 4020                  * Abort the mapping if the first page is not physically
 4021                  * aligned to a 2MB page boundary.
 4022                  */
 4023                 ptepa = VM_PAGE_TO_PHYS(p);
 4024                 if (ptepa & L3_PAGE_MASK)
 4025                         return;
 4026 
 4027                 /*
 4028                  * Skip the first page.  Abort the mapping if the rest of
 4029                  * the pages are not physically contiguous or have differing
 4030                  * memory attributes.
 4031                  */
 4032                 p = TAILQ_NEXT(p, listq);
 4033                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 4034                     pa += PAGE_SIZE) {
 4035                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
 4036                             ("pmap_object_init_pt: invalid page %p", p));
 4037                         if (pa != VM_PAGE_TO_PHYS(p) ||
 4038                             ma != p->md.mdpg_cache_attrs)
 4039                                 return;
 4040                         p = TAILQ_NEXT(p, listq);
 4041                 }
 4042 
 4043                 PMAP_LOCK(pmap);
 4044                 for (pa = ptepa | pmap_cache_bits(ma);
 4045                     pa < ptepa + size; pa += L3_PAGE_SIZE) {
 4046                         pdpg = pmap_allocl3e(pmap, addr, NULL);
 4047                         if (pdpg == NULL) {
 4048                                 /*
 4049                                  * The creation of mappings below is only an
 4050                                  * optimization.  If a page directory page
 4051                                  * cannot be allocated without blocking,
 4052                                  * continue on to the next mapping rather than
 4053                                  * blocking.
 4054                                  */
 4055                                 addr += L3_PAGE_SIZE;
 4056                                 continue;
 4057                         }
 4058                         l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 4059                         l3e = &l3e[pmap_pml3e_index(addr)];
 4060                         if ((be64toh(*l3e) & PG_V) == 0) {
 4061                                 pa |= PG_M | PG_A | PG_RW;
 4062                                 pte_store(l3e, pa);
 4063                                 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
 4064                                 counter_u64_add(pmap_l3e_mappings, 1);
 4065                         } else {
 4066                                 /* Continue on if the PDE is already valid. */
 4067                                 pdpg->ref_count--;
 4068                                 KASSERT(pdpg->ref_count > 0,
 4069                                     ("pmap_object_init_pt: missing reference "
 4070                                     "to page directory page, va: 0x%lx", addr));
 4071                         }
 4072                         addr += L3_PAGE_SIZE;
 4073                 }
 4074                 ptesync();
 4075                 PMAP_UNLOCK(pmap);
 4076         }
 4077 }
 4078 
 4079 boolean_t
 4080 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m)
 4081 {
 4082         struct md_page *pvh;
 4083         struct rwlock *lock;
 4084         pv_entry_t pv;
 4085         int loops = 0;
 4086         boolean_t rv;
 4087 
 4088         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 4089             ("pmap_page_exists_quick: page %p is not managed", m));
 4090         CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m);
 4091         rv = FALSE;
 4092         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 4093         rw_rlock(lock);
 4094         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 4095                 if (PV_PMAP(pv) == pmap) {
 4096                         rv = TRUE;
 4097                         break;
 4098                 }
 4099                 loops++;
 4100                 if (loops >= 16)
 4101                         break;
 4102         }
 4103         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 4104                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4105                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
 4106                         if (PV_PMAP(pv) == pmap) {
 4107                                 rv = TRUE;
 4108                                 break;
 4109                         }
 4110                         loops++;
 4111                         if (loops >= 16)
 4112                                 break;
 4113                 }
 4114         }
 4115         rw_runlock(lock);
 4116         return (rv);
 4117 }
 4118 
 4119 void
 4120 mmu_radix_page_init(vm_page_t m)
 4121 {
 4122 
 4123         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 4124         TAILQ_INIT(&m->md.pv_list);
 4125         m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
 4126 }
 4127 
 4128 int
 4129 mmu_radix_page_wired_mappings(vm_page_t m)
 4130 {
 4131         struct rwlock *lock;
 4132         struct md_page *pvh;
 4133         pmap_t pmap;
 4134         pt_entry_t *pte;
 4135         pv_entry_t pv;
 4136         int count, md_gen, pvh_gen;
 4137 
 4138         if ((m->oflags & VPO_UNMANAGED) != 0)
 4139                 return (0);
 4140         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 4141         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 4142         rw_rlock(lock);
 4143 restart:
 4144         count = 0;
 4145         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 4146                 pmap = PV_PMAP(pv);
 4147                 if (!PMAP_TRYLOCK(pmap)) {
 4148                         md_gen = m->md.pv_gen;
 4149                         rw_runlock(lock);
 4150                         PMAP_LOCK(pmap);
 4151                         rw_rlock(lock);
 4152                         if (md_gen != m->md.pv_gen) {
 4153                                 PMAP_UNLOCK(pmap);
 4154                                 goto restart;
 4155                         }
 4156                 }
 4157                 pte = pmap_pte(pmap, pv->pv_va);
 4158                 if ((be64toh(*pte) & PG_W) != 0)
 4159                         count++;
 4160                 PMAP_UNLOCK(pmap);
 4161         }
 4162         if ((m->flags & PG_FICTITIOUS) == 0) {
 4163                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 4164                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
 4165                         pmap = PV_PMAP(pv);
 4166                         if (!PMAP_TRYLOCK(pmap)) {
 4167                                 md_gen = m->md.pv_gen;
 4168                                 pvh_gen = pvh->pv_gen;
 4169                                 rw_runlock(lock);
 4170                                 PMAP_LOCK(pmap);
 4171                                 rw_rlock(lock);
 4172                                 if (md_gen != m->md.pv_gen ||
 4173                                     pvh_gen != pvh->pv_gen) {
 4174                                         PMAP_UNLOCK(pmap);
 4175                                         goto restart;
 4176                                 }
 4177                         }
 4178                         pte = pmap_pml3e(pmap, pv->pv_va);
 4179                         if ((be64toh(*pte) & PG_W) != 0)
 4180                                 count++;
 4181                         PMAP_UNLOCK(pmap);
 4182                 }
 4183         }
 4184         rw_runlock(lock);
 4185         return (count);
 4186 }
 4187 
 4188 static void
 4189 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa)
 4190 {
 4191         isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE |  l1pa | RADIX_PGD_INDEX_SHIFT);
 4192 }
 4193 
 4194 int
 4195 mmu_radix_pinit(pmap_t pmap)
 4196 {
 4197         vmem_addr_t pid;
 4198         vm_paddr_t l1pa;
 4199 
 4200         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
 4201 
 4202         /*
 4203          * allocate the page directory page
 4204          */
 4205         pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK);
 4206 
 4207         for (int j = 0; j <  RADIX_PGD_SIZE_SHIFT; j++)
 4208                 pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE);
 4209         vm_radix_init(&pmap->pm_radix);
 4210         TAILQ_INIT(&pmap->pm_pvchunk);
 4211         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 4212         pmap->pm_flags = PMAP_PDE_SUPERPAGE;
 4213         vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid);
 4214 
 4215         pmap->pm_pid = pid;
 4216         l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1);
 4217         mmu_radix_update_proctab(pid, l1pa);
 4218         __asm __volatile("ptesync;isync" : : : "memory");
 4219 
 4220         return (1);
 4221 }
 4222 
 4223 /*
 4224  * This routine is called if the desired page table page does not exist.
 4225  *
 4226  * If page table page allocation fails, this routine may sleep before
 4227  * returning NULL.  It sleeps only if a lock pointer was given.
 4228  *
 4229  * Note: If a page allocation fails at page table level two or three,
 4230  * one or two pages may be held during the wait, only to be released
 4231  * afterwards.  This conservative approach is easily argued to avoid
 4232  * race conditions.
 4233  */
 4234 static vm_page_t
 4235 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 4236 {
 4237         vm_page_t m, pdppg, pdpg;
 4238 
 4239         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4240 
 4241         /*
 4242          * Allocate a page table page.
 4243          */
 4244         if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 4245                 if (lockp != NULL) {
 4246                         RELEASE_PV_LIST_LOCK(lockp);
 4247                         PMAP_UNLOCK(pmap);
 4248                         vm_wait(NULL);
 4249                         PMAP_LOCK(pmap);
 4250                 }
 4251                 /*
 4252                  * Indicate the need to retry.  While waiting, the page table
 4253                  * page may have been allocated.
 4254                  */
 4255                 return (NULL);
 4256         }
 4257         m->pindex = ptepindex;
 4258 
 4259         /*
 4260          * Map the pagetable page into the process address space, if
 4261          * it isn't already there.
 4262          */
 4263 
 4264         if (ptepindex >= (NUPDE + NUPDPE)) {
 4265                 pml1_entry_t *l1e;
 4266                 vm_pindex_t pml1index;
 4267 
 4268                 /* Wire up a new PDPE page */
 4269                 pml1index = ptepindex - (NUPDE + NUPDPE);
 4270                 l1e = &pmap->pm_pml1[pml1index];
 4271                 KASSERT((be64toh(*l1e) & PG_V) == 0,
 4272                     ("%s: L1 entry %#lx is valid", __func__, *l1e));
 4273                 pde_store(l1e, VM_PAGE_TO_PHYS(m));
 4274         } else if (ptepindex >= NUPDE) {
 4275                 vm_pindex_t pml1index;
 4276                 vm_pindex_t pdpindex;
 4277                 pml1_entry_t *l1e;
 4278                 pml2_entry_t *l2e;
 4279 
 4280                 /* Wire up a new l2e page */
 4281                 pdpindex = ptepindex - NUPDE;
 4282                 pml1index = pdpindex >> RPTE_SHIFT;
 4283 
 4284                 l1e = &pmap->pm_pml1[pml1index];
 4285                 if ((be64toh(*l1e) & PG_V) == 0) {
 4286                         /* Have to allocate a new pdp, recurse */
 4287                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index,
 4288                                 lockp) == NULL) {
 4289                                 vm_page_unwire_noq(m);
 4290                                 vm_page_free_zero(m);
 4291                                 return (NULL);
 4292                         }
 4293                 } else {
 4294                         /* Add reference to l2e page */
 4295                         pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME);
 4296                         pdppg->ref_count++;
 4297                 }
 4298                 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
 4299 
 4300                 /* Now find the pdp page */
 4301                 l2e = &l2e[pdpindex & RPTE_MASK];
 4302                 KASSERT((be64toh(*l2e) & PG_V) == 0,
 4303                     ("%s: L2 entry %#lx is valid", __func__, *l2e));
 4304                 pde_store(l2e, VM_PAGE_TO_PHYS(m));
 4305         } else {
 4306                 vm_pindex_t pml1index;
 4307                 vm_pindex_t pdpindex;
 4308                 pml1_entry_t *l1e;
 4309                 pml2_entry_t *l2e;
 4310                 pml3_entry_t *l3e;
 4311 
 4312                 /* Wire up a new PTE page */
 4313                 pdpindex = ptepindex >> RPTE_SHIFT;
 4314                 pml1index = pdpindex >> RPTE_SHIFT;
 4315 
 4316                 /* First, find the pdp and check that its valid. */
 4317                 l1e = &pmap->pm_pml1[pml1index];
 4318                 if ((be64toh(*l1e) & PG_V) == 0) {
 4319                         /* Have to allocate a new pd, recurse */
 4320                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 4321                             lockp) == NULL) {
 4322                                 vm_page_unwire_noq(m);
 4323                                 vm_page_free_zero(m);
 4324                                 return (NULL);
 4325                         }
 4326                         l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
 4327                         l2e = &l2e[pdpindex & RPTE_MASK];
 4328                 } else {
 4329                         l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
 4330                         l2e = &l2e[pdpindex & RPTE_MASK];
 4331                         if ((be64toh(*l2e) & PG_V) == 0) {
 4332                                 /* Have to allocate a new pd, recurse */
 4333                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 4334                                     lockp) == NULL) {
 4335                                         vm_page_unwire_noq(m);
 4336                                         vm_page_free_zero(m);
 4337                                         return (NULL);
 4338                                 }
 4339                         } else {
 4340                                 /* Add reference to the pd page */
 4341                                 pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME);
 4342                                 pdpg->ref_count++;
 4343                         }
 4344                 }
 4345                 l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME);
 4346 
 4347                 /* Now we know where the page directory page is */
 4348                 l3e = &l3e[ptepindex & RPTE_MASK];
 4349                 KASSERT((be64toh(*l3e) & PG_V) == 0,
 4350                     ("%s: L3 entry %#lx is valid", __func__, *l3e));
 4351                 pde_store(l3e, VM_PAGE_TO_PHYS(m));
 4352         }
 4353 
 4354         pmap_resident_count_inc(pmap, 1);
 4355         return (m);
 4356 }
 4357 static vm_page_t
 4358 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 4359 {
 4360         vm_pindex_t pdpindex, ptepindex;
 4361         pml2_entry_t *pdpe;
 4362         vm_page_t pdpg;
 4363 
 4364 retry:
 4365         pdpe = pmap_pml2e(pmap, va);
 4366         if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) {
 4367                 /* Add a reference to the pd page. */
 4368                 pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME);
 4369                 pdpg->ref_count++;
 4370         } else {
 4371                 /* Allocate a pd page. */
 4372                 ptepindex = pmap_l3e_pindex(va);
 4373                 pdpindex = ptepindex >> RPTE_SHIFT;
 4374                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 4375                 if (pdpg == NULL && lockp != NULL)
 4376                         goto retry;
 4377         }
 4378         return (pdpg);
 4379 }
 4380 
 4381 static vm_page_t
 4382 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 4383 {
 4384         vm_pindex_t ptepindex;
 4385         pml3_entry_t *pd;
 4386         vm_page_t m;
 4387 
 4388         /*
 4389          * Calculate pagetable page index
 4390          */
 4391         ptepindex = pmap_l3e_pindex(va);
 4392 retry:
 4393         /*
 4394          * Get the page directory entry
 4395          */
 4396         pd = pmap_pml3e(pmap, va);
 4397 
 4398         /*
 4399          * This supports switching from a 2MB page to a
 4400          * normal 4K page.
 4401          */
 4402         if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) {
 4403                 if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) {
 4404                         /*
 4405                          * Invalidation of the 2MB page mapping may have caused
 4406                          * the deallocation of the underlying PD page.
 4407                          */
 4408                         pd = NULL;
 4409                 }
 4410         }
 4411 
 4412         /*
 4413          * If the page table page is mapped, we just increment the
 4414          * hold count, and activate it.
 4415          */
 4416         if (pd != NULL && (be64toh(*pd) & PG_V) != 0) {
 4417                 m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME);
 4418                 m->ref_count++;
 4419         } else {
 4420                 /*
 4421                  * Here if the pte page isn't mapped, or if it has been
 4422                  * deallocated.
 4423                  */
 4424                 m = _pmap_allocpte(pmap, ptepindex, lockp);
 4425                 if (m == NULL && lockp != NULL)
 4426                         goto retry;
 4427         }
 4428         return (m);
 4429 }
 4430 
 4431 static void
 4432 mmu_radix_pinit0(pmap_t pmap)
 4433 {
 4434 
 4435         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
 4436         PMAP_LOCK_INIT(pmap);
 4437         pmap->pm_pml1 = kernel_pmap->pm_pml1;
 4438         pmap->pm_pid = kernel_pmap->pm_pid;
 4439 
 4440         vm_radix_init(&pmap->pm_radix);
 4441         TAILQ_INIT(&pmap->pm_pvchunk);
 4442         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 4443         kernel_pmap->pm_flags =
 4444                 pmap->pm_flags = PMAP_PDE_SUPERPAGE;
 4445 }
 4446 /*
 4447  * pmap_protect_l3e: do the things to protect a 2mpage in a process
 4448  */
 4449 static boolean_t
 4450 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot)
 4451 {
 4452         pt_entry_t newpde, oldpde;
 4453         vm_offset_t eva, va;
 4454         vm_page_t m;
 4455         boolean_t anychanged;
 4456 
 4457         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4458         KASSERT((sva & L3_PAGE_MASK) == 0,
 4459             ("pmap_protect_l3e: sva is not 2mpage aligned"));
 4460         anychanged = FALSE;
 4461 retry:
 4462         oldpde = newpde = be64toh(*l3e);
 4463         if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 4464             (PG_MANAGED | PG_M | PG_RW)) {
 4465                 eva = sva + L3_PAGE_SIZE;
 4466                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 4467                     va < eva; va += PAGE_SIZE, m++)
 4468                         vm_page_dirty(m);
 4469         }
 4470         if ((prot & VM_PROT_WRITE) == 0) {
 4471                 newpde &= ~(PG_RW | PG_M);
 4472                 newpde |= RPTE_EAA_R;
 4473         }
 4474         if (prot & VM_PROT_EXECUTE)
 4475                 newpde |= PG_X;
 4476         if (newpde != oldpde) {
 4477                 /*
 4478                  * As an optimization to future operations on this PDE, clear
 4479                  * PG_PROMOTED.  The impending invalidation will remove any
 4480                  * lingering 4KB page mappings from the TLB.
 4481                  */
 4482                 if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED)))
 4483                         goto retry;
 4484                 anychanged = TRUE;
 4485         }
 4486         return (anychanged);
 4487 }
 4488 
 4489 void
 4490 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 4491     vm_prot_t prot)
 4492 {
 4493         vm_offset_t va_next;
 4494         pml1_entry_t *l1e;
 4495         pml2_entry_t *l2e;
 4496         pml3_entry_t ptpaddr, *l3e;
 4497         pt_entry_t *pte;
 4498         boolean_t anychanged;
 4499 
 4500         CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva,
 4501             prot);
 4502 
 4503         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 4504         if (prot == VM_PROT_NONE) {
 4505                 mmu_radix_remove(pmap, sva, eva);
 4506                 return;
 4507         }
 4508 
 4509         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 4510             (VM_PROT_WRITE|VM_PROT_EXECUTE))
 4511                 return;
 4512 
 4513 #ifdef INVARIANTS
 4514         if (VERBOSE_PROTECT || pmap_logging)
 4515                 printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n",
 4516                            pmap, sva, eva, prot, pmap->pm_pid);
 4517 #endif
 4518         anychanged = FALSE;
 4519 
 4520         PMAP_LOCK(pmap);
 4521         for (; sva < eva; sva = va_next) {
 4522                 l1e = pmap_pml1e(pmap, sva);
 4523                 if ((be64toh(*l1e) & PG_V) == 0) {
 4524                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 4525                         if (va_next < sva)
 4526                                 va_next = eva;
 4527                         continue;
 4528                 }
 4529 
 4530                 l2e = pmap_l1e_to_l2e(l1e, sva);
 4531                 if ((be64toh(*l2e) & PG_V) == 0) {
 4532                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 4533                         if (va_next < sva)
 4534                                 va_next = eva;
 4535                         continue;
 4536                 }
 4537 
 4538                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 4539                 if (va_next < sva)
 4540                         va_next = eva;
 4541 
 4542                 l3e = pmap_l2e_to_l3e(l2e, sva);
 4543                 ptpaddr = be64toh(*l3e);
 4544 
 4545                 /*
 4546                  * Weed out invalid mappings.
 4547                  */
 4548                 if (ptpaddr == 0)
 4549                         continue;
 4550 
 4551                 /*
 4552                  * Check for large page.
 4553                  */
 4554                 if ((ptpaddr & RPTE_LEAF) != 0) {
 4555                         /*
 4556                          * Are we protecting the entire large page?  If not,
 4557                          * demote the mapping and fall through.
 4558                          */
 4559                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
 4560                                 if (pmap_protect_l3e(pmap, l3e, sva, prot))
 4561                                         anychanged = TRUE;
 4562                                 continue;
 4563                         } else if (!pmap_demote_l3e(pmap, l3e, sva)) {
 4564                                 /*
 4565                                  * The large page mapping was destroyed.
 4566                                  */
 4567                                 continue;
 4568                         }
 4569                 }
 4570 
 4571                 if (va_next > eva)
 4572                         va_next = eva;
 4573 
 4574                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
 4575                     sva += PAGE_SIZE) {
 4576                         pt_entry_t obits, pbits;
 4577                         vm_page_t m;
 4578 
 4579 retry:
 4580                         MPASS(pte == pmap_pte(pmap, sva));
 4581                         obits = pbits = be64toh(*pte);
 4582                         if ((pbits & PG_V) == 0)
 4583                                 continue;
 4584 
 4585                         if ((prot & VM_PROT_WRITE) == 0) {
 4586                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 4587                                     (PG_MANAGED | PG_M | PG_RW)) {
 4588                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 4589                                         vm_page_dirty(m);
 4590                                 }
 4591                                 pbits &= ~(PG_RW | PG_M);
 4592                                 pbits |= RPTE_EAA_R;
 4593                         }
 4594                         if (prot & VM_PROT_EXECUTE)
 4595                                 pbits |= PG_X;
 4596 
 4597                         if (pbits != obits) {
 4598                                 if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits)))
 4599                                         goto retry;
 4600                                 if (obits & (PG_A|PG_M)) {
 4601                                         anychanged = TRUE;
 4602 #ifdef INVARIANTS
 4603                                         if (VERBOSE_PROTECT || pmap_logging)
 4604                                                 printf("%#lx %#lx -> %#lx\n",
 4605                                                     sva, obits, pbits);
 4606 #endif
 4607                                 }
 4608                         }
 4609                 }
 4610         }
 4611         if (anychanged)
 4612                 pmap_invalidate_all(pmap);
 4613         PMAP_UNLOCK(pmap);
 4614 }
 4615 
 4616 void
 4617 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 4618 {
 4619 
 4620         CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count);
 4621         pt_entry_t oldpte, pa, *pte;
 4622         vm_page_t m;
 4623         uint64_t cache_bits, attr_bits;
 4624         vm_offset_t va;
 4625 
 4626         oldpte = 0;
 4627         attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
 4628         va = sva;
 4629         pte = kvtopte(va);
 4630         while (va < sva + PAGE_SIZE * count) {
 4631                 if (__predict_false((va & L3_PAGE_MASK) == 0))
 4632                         pte = kvtopte(va);
 4633                 MPASS(pte == pmap_pte(kernel_pmap, va));
 4634 
 4635                 /*
 4636                  * XXX there has to be a more efficient way than traversing
 4637                  * the page table every time - but go for correctness for
 4638                  * today
 4639                  */
 4640 
 4641                 m = *ma++;
 4642                 cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs);
 4643                 pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits;
 4644                 if (be64toh(*pte) != pa) {
 4645                         oldpte |= be64toh(*pte);
 4646                         pte_store(pte, pa);
 4647                 }
 4648                 va += PAGE_SIZE;
 4649                 pte++;
 4650         }
 4651         if (__predict_false((oldpte & RPTE_VALID) != 0))
 4652                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
 4653                     PAGE_SIZE);
 4654         else
 4655                 ptesync();
 4656 }
 4657 
 4658 void
 4659 mmu_radix_qremove(vm_offset_t sva, int count)
 4660 {
 4661         vm_offset_t va;
 4662         pt_entry_t *pte;
 4663 
 4664         CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count);
 4665         KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva));
 4666 
 4667         va = sva;
 4668         pte = kvtopte(va);
 4669         while (va < sva + PAGE_SIZE * count) {
 4670                 if (__predict_false((va & L3_PAGE_MASK) == 0))
 4671                         pte = kvtopte(va);
 4672                 pte_clear(pte);
 4673                 pte++;
 4674                 va += PAGE_SIZE;
 4675         }
 4676         pmap_invalidate_range(kernel_pmap, sva, va);
 4677 }
 4678 
 4679 /***************************************************
 4680  * Page table page management routines.....
 4681  ***************************************************/
 4682 /*
 4683  * Schedule the specified unused page table page to be freed.  Specifically,
 4684  * add the page to the specified list of pages that will be released to the
 4685  * physical memory manager after the TLB has been updated.
 4686  */
 4687 static __inline void
 4688 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
 4689     boolean_t set_PG_ZERO)
 4690 {
 4691 
 4692         if (set_PG_ZERO)
 4693                 m->flags |= PG_ZERO;
 4694         else
 4695                 m->flags &= ~PG_ZERO;
 4696         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 4697 }
 4698 
 4699 /*
 4700  * Inserts the specified page table page into the specified pmap's collection
 4701  * of idle page table pages.  Each of a pmap's page table pages is responsible
 4702  * for mapping a distinct range of virtual addresses.  The pmap's collection is
 4703  * ordered by this virtual address range.
 4704  */
 4705 static __inline int
 4706 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 4707 {
 4708 
 4709         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4710         return (vm_radix_insert(&pmap->pm_radix, mpte));
 4711 }
 4712 
 4713 /*
 4714  * Removes the page table page mapping the specified virtual address from the
 4715  * specified pmap's collection of idle page table pages, and returns it.
 4716  * Otherwise, returns NULL if there is no page table page corresponding to the
 4717  * specified virtual address.
 4718  */
 4719 static __inline vm_page_t
 4720 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 4721 {
 4722 
 4723         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4724         return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va)));
 4725 }
 4726 
 4727 /*
 4728  * Decrements a page table page's wire count, which is used to record the
 4729  * number of valid page table entries within the page.  If the wire count
 4730  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
 4731  * page table page was unmapped and FALSE otherwise.
 4732  */
 4733 static inline boolean_t
 4734 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 4735 {
 4736 
 4737         --m->ref_count;
 4738         if (m->ref_count == 0) {
 4739                 _pmap_unwire_ptp(pmap, va, m, free);
 4740                 return (TRUE);
 4741         } else
 4742                 return (FALSE);
 4743 }
 4744 
 4745 static void
 4746 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 4747 {
 4748 
 4749         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4750         /*
 4751          * unmap the page table page
 4752          */
 4753         if (m->pindex >= NUPDE + NUPDPE) {
 4754                 /* PDP page */
 4755                 pml1_entry_t *pml1;
 4756                 pml1 = pmap_pml1e(pmap, va);
 4757                 *pml1 = 0;
 4758         } else if (m->pindex >= NUPDE) {
 4759                 /* PD page */
 4760                 pml2_entry_t *l2e;
 4761                 l2e = pmap_pml2e(pmap, va);
 4762                 *l2e = 0;
 4763         } else {
 4764                 /* PTE page */
 4765                 pml3_entry_t *l3e;
 4766                 l3e = pmap_pml3e(pmap, va);
 4767                 *l3e = 0;
 4768         }
 4769         pmap_resident_count_dec(pmap, 1);
 4770         if (m->pindex < NUPDE) {
 4771                 /* We just released a PT, unhold the matching PD */
 4772                 vm_page_t pdpg;
 4773 
 4774                 pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME);
 4775                 pmap_unwire_ptp(pmap, va, pdpg, free);
 4776         }
 4777         else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 4778                 /* We just released a PD, unhold the matching PDP */
 4779                 vm_page_t pdppg;
 4780 
 4781                 pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME);
 4782                 pmap_unwire_ptp(pmap, va, pdppg, free);
 4783         }
 4784 
 4785         /*
 4786          * Put page on a list so that it is released after
 4787          * *ALL* TLB shootdown is done
 4788          */
 4789         pmap_add_delayed_free_list(m, free, TRUE);
 4790 }
 4791 
 4792 /*
 4793  * After removing a page table entry, this routine is used to
 4794  * conditionally free the page, and manage the hold/wire counts.
 4795  */
 4796 static int
 4797 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde,
 4798     struct spglist *free)
 4799 {
 4800         vm_page_t mpte;
 4801 
 4802         if (va >= VM_MAXUSER_ADDRESS)
 4803                 return (0);
 4804         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 4805         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 4806         return (pmap_unwire_ptp(pmap, va, mpte, free));
 4807 }
 4808 
 4809 void
 4810 mmu_radix_release(pmap_t pmap)
 4811 {
 4812 
 4813         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
 4814         KASSERT(pmap->pm_stats.resident_count == 0,
 4815             ("pmap_release: pmap resident count %ld != 0",
 4816             pmap->pm_stats.resident_count));
 4817         KASSERT(vm_radix_is_empty(&pmap->pm_radix),
 4818             ("pmap_release: pmap has reserved page table page(s)"));
 4819 
 4820         pmap_invalidate_all(pmap);
 4821         isa3_proctab[pmap->pm_pid].proctab0 = 0;
 4822         uma_zfree(zone_radix_pgd, pmap->pm_pml1);
 4823         vmem_free(asid_arena, pmap->pm_pid, 1);
 4824 }
 4825 
 4826 /*
 4827  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
 4828  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
 4829  * false if the PV entry cannot be allocated without resorting to reclamation.
 4830  */
 4831 static bool
 4832 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags,
 4833     struct rwlock **lockp)
 4834 {
 4835         struct md_page *pvh;
 4836         pv_entry_t pv;
 4837         vm_paddr_t pa;
 4838 
 4839         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4840         /* Pass NULL instead of the lock pointer to disable reclamation. */
 4841         if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 4842             NULL : lockp)) == NULL)
 4843                 return (false);
 4844         pv->pv_va = va;
 4845         pa = pde & PG_PS_FRAME;
 4846         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 4847         pvh = pa_to_pvh(pa);
 4848         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
 4849         pvh->pv_gen++;
 4850         return (true);
 4851 }
 4852 
 4853 /*
 4854  * Fills a page table page with mappings to consecutive physical pages.
 4855  */
 4856 static void
 4857 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 4858 {
 4859         pt_entry_t *pte;
 4860 
 4861         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 4862                 *pte = htobe64(newpte);
 4863                 newpte += PAGE_SIZE;
 4864         }
 4865 }
 4866 
 4867 static boolean_t
 4868 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va)
 4869 {
 4870         struct rwlock *lock;
 4871         boolean_t rv;
 4872 
 4873         lock = NULL;
 4874         rv = pmap_demote_l3e_locked(pmap, pde, va, &lock);
 4875         if (lock != NULL)
 4876                 rw_wunlock(lock);
 4877         return (rv);
 4878 }
 4879 
 4880 static boolean_t
 4881 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
 4882     struct rwlock **lockp)
 4883 {
 4884         pml3_entry_t oldpde;
 4885         pt_entry_t *firstpte;
 4886         vm_paddr_t mptepa;
 4887         vm_page_t mpte;
 4888         struct spglist free;
 4889         vm_offset_t sva;
 4890 
 4891         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4892         oldpde = be64toh(*l3e);
 4893         KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
 4894             ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx",
 4895             oldpde));
 4896         if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 4897             NULL) {
 4898                 KASSERT((oldpde & PG_W) == 0,
 4899                     ("pmap_demote_l3e: page table page for a wired mapping"
 4900                     " is missing"));
 4901 
 4902                 /*
 4903                  * Invalidate the 2MB page mapping and return "failure" if the
 4904                  * mapping was never accessed or the allocation of the new
 4905                  * page table page fails.  If the 2MB page mapping belongs to
 4906                  * the direct map region of the kernel's address space, then
 4907                  * the page allocation request specifies the highest possible
 4908                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
 4909                  * normal.  Page table pages are preallocated for every other
 4910                  * part of the kernel address space, so the direct map region
 4911                  * is the only part of the kernel address space that must be
 4912                  * handled here.
 4913                  */
 4914                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc_noobj(
 4915                     (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ?
 4916                     VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED)) == NULL) {
 4917                         SLIST_INIT(&free);
 4918                         sva = trunc_2mpage(va);
 4919                         pmap_remove_l3e(pmap, l3e, sva, &free, lockp);
 4920                         pmap_invalidate_l3e_page(pmap, sva, oldpde);
 4921                         vm_page_free_pages_toq(&free, true);
 4922                         CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx"
 4923                             " in pmap %p", va, pmap);
 4924                         return (FALSE);
 4925                 }
 4926                 mpte->pindex = pmap_l3e_pindex(va);
 4927                 if (va < VM_MAXUSER_ADDRESS)
 4928                         pmap_resident_count_inc(pmap, 1);
 4929         }
 4930         mptepa = VM_PAGE_TO_PHYS(mpte);
 4931         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 4932         KASSERT((oldpde & PG_A) != 0,
 4933             ("pmap_demote_l3e: oldpde is missing PG_A"));
 4934         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 4935             ("pmap_demote_l3e: oldpde is missing PG_M"));
 4936 
 4937         /*
 4938          * If the page table page is new, initialize it.
 4939          */
 4940         if (mpte->ref_count == 1) {
 4941                 mpte->ref_count = NPTEPG;
 4942                 pmap_fill_ptp(firstpte, oldpde);
 4943         }
 4944 
 4945         KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME),
 4946             ("pmap_demote_l3e: firstpte and newpte map different physical"
 4947             " addresses"));
 4948 
 4949         /*
 4950          * If the mapping has changed attributes, update the page table
 4951          * entries.
 4952          */
 4953         if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE))
 4954                 pmap_fill_ptp(firstpte, oldpde);
 4955 
 4956         /*
 4957          * The spare PV entries must be reserved prior to demoting the
 4958          * mapping, that is, prior to changing the PDE.  Otherwise, the state
 4959          * of the PDE and the PV lists will be inconsistent, which can result
 4960          * in reclaim_pv_chunk() attempting to remove a PV entry from the
 4961          * wrong PV list and pmap_pv_demote_l3e() failing to find the expected
 4962          * PV entry for the 2MB page mapping that is being demoted.
 4963          */
 4964         if ((oldpde & PG_MANAGED) != 0)
 4965                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 4966 
 4967         /*
 4968          * Demote the mapping.  This pmap is locked.  The old PDE has
 4969          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 4970          * set.  Thus, there is no danger of a race with another
 4971          * processor changing the setting of PG_A and/or PG_M between
 4972          * the read above and the store below.
 4973          */
 4974         pde_store(l3e, mptepa);
 4975         pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde);
 4976         /*
 4977          * Demote the PV entry.
 4978          */
 4979         if ((oldpde & PG_MANAGED) != 0)
 4980                 pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp);
 4981 
 4982         counter_u64_add(pmap_l3e_demotions, 1);
 4983         CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx"
 4984             " in pmap %p", va, pmap);
 4985         return (TRUE);
 4986 }
 4987 
 4988 /*
 4989  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
 4990  */
 4991 static void
 4992 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va)
 4993 {
 4994         vm_paddr_t mptepa;
 4995         vm_page_t mpte;
 4996 
 4997         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 4998         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4999         mpte = pmap_remove_pt_page(pmap, va);
 5000         if (mpte == NULL)
 5001                 panic("pmap_remove_kernel_pde: Missing pt page.");
 5002 
 5003         mptepa = VM_PAGE_TO_PHYS(mpte);
 5004 
 5005         /*
 5006          * Initialize the page table page.
 5007          */
 5008         pagezero(PHYS_TO_DMAP(mptepa));
 5009 
 5010         /*
 5011          * Demote the mapping.
 5012          */
 5013         pde_store(l3e, mptepa);
 5014         ptesync();
 5015 }
 5016 
 5017 /*
 5018  * pmap_remove_l3e: do the things to unmap a superpage in a process
 5019  */
 5020 static int
 5021 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
 5022     struct spglist *free, struct rwlock **lockp)
 5023 {
 5024         struct md_page *pvh;
 5025         pml3_entry_t oldpde;
 5026         vm_offset_t eva, va;
 5027         vm_page_t m, mpte;
 5028 
 5029         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5030         KASSERT((sva & L3_PAGE_MASK) == 0,
 5031             ("pmap_remove_l3e: sva is not 2mpage aligned"));
 5032         oldpde = be64toh(pte_load_clear(pdq));
 5033         if (oldpde & PG_W)
 5034                 pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE);
 5035         pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
 5036         if (oldpde & PG_MANAGED) {
 5037                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 5038                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 5039                 pmap_pvh_free(pvh, pmap, sva);
 5040                 eva = sva + L3_PAGE_SIZE;
 5041                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 5042                     va < eva; va += PAGE_SIZE, m++) {
 5043                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5044                                 vm_page_dirty(m);
 5045                         if (oldpde & PG_A)
 5046                                 vm_page_aflag_set(m, PGA_REFERENCED);
 5047                         if (TAILQ_EMPTY(&m->md.pv_list) &&
 5048                             TAILQ_EMPTY(&pvh->pv_list))
 5049                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 5050                 }
 5051         }
 5052         if (pmap == kernel_pmap) {
 5053                 pmap_remove_kernel_l3e(pmap, pdq, sva);
 5054         } else {
 5055                 mpte = pmap_remove_pt_page(pmap, sva);
 5056                 if (mpte != NULL) {
 5057                         pmap_resident_count_dec(pmap, 1);
 5058                         KASSERT(mpte->ref_count == NPTEPG,
 5059                             ("pmap_remove_l3e: pte page wire count error"));
 5060                         mpte->ref_count = 0;
 5061                         pmap_add_delayed_free_list(mpte, free, FALSE);
 5062                 }
 5063         }
 5064         return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free));
 5065 }
 5066 
 5067 /*
 5068  * pmap_remove_pte: do the things to unmap a page in a process
 5069  */
 5070 static int
 5071 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
 5072     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 5073 {
 5074         struct md_page *pvh;
 5075         pt_entry_t oldpte;
 5076         vm_page_t m;
 5077 
 5078         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5079         oldpte = be64toh(pte_load_clear(ptq));
 5080         if (oldpte & RPTE_WIRED)
 5081                 pmap->pm_stats.wired_count -= 1;
 5082         pmap_resident_count_dec(pmap, 1);
 5083         if (oldpte & RPTE_MANAGED) {
 5084                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 5085                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5086                         vm_page_dirty(m);
 5087                 if (oldpte & PG_A)
 5088                         vm_page_aflag_set(m, PGA_REFERENCED);
 5089                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 5090                 pmap_pvh_free(&m->md, pmap, va);
 5091                 if (TAILQ_EMPTY(&m->md.pv_list) &&
 5092                     (m->flags & PG_FICTITIOUS) == 0) {
 5093                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5094                         if (TAILQ_EMPTY(&pvh->pv_list))
 5095                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
 5096                 }
 5097         }
 5098         return (pmap_unuse_pt(pmap, va, ptepde, free));
 5099 }
 5100 
 5101 /*
 5102  * Remove a single page from a process address space
 5103  */
 5104 static bool
 5105 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e,
 5106     struct spglist *free)
 5107 {
 5108         struct rwlock *lock;
 5109         pt_entry_t *pte;
 5110         bool invalidate_all;
 5111 
 5112         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5113         if ((be64toh(*l3e) & RPTE_VALID) == 0) {
 5114                 return (false);
 5115         }
 5116         pte = pmap_l3e_to_pte(l3e, va);
 5117         if ((be64toh(*pte) & RPTE_VALID) == 0) {
 5118                 return (false);
 5119         }
 5120         lock = NULL;
 5121 
 5122         invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock);
 5123         if (lock != NULL)
 5124                 rw_wunlock(lock);
 5125         if (!invalidate_all)
 5126                 pmap_invalidate_page(pmap, va);
 5127         return (invalidate_all);
 5128 }
 5129 
 5130 /*
 5131  * Removes the specified range of addresses from the page table page.
 5132  */
 5133 static bool
 5134 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 5135     pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp)
 5136 {
 5137         pt_entry_t *pte;
 5138         vm_offset_t va;
 5139         bool anyvalid;
 5140 
 5141         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5142         anyvalid = false;
 5143         va = eva;
 5144         for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++,
 5145             sva += PAGE_SIZE) {
 5146                 MPASS(pte == pmap_pte(pmap, sva));
 5147                 if (*pte == 0) {
 5148                         if (va != eva) {
 5149                                 anyvalid = true;
 5150                                 va = eva;
 5151                         }
 5152                         continue;
 5153                 }
 5154                 if (va == eva)
 5155                         va = sva;
 5156                 if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) {
 5157                         anyvalid = true;
 5158                         sva += PAGE_SIZE;
 5159                         break;
 5160                 }
 5161         }
 5162         if (anyvalid)
 5163                 pmap_invalidate_all(pmap);
 5164         else if (va != eva)
 5165                 pmap_invalidate_range(pmap, va, sva);
 5166         return (anyvalid);
 5167 }
 5168 
 5169 void
 5170 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 5171 {
 5172         struct rwlock *lock;
 5173         vm_offset_t va_next;
 5174         pml1_entry_t *l1e;
 5175         pml2_entry_t *l2e;
 5176         pml3_entry_t ptpaddr, *l3e;
 5177         struct spglist free;
 5178         bool anyvalid;
 5179 
 5180         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
 5181 
 5182         /*
 5183          * Perform an unsynchronized read.  This is, however, safe.
 5184          */
 5185         if (pmap->pm_stats.resident_count == 0)
 5186                 return;
 5187 
 5188         anyvalid = false;
 5189         SLIST_INIT(&free);
 5190 
 5191         /* XXX something fishy here */
 5192         sva = (sva + PAGE_MASK) & ~PAGE_MASK;
 5193         eva = (eva + PAGE_MASK) & ~PAGE_MASK;
 5194 
 5195         PMAP_LOCK(pmap);
 5196 
 5197         /*
 5198          * special handling of removing one page.  a very
 5199          * common operation and easy to short circuit some
 5200          * code.
 5201          */
 5202         if (sva + PAGE_SIZE == eva) {
 5203                 l3e = pmap_pml3e(pmap, sva);
 5204                 if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) {
 5205                         anyvalid = pmap_remove_page(pmap, sva, l3e, &free);
 5206                         goto out;
 5207                 }
 5208         }
 5209 
 5210         lock = NULL;
 5211         for (; sva < eva; sva = va_next) {
 5212                 if (pmap->pm_stats.resident_count == 0)
 5213                         break;
 5214                 l1e = pmap_pml1e(pmap, sva);
 5215                 if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) {
 5216                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 5217                         if (va_next < sva)
 5218                                 va_next = eva;
 5219                         continue;
 5220                 }
 5221 
 5222                 l2e = pmap_l1e_to_l2e(l1e, sva);
 5223                 if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) {
 5224                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 5225                         if (va_next < sva)
 5226                                 va_next = eva;
 5227                         continue;
 5228                 }
 5229 
 5230                 /*
 5231                  * Calculate index for next page table.
 5232                  */
 5233                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 5234                 if (va_next < sva)
 5235                         va_next = eva;
 5236 
 5237                 l3e = pmap_l2e_to_l3e(l2e, sva);
 5238                 ptpaddr = be64toh(*l3e);
 5239 
 5240                 /*
 5241                  * Weed out invalid mappings.
 5242                  */
 5243                 if (ptpaddr == 0)
 5244                         continue;
 5245 
 5246                 /*
 5247                  * Check for large page.
 5248                  */
 5249                 if ((ptpaddr & RPTE_LEAF) != 0) {
 5250                         /*
 5251                          * Are we removing the entire large page?  If not,
 5252                          * demote the mapping and fall through.
 5253                          */
 5254                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
 5255                                 pmap_remove_l3e(pmap, l3e, sva, &free, &lock);
 5256                                 anyvalid = true;
 5257                                 continue;
 5258                         } else if (!pmap_demote_l3e_locked(pmap, l3e, sva,
 5259                             &lock)) {
 5260                                 /* The large page mapping was destroyed. */
 5261                                 continue;
 5262                         } else
 5263                                 ptpaddr = be64toh(*l3e);
 5264                 }
 5265 
 5266                 /*
 5267                  * Limit our scan to either the end of the va represented
 5268                  * by the current page table page, or to the end of the
 5269                  * range being removed.
 5270                  */
 5271                 if (va_next > eva)
 5272                         va_next = eva;
 5273 
 5274                 if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock))
 5275                         anyvalid = true;
 5276         }
 5277         if (lock != NULL)
 5278                 rw_wunlock(lock);
 5279 out:
 5280         if (anyvalid)
 5281                 pmap_invalidate_all(pmap);
 5282         PMAP_UNLOCK(pmap);
 5283         vm_page_free_pages_toq(&free, true);
 5284 }
 5285 
 5286 void
 5287 mmu_radix_remove_all(vm_page_t m)
 5288 {
 5289         struct md_page *pvh;
 5290         pv_entry_t pv;
 5291         pmap_t pmap;
 5292         struct rwlock *lock;
 5293         pt_entry_t *pte, tpte;
 5294         pml3_entry_t *l3e;
 5295         vm_offset_t va;
 5296         struct spglist free;
 5297         int pvh_gen, md_gen;
 5298 
 5299         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 5300         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5301             ("pmap_remove_all: page %p is not managed", m));
 5302         SLIST_INIT(&free);
 5303         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5304         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 5305             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5306 retry:
 5307         rw_wlock(lock);
 5308         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 5309                 pmap = PV_PMAP(pv);
 5310                 if (!PMAP_TRYLOCK(pmap)) {
 5311                         pvh_gen = pvh->pv_gen;
 5312                         rw_wunlock(lock);
 5313                         PMAP_LOCK(pmap);
 5314                         rw_wlock(lock);
 5315                         if (pvh_gen != pvh->pv_gen) {
 5316                                 rw_wunlock(lock);
 5317                                 PMAP_UNLOCK(pmap);
 5318                                 goto retry;
 5319                         }
 5320                 }
 5321                 va = pv->pv_va;
 5322                 l3e = pmap_pml3e(pmap, va);
 5323                 (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock);
 5324                 PMAP_UNLOCK(pmap);
 5325         }
 5326         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 5327                 pmap = PV_PMAP(pv);
 5328                 if (!PMAP_TRYLOCK(pmap)) {
 5329                         pvh_gen = pvh->pv_gen;
 5330                         md_gen = m->md.pv_gen;
 5331                         rw_wunlock(lock);
 5332                         PMAP_LOCK(pmap);
 5333                         rw_wlock(lock);
 5334                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 5335                                 rw_wunlock(lock);
 5336                                 PMAP_UNLOCK(pmap);
 5337                                 goto retry;
 5338                         }
 5339                 }
 5340                 pmap_resident_count_dec(pmap, 1);
 5341                 l3e = pmap_pml3e(pmap, pv->pv_va);
 5342                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found"
 5343                     " a 2mpage in page %p's pv list", m));
 5344                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
 5345                 tpte = be64toh(pte_load_clear(pte));
 5346                 if (tpte & PG_W)
 5347                         pmap->pm_stats.wired_count--;
 5348                 if (tpte & PG_A)
 5349                         vm_page_aflag_set(m, PGA_REFERENCED);
 5350 
 5351                 /*
 5352                  * Update the vm_page_t clean and reference bits.
 5353                  */
 5354                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5355                         vm_page_dirty(m);
 5356                 pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free);
 5357                 pmap_invalidate_page(pmap, pv->pv_va);
 5358                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
 5359                 m->md.pv_gen++;
 5360                 free_pv_entry(pmap, pv);
 5361                 PMAP_UNLOCK(pmap);
 5362         }
 5363         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5364         rw_wunlock(lock);
 5365         vm_page_free_pages_toq(&free, true);
 5366 }
 5367 
 5368 /*
 5369  * Destroy all managed, non-wired mappings in the given user-space
 5370  * pmap.  This pmap cannot be active on any processor besides the
 5371  * caller.
 5372  *
 5373  * This function cannot be applied to the kernel pmap.  Moreover, it
 5374  * is not intended for general use.  It is only to be used during
 5375  * process termination.  Consequently, it can be implemented in ways
 5376  * that make it faster than pmap_remove().  First, it can more quickly
 5377  * destroy mappings by iterating over the pmap's collection of PV
 5378  * entries, rather than searching the page table.  Second, it doesn't
 5379  * have to test and clear the page table entries atomically, because
 5380  * no processor is currently accessing the user address space.  In
 5381  * particular, a page table entry's dirty bit won't change state once
 5382  * this function starts.
 5383  *
 5384  * Although this function destroys all of the pmap's managed,
 5385  * non-wired mappings, it can delay and batch the invalidation of TLB
 5386  * entries without calling pmap_delayed_invl_started() and
 5387  * pmap_delayed_invl_finished().  Because the pmap is not active on
 5388  * any other processor, none of these TLB entries will ever be used
 5389  * before their eventual invalidation.  Consequently, there is no need
 5390  * for either pmap_remove_all() or pmap_remove_write() to wait for
 5391  * that eventual TLB invalidation.
 5392  */
 5393 
 5394 void
 5395 mmu_radix_remove_pages(pmap_t pmap)
 5396 {
 5397 
 5398         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
 5399         pml3_entry_t ptel3e;
 5400         pt_entry_t *pte, tpte;
 5401         struct spglist free;
 5402         vm_page_t m, mpte, mt;
 5403         pv_entry_t pv;
 5404         struct md_page *pvh;
 5405         struct pv_chunk *pc, *npc;
 5406         struct rwlock *lock;
 5407         int64_t bit;
 5408         uint64_t inuse, bitmask;
 5409         int allfree, field, idx;
 5410 #ifdef PV_STATS
 5411         int freed;
 5412 #endif
 5413         boolean_t superpage;
 5414         vm_paddr_t pa;
 5415 
 5416         /*
 5417          * Assert that the given pmap is only active on the current
 5418          * CPU.  Unfortunately, we cannot block another CPU from
 5419          * activating the pmap while this function is executing.
 5420          */
 5421         KASSERT(pmap->pm_pid == mfspr(SPR_PID),
 5422             ("non-current asid %lu - expected %lu", pmap->pm_pid,
 5423             mfspr(SPR_PID)));
 5424 
 5425         lock = NULL;
 5426 
 5427         SLIST_INIT(&free);
 5428         PMAP_LOCK(pmap);
 5429         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 5430                 allfree = 1;
 5431 #ifdef PV_STATS
 5432                 freed = 0;
 5433 #endif
 5434                 for (field = 0; field < _NPCM; field++) {
 5435                         inuse = ~pc->pc_map[field] & pc_freemask[field];
 5436                         while (inuse != 0) {
 5437                                 bit = cnttzd(inuse);
 5438                                 bitmask = 1UL << bit;
 5439                                 idx = field * 64 + bit;
 5440                                 pv = &pc->pc_pventry[idx];
 5441                                 inuse &= ~bitmask;
 5442 
 5443                                 pte = pmap_pml2e(pmap, pv->pv_va);
 5444                                 ptel3e = be64toh(*pte);
 5445                                 pte = pmap_l2e_to_l3e(pte, pv->pv_va);
 5446                                 tpte = be64toh(*pte);
 5447                                 if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) {
 5448                                         superpage = FALSE;
 5449                                         ptel3e = tpte;
 5450                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 5451                                             PG_FRAME);
 5452                                         pte = &pte[pmap_pte_index(pv->pv_va)];
 5453                                         tpte = be64toh(*pte);
 5454                                 } else {
 5455                                         /*
 5456                                          * Keep track whether 'tpte' is a
 5457                                          * superpage explicitly instead of
 5458                                          * relying on RPTE_LEAF being set.
 5459                                          *
 5460                                          * This is because RPTE_LEAF is numerically
 5461                                          * identical to PG_PTE_PAT and thus a
 5462                                          * regular page could be mistaken for
 5463                                          * a superpage.
 5464                                          */
 5465                                         superpage = TRUE;
 5466                                 }
 5467 
 5468                                 if ((tpte & PG_V) == 0) {
 5469                                         panic("bad pte va %lx pte %lx",
 5470                                             pv->pv_va, tpte);
 5471                                 }
 5472 
 5473 /*
 5474  * We cannot remove wired pages from a process' mapping at this time
 5475  */
 5476                                 if (tpte & PG_W) {
 5477                                         allfree = 0;
 5478                                         continue;
 5479                                 }
 5480 
 5481                                 if (superpage)
 5482                                         pa = tpte & PG_PS_FRAME;
 5483                                 else
 5484                                         pa = tpte & PG_FRAME;
 5485 
 5486                                 m = PHYS_TO_VM_PAGE(pa);
 5487                                 KASSERT(m->phys_addr == pa,
 5488                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 5489                                     m, (uintmax_t)m->phys_addr,
 5490                                     (uintmax_t)tpte));
 5491 
 5492                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 5493                                     m < &vm_page_array[vm_page_array_size],
 5494                                     ("pmap_remove_pages: bad tpte %#jx",
 5495                                     (uintmax_t)tpte));
 5496 
 5497                                 pte_clear(pte);
 5498 
 5499                                 /*
 5500                                  * Update the vm_page_t clean/reference bits.
 5501                                  */
 5502                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 5503                                         if (superpage) {
 5504                                                 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
 5505                                                         vm_page_dirty(mt);
 5506                                         } else
 5507                                                 vm_page_dirty(m);
 5508                                 }
 5509 
 5510                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 5511 
 5512                                 /* Mark free */
 5513                                 pc->pc_map[field] |= bitmask;
 5514                                 if (superpage) {
 5515                                         pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
 5516                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 5517                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
 5518                                         pvh->pv_gen++;
 5519                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
 5520                                                 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
 5521                                                         if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
 5522                                                             TAILQ_EMPTY(&mt->md.pv_list))
 5523                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
 5524                                         }
 5525                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 5526                                         if (mpte != NULL) {
 5527                                                 pmap_resident_count_dec(pmap, 1);
 5528                                                 KASSERT(mpte->ref_count == NPTEPG,
 5529                                                     ("pmap_remove_pages: pte page wire count error"));
 5530                                                 mpte->ref_count = 0;
 5531                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
 5532                                         }
 5533                                 } else {
 5534                                         pmap_resident_count_dec(pmap, 1);
 5535 #ifdef VERBOSE_PV
 5536                                         printf("freeing pv (%p, %p)\n",
 5537                                                    pmap, pv);
 5538 #endif
 5539                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
 5540                                         m->md.pv_gen++;
 5541                                         if ((m->a.flags & PGA_WRITEABLE) != 0 &&
 5542                                             TAILQ_EMPTY(&m->md.pv_list) &&
 5543                                             (m->flags & PG_FICTITIOUS) == 0) {
 5544                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5545                                                 if (TAILQ_EMPTY(&pvh->pv_list))
 5546                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5547                                         }
 5548                                 }
 5549                                 pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free);
 5550 #ifdef PV_STATS
 5551                                 freed++;
 5552 #endif
 5553                         }
 5554                 }
 5555                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 5556                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 5557                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 5558                 if (allfree) {
 5559                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 5560                         free_pv_chunk(pc);
 5561                 }
 5562         }
 5563         if (lock != NULL)
 5564                 rw_wunlock(lock);
 5565         pmap_invalidate_all(pmap);
 5566         PMAP_UNLOCK(pmap);
 5567         vm_page_free_pages_toq(&free, true);
 5568 }
 5569 
 5570 void
 5571 mmu_radix_remove_write(vm_page_t m)
 5572 {
 5573         struct md_page *pvh;
 5574         pmap_t pmap;
 5575         struct rwlock *lock;
 5576         pv_entry_t next_pv, pv;
 5577         pml3_entry_t *l3e;
 5578         pt_entry_t oldpte, *pte;
 5579         int pvh_gen, md_gen;
 5580 
 5581         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 5582         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 5583             ("pmap_remove_write: page %p is not managed", m));
 5584         vm_page_assert_busied(m);
 5585 
 5586         if (!pmap_page_is_write_mapped(m))
 5587                 return;
 5588         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 5589         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 5590             pa_to_pvh(VM_PAGE_TO_PHYS(m));
 5591 retry_pv_loop:
 5592         rw_wlock(lock);
 5593         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
 5594                 pmap = PV_PMAP(pv);
 5595                 if (!PMAP_TRYLOCK(pmap)) {
 5596                         pvh_gen = pvh->pv_gen;
 5597                         rw_wunlock(lock);
 5598                         PMAP_LOCK(pmap);
 5599                         rw_wlock(lock);
 5600                         if (pvh_gen != pvh->pv_gen) {
 5601                                 PMAP_UNLOCK(pmap);
 5602                                 rw_wunlock(lock);
 5603                                 goto retry_pv_loop;
 5604                         }
 5605                 }
 5606                 l3e = pmap_pml3e(pmap, pv->pv_va);
 5607                 if ((be64toh(*l3e) & PG_RW) != 0)
 5608                         (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock);
 5609                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 5610                     ("inconsistent pv lock %p %p for page %p",
 5611                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 5612                 PMAP_UNLOCK(pmap);
 5613         }
 5614         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 5615                 pmap = PV_PMAP(pv);
 5616                 if (!PMAP_TRYLOCK(pmap)) {
 5617                         pvh_gen = pvh->pv_gen;
 5618                         md_gen = m->md.pv_gen;
 5619                         rw_wunlock(lock);
 5620                         PMAP_LOCK(pmap);
 5621                         rw_wlock(lock);
 5622                         if (pvh_gen != pvh->pv_gen ||
 5623                             md_gen != m->md.pv_gen) {
 5624                                 PMAP_UNLOCK(pmap);
 5625                                 rw_wunlock(lock);
 5626                                 goto retry_pv_loop;
 5627                         }
 5628                 }
 5629                 l3e = pmap_pml3e(pmap, pv->pv_va);
 5630                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
 5631                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
 5632                     m));
 5633                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
 5634 retry:
 5635                 oldpte = be64toh(*pte);
 5636                 if (oldpte & PG_RW) {
 5637                         if (!atomic_cmpset_long(pte, htobe64(oldpte),
 5638                             htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))))
 5639                                 goto retry;
 5640                         if ((oldpte & PG_M) != 0)
 5641                                 vm_page_dirty(m);
 5642                         pmap_invalidate_page(pmap, pv->pv_va);
 5643                 }
 5644                 PMAP_UNLOCK(pmap);
 5645         }
 5646         rw_wunlock(lock);
 5647         vm_page_aflag_clear(m, PGA_WRITEABLE);
 5648 }
 5649 
 5650 /*
 5651  *      Clear the wired attribute from the mappings for the specified range of
 5652  *      addresses in the given pmap.  Every valid mapping within that range
 5653  *      must have the wired attribute set.  In contrast, invalid mappings
 5654  *      cannot have the wired attribute set, so they are ignored.
 5655  *
 5656  *      The wired attribute of the page table entry is not a hardware
 5657  *      feature, so there is no need to invalidate any TLB entries.
 5658  *      Since pmap_demote_l3e() for the wired entry must never fail,
 5659  *      pmap_delayed_invl_started()/finished() calls around the
 5660  *      function are not needed.
 5661  */
 5662 void
 5663 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 5664 {
 5665         vm_offset_t va_next;
 5666         pml1_entry_t *l1e;
 5667         pml2_entry_t *l2e;
 5668         pml3_entry_t *l3e;
 5669         pt_entry_t *pte;
 5670 
 5671         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
 5672         PMAP_LOCK(pmap);
 5673         for (; sva < eva; sva = va_next) {
 5674                 l1e = pmap_pml1e(pmap, sva);
 5675                 if ((be64toh(*l1e) & PG_V) == 0) {
 5676                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
 5677                         if (va_next < sva)
 5678                                 va_next = eva;
 5679                         continue;
 5680                 }
 5681                 l2e = pmap_l1e_to_l2e(l1e, sva);
 5682                 if ((be64toh(*l2e) & PG_V) == 0) {
 5683                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
 5684                         if (va_next < sva)
 5685                                 va_next = eva;
 5686                         continue;
 5687                 }
 5688                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
 5689                 if (va_next < sva)
 5690                         va_next = eva;
 5691                 l3e = pmap_l2e_to_l3e(l2e, sva);
 5692                 if ((be64toh(*l3e) & PG_V) == 0)
 5693                         continue;
 5694                 if ((be64toh(*l3e) & RPTE_LEAF) != 0) {
 5695                         if ((be64toh(*l3e) & PG_W) == 0)
 5696                                 panic("pmap_unwire: pde %#jx is missing PG_W",
 5697                                     (uintmax_t)(be64toh(*l3e)));
 5698 
 5699                         /*
 5700                          * Are we unwiring the entire large page?  If not,
 5701                          * demote the mapping and fall through.
 5702                          */
 5703                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
 5704                                 atomic_clear_long(l3e, htobe64(PG_W));
 5705                                 pmap->pm_stats.wired_count -= L3_PAGE_SIZE /
 5706                                     PAGE_SIZE;
 5707                                 continue;
 5708                         } else if (!pmap_demote_l3e(pmap, l3e, sva))
 5709                                 panic("pmap_unwire: demotion failed");
 5710                 }
 5711                 if (va_next > eva)
 5712                         va_next = eva;
 5713                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
 5714                     sva += PAGE_SIZE) {
 5715                         MPASS(pte == pmap_pte(pmap, sva));
 5716                         if ((be64toh(*pte) & PG_V) == 0)
 5717                                 continue;
 5718                         if ((be64toh(*pte) & PG_W) == 0)
 5719                                 panic("pmap_unwire: pte %#jx is missing PG_W",
 5720                                     (uintmax_t)(be64toh(*pte)));
 5721 
 5722                         /*
 5723                          * PG_W must be cleared atomically.  Although the pmap
 5724                          * lock synchronizes access to PG_W, another processor
 5725                          * could be setting PG_M and/or PG_A concurrently.
 5726                          */
 5727                         atomic_clear_long(pte, htobe64(PG_W));
 5728                         pmap->pm_stats.wired_count--;
 5729                 }
 5730         }
 5731         PMAP_UNLOCK(pmap);
 5732 }
 5733 
 5734 void
 5735 mmu_radix_zero_page(vm_page_t m)
 5736 {
 5737         vm_offset_t addr;
 5738 
 5739         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 5740         addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 5741         pagezero(addr);
 5742 }
 5743 
 5744 void
 5745 mmu_radix_zero_page_area(vm_page_t m, int off, int size)
 5746 {
 5747         caddr_t addr;
 5748 
 5749         CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size);
 5750         MPASS(off + size <= PAGE_SIZE);
 5751         addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 5752         memset(addr + off, 0, size);
 5753 }
 5754 
 5755 static int
 5756 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 5757 {
 5758         pml3_entry_t *l3ep;
 5759         pt_entry_t pte;
 5760         vm_paddr_t pa;
 5761         int val;
 5762 
 5763         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
 5764         PMAP_LOCK(pmap);
 5765 
 5766         l3ep = pmap_pml3e(pmap, addr);
 5767         if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) {
 5768                 if (be64toh(*l3ep) & RPTE_LEAF) {
 5769                         pte = be64toh(*l3ep);
 5770                         /* Compute the physical address of the 4KB page. */
 5771                         pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) &
 5772                             PG_FRAME;
 5773                         val = MINCORE_PSIND(1);
 5774                 } else {
 5775                         /* Native endian PTE, do not pass to functions */
 5776                         pte = be64toh(*pmap_l3e_to_pte(l3ep, addr));
 5777                         pa = pte & PG_FRAME;
 5778                         val = 0;
 5779                 }
 5780         } else {
 5781                 pte = 0;
 5782                 pa = 0;
 5783                 val = 0;
 5784         }
 5785         if ((pte & PG_V) != 0) {
 5786                 val |= MINCORE_INCORE;
 5787                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 5788                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 5789                 if ((pte & PG_A) != 0)
 5790                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 5791         }
 5792         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 5793             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 5794             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 5795                 *locked_pa = pa;
 5796         }
 5797         PMAP_UNLOCK(pmap);
 5798         return (val);
 5799 }
 5800 
 5801 void
 5802 mmu_radix_activate(struct thread *td)
 5803 {
 5804         pmap_t pmap;
 5805         uint32_t curpid;
 5806 
 5807         CTR2(KTR_PMAP, "%s(%p)", __func__, td);
 5808         critical_enter();
 5809         pmap = vmspace_pmap(td->td_proc->p_vmspace);
 5810         curpid = mfspr(SPR_PID);
 5811         if (pmap->pm_pid > isa3_base_pid &&
 5812                 curpid != pmap->pm_pid) {
 5813                 mmu_radix_pid_set(pmap);
 5814         }
 5815         critical_exit();
 5816 }
 5817 
 5818 /*
 5819  *      Increase the starting virtual address of the given mapping if a
 5820  *      different alignment might result in more superpage mappings.
 5821  */
 5822 void
 5823 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset,
 5824     vm_offset_t *addr, vm_size_t size)
 5825 {
 5826 
 5827         CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr,
 5828             size);
 5829         vm_offset_t superpage_offset;
 5830 
 5831         if (size < L3_PAGE_SIZE)
 5832                 return;
 5833         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 5834                 offset += ptoa(object->pg_color);
 5835         superpage_offset = offset & L3_PAGE_MASK;
 5836         if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE ||
 5837             (*addr & L3_PAGE_MASK) == superpage_offset)
 5838                 return;
 5839         if ((*addr & L3_PAGE_MASK) < superpage_offset)
 5840                 *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset;
 5841         else
 5842                 *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset;
 5843 }
 5844 
 5845 static void *
 5846 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr)
 5847 {
 5848         vm_offset_t va, tmpva, ppa, offset;
 5849 
 5850         ppa = trunc_page(pa);
 5851         offset = pa & PAGE_MASK;
 5852         size = roundup2(offset + size, PAGE_SIZE);
 5853         if (pa < powerpc_ptob(Maxmem))
 5854                 panic("bad pa: %#lx less than Maxmem %#lx\n",
 5855                           pa, powerpc_ptob(Maxmem));
 5856         va = kva_alloc(size);
 5857         if (bootverbose)
 5858                 printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr);
 5859         KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr));
 5860 
 5861         if (!va)
 5862                 panic("%s: Couldn't alloc kernel virtual memory", __func__);
 5863 
 5864         for (tmpva = va; size > 0;) {
 5865                 mmu_radix_kenter_attr(tmpva, ppa, attr);
 5866                 size -= PAGE_SIZE;
 5867                 tmpva += PAGE_SIZE;
 5868                 ppa += PAGE_SIZE;
 5869         }
 5870         ptesync();
 5871 
 5872         return ((void *)(va + offset));
 5873 }
 5874 
 5875 static void *
 5876 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size)
 5877 {
 5878 
 5879         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
 5880 
 5881         return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
 5882 }
 5883 
 5884 void
 5885 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 5886 {
 5887 
 5888         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
 5889         m->md.mdpg_cache_attrs = ma;
 5890 
 5891         /*
 5892          * If "m" is a normal page, update its direct mapping.  This update
 5893          * can be relied upon to perform any cache operations that are
 5894          * required for data coherence.
 5895          */
 5896         if ((m->flags & PG_FICTITIOUS) == 0 &&
 5897             mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
 5898             PAGE_SIZE, m->md.mdpg_cache_attrs))
 5899                 panic("memory attribute change on the direct map failed");
 5900 }
 5901 
 5902 static void
 5903 mmu_radix_unmapdev(void *p, vm_size_t size)
 5904 {
 5905         vm_offset_t offset, va;
 5906 
 5907         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, p, size);
 5908 
 5909         /* If we gave a direct map region in pmap_mapdev, do nothing */
 5910         va = (vm_offset_t)p;
 5911         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 5912                 return;
 5913 
 5914         offset = va & PAGE_MASK;
 5915         size = round_page(offset + size);
 5916         va = trunc_page(va);
 5917 
 5918         if (pmap_initialized) {
 5919                 mmu_radix_qremove(va, atop(size));
 5920                 kva_free(va, size);
 5921         }
 5922 }
 5923 
 5924 static __inline void
 5925 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask)
 5926 {
 5927         uint64_t opte, npte;
 5928 
 5929         /*
 5930          * The cache mode bits are all in the low 32-bits of the
 5931          * PTE, so we can just spin on updating the low 32-bits.
 5932          */
 5933         do {
 5934                 opte = be64toh(*pte);
 5935                 npte = opte & ~mask;
 5936                 npte |= cache_bits;
 5937         } while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte)));
 5938 }
 5939 
 5940 /*
 5941  * Tries to demote a 1GB page mapping.
 5942  */
 5943 static boolean_t
 5944 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va)
 5945 {
 5946         pml2_entry_t oldpdpe;
 5947         pml3_entry_t *firstpde, newpde, *pde;
 5948         vm_paddr_t pdpgpa;
 5949         vm_page_t pdpg;
 5950 
 5951         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 5952         oldpdpe = be64toh(*l2e);
 5953         KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
 5954             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 5955         pdpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
 5956         if (pdpg == NULL) {
 5957                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 5958                     " in pmap %p", va, pmap);
 5959                 return (FALSE);
 5960         }
 5961         pdpg->pindex = va >> L2_PAGE_SIZE_SHIFT;
 5962         pdpgpa = VM_PAGE_TO_PHYS(pdpg);
 5963         firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa);
 5964         KASSERT((oldpdpe & PG_A) != 0,
 5965             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 5966         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 5967             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 5968         newpde = oldpdpe;
 5969 
 5970         /*
 5971          * Initialize the page directory page.
 5972          */
 5973         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 5974                 *pde = htobe64(newpde);
 5975                 newpde += L3_PAGE_SIZE;
 5976         }
 5977 
 5978         /*
 5979          * Demote the mapping.
 5980          */
 5981         pde_store(l2e, pdpgpa);
 5982 
 5983         /*
 5984          * Flush PWC --- XXX revisit
 5985          */
 5986         pmap_invalidate_all(pmap);
 5987 
 5988         counter_u64_add(pmap_l2e_demotions, 1);
 5989         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 5990             " in pmap %p", va, pmap);
 5991         return (TRUE);
 5992 }
 5993 
 5994 vm_paddr_t
 5995 mmu_radix_kextract(vm_offset_t va)
 5996 {
 5997         pml3_entry_t l3e;
 5998         vm_paddr_t pa;
 5999 
 6000         CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
 6001         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 6002                 pa = DMAP_TO_PHYS(va);
 6003         } else {
 6004                 /* Big-endian PTE on stack */
 6005                 l3e = *pmap_pml3e(kernel_pmap, va);
 6006                 if (be64toh(l3e) & RPTE_LEAF) {
 6007                         pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
 6008                         pa |= (va & L3_PAGE_MASK);
 6009                 } else {
 6010                         /*
 6011                          * Beware of a concurrent promotion that changes the
 6012                          * PDE at this point!  For example, vtopte() must not
 6013                          * be used to access the PTE because it would use the
 6014                          * new PDE.  It is, however, safe to use the old PDE
 6015                          * because the page table page is preserved by the
 6016                          * promotion.
 6017                          */
 6018                         pa = be64toh(*pmap_l3e_to_pte(&l3e, va));
 6019                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 6020                         pa |= (va & PAGE_MASK);
 6021                 }
 6022         }
 6023         return (pa);
 6024 }
 6025 
 6026 static pt_entry_t
 6027 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
 6028 {
 6029 
 6030         if (ma != VM_MEMATTR_DEFAULT) {
 6031                 return pmap_cache_bits(ma);
 6032         }
 6033 
 6034         /*
 6035          * Assume the page is cache inhibited and access is guarded unless
 6036          * it's in our available memory array.
 6037          */
 6038         for (int i = 0; i < pregions_sz; i++) {
 6039                 if ((pa >= pregions[i].mr_start) &&
 6040                     (pa < (pregions[i].mr_start + pregions[i].mr_size)))
 6041                         return (RPTE_ATTR_MEM);
 6042         }
 6043         return (RPTE_ATTR_GUARDEDIO);
 6044 }
 6045 
 6046 static void
 6047 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
 6048 {
 6049         pt_entry_t *pte, pteval;
 6050         uint64_t cache_bits;
 6051 
 6052         pte = kvtopte(va);
 6053         MPASS(pte != NULL);
 6054         pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
 6055         cache_bits = mmu_radix_calc_wimg(pa, ma);
 6056         pte_store(pte, pteval | cache_bits);
 6057 }
 6058 
 6059 void
 6060 mmu_radix_kremove(vm_offset_t va)
 6061 {
 6062         pt_entry_t *pte;
 6063 
 6064         CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
 6065 
 6066         pte = kvtopte(va);
 6067         pte_clear(pte);
 6068 }
 6069 
 6070 int
 6071 mmu_radix_decode_kernel_ptr(vm_offset_t addr,
 6072     int *is_user, vm_offset_t *decoded)
 6073 {
 6074 
 6075         CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr);
 6076         *decoded = addr;
 6077         *is_user = (addr < VM_MAXUSER_ADDRESS);
 6078         return (0);
 6079 }
 6080 
 6081 static boolean_t
 6082 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
 6083 {
 6084 
 6085         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
 6086         return (mem_valid(pa, size));
 6087 }
 6088 
 6089 static void
 6090 mmu_radix_scan_init(void)
 6091 {
 6092 
 6093         CTR1(KTR_PMAP, "%s()", __func__);
 6094         UNIMPLEMENTED();
 6095 }
 6096 
 6097 static void
 6098 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz,
 6099         void **va)
 6100 {
 6101         CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va);
 6102         UNIMPLEMENTED();
 6103 }
 6104 
 6105 vm_offset_t
 6106 mmu_radix_quick_enter_page(vm_page_t m)
 6107 {
 6108         vm_paddr_t paddr;
 6109 
 6110         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
 6111         paddr = VM_PAGE_TO_PHYS(m);
 6112         return (PHYS_TO_DMAP(paddr));
 6113 }
 6114 
 6115 void
 6116 mmu_radix_quick_remove_page(vm_offset_t addr __unused)
 6117 {
 6118         /* no work to do here */
 6119         CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
 6120 }
 6121 
 6122 static void
 6123 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 6124 {
 6125         cpu_flush_dcache((void *)sva, eva - sva);
 6126 }
 6127 
 6128 int
 6129 mmu_radix_change_attr(vm_offset_t va, vm_size_t size,
 6130     vm_memattr_t mode)
 6131 {
 6132         int error;
 6133 
 6134         CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode);
 6135         PMAP_LOCK(kernel_pmap);
 6136         error = pmap_change_attr_locked(va, size, mode, true);
 6137         PMAP_UNLOCK(kernel_pmap);
 6138         return (error);
 6139 }
 6140 
 6141 static int
 6142 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush)
 6143 {
 6144         vm_offset_t base, offset, tmpva;
 6145         vm_paddr_t pa_start, pa_end, pa_end1;
 6146         pml2_entry_t *l2e;
 6147         pml3_entry_t *l3e;
 6148         pt_entry_t *pte;
 6149         int cache_bits, error;
 6150         boolean_t changed;
 6151 
 6152         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 6153         base = trunc_page(va);
 6154         offset = va & PAGE_MASK;
 6155         size = round_page(offset + size);
 6156 
 6157         /*
 6158          * Only supported on kernel virtual addresses, including the direct
 6159          * map but excluding the recursive map.
 6160          */
 6161         if (base < DMAP_MIN_ADDRESS)
 6162                 return (EINVAL);
 6163 
 6164         cache_bits = pmap_cache_bits(mode);
 6165         changed = FALSE;
 6166 
 6167         /*
 6168          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 6169          * into 4KB pages if required.
 6170          */
 6171         for (tmpva = base; tmpva < base + size; ) {
 6172                 l2e = pmap_pml2e(kernel_pmap, tmpva);
 6173                 if (l2e == NULL || *l2e == 0)
 6174                         return (EINVAL);
 6175                 if (be64toh(*l2e) & RPTE_LEAF) {
 6176                         /*
 6177                          * If the current 1GB page already has the required
 6178                          * memory type, then we need not demote this page. Just
 6179                          * increment tmpva to the next 1GB page frame.
 6180                          */
 6181                         if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) {
 6182                                 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
 6183                                 continue;
 6184                         }
 6185 
 6186                         /*
 6187                          * If the current offset aligns with a 1GB page frame
 6188                          * and there is at least 1GB left within the range, then
 6189                          * we need not break down this page into 2MB pages.
 6190                          */
 6191                         if ((tmpva & L2_PAGE_MASK) == 0 &&
 6192                             tmpva + L2_PAGE_MASK < base + size) {
 6193                                 tmpva += L2_PAGE_MASK;
 6194                                 continue;
 6195                         }
 6196                         if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva))
 6197                                 return (ENOMEM);
 6198                 }
 6199                 l3e = pmap_l2e_to_l3e(l2e, tmpva);
 6200                 KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n",
 6201                     tmpva, l2e));
 6202                 if (*l3e == 0)
 6203                         return (EINVAL);
 6204                 if (be64toh(*l3e) & RPTE_LEAF) {
 6205                         /*
 6206                          * If the current 2MB page already has the required
 6207                          * memory type, then we need not demote this page. Just
 6208                          * increment tmpva to the next 2MB page frame.
 6209                          */
 6210                         if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) {
 6211                                 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
 6212                                 continue;
 6213                         }
 6214 
 6215                         /*
 6216                          * If the current offset aligns with a 2MB page frame
 6217                          * and there is at least 2MB left within the range, then
 6218                          * we need not break down this page into 4KB pages.
 6219                          */
 6220                         if ((tmpva & L3_PAGE_MASK) == 0 &&
 6221                             tmpva + L3_PAGE_MASK < base + size) {
 6222                                 tmpva += L3_PAGE_SIZE;
 6223                                 continue;
 6224                         }
 6225                         if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva))
 6226                                 return (ENOMEM);
 6227                 }
 6228                 pte = pmap_l3e_to_pte(l3e, tmpva);
 6229                 if (*pte == 0)
 6230                         return (EINVAL);
 6231                 tmpva += PAGE_SIZE;
 6232         }
 6233         error = 0;
 6234 
 6235         /*
 6236          * Ok, all the pages exist, so run through them updating their
 6237          * cache mode if required.
 6238          */
 6239         pa_start = pa_end = 0;
 6240         for (tmpva = base; tmpva < base + size; ) {
 6241                 l2e = pmap_pml2e(kernel_pmap, tmpva);
 6242                 if (be64toh(*l2e) & RPTE_LEAF) {
 6243                         if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) {
 6244                                 pmap_pte_attr(l2e, cache_bits,
 6245                                     RPTE_ATTR_MASK);
 6246                                 changed = TRUE;
 6247                         }
 6248                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 6249                             (*l2e & PG_PS_FRAME) < dmaplimit) {
 6250                                 if (pa_start == pa_end) {
 6251                                         /* Start physical address run. */
 6252                                         pa_start = be64toh(*l2e) & PG_PS_FRAME;
 6253                                         pa_end = pa_start + L2_PAGE_SIZE;
 6254                                 } else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME))
 6255                                         pa_end += L2_PAGE_SIZE;
 6256                                 else {
 6257                                         /* Run ended, update direct map. */
 6258                                         error = pmap_change_attr_locked(
 6259                                             PHYS_TO_DMAP(pa_start),
 6260                                             pa_end - pa_start, mode, flush);
 6261                                         if (error != 0)
 6262                                                 break;
 6263                                         /* Start physical address run. */
 6264                                         pa_start = be64toh(*l2e) & PG_PS_FRAME;
 6265                                         pa_end = pa_start + L2_PAGE_SIZE;
 6266                                 }
 6267                         }
 6268                         tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
 6269                         continue;
 6270                 }
 6271                 l3e = pmap_l2e_to_l3e(l2e, tmpva);
 6272                 if (be64toh(*l3e) & RPTE_LEAF) {
 6273                         if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) {
 6274                                 pmap_pte_attr(l3e, cache_bits,
 6275                                     RPTE_ATTR_MASK);
 6276                                 changed = TRUE;
 6277                         }
 6278                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 6279                             (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) {
 6280                                 if (pa_start == pa_end) {
 6281                                         /* Start physical address run. */
 6282                                         pa_start = be64toh(*l3e) & PG_PS_FRAME;
 6283                                         pa_end = pa_start + L3_PAGE_SIZE;
 6284                                 } else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME))
 6285                                         pa_end += L3_PAGE_SIZE;
 6286                                 else {
 6287                                         /* Run ended, update direct map. */
 6288                                         error = pmap_change_attr_locked(
 6289                                             PHYS_TO_DMAP(pa_start),
 6290                                             pa_end - pa_start, mode, flush);
 6291                                         if (error != 0)
 6292                                                 break;
 6293                                         /* Start physical address run. */
 6294                                         pa_start = be64toh(*l3e) & PG_PS_FRAME;
 6295                                         pa_end = pa_start + L3_PAGE_SIZE;
 6296                                 }
 6297                         }
 6298                         tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
 6299                 } else {
 6300                         pte = pmap_l3e_to_pte(l3e, tmpva);
 6301                         if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) {
 6302                                 pmap_pte_attr(pte, cache_bits,
 6303                                     RPTE_ATTR_MASK);
 6304                                 changed = TRUE;
 6305                         }
 6306                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 6307                             (be64toh(*pte) & PG_FRAME) < dmaplimit) {
 6308                                 if (pa_start == pa_end) {
 6309                                         /* Start physical address run. */
 6310                                         pa_start = be64toh(*pte) & PG_FRAME;
 6311                                         pa_end = pa_start + PAGE_SIZE;
 6312                                 } else if (pa_end == (be64toh(*pte) & PG_FRAME))
 6313                                         pa_end += PAGE_SIZE;
 6314                                 else {
 6315                                         /* Run ended, update direct map. */
 6316                                         error = pmap_change_attr_locked(
 6317                                             PHYS_TO_DMAP(pa_start),
 6318                                             pa_end - pa_start, mode, flush);
 6319                                         if (error != 0)
 6320                                                 break;
 6321                                         /* Start physical address run. */
 6322                                         pa_start = be64toh(*pte) & PG_FRAME;
 6323                                         pa_end = pa_start + PAGE_SIZE;
 6324                                 }
 6325                         }
 6326                         tmpva += PAGE_SIZE;
 6327                 }
 6328         }
 6329         if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
 6330                 pa_end1 = MIN(pa_end, dmaplimit);
 6331                 if (pa_start != pa_end1)
 6332                         error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
 6333                             pa_end1 - pa_start, mode, flush);
 6334         }
 6335 
 6336         /*
 6337          * Flush CPU caches if required to make sure any data isn't cached that
 6338          * shouldn't be, etc.
 6339          */
 6340         if (changed) {
 6341                 pmap_invalidate_all(kernel_pmap);
 6342 
 6343                 if (flush)
 6344                         pmap_invalidate_cache_range(base, tmpva);
 6345         }
 6346         return (error);
 6347 }
 6348 
 6349 /*
 6350  * Allocate physical memory for the vm_page array and map it into KVA,
 6351  * attempting to back the vm_pages with domain-local memory.
 6352  */
 6353 void
 6354 mmu_radix_page_array_startup(long pages)
 6355 {
 6356 #ifdef notyet
 6357         pml2_entry_t *l2e;
 6358         pml3_entry_t *pde;
 6359         pml3_entry_t newl3;
 6360         vm_offset_t va;
 6361         long pfn;
 6362         int domain, i;
 6363 #endif
 6364         vm_paddr_t pa;
 6365         vm_offset_t start, end;
 6366 
 6367         vm_page_array_size = pages;
 6368 
 6369         start = VM_MIN_KERNEL_ADDRESS;
 6370         end = start + pages * sizeof(struct vm_page);
 6371 
 6372         pa = vm_phys_early_alloc(0, end - start);
 6373 
 6374         start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT);
 6375 #ifdef notyet
 6376         /* TODO: NUMA vm_page_array.  Blocked out until then (copied from amd64). */
 6377         for (va = start; va < end; va += L3_PAGE_SIZE) {
 6378                 pfn = first_page + (va - start) / sizeof(struct vm_page);
 6379                 domain = vm_phys_domain(ptoa(pfn));
 6380                 l2e = pmap_pml2e(kernel_pmap, va);
 6381                 if ((be64toh(*l2e) & PG_V) == 0) {
 6382                         pa = vm_phys_early_alloc(domain, PAGE_SIZE);
 6383                         dump_add_page(pa);
 6384                         pagezero(PHYS_TO_DMAP(pa));
 6385                         pde_store(l2e, (pml2_entry_t)pa);
 6386                 }
 6387                 pde = pmap_l2e_to_l3e(l2e, va);
 6388                 if ((be64toh(*pde) & PG_V) != 0)
 6389                         panic("Unexpected pde %p", pde);
 6390                 pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE);
 6391                 for (i = 0; i < NPDEPG; i++)
 6392                         dump_add_page(pa + i * PAGE_SIZE);
 6393                 newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W);
 6394                 pte_store(pde, newl3);
 6395         }
 6396 #endif
 6397         vm_page_array = (vm_page_t)start;
 6398 }
 6399 
 6400 #ifdef DDB
 6401 #include <sys/kdb.h>
 6402 #include <ddb/ddb.h>
 6403 
 6404 static void
 6405 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va)
 6406 {
 6407         pml1_entry_t *l1e;
 6408         pml2_entry_t *l2e;
 6409         pml3_entry_t *l3e;
 6410         pt_entry_t *pte;
 6411 
 6412         l1e = &l1[pmap_pml1e_index(va)];
 6413         db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e));
 6414         if ((be64toh(*l1e) & PG_V) == 0) {
 6415                 db_printf("\n");
 6416                 return;
 6417         }
 6418         l2e = pmap_l1e_to_l2e(l1e, va);
 6419         db_printf(" l2e %#016lx", be64toh(*l2e));
 6420         if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) {
 6421                 db_printf("\n");
 6422                 return;
 6423         }
 6424         l3e = pmap_l2e_to_l3e(l2e, va);
 6425         db_printf(" l3e %#016lx", be64toh(*l3e));
 6426         if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) {
 6427                 db_printf("\n");
 6428                 return;
 6429         }
 6430         pte = pmap_l3e_to_pte(l3e, va);
 6431         db_printf(" pte %#016lx\n", be64toh(*pte));
 6432 }
 6433 
 6434 void
 6435 pmap_page_print_mappings(vm_page_t m)
 6436 {
 6437         pmap_t pmap;
 6438         pv_entry_t pv;
 6439 
 6440         db_printf("page %p(%lx)\n", m, m->phys_addr);
 6441         /* need to elide locks if running in ddb */
 6442         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
 6443                 db_printf("pv: %p ", pv);
 6444                 db_printf("va: %#016lx ", pv->pv_va);
 6445                 pmap = PV_PMAP(pv);
 6446                 db_printf("pmap %p  ", pmap);
 6447                 if (pmap != NULL) {
 6448                         db_printf("asid: %lu\n", pmap->pm_pid);
 6449                         pmap_pte_walk(pmap->pm_pml1, pv->pv_va);
 6450                 }
 6451         }
 6452 }
 6453 
 6454 DB_SHOW_COMMAND(pte, pmap_print_pte)
 6455 {
 6456         vm_offset_t va;
 6457         pmap_t pmap;
 6458 
 6459         if (!have_addr) {
 6460                 db_printf("show pte addr\n");
 6461                 return;
 6462         }
 6463         va = (vm_offset_t)addr;
 6464 
 6465         if (va >= DMAP_MIN_ADDRESS)
 6466                 pmap = kernel_pmap;
 6467         else if (kdb_thread != NULL)
 6468                 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
 6469         else
 6470                 pmap = vmspace_pmap(curthread->td_proc->p_vmspace);
 6471 
 6472         pmap_pte_walk(pmap->pm_pml1, va);
 6473 }
 6474 
 6475 #endif

Cache object: af8e0c182b88ad03311107bcf7839987


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.