The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/powerpc/aim/mmu_oea64.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2008-2015 Nathan Whitehorn
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  *
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 
   32 /*
   33  * Manages physical address maps.
   34  *
   35  * Since the information managed by this module is also stored by the
   36  * logical address mapping module, this module may throw away valid virtual
   37  * to physical mappings at almost any time.  However, invalidations of
   38  * mappings must be done as requested.
   39  *
   40  * In order to cope with hardware architectures which make virtual to
   41  * physical map invalidates expensive, this module may delay invalidate
   42  * reduced protection operations until such time as they are actually
   43  * necessary.  This module is given full information as to which processors
   44  * are currently using which maps, and to when physical maps must be made
   45  * correct.
   46  */
   47 
   48 #include "opt_kstack_pages.h"
   49 
   50 #include <sys/param.h>
   51 #include <sys/kernel.h>
   52 #include <sys/conf.h>
   53 #include <sys/queue.h>
   54 #include <sys/cpuset.h>
   55 #include <sys/kerneldump.h>
   56 #include <sys/ktr.h>
   57 #include <sys/lock.h>
   58 #include <sys/msgbuf.h>
   59 #include <sys/malloc.h>
   60 #include <sys/mman.h>
   61 #include <sys/mutex.h>
   62 #include <sys/proc.h>
   63 #include <sys/rwlock.h>
   64 #include <sys/sched.h>
   65 #include <sys/sysctl.h>
   66 #include <sys/systm.h>
   67 #include <sys/vmmeter.h>
   68 #include <sys/smp.h>
   69 #include <sys/reboot.h>
   70 
   71 #include <sys/kdb.h>
   72 
   73 #include <dev/ofw/openfirm.h>
   74 
   75 #include <vm/vm.h>
   76 #include <vm/pmap.h>
   77 #include <vm/vm_param.h>
   78 #include <vm/vm_kern.h>
   79 #include <vm/vm_page.h>
   80 #include <vm/vm_phys.h>
   81 #include <vm/vm_map.h>
   82 #include <vm/vm_object.h>
   83 #include <vm/vm_extern.h>
   84 #include <vm/vm_pageout.h>
   85 #include <vm/vm_dumpset.h>
   86 #include <vm/vm_reserv.h>
   87 #include <vm/uma.h>
   88 
   89 #include <machine/_inttypes.h>
   90 #include <machine/cpu.h>
   91 #include <machine/ifunc.h>
   92 #include <machine/platform.h>
   93 #include <machine/frame.h>
   94 #include <machine/md_var.h>
   95 #include <machine/psl.h>
   96 #include <machine/bat.h>
   97 #include <machine/hid.h>
   98 #include <machine/pte.h>
   99 #include <machine/sr.h>
  100 #include <machine/trap.h>
  101 #include <machine/mmuvar.h>
  102 
  103 #include "mmu_oea64.h"
  104 
  105 void moea64_release_vsid(uint64_t vsid);
  106 uintptr_t moea64_get_unique_vsid(void);
  107 
  108 #define DISABLE_TRANS(msr)      msr = mfmsr(); mtmsr(msr & ~PSL_DR)
  109 #define ENABLE_TRANS(msr)       mtmsr(msr)
  110 
  111 #define VSID_MAKE(sr, hash)     ((sr) | (((hash) & 0xfffff) << 4))
  112 #define VSID_TO_HASH(vsid)      (((vsid) >> 4) & 0xfffff)
  113 #define VSID_HASH_MASK          0x0000007fffffffffULL
  114 
  115 /*
  116  * Locking semantics:
  117  *
  118  * There are two locks of interest: the page locks and the pmap locks, which
  119  * protect their individual PVO lists and are locked in that order. The contents
  120  * of all PVO entries are protected by the locks of their respective pmaps.
  121  * The pmap of any PVO is guaranteed not to change so long as the PVO is linked
  122  * into any list.
  123  *
  124  */
  125 
  126 #define PV_LOCK_COUNT   PA_LOCK_COUNT
  127 static struct mtx_padalign pv_lock[PV_LOCK_COUNT];
  128 
  129 /*
  130  * Cheap NUMA-izing of the pv locks, to reduce contention across domains.
  131  * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the
  132  * index at (N << 45).
  133  */
  134 #ifdef __powerpc64__
  135 #define PV_LOCK_IDX(pa) ((pa_index(pa) * (((pa) >> 45) + 1)) % PV_LOCK_COUNT)
  136 #else
  137 #define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_COUNT)
  138 #endif
  139 #define PV_LOCKPTR(pa)  ((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)]))
  140 #define PV_LOCK(pa)             mtx_lock(PV_LOCKPTR(pa))
  141 #define PV_UNLOCK(pa)           mtx_unlock(PV_LOCKPTR(pa))
  142 #define PV_LOCKASSERT(pa)       mtx_assert(PV_LOCKPTR(pa), MA_OWNED)
  143 #define PV_PAGE_LOCK(m)         PV_LOCK(VM_PAGE_TO_PHYS(m))
  144 #define PV_PAGE_UNLOCK(m)       PV_UNLOCK(VM_PAGE_TO_PHYS(m))
  145 #define PV_PAGE_LOCKASSERT(m)   PV_LOCKASSERT(VM_PAGE_TO_PHYS(m))
  146 
  147 /* Superpage PV lock */
  148 
  149 #define PV_LOCK_SIZE            (1<<PDRSHIFT)
  150 
  151 static __always_inline void
  152 moea64_sp_pv_lock(vm_paddr_t pa)
  153 {
  154         vm_paddr_t pa_end;
  155 
  156         /* Note: breaking when pa_end is reached to avoid overflows */
  157         pa_end = pa + (HPT_SP_SIZE - PV_LOCK_SIZE);
  158         for (;;) {
  159                 mtx_lock_flags(PV_LOCKPTR(pa), MTX_DUPOK);
  160                 if (pa == pa_end)
  161                         break;
  162                 pa += PV_LOCK_SIZE;
  163         }
  164 }
  165 
  166 static __always_inline void
  167 moea64_sp_pv_unlock(vm_paddr_t pa)
  168 {
  169         vm_paddr_t pa_end;
  170 
  171         /* Note: breaking when pa_end is reached to avoid overflows */
  172         pa_end = pa;
  173         pa += HPT_SP_SIZE - PV_LOCK_SIZE;
  174         for (;;) {
  175                 mtx_unlock_flags(PV_LOCKPTR(pa), MTX_DUPOK);
  176                 if (pa == pa_end)
  177                         break;
  178                 pa -= PV_LOCK_SIZE;
  179         }
  180 }
  181 
  182 #define SP_PV_LOCK_ALIGNED(pa)          moea64_sp_pv_lock(pa)
  183 #define SP_PV_UNLOCK_ALIGNED(pa)        moea64_sp_pv_unlock(pa)
  184 #define SP_PV_LOCK(pa)                  moea64_sp_pv_lock((pa) & ~HPT_SP_MASK)
  185 #define SP_PV_UNLOCK(pa)                moea64_sp_pv_unlock((pa) & ~HPT_SP_MASK)
  186 #define SP_PV_PAGE_LOCK(m)              SP_PV_LOCK(VM_PAGE_TO_PHYS(m))
  187 #define SP_PV_PAGE_UNLOCK(m)            SP_PV_UNLOCK(VM_PAGE_TO_PHYS(m))
  188 
  189 struct ofw_map {
  190         cell_t  om_va;
  191         cell_t  om_len;
  192         uint64_t om_pa;
  193         cell_t  om_mode;
  194 };
  195 
  196 extern unsigned char _etext[];
  197 extern unsigned char _end[];
  198 
  199 extern void *slbtrap, *slbtrapend;
  200 
  201 /*
  202  * Map of physical memory regions.
  203  */
  204 static struct   mem_region *regions;
  205 static struct   mem_region *pregions;
  206 static struct   numa_mem_region *numa_pregions;
  207 static u_int    phys_avail_count;
  208 static int      regions_sz, pregions_sz, numapregions_sz;
  209 
  210 extern void bs_remap_earlyboot(void);
  211 
  212 /*
  213  * Lock for the SLB tables.
  214  */
  215 struct mtx      moea64_slb_mutex;
  216 
  217 /*
  218  * PTEG data.
  219  */
  220 u_long          moea64_pteg_count;
  221 u_long          moea64_pteg_mask;
  222 
  223 /*
  224  * PVO data.
  225  */
  226 
  227 uma_zone_t      moea64_pvo_zone; /* zone for pvo entries */
  228 
  229 static struct   pvo_entry *moea64_bpvo_pool;
  230 static int      moea64_bpvo_pool_index = 0;
  231 static int      moea64_bpvo_pool_size = 0;
  232 SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD,
  233     &moea64_bpvo_pool_index, 0, "");
  234 
  235 #define BPVO_POOL_SIZE  327680 /* Sensible historical default value */
  236 #define BPVO_POOL_EXPANSION_FACTOR      3
  237 #define VSID_NBPW       (sizeof(u_int32_t) * 8)
  238 #ifdef __powerpc64__
  239 #define NVSIDS          (NPMAPS * 16)
  240 #define VSID_HASHMASK   0xffffffffUL
  241 #else
  242 #define NVSIDS          NPMAPS
  243 #define VSID_HASHMASK   0xfffffUL
  244 #endif
  245 static u_int    moea64_vsid_bitmap[NVSIDS / VSID_NBPW];
  246 
  247 static boolean_t moea64_initialized = FALSE;
  248 
  249 #ifdef MOEA64_STATS
  250 /*
  251  * Statistics.
  252  */
  253 u_int   moea64_pte_valid = 0;
  254 u_int   moea64_pte_overflow = 0;
  255 u_int   moea64_pvo_entries = 0;
  256 u_int   moea64_pvo_enter_calls = 0;
  257 u_int   moea64_pvo_remove_calls = 0;
  258 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD,
  259     &moea64_pte_valid, 0, "");
  260 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD,
  261     &moea64_pte_overflow, 0, "");
  262 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD,
  263     &moea64_pvo_entries, 0, "");
  264 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD,
  265     &moea64_pvo_enter_calls, 0, "");
  266 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD,
  267     &moea64_pvo_remove_calls, 0, "");
  268 #endif
  269 
  270 vm_offset_t     moea64_scratchpage_va[2];
  271 struct pvo_entry *moea64_scratchpage_pvo[2];
  272 struct  mtx     moea64_scratchpage_mtx;
  273 
  274 uint64_t        moea64_large_page_mask = 0;
  275 uint64_t        moea64_large_page_size = 0;
  276 int             moea64_large_page_shift = 0;
  277 bool            moea64_has_lp_4k_16m = false;
  278 
  279 /*
  280  * PVO calls.
  281  */
  282 static int      moea64_pvo_enter(struct pvo_entry *pvo,
  283                     struct pvo_head *pvo_head, struct pvo_entry **oldpvo);
  284 static void     moea64_pvo_remove_from_pmap(struct pvo_entry *pvo);
  285 static void     moea64_pvo_remove_from_page(struct pvo_entry *pvo);
  286 static void     moea64_pvo_remove_from_page_locked(
  287                     struct pvo_entry *pvo, vm_page_t m);
  288 static struct   pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t);
  289 
  290 /*
  291  * Utility routines.
  292  */
  293 static boolean_t        moea64_query_bit(vm_page_t, uint64_t);
  294 static u_int            moea64_clear_bit(vm_page_t, uint64_t);
  295 static void             moea64_kremove(vm_offset_t);
  296 static void             moea64_syncicache(pmap_t pmap, vm_offset_t va,
  297                             vm_paddr_t pa, vm_size_t sz);
  298 static void             moea64_pmap_init_qpages(void);
  299 static void             moea64_remove_locked(pmap_t, vm_offset_t,
  300                             vm_offset_t, struct pvo_dlist *);
  301 
  302 /*
  303  * Superpages data and routines.
  304  */
  305 
  306 /*
  307  * PVO flags (in vaddr) that must match for promotion to succeed.
  308  * Note that protection bits are checked separately, as they reside in
  309  * another field.
  310  */
  311 #define PVO_FLAGS_PROMOTE       (PVO_WIRED | PVO_MANAGED | PVO_PTEGIDX_VALID)
  312 
  313 #define PVO_IS_SP(pvo)          (((pvo)->pvo_vaddr & PVO_LARGE) && \
  314                                  (pvo)->pvo_pmap != kernel_pmap)
  315 
  316 /* Get physical address from PVO. */
  317 #define PVO_PADDR(pvo)          moea64_pvo_paddr(pvo)
  318 
  319 /* MD page flag indicating that the page is a superpage. */
  320 #define MDPG_ATTR_SP            0x40000000
  321 
  322 SYSCTL_DECL(_vm_pmap);
  323 
  324 static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0,
  325     "SP page mapping counters");
  326 
  327 static u_long sp_demotions;
  328 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD,
  329     &sp_demotions, 0, "SP page demotions");
  330 
  331 static u_long sp_mappings;
  332 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD,
  333     &sp_mappings, 0, "SP page mappings");
  334 
  335 static u_long sp_p_failures;
  336 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD,
  337     &sp_p_failures, 0, "SP page promotion failures");
  338 
  339 static u_long sp_p_fail_pa;
  340 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD,
  341     &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match");
  342 
  343 static u_long sp_p_fail_flags;
  344 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD,
  345     &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match");
  346 
  347 static u_long sp_p_fail_prot;
  348 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD,
  349     &sp_p_fail_prot, 0,
  350     "SP page promotion failure: page protections don't match");
  351 
  352 static u_long sp_p_fail_wimg;
  353 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD,
  354     &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match");
  355 
  356 static u_long sp_promotions;
  357 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD,
  358     &sp_promotions, 0, "SP page promotions");
  359 
  360 static bool moea64_ps_enabled(pmap_t);
  361 static void moea64_align_superpage(vm_object_t, vm_ooffset_t,
  362     vm_offset_t *, vm_size_t);
  363 
  364 static int moea64_sp_enter(pmap_t pmap, vm_offset_t va,
  365     vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind);
  366 static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp,
  367     struct pvo_dlist *tofree);
  368 
  369 static void moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m);
  370 static void moea64_sp_demote_aligned(struct pvo_entry *sp);
  371 static void moea64_sp_demote(struct pvo_entry *pvo);
  372 
  373 static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp);
  374 static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp,
  375     vm_prot_t prot);
  376 
  377 static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit);
  378 static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m,
  379     uint64_t ptebit);
  380 
  381 static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo,
  382     vm_offset_t sva, vm_offset_t eva);
  383 
  384 /*
  385  * Kernel MMU interface
  386  */
  387 void moea64_clear_modify(vm_page_t);
  388 void moea64_copy_page(vm_page_t, vm_page_t);
  389 void moea64_copy_page_dmap(vm_page_t, vm_page_t);
  390 void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
  391     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
  392 void moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset,
  393     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
  394 int moea64_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t,
  395     u_int flags, int8_t psind);
  396 void moea64_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
  397     vm_prot_t);
  398 void moea64_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
  399 vm_paddr_t moea64_extract(pmap_t, vm_offset_t);
  400 vm_page_t moea64_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
  401 void moea64_init(void);
  402 boolean_t moea64_is_modified(vm_page_t);
  403 boolean_t moea64_is_prefaultable(pmap_t, vm_offset_t);
  404 boolean_t moea64_is_referenced(vm_page_t);
  405 int moea64_ts_referenced(vm_page_t);
  406 vm_offset_t moea64_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
  407 boolean_t moea64_page_exists_quick(pmap_t, vm_page_t);
  408 void moea64_page_init(vm_page_t);
  409 int moea64_page_wired_mappings(vm_page_t);
  410 int moea64_pinit(pmap_t);
  411 void moea64_pinit0(pmap_t);
  412 void moea64_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
  413 void moea64_qenter(vm_offset_t, vm_page_t *, int);
  414 void moea64_qremove(vm_offset_t, int);
  415 void moea64_release(pmap_t);
  416 void moea64_remove(pmap_t, vm_offset_t, vm_offset_t);
  417 void moea64_remove_pages(pmap_t);
  418 void moea64_remove_all(vm_page_t);
  419 void moea64_remove_write(vm_page_t);
  420 void moea64_unwire(pmap_t, vm_offset_t, vm_offset_t);
  421 void moea64_zero_page(vm_page_t);
  422 void moea64_zero_page_dmap(vm_page_t);
  423 void moea64_zero_page_area(vm_page_t, int, int);
  424 void moea64_activate(struct thread *);
  425 void moea64_deactivate(struct thread *);
  426 void *moea64_mapdev(vm_paddr_t, vm_size_t);
  427 void *moea64_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
  428 void moea64_unmapdev(void *, vm_size_t);
  429 vm_paddr_t moea64_kextract(vm_offset_t);
  430 void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma);
  431 void moea64_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
  432 void moea64_kenter(vm_offset_t, vm_paddr_t);
  433 boolean_t moea64_dev_direct_mapped(vm_paddr_t, vm_size_t);
  434 static void moea64_sync_icache(pmap_t, vm_offset_t, vm_size_t);
  435 void moea64_dumpsys_map(vm_paddr_t pa, size_t sz,
  436     void **va);
  437 void moea64_scan_init(void);
  438 vm_offset_t moea64_quick_enter_page(vm_page_t m);
  439 vm_offset_t moea64_quick_enter_page_dmap(vm_page_t m);
  440 void moea64_quick_remove_page(vm_offset_t addr);
  441 boolean_t moea64_page_is_mapped(vm_page_t m);
  442 static int moea64_map_user_ptr(pmap_t pm,
  443     volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen);
  444 static int moea64_decode_kernel_ptr(vm_offset_t addr,
  445     int *is_user, vm_offset_t *decoded_addr);
  446 static size_t moea64_scan_pmap(struct bitset *dump_bitset);
  447 static void *moea64_dump_pmap_init(unsigned blkpgs);
  448 #ifdef __powerpc64__
  449 static void moea64_page_array_startup(long);
  450 #endif
  451 static int moea64_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
  452 
  453 static struct pmap_funcs moea64_methods = {
  454         .clear_modify = moea64_clear_modify,
  455         .copy_page = moea64_copy_page,
  456         .copy_pages = moea64_copy_pages,
  457         .enter = moea64_enter,
  458         .enter_object = moea64_enter_object,
  459         .enter_quick = moea64_enter_quick,
  460         .extract = moea64_extract,
  461         .extract_and_hold = moea64_extract_and_hold,
  462         .init = moea64_init,
  463         .is_modified = moea64_is_modified,
  464         .is_prefaultable = moea64_is_prefaultable,
  465         .is_referenced = moea64_is_referenced,
  466         .ts_referenced = moea64_ts_referenced,
  467         .map =                  moea64_map,
  468         .mincore = moea64_mincore,
  469         .page_exists_quick = moea64_page_exists_quick,
  470         .page_init = moea64_page_init,
  471         .page_wired_mappings = moea64_page_wired_mappings,
  472         .pinit = moea64_pinit,
  473         .pinit0 = moea64_pinit0,
  474         .protect = moea64_protect,
  475         .qenter = moea64_qenter,
  476         .qremove = moea64_qremove,
  477         .release = moea64_release,
  478         .remove = moea64_remove,
  479         .remove_pages = moea64_remove_pages,
  480         .remove_all =           moea64_remove_all,
  481         .remove_write = moea64_remove_write,
  482         .sync_icache = moea64_sync_icache,
  483         .unwire = moea64_unwire,
  484         .zero_page =            moea64_zero_page,
  485         .zero_page_area = moea64_zero_page_area,
  486         .activate = moea64_activate,
  487         .deactivate =           moea64_deactivate,
  488         .page_set_memattr = moea64_page_set_memattr,
  489         .quick_enter_page =  moea64_quick_enter_page,
  490         .quick_remove_page =  moea64_quick_remove_page,
  491         .page_is_mapped = moea64_page_is_mapped,
  492 #ifdef __powerpc64__
  493         .page_array_startup = moea64_page_array_startup,
  494 #endif
  495         .ps_enabled = moea64_ps_enabled,
  496         .align_superpage = moea64_align_superpage,
  497 
  498         /* Internal interfaces */
  499         .mapdev = moea64_mapdev,
  500         .mapdev_attr = moea64_mapdev_attr,
  501         .unmapdev = moea64_unmapdev,
  502         .kextract = moea64_kextract,
  503         .kenter = moea64_kenter,
  504         .kenter_attr = moea64_kenter_attr,
  505         .dev_direct_mapped = moea64_dev_direct_mapped,
  506         .dumpsys_pa_init = moea64_scan_init,
  507         .dumpsys_scan_pmap = moea64_scan_pmap,
  508         .dumpsys_dump_pmap_init =    moea64_dump_pmap_init,
  509         .dumpsys_map_chunk = moea64_dumpsys_map,
  510         .map_user_ptr = moea64_map_user_ptr,
  511         .decode_kernel_ptr =  moea64_decode_kernel_ptr,
  512 };
  513 
  514 MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods);
  515 
  516 /*
  517  * Get physical address from PVO.
  518  *
  519  * For superpages, the lower bits are not stored on pvo_pte.pa and must be
  520  * obtained from VA.
  521  */
  522 static __always_inline vm_paddr_t
  523 moea64_pvo_paddr(struct pvo_entry *pvo)
  524 {
  525         vm_paddr_t pa;
  526 
  527         pa = (pvo)->pvo_pte.pa & LPTE_RPGN;
  528 
  529         if (PVO_IS_SP(pvo)) {
  530                 pa &= ~HPT_SP_MASK; /* This is needed to clear LPTE_LP bits. */
  531                 pa |= PVO_VADDR(pvo) & HPT_SP_MASK;
  532         }
  533         return (pa);
  534 }
  535 
  536 static struct pvo_head *
  537 vm_page_to_pvoh(vm_page_t m)
  538 {
  539 
  540         mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED);
  541         return (&m->md.mdpg_pvoh);
  542 }
  543 
  544 static struct pvo_entry *
  545 alloc_pvo_entry(int bootstrap)
  546 {
  547         struct pvo_entry *pvo;
  548 
  549         if (!moea64_initialized || bootstrap) {
  550                 if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) {
  551                         panic("%s: bpvo pool exhausted, index=%d, size=%d, bytes=%zd."
  552                             "Try setting machdep.moea64_bpvo_pool_size tunable",
  553                             __func__, moea64_bpvo_pool_index,
  554                             moea64_bpvo_pool_size,
  555                             moea64_bpvo_pool_size * sizeof(struct pvo_entry));
  556                 }
  557                 pvo = &moea64_bpvo_pool[
  558                     atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)];
  559                 bzero(pvo, sizeof(*pvo));
  560                 pvo->pvo_vaddr = PVO_BOOTSTRAP;
  561         } else
  562                 pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO);
  563 
  564         return (pvo);
  565 }
  566 
  567 static void
  568 init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va)
  569 {
  570         uint64_t vsid;
  571         uint64_t hash;
  572         int shift;
  573 
  574         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  575 
  576         pvo->pvo_pmap = pmap;
  577         va &= ~ADDR_POFF;
  578         pvo->pvo_vaddr |= va;
  579         vsid = va_to_vsid(pmap, va);
  580         pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT)
  581             | (vsid << 16);
  582 
  583         if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0)
  584                 shift = moea64_large_page_shift;
  585         else
  586                 shift = ADDR_PIDX_SHFT;
  587         hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift);
  588         pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3;
  589 }
  590 
  591 static void
  592 free_pvo_entry(struct pvo_entry *pvo)
  593 {
  594 
  595         if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
  596                 uma_zfree(moea64_pvo_zone, pvo);
  597 }
  598 
  599 void
  600 moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte)
  601 {
  602 
  603         lpte->pte_hi = moea64_pte_vpn_from_pvo_vpn(pvo);
  604         lpte->pte_hi |= LPTE_VALID;
  605 
  606         if (pvo->pvo_vaddr & PVO_LARGE)
  607                 lpte->pte_hi |= LPTE_BIG;
  608         if (pvo->pvo_vaddr & PVO_WIRED)
  609                 lpte->pte_hi |= LPTE_WIRED;
  610         if (pvo->pvo_vaddr & PVO_HID)
  611                 lpte->pte_hi |= LPTE_HID;
  612 
  613         lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */
  614         if (pvo->pvo_pte.prot & VM_PROT_WRITE)
  615                 lpte->pte_lo |= LPTE_BW;
  616         else
  617                 lpte->pte_lo |= LPTE_BR;
  618 
  619         if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE))
  620                 lpte->pte_lo |= LPTE_NOEXEC;
  621 }
  622 
  623 static __inline uint64_t
  624 moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
  625 {
  626         uint64_t pte_lo;
  627         int i;
  628 
  629         if (ma != VM_MEMATTR_DEFAULT) {
  630                 switch (ma) {
  631                 case VM_MEMATTR_UNCACHEABLE:
  632                         return (LPTE_I | LPTE_G);
  633                 case VM_MEMATTR_CACHEABLE:
  634                         return (LPTE_M);
  635                 case VM_MEMATTR_WRITE_COMBINING:
  636                 case VM_MEMATTR_WRITE_BACK:
  637                 case VM_MEMATTR_PREFETCHABLE:
  638                         return (LPTE_I);
  639                 case VM_MEMATTR_WRITE_THROUGH:
  640                         return (LPTE_W | LPTE_M);
  641                 }
  642         }
  643 
  644         /*
  645          * Assume the page is cache inhibited and access is guarded unless
  646          * it's in our available memory array.
  647          */
  648         pte_lo = LPTE_I | LPTE_G;
  649         for (i = 0; i < pregions_sz; i++) {
  650                 if ((pa >= pregions[i].mr_start) &&
  651                     (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
  652                         pte_lo &= ~(LPTE_I | LPTE_G);
  653                         pte_lo |= LPTE_M;
  654                         break;
  655                 }
  656         }
  657 
  658         return pte_lo;
  659 }
  660 
  661 /*
  662  * Quick sort callout for comparing memory regions.
  663  */
  664 static int      om_cmp(const void *a, const void *b);
  665 
  666 static int
  667 om_cmp(const void *a, const void *b)
  668 {
  669         const struct    ofw_map *mapa;
  670         const struct    ofw_map *mapb;
  671 
  672         mapa = a;
  673         mapb = b;
  674         if (mapa->om_pa < mapb->om_pa)
  675                 return (-1);
  676         else if (mapa->om_pa > mapb->om_pa)
  677                 return (1);
  678         else
  679                 return (0);
  680 }
  681 
  682 static void
  683 moea64_add_ofw_mappings(phandle_t mmu, size_t sz)
  684 {
  685         struct ofw_map  translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */
  686         pcell_t         acells, trans_cells[sz/sizeof(cell_t)];
  687         struct pvo_entry *pvo;
  688         register_t      msr;
  689         vm_offset_t     off;
  690         vm_paddr_t      pa_base;
  691         int             i, j;
  692 
  693         bzero(translations, sz);
  694         OF_getencprop(OF_finddevice("/"), "#address-cells", &acells,
  695             sizeof(acells));
  696         if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1)
  697                 panic("moea64_bootstrap: can't get ofw translations");
  698 
  699         CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations");
  700         sz /= sizeof(cell_t);
  701         for (i = 0, j = 0; i < sz; j++) {
  702                 translations[j].om_va = trans_cells[i++];
  703                 translations[j].om_len = trans_cells[i++];
  704                 translations[j].om_pa = trans_cells[i++];
  705                 if (acells == 2) {
  706                         translations[j].om_pa <<= 32;
  707                         translations[j].om_pa |= trans_cells[i++];
  708                 }
  709                 translations[j].om_mode = trans_cells[i++];
  710         }
  711         KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)",
  712             i, sz));
  713 
  714         sz = j;
  715         qsort(translations, sz, sizeof (*translations), om_cmp);
  716 
  717         for (i = 0; i < sz; i++) {
  718                 pa_base = translations[i].om_pa;
  719               #ifndef __powerpc64__
  720                 if ((translations[i].om_pa >> 32) != 0)
  721                         panic("OFW translations above 32-bit boundary!");
  722               #endif
  723 
  724                 if (pa_base % PAGE_SIZE)
  725                         panic("OFW translation not page-aligned (phys)!");
  726                 if (translations[i].om_va % PAGE_SIZE)
  727                         panic("OFW translation not page-aligned (virt)!");
  728 
  729                 CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x",
  730                     pa_base, translations[i].om_va, translations[i].om_len);
  731 
  732                 /* Now enter the pages for this mapping */
  733 
  734                 DISABLE_TRANS(msr);
  735                 for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) {
  736                         /* If this address is direct-mapped, skip remapping */
  737                         if (hw_direct_map &&
  738                             translations[i].om_va == PHYS_TO_DMAP(pa_base) &&
  739                             moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT)
  740                             == LPTE_M)
  741                                 continue;
  742 
  743                         PMAP_LOCK(kernel_pmap);
  744                         pvo = moea64_pvo_find_va(kernel_pmap,
  745                             translations[i].om_va + off);
  746                         PMAP_UNLOCK(kernel_pmap);
  747                         if (pvo != NULL)
  748                                 continue;
  749 
  750                         moea64_kenter(translations[i].om_va + off,
  751                             pa_base + off);
  752                 }
  753                 ENABLE_TRANS(msr);
  754         }
  755 }
  756 
  757 #ifdef __powerpc64__
  758 static void
  759 moea64_probe_large_page(void)
  760 {
  761         uint16_t pvr = mfpvr() >> 16;
  762 
  763         switch (pvr) {
  764         case IBM970:
  765         case IBM970FX:
  766         case IBM970MP:
  767                 powerpc_sync(); isync();
  768                 mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG);
  769                 powerpc_sync(); isync();
  770                 
  771                 /* FALLTHROUGH */
  772         default:
  773                 if (moea64_large_page_size == 0) {
  774                         moea64_large_page_size = 0x1000000; /* 16 MB */
  775                         moea64_large_page_shift = 24;
  776                 }
  777         }
  778 
  779         moea64_large_page_mask = moea64_large_page_size - 1;
  780 }
  781 
  782 static void
  783 moea64_bootstrap_slb_prefault(vm_offset_t va, int large)
  784 {
  785         struct slb *cache;
  786         struct slb entry;
  787         uint64_t esid, slbe;
  788         uint64_t i;
  789 
  790         cache = PCPU_GET(aim.slb);
  791         esid = va >> ADDR_SR_SHFT;
  792         slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
  793 
  794         for (i = 0; i < 64; i++) {
  795                 if (cache[i].slbe == (slbe | i))
  796                         return;
  797         }
  798 
  799         entry.slbe = slbe;
  800         entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT;
  801         if (large)
  802                 entry.slbv |= SLBV_L;
  803 
  804         slb_insert_kernel(entry.slbe, entry.slbv);
  805 }
  806 #endif
  807 
  808 static int
  809 moea64_kenter_large(vm_offset_t va, vm_paddr_t pa, uint64_t attr, int bootstrap)
  810 {
  811         struct pvo_entry *pvo;
  812         uint64_t pte_lo;
  813         int error;
  814 
  815         pte_lo = LPTE_M;
  816         pte_lo |= attr;
  817 
  818         pvo = alloc_pvo_entry(bootstrap);
  819         pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE;
  820         init_pvo_entry(pvo, kernel_pmap, va);
  821 
  822         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE |
  823             VM_PROT_EXECUTE;
  824         pvo->pvo_pte.pa = pa | pte_lo;
  825         error = moea64_pvo_enter(pvo, NULL, NULL);
  826         if (error != 0)
  827                 panic("Error %d inserting large page\n", error);
  828         return (0);
  829 }
  830 
  831 static void
  832 moea64_setup_direct_map(vm_offset_t kernelstart,
  833     vm_offset_t kernelend)
  834 {
  835         register_t msr;
  836         vm_paddr_t pa, pkernelstart, pkernelend;
  837         vm_offset_t size, off;
  838         uint64_t pte_lo;
  839         int i;
  840 
  841         if (moea64_large_page_size == 0)
  842                 hw_direct_map = 0;
  843 
  844         DISABLE_TRANS(msr);
  845         if (hw_direct_map) {
  846                 PMAP_LOCK(kernel_pmap);
  847                 for (i = 0; i < pregions_sz; i++) {
  848                   for (pa = pregions[i].mr_start; pa < pregions[i].mr_start +
  849                      pregions[i].mr_size; pa += moea64_large_page_size) {
  850                         pte_lo = LPTE_M;
  851                         if (pa & moea64_large_page_mask) {
  852                                 pa &= moea64_large_page_mask;
  853                                 pte_lo |= LPTE_G;
  854                         }
  855                         if (pa + moea64_large_page_size >
  856                             pregions[i].mr_start + pregions[i].mr_size)
  857                                 pte_lo |= LPTE_G;
  858 
  859                         moea64_kenter_large(PHYS_TO_DMAP(pa), pa, pte_lo, 1);
  860                   }
  861                 }
  862                 PMAP_UNLOCK(kernel_pmap);
  863         }
  864 
  865         /*
  866          * Make sure the kernel and BPVO pool stay mapped on systems either
  867          * without a direct map or on which the kernel is not already executing
  868          * out of the direct-mapped region.
  869          */
  870         if (kernelstart < DMAP_BASE_ADDRESS) {
  871                 /*
  872                  * For pre-dmap execution, we need to use identity mapping
  873                  * because we will be operating with the mmu on but in the
  874                  * wrong address configuration until we __restartkernel().
  875                  */
  876                 for (pa = kernelstart & ~PAGE_MASK; pa < kernelend;
  877                     pa += PAGE_SIZE)
  878                         moea64_kenter(pa, pa);
  879         } else if (!hw_direct_map) {
  880                 pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS;
  881                 pkernelend = kernelend & ~DMAP_BASE_ADDRESS;
  882                 for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend;
  883                     pa += PAGE_SIZE)
  884                         moea64_kenter(pa | DMAP_BASE_ADDRESS, pa);
  885         }
  886 
  887         if (!hw_direct_map) {
  888                 size = moea64_bpvo_pool_size*sizeof(struct pvo_entry);
  889                 off = (vm_offset_t)(moea64_bpvo_pool);
  890                 for (pa = off; pa < off + size; pa += PAGE_SIZE)
  891                         moea64_kenter(pa, pa);
  892 
  893                 /* Map exception vectors */
  894                 for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE)
  895                         moea64_kenter(pa | DMAP_BASE_ADDRESS, pa);
  896         }
  897         ENABLE_TRANS(msr);
  898 
  899         /*
  900          * Allow user to override unmapped_buf_allowed for testing.
  901          * XXXKIB Only direct map implementation was tested.
  902          */
  903         if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
  904             &unmapped_buf_allowed))
  905                 unmapped_buf_allowed = hw_direct_map;
  906 }
  907 
  908 /* Quick sort callout for comparing physical addresses. */
  909 static int
  910 pa_cmp(const void *a, const void *b)
  911 {
  912         const vm_paddr_t *pa = a, *pb = b;
  913 
  914         if (*pa < *pb)
  915                 return (-1);
  916         else if (*pa > *pb)
  917                 return (1);
  918         else
  919                 return (0);
  920 }
  921 
  922 void
  923 moea64_early_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
  924 {
  925         int             i, j;
  926         vm_size_t       physsz, hwphyssz;
  927         vm_paddr_t      kernelphysstart, kernelphysend;
  928         int             rm_pavail;
  929 
  930         /* Level 0 reservations consist of 4096 pages (16MB superpage). */
  931         vm_level_0_order = 12;
  932 
  933 #ifndef __powerpc64__
  934         /* We don't have a direct map since there is no BAT */
  935         hw_direct_map = 0;
  936 
  937         /* Make sure battable is zero, since we have no BAT */
  938         for (i = 0; i < 16; i++) {
  939                 battable[i].batu = 0;
  940                 battable[i].batl = 0;
  941         }
  942 #else
  943         /* Install trap handlers for SLBs */
  944         bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap);
  945         bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap);
  946         __syncicache((void *)EXC_DSE, 0x80);
  947         __syncicache((void *)EXC_ISE, 0x80);
  948 #endif
  949 
  950         kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS;
  951         kernelphysend = kernelend & ~DMAP_BASE_ADDRESS;
  952 
  953         /* Get physical memory regions from firmware */
  954         mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
  955         CTR0(KTR_PMAP, "moea64_bootstrap: physical memory");
  956 
  957         if (PHYS_AVAIL_ENTRIES < regions_sz)
  958                 panic("moea64_bootstrap: phys_avail too small");
  959 
  960         phys_avail_count = 0;
  961         physsz = 0;
  962         hwphyssz = 0;
  963         TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
  964         for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
  965                 CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)",
  966                     regions[i].mr_start, regions[i].mr_start +
  967                     regions[i].mr_size, regions[i].mr_size);
  968                 if (hwphyssz != 0 &&
  969                     (physsz + regions[i].mr_size) >= hwphyssz) {
  970                         if (physsz < hwphyssz) {
  971                                 phys_avail[j] = regions[i].mr_start;
  972                                 phys_avail[j + 1] = regions[i].mr_start +
  973                                     hwphyssz - physsz;
  974                                 physsz = hwphyssz;
  975                                 phys_avail_count++;
  976                                 dump_avail[j] = phys_avail[j];
  977                                 dump_avail[j + 1] = phys_avail[j + 1];
  978                         }
  979                         break;
  980                 }
  981                 phys_avail[j] = regions[i].mr_start;
  982                 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
  983                 phys_avail_count++;
  984                 physsz += regions[i].mr_size;
  985                 dump_avail[j] = phys_avail[j];
  986                 dump_avail[j + 1] = phys_avail[j + 1];
  987         }
  988 
  989         /* Check for overlap with the kernel and exception vectors */
  990         rm_pavail = 0;
  991         for (j = 0; j < 2*phys_avail_count; j+=2) {
  992                 if (phys_avail[j] < EXC_LAST)
  993                         phys_avail[j] += EXC_LAST;
  994 
  995                 if (phys_avail[j] >= kernelphysstart &&
  996                     phys_avail[j+1] <= kernelphysend) {
  997                         phys_avail[j] = phys_avail[j+1] = ~0;
  998                         rm_pavail++;
  999                         continue;
 1000                 }
 1001 
 1002                 if (kernelphysstart >= phys_avail[j] &&
 1003                     kernelphysstart < phys_avail[j+1]) {
 1004                         if (kernelphysend < phys_avail[j+1]) {
 1005                                 phys_avail[2*phys_avail_count] =
 1006                                     (kernelphysend & ~PAGE_MASK) + PAGE_SIZE;
 1007                                 phys_avail[2*phys_avail_count + 1] =
 1008                                     phys_avail[j+1];
 1009                                 phys_avail_count++;
 1010                         }
 1011 
 1012                         phys_avail[j+1] = kernelphysstart & ~PAGE_MASK;
 1013                 }
 1014 
 1015                 if (kernelphysend >= phys_avail[j] &&
 1016                     kernelphysend < phys_avail[j+1]) {
 1017                         if (kernelphysstart > phys_avail[j]) {
 1018                                 phys_avail[2*phys_avail_count] = phys_avail[j];
 1019                                 phys_avail[2*phys_avail_count + 1] =
 1020                                     kernelphysstart & ~PAGE_MASK;
 1021                                 phys_avail_count++;
 1022                         }
 1023 
 1024                         phys_avail[j] = (kernelphysend & ~PAGE_MASK) +
 1025                             PAGE_SIZE;
 1026                 }
 1027         }
 1028 
 1029         /* Remove physical available regions marked for removal (~0) */
 1030         if (rm_pavail) {
 1031                 qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]),
 1032                         pa_cmp);
 1033                 phys_avail_count -= rm_pavail;
 1034                 for (i = 2*phys_avail_count;
 1035                      i < 2*(phys_avail_count + rm_pavail); i+=2)
 1036                         phys_avail[i] = phys_avail[i+1] = 0;
 1037         }
 1038 
 1039         physmem = btoc(physsz);
 1040 
 1041 #ifdef PTEGCOUNT
 1042         moea64_pteg_count = PTEGCOUNT;
 1043 #else
 1044         moea64_pteg_count = 0x1000;
 1045 
 1046         while (moea64_pteg_count < physmem)
 1047                 moea64_pteg_count <<= 1;
 1048 
 1049         moea64_pteg_count >>= 1;
 1050 #endif /* PTEGCOUNT */
 1051 }
 1052 
 1053 void
 1054 moea64_mid_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
 1055 {
 1056         int             i;
 1057 
 1058         /*
 1059          * Set PTEG mask
 1060          */
 1061         moea64_pteg_mask = moea64_pteg_count - 1;
 1062 
 1063         /*
 1064          * Initialize SLB table lock and page locks
 1065          */
 1066         mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF);
 1067         for (i = 0; i < PV_LOCK_COUNT; i++)
 1068                 mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF);
 1069 
 1070         /*
 1071          * Initialise the bootstrap pvo pool.
 1072          */
 1073         TUNABLE_INT_FETCH("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size);
 1074         if (moea64_bpvo_pool_size == 0) {
 1075                 if (!hw_direct_map)
 1076                         moea64_bpvo_pool_size = ((ptoa((uintmax_t)physmem) * sizeof(struct vm_page)) /
 1077                             (PAGE_SIZE * PAGE_SIZE)) * BPVO_POOL_EXPANSION_FACTOR;
 1078                 else
 1079                         moea64_bpvo_pool_size = BPVO_POOL_SIZE;
 1080         }
 1081 
 1082         if (boothowto & RB_VERBOSE) {
 1083                 printf("mmu_oea64: bpvo pool entries = %d, bpvo pool size = %zu MB\n",
 1084                     moea64_bpvo_pool_size,
 1085                     moea64_bpvo_pool_size*sizeof(struct pvo_entry) / 1048576);
 1086         }
 1087 
 1088         moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc(
 1089                 moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE);
 1090         moea64_bpvo_pool_index = 0;
 1091 
 1092         /* Place at address usable through the direct map */
 1093         if (hw_direct_map)
 1094                 moea64_bpvo_pool = (struct pvo_entry *)
 1095                     PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool);
 1096 
 1097         /*
 1098          * Make sure kernel vsid is allocated as well as VSID 0.
 1099          */
 1100         #ifndef __powerpc64__
 1101         moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW]
 1102                 |= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
 1103         moea64_vsid_bitmap[0] |= 1;
 1104         #endif
 1105 
 1106         /*
 1107          * Initialize the kernel pmap (which is statically allocated).
 1108          */
 1109         #ifdef __powerpc64__
 1110         for (i = 0; i < 64; i++) {
 1111                 pcpup->pc_aim.slb[i].slbv = 0;
 1112                 pcpup->pc_aim.slb[i].slbe = 0;
 1113         }
 1114         #else
 1115         for (i = 0; i < 16; i++)
 1116                 kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
 1117         #endif
 1118 
 1119         kernel_pmap->pmap_phys = kernel_pmap;
 1120         CPU_FILL(&kernel_pmap->pm_active);
 1121         RB_INIT(&kernel_pmap->pmap_pvo);
 1122 
 1123         PMAP_LOCK_INIT(kernel_pmap);
 1124 
 1125         /*
 1126          * Now map in all the other buffers we allocated earlier
 1127          */
 1128 
 1129         moea64_setup_direct_map(kernelstart, kernelend);
 1130 }
 1131 
 1132 void
 1133 moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
 1134 {
 1135         ihandle_t       mmui;
 1136         phandle_t       chosen;
 1137         phandle_t       mmu;
 1138         ssize_t         sz;
 1139         int             i;
 1140         vm_offset_t     pa, va;
 1141         void            *dpcpu;
 1142 
 1143         /*
 1144          * Set up the Open Firmware pmap and add its mappings if not in real
 1145          * mode.
 1146          */
 1147 
 1148         chosen = OF_finddevice("/chosen");
 1149         if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) {
 1150                 mmu = OF_instance_to_package(mmui);
 1151                 if (mmu == -1 ||
 1152                     (sz = OF_getproplen(mmu, "translations")) == -1)
 1153                         sz = 0;
 1154                 if (sz > 6144 /* tmpstksz - 2 KB headroom */)
 1155                         panic("moea64_bootstrap: too many ofw translations");
 1156 
 1157                 if (sz > 0)
 1158                         moea64_add_ofw_mappings(mmu, sz);
 1159         }
 1160 
 1161         /*
 1162          * Calculate the last available physical address.
 1163          */
 1164         Maxmem = 0;
 1165         for (i = 0; phys_avail[i + 1] != 0; i += 2)
 1166                 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
 1167 
 1168         /*
 1169          * Initialize MMU.
 1170          */
 1171         pmap_cpu_bootstrap(0);
 1172         mtmsr(mfmsr() | PSL_DR | PSL_IR);
 1173         pmap_bootstrapped++;
 1174 
 1175         /*
 1176          * Set the start and end of kva.
 1177          */
 1178         virtual_avail = VM_MIN_KERNEL_ADDRESS;
 1179         virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
 1180 
 1181         /*
 1182          * Map the entire KVA range into the SLB. We must not fault there.
 1183          */
 1184         #ifdef __powerpc64__
 1185         for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH)
 1186                 moea64_bootstrap_slb_prefault(va, 0);
 1187         #endif
 1188 
 1189         /*
 1190          * Remap any early IO mappings (console framebuffer, etc.)
 1191          */
 1192         bs_remap_earlyboot();
 1193 
 1194         /*
 1195          * Figure out how far we can extend virtual_end into segment 16
 1196          * without running into existing mappings. Segment 16 is guaranteed
 1197          * to contain neither RAM nor devices (at least on Apple hardware),
 1198          * but will generally contain some OFW mappings we should not
 1199          * step on.
 1200          */
 1201 
 1202         #ifndef __powerpc64__   /* KVA is in high memory on PPC64 */
 1203         PMAP_LOCK(kernel_pmap);
 1204         while (virtual_end < VM_MAX_KERNEL_ADDRESS &&
 1205             moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL)
 1206                 virtual_end += PAGE_SIZE;
 1207         PMAP_UNLOCK(kernel_pmap);
 1208         #endif
 1209 
 1210         /*
 1211          * Allocate a kernel stack with a guard page for thread0 and map it
 1212          * into the kernel page map.
 1213          */
 1214         pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE);
 1215         va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
 1216         virtual_avail = va + kstack_pages * PAGE_SIZE;
 1217         CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
 1218         thread0.td_kstack = va;
 1219         thread0.td_kstack_pages = kstack_pages;
 1220         for (i = 0; i < kstack_pages; i++) {
 1221                 moea64_kenter(va, pa);
 1222                 pa += PAGE_SIZE;
 1223                 va += PAGE_SIZE;
 1224         }
 1225 
 1226         /*
 1227          * Allocate virtual address space for the message buffer.
 1228          */
 1229         pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE);
 1230         msgbufp = (struct msgbuf *)virtual_avail;
 1231         va = virtual_avail;
 1232         virtual_avail += round_page(msgbufsize);
 1233         while (va < virtual_avail) {
 1234                 moea64_kenter(va, pa);
 1235                 pa += PAGE_SIZE;
 1236                 va += PAGE_SIZE;
 1237         }
 1238 
 1239         /*
 1240          * Allocate virtual address space for the dynamic percpu area.
 1241          */
 1242         pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
 1243         dpcpu = (void *)virtual_avail;
 1244         va = virtual_avail;
 1245         virtual_avail += DPCPU_SIZE;
 1246         while (va < virtual_avail) {
 1247                 moea64_kenter(va, pa);
 1248                 pa += PAGE_SIZE;
 1249                 va += PAGE_SIZE;
 1250         }
 1251         dpcpu_init(dpcpu, curcpu);
 1252 
 1253         crashdumpmap = (caddr_t)virtual_avail;
 1254         virtual_avail += MAXDUMPPGS * PAGE_SIZE;
 1255 
 1256         /*
 1257          * Allocate some things for page zeroing. We put this directly
 1258          * in the page table and use MOEA64_PTE_REPLACE to avoid any
 1259          * of the PVO book-keeping or other parts of the VM system
 1260          * from even knowing that this hack exists.
 1261          */
 1262 
 1263         if (!hw_direct_map) {
 1264                 mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL,
 1265                     MTX_DEF);
 1266                 for (i = 0; i < 2; i++) {
 1267                         moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE;
 1268                         virtual_end -= PAGE_SIZE;
 1269 
 1270                         moea64_kenter(moea64_scratchpage_va[i], 0);
 1271 
 1272                         PMAP_LOCK(kernel_pmap);
 1273                         moea64_scratchpage_pvo[i] = moea64_pvo_find_va(
 1274                             kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]);
 1275                         PMAP_UNLOCK(kernel_pmap);
 1276                 }
 1277         }
 1278 
 1279         numa_mem_regions(&numa_pregions, &numapregions_sz);
 1280 }
 1281 
 1282 static void
 1283 moea64_pmap_init_qpages(void)
 1284 {
 1285         struct pcpu *pc;
 1286         int i;
 1287 
 1288         if (hw_direct_map)
 1289                 return;
 1290 
 1291         CPU_FOREACH(i) {
 1292                 pc = pcpu_find(i);
 1293                 pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
 1294                 if (pc->pc_qmap_addr == 0)
 1295                         panic("pmap_init_qpages: unable to allocate KVA");
 1296                 PMAP_LOCK(kernel_pmap);
 1297                 pc->pc_aim.qmap_pvo =
 1298                     moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr);
 1299                 PMAP_UNLOCK(kernel_pmap);
 1300                 mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF);
 1301         }
 1302 }
 1303 
 1304 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL);
 1305 
 1306 /*
 1307  * Activate a user pmap.  This mostly involves setting some non-CPU
 1308  * state.
 1309  */
 1310 void
 1311 moea64_activate(struct thread *td)
 1312 {
 1313         pmap_t  pm;
 1314 
 1315         pm = &td->td_proc->p_vmspace->vm_pmap;
 1316         CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
 1317 
 1318         #ifdef __powerpc64__
 1319         PCPU_SET(aim.userslb, pm->pm_slb);
 1320         __asm __volatile("slbmte %0, %1; isync" ::
 1321             "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE));
 1322         #else
 1323         PCPU_SET(curpmap, pm->pmap_phys);
 1324         mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid);
 1325         #endif
 1326 }
 1327 
 1328 void
 1329 moea64_deactivate(struct thread *td)
 1330 {
 1331         pmap_t  pm;
 1332 
 1333         __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR));
 1334 
 1335         pm = &td->td_proc->p_vmspace->vm_pmap;
 1336         CPU_CLR(PCPU_GET(cpuid), &pm->pm_active);
 1337         #ifdef __powerpc64__
 1338         PCPU_SET(aim.userslb, NULL);
 1339         #else
 1340         PCPU_SET(curpmap, NULL);
 1341         #endif
 1342 }
 1343 
 1344 void
 1345 moea64_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 1346 {
 1347         struct  pvo_entry key, *pvo;
 1348         vm_page_t m;
 1349         int64_t refchg;
 1350 
 1351         key.pvo_vaddr = sva;
 1352         PMAP_LOCK(pm);
 1353         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
 1354             pvo != NULL && PVO_VADDR(pvo) < eva;
 1355             pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
 1356                 if (PVO_IS_SP(pvo)) {
 1357                         if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
 1358                                 pvo = moea64_sp_unwire(pvo);
 1359                                 continue;
 1360                         } else {
 1361                                 CTR1(KTR_PMAP, "%s: demote before unwire",
 1362                                     __func__);
 1363                                 moea64_sp_demote(pvo);
 1364                         }
 1365                 }
 1366 
 1367                 if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
 1368                         panic("moea64_unwire: pvo %p is missing PVO_WIRED",
 1369                             pvo);
 1370                 pvo->pvo_vaddr &= ~PVO_WIRED;
 1371                 refchg = moea64_pte_replace(pvo, 0 /* No invalidation */);
 1372                 if ((pvo->pvo_vaddr & PVO_MANAGED) &&
 1373                     (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
 1374                         if (refchg < 0)
 1375                                 refchg = LPTE_CHG;
 1376                         m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
 1377 
 1378                         refchg |= atomic_readandclear_32(&m->md.mdpg_attrs);
 1379                         if (refchg & LPTE_CHG)
 1380                                 vm_page_dirty(m);
 1381                         if (refchg & LPTE_REF)
 1382                                 vm_page_aflag_set(m, PGA_REFERENCED);
 1383                 }
 1384                 pm->pm_stats.wired_count--;
 1385         }
 1386         PMAP_UNLOCK(pm);
 1387 }
 1388 
 1389 static int
 1390 moea64_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
 1391 {
 1392         struct pvo_entry *pvo;
 1393         vm_paddr_t pa;
 1394         vm_page_t m;
 1395         int val;
 1396         bool managed;
 1397 
 1398         PMAP_LOCK(pmap);
 1399 
 1400         pvo = moea64_pvo_find_va(pmap, addr);
 1401         if (pvo != NULL) {
 1402                 pa = PVO_PADDR(pvo);
 1403                 m = PHYS_TO_VM_PAGE(pa);
 1404                 managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED;
 1405                 if (PVO_IS_SP(pvo))
 1406                         val = MINCORE_INCORE | MINCORE_PSIND(1);
 1407                 else
 1408                         val = MINCORE_INCORE;
 1409         } else {
 1410                 PMAP_UNLOCK(pmap);
 1411                 return (0);
 1412         }
 1413 
 1414         PMAP_UNLOCK(pmap);
 1415 
 1416         if (m == NULL)
 1417                 return (0);
 1418 
 1419         if (managed) {
 1420                 if (moea64_is_modified(m))
 1421                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 1422 
 1423                 if (moea64_is_referenced(m))
 1424                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 1425         }
 1426 
 1427         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 1428             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 1429             managed) {
 1430                 *pap = pa;
 1431         }
 1432 
 1433         return (val);
 1434 }
 1435 
 1436 /*
 1437  * This goes through and sets the physical address of our
 1438  * special scratch PTE to the PA we want to zero or copy. Because
 1439  * of locking issues (this can get called in pvo_enter() by
 1440  * the UMA allocator), we can't use most other utility functions here
 1441  */
 1442 
 1443 static __inline
 1444 void moea64_set_scratchpage_pa(int which, vm_paddr_t pa)
 1445 {
 1446         struct pvo_entry *pvo;
 1447 
 1448         KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!"));
 1449         mtx_assert(&moea64_scratchpage_mtx, MA_OWNED);
 1450 
 1451         pvo = moea64_scratchpage_pvo[which];
 1452         PMAP_LOCK(pvo->pvo_pmap);
 1453         pvo->pvo_pte.pa =
 1454             moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa;
 1455         moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
 1456         PMAP_UNLOCK(pvo->pvo_pmap);
 1457         isync();
 1458 }
 1459 
 1460 void
 1461 moea64_copy_page(vm_page_t msrc, vm_page_t mdst)
 1462 {
 1463         mtx_lock(&moea64_scratchpage_mtx);
 1464 
 1465         moea64_set_scratchpage_pa(0, VM_PAGE_TO_PHYS(msrc));
 1466         moea64_set_scratchpage_pa(1, VM_PAGE_TO_PHYS(mdst));
 1467 
 1468         bcopy((void *)moea64_scratchpage_va[0],
 1469             (void *)moea64_scratchpage_va[1], PAGE_SIZE);
 1470 
 1471         mtx_unlock(&moea64_scratchpage_mtx);
 1472 }
 1473 
 1474 void
 1475 moea64_copy_page_dmap(vm_page_t msrc, vm_page_t mdst)
 1476 {
 1477         vm_offset_t     dst;
 1478         vm_offset_t     src;
 1479 
 1480         dst = VM_PAGE_TO_PHYS(mdst);
 1481         src = VM_PAGE_TO_PHYS(msrc);
 1482 
 1483         bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst),
 1484             PAGE_SIZE);
 1485 }
 1486 
 1487 inline void
 1488 moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset,
 1489     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
 1490 {
 1491         void *a_cp, *b_cp;
 1492         vm_offset_t a_pg_offset, b_pg_offset;
 1493         int cnt;
 1494 
 1495         while (xfersize > 0) {
 1496                 a_pg_offset = a_offset & PAGE_MASK;
 1497                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 1498                 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
 1499                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
 1500                     a_pg_offset;
 1501                 b_pg_offset = b_offset & PAGE_MASK;
 1502                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 1503                 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
 1504                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
 1505                     b_pg_offset;
 1506                 bcopy(a_cp, b_cp, cnt);
 1507                 a_offset += cnt;
 1508                 b_offset += cnt;
 1509                 xfersize -= cnt;
 1510         }
 1511 }
 1512 
 1513 void
 1514 moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
 1515     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
 1516 {
 1517         void *a_cp, *b_cp;
 1518         vm_offset_t a_pg_offset, b_pg_offset;
 1519         int cnt;
 1520 
 1521         mtx_lock(&moea64_scratchpage_mtx);
 1522         while (xfersize > 0) {
 1523                 a_pg_offset = a_offset & PAGE_MASK;
 1524                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 1525                 moea64_set_scratchpage_pa(0,
 1526                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]));
 1527                 a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset;
 1528                 b_pg_offset = b_offset & PAGE_MASK;
 1529                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 1530                 moea64_set_scratchpage_pa(1,
 1531                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]));
 1532                 b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset;
 1533                 bcopy(a_cp, b_cp, cnt);
 1534                 a_offset += cnt;
 1535                 b_offset += cnt;
 1536                 xfersize -= cnt;
 1537         }
 1538         mtx_unlock(&moea64_scratchpage_mtx);
 1539 }
 1540 
 1541 void
 1542 moea64_zero_page_area(vm_page_t m, int off, int size)
 1543 {
 1544         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
 1545 
 1546         if (size + off > PAGE_SIZE)
 1547                 panic("moea64_zero_page: size + off > PAGE_SIZE");
 1548 
 1549         if (hw_direct_map) {
 1550                 bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size);
 1551         } else {
 1552                 mtx_lock(&moea64_scratchpage_mtx);
 1553                 moea64_set_scratchpage_pa(0, pa);
 1554                 bzero((caddr_t)moea64_scratchpage_va[0] + off, size);
 1555                 mtx_unlock(&moea64_scratchpage_mtx);
 1556         }
 1557 }
 1558 
 1559 /*
 1560  * Zero a page of physical memory by temporarily mapping it
 1561  */
 1562 void
 1563 moea64_zero_page(vm_page_t m)
 1564 {
 1565         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
 1566         vm_offset_t va, off;
 1567 
 1568         mtx_lock(&moea64_scratchpage_mtx);
 1569 
 1570         moea64_set_scratchpage_pa(0, pa);
 1571         va = moea64_scratchpage_va[0];
 1572 
 1573         for (off = 0; off < PAGE_SIZE; off += cacheline_size)
 1574                 __asm __volatile("dcbz 0,%0" :: "r"(va + off));
 1575 
 1576         mtx_unlock(&moea64_scratchpage_mtx);
 1577 }
 1578 
 1579 void
 1580 moea64_zero_page_dmap(vm_page_t m)
 1581 {
 1582         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
 1583         vm_offset_t va, off;
 1584 
 1585         va = PHYS_TO_DMAP(pa);
 1586         for (off = 0; off < PAGE_SIZE; off += cacheline_size)
 1587                 __asm __volatile("dcbz 0,%0" :: "r"(va + off));
 1588 }
 1589 
 1590 vm_offset_t
 1591 moea64_quick_enter_page(vm_page_t m)
 1592 {
 1593         struct pvo_entry *pvo;
 1594         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
 1595 
 1596         /*
 1597          * MOEA64_PTE_REPLACE does some locking, so we can't just grab
 1598          * a critical section and access the PCPU data like on i386.
 1599          * Instead, pin the thread and grab the PCPU lock to prevent
 1600          * a preempting thread from using the same PCPU data.
 1601          */
 1602         sched_pin();
 1603 
 1604         mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED);
 1605         pvo = PCPU_GET(aim.qmap_pvo);
 1606 
 1607         mtx_lock(PCPU_PTR(aim.qmap_lock));
 1608         pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) |
 1609             (uint64_t)pa;
 1610         moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
 1611         isync();
 1612 
 1613         return (PCPU_GET(qmap_addr));
 1614 }
 1615 
 1616 vm_offset_t
 1617 moea64_quick_enter_page_dmap(vm_page_t m)
 1618 {
 1619 
 1620         return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
 1621 }
 1622 
 1623 void
 1624 moea64_quick_remove_page(vm_offset_t addr)
 1625 {
 1626 
 1627         mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED);
 1628         KASSERT(PCPU_GET(qmap_addr) == addr,
 1629             ("moea64_quick_remove_page: invalid address"));
 1630         mtx_unlock(PCPU_PTR(aim.qmap_lock));
 1631         sched_unpin();  
 1632 }
 1633 
 1634 boolean_t
 1635 moea64_page_is_mapped(vm_page_t m)
 1636 {
 1637         return (!LIST_EMPTY(&(m)->md.mdpg_pvoh));
 1638 }
 1639 
 1640 /*
 1641  * Map the given physical page at the specified virtual address in the
 1642  * target pmap with the protection requested.  If specified the page
 1643  * will be wired down.
 1644  */
 1645 
 1646 int
 1647 moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
 1648     vm_prot_t prot, u_int flags, int8_t psind)
 1649 {
 1650         struct          pvo_entry *pvo, *oldpvo, *tpvo;
 1651         struct          pvo_head *pvo_head;
 1652         uint64_t        pte_lo;
 1653         int             error;
 1654         vm_paddr_t      pa;
 1655 
 1656         if ((m->oflags & VPO_UNMANAGED) == 0) {
 1657                 if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0)
 1658                         VM_PAGE_OBJECT_BUSY_ASSERT(m);
 1659                 else
 1660                         VM_OBJECT_ASSERT_LOCKED(m->object);
 1661         }
 1662 
 1663         if (psind > 0)
 1664                 return (moea64_sp_enter(pmap, va, m, prot, flags, psind));
 1665 
 1666         pvo = alloc_pvo_entry(0);
 1667         if (pvo == NULL)
 1668                 return (KERN_RESOURCE_SHORTAGE);
 1669         pvo->pvo_pmap = NULL; /* to be filled in later */
 1670         pvo->pvo_pte.prot = prot;
 1671 
 1672         pa = VM_PAGE_TO_PHYS(m);
 1673         pte_lo = moea64_calc_wimg(pa, pmap_page_get_memattr(m));
 1674         pvo->pvo_pte.pa = pa | pte_lo;
 1675 
 1676         if ((flags & PMAP_ENTER_WIRED) != 0)
 1677                 pvo->pvo_vaddr |= PVO_WIRED;
 1678 
 1679         if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) {
 1680                 pvo_head = NULL;
 1681         } else {
 1682                 pvo_head = &m->md.mdpg_pvoh;
 1683                 pvo->pvo_vaddr |= PVO_MANAGED;
 1684         }
 1685 
 1686         PV_LOCK(pa);
 1687         PMAP_LOCK(pmap);
 1688         if (pvo->pvo_pmap == NULL)
 1689                 init_pvo_entry(pvo, pmap, va);
 1690 
 1691         if (moea64_ps_enabled(pmap) &&
 1692             (tpvo = moea64_pvo_find_va(pmap, va & ~HPT_SP_MASK)) != NULL &&
 1693             PVO_IS_SP(tpvo)) {
 1694                 /* Demote SP before entering a regular page */
 1695                 CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx",
 1696                     __func__, (uintmax_t)va);
 1697                 moea64_sp_demote_aligned(tpvo);
 1698         }
 1699 
 1700         if (prot & VM_PROT_WRITE)
 1701                 if (pmap_bootstrapped &&
 1702                     (m->oflags & VPO_UNMANAGED) == 0)
 1703                         vm_page_aflag_set(m, PGA_WRITEABLE);
 1704 
 1705         error = moea64_pvo_enter(pvo, pvo_head, &oldpvo);
 1706         if (error == EEXIST) {
 1707                 if (oldpvo->pvo_vaddr == pvo->pvo_vaddr &&
 1708                     oldpvo->pvo_pte.pa == pvo->pvo_pte.pa &&
 1709                     oldpvo->pvo_pte.prot == prot) {
 1710                         /* Identical mapping already exists */
 1711                         error = 0;
 1712 
 1713                         /* If not in page table, reinsert it */
 1714                         if (moea64_pte_synch(oldpvo) < 0) {
 1715                                 STAT_MOEA64(moea64_pte_overflow--);
 1716                                 moea64_pte_insert(oldpvo);
 1717                         }
 1718 
 1719                         /* Then just clean up and go home */
 1720                         PMAP_UNLOCK(pmap);
 1721                         PV_UNLOCK(pa);
 1722                         free_pvo_entry(pvo);
 1723                         pvo = NULL;
 1724                         goto out;
 1725                 } else {
 1726                         /* Otherwise, need to kill it first */
 1727                         KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old "
 1728                             "mapping does not match new mapping"));
 1729                         moea64_pvo_remove_from_pmap(oldpvo);
 1730                         moea64_pvo_enter(pvo, pvo_head, NULL);
 1731                 }
 1732         }
 1733         PMAP_UNLOCK(pmap);
 1734         PV_UNLOCK(pa);
 1735 
 1736         /* Free any dead pages */
 1737         if (error == EEXIST) {
 1738                 moea64_pvo_remove_from_page(oldpvo);
 1739                 free_pvo_entry(oldpvo);
 1740         }
 1741 
 1742 out:
 1743         /*
 1744          * Flush the page from the instruction cache if this page is
 1745          * mapped executable and cacheable.
 1746          */
 1747         if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 &&
 1748             (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
 1749                 vm_page_aflag_set(m, PGA_EXECUTABLE);
 1750                 moea64_syncicache(pmap, va, pa, PAGE_SIZE);
 1751         }
 1752 
 1753 #if VM_NRESERVLEVEL > 0
 1754         /*
 1755          * Try to promote pages.
 1756          *
 1757          * If the VA of the entered page is not aligned with its PA,
 1758          * don't try page promotion as it is not possible.
 1759          * This reduces the number of promotion failures dramatically.
 1760          */
 1761         if (moea64_ps_enabled(pmap) && pmap != kernel_pmap && pvo != NULL &&
 1762             (pvo->pvo_vaddr & PVO_MANAGED) != 0 &&
 1763             (va & HPT_SP_MASK) == (pa & HPT_SP_MASK) &&
 1764             (m->flags & PG_FICTITIOUS) == 0 &&
 1765             vm_reserv_level_iffullpop(m) == 0)
 1766                 moea64_sp_promote(pmap, va, m);
 1767 #endif
 1768 
 1769         return (KERN_SUCCESS);
 1770 }
 1771 
 1772 static void
 1773 moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 1774     vm_size_t sz)
 1775 {
 1776 
 1777         /*
 1778          * This is much trickier than on older systems because
 1779          * we can't sync the icache on physical addresses directly
 1780          * without a direct map. Instead we check a couple of cases
 1781          * where the memory is already mapped in and, failing that,
 1782          * use the same trick we use for page zeroing to create
 1783          * a temporary mapping for this physical address.
 1784          */
 1785 
 1786         if (!pmap_bootstrapped) {
 1787                 /*
 1788                  * If PMAP is not bootstrapped, we are likely to be
 1789                  * in real mode.
 1790                  */
 1791                 __syncicache((void *)(uintptr_t)pa, sz);
 1792         } else if (pmap == kernel_pmap) {
 1793                 __syncicache((void *)va, sz);
 1794         } else if (hw_direct_map) {
 1795                 __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz);
 1796         } else {
 1797                 /* Use the scratch page to set up a temp mapping */
 1798 
 1799                 mtx_lock(&moea64_scratchpage_mtx);
 1800 
 1801                 moea64_set_scratchpage_pa(1, pa & ~ADDR_POFF);
 1802                 __syncicache((void *)(moea64_scratchpage_va[1] +
 1803                     (va & ADDR_POFF)), sz);
 1804 
 1805                 mtx_unlock(&moea64_scratchpage_mtx);
 1806         }
 1807 }
 1808 
 1809 /*
 1810  * Maps a sequence of resident pages belonging to the same object.
 1811  * The sequence begins with the given page m_start.  This page is
 1812  * mapped at the given virtual address start.  Each subsequent page is
 1813  * mapped at a virtual address that is offset from start by the same
 1814  * amount as the page is offset from m_start within the object.  The
 1815  * last page in the sequence is the page with the largest offset from
 1816  * m_start that can be mapped at a virtual address less than the given
 1817  * virtual address end.  Not every virtual page between start and end
 1818  * is mapped; only those for which a resident page exists with the
 1819  * corresponding offset from m_start are mapped.
 1820  */
 1821 void
 1822 moea64_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
 1823     vm_page_t m_start, vm_prot_t prot)
 1824 {
 1825         vm_page_t m;
 1826         vm_pindex_t diff, psize;
 1827         vm_offset_t va;
 1828         int8_t psind;
 1829 
 1830         VM_OBJECT_ASSERT_LOCKED(m_start->object);
 1831 
 1832         psize = atop(end - start);
 1833         m = m_start;
 1834         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 1835                 va = start + ptoa(diff);
 1836                 if ((va & HPT_SP_MASK) == 0 && va + HPT_SP_SIZE <= end &&
 1837                     m->psind == 1 && moea64_ps_enabled(pm))
 1838                         psind = 1;
 1839                 else
 1840                         psind = 0;
 1841                 moea64_enter(pm, va, m, prot &
 1842                     (VM_PROT_READ | VM_PROT_EXECUTE),
 1843                     PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind);
 1844                 if (psind == 1)
 1845                         m = &m[HPT_SP_SIZE / PAGE_SIZE - 1];
 1846                 m = TAILQ_NEXT(m, listq);
 1847         }
 1848 }
 1849 
 1850 void
 1851 moea64_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m,
 1852     vm_prot_t prot)
 1853 {
 1854 
 1855         moea64_enter(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 1856             PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0);
 1857 }
 1858 
 1859 vm_paddr_t
 1860 moea64_extract(pmap_t pm, vm_offset_t va)
 1861 {
 1862         struct  pvo_entry *pvo;
 1863         vm_paddr_t pa;
 1864 
 1865         PMAP_LOCK(pm);
 1866         pvo = moea64_pvo_find_va(pm, va);
 1867         if (pvo == NULL)
 1868                 pa = 0;
 1869         else
 1870                 pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
 1871         PMAP_UNLOCK(pm);
 1872 
 1873         return (pa);
 1874 }
 1875 
 1876 /*
 1877  * Atomically extract and hold the physical page with the given
 1878  * pmap and virtual address pair if that mapping permits the given
 1879  * protection.
 1880  */
 1881 vm_page_t
 1882 moea64_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 1883 {
 1884         struct  pvo_entry *pvo;
 1885         vm_page_t m;
 1886 
 1887         m = NULL;
 1888         PMAP_LOCK(pmap);
 1889         pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
 1890         if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) {
 1891                 m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
 1892                 if (!vm_page_wire_mapped(m))
 1893                         m = NULL;
 1894         }
 1895         PMAP_UNLOCK(pmap);
 1896         return (m);
 1897 }
 1898 
 1899 static void *
 1900 moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
 1901     uint8_t *flags, int wait)
 1902 {
 1903         struct pvo_entry *pvo;
 1904         vm_offset_t va;
 1905         vm_page_t m;
 1906         int needed_lock;
 1907 
 1908         /*
 1909          * This entire routine is a horrible hack to avoid bothering kmem
 1910          * for new KVA addresses. Because this can get called from inside
 1911          * kmem allocation routines, calling kmem for a new address here
 1912          * can lead to multiply locking non-recursive mutexes.
 1913          */
 1914 
 1915         *flags = UMA_SLAB_PRIV;
 1916         needed_lock = !PMAP_LOCKED(kernel_pmap);
 1917 
 1918         m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) |
 1919             VM_ALLOC_WIRED);
 1920         if (m == NULL)
 1921                 return (NULL);
 1922 
 1923         va = VM_PAGE_TO_PHYS(m);
 1924 
 1925         pvo = alloc_pvo_entry(1 /* bootstrap */);
 1926 
 1927         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE;
 1928         pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M;
 1929 
 1930         if (needed_lock)
 1931                 PMAP_LOCK(kernel_pmap);
 1932 
 1933         init_pvo_entry(pvo, kernel_pmap, va);
 1934         pvo->pvo_vaddr |= PVO_WIRED;
 1935 
 1936         moea64_pvo_enter(pvo, NULL, NULL);
 1937 
 1938         if (needed_lock)
 1939                 PMAP_UNLOCK(kernel_pmap);
 1940 
 1941         return (void *)va;
 1942 }
 1943 
 1944 extern int elf32_nxstack;
 1945 
 1946 void
 1947 moea64_init(void)
 1948 {
 1949 
 1950         CTR0(KTR_PMAP, "moea64_init");
 1951 
 1952         moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
 1953             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 1954             UMA_ZONE_VM | UMA_ZONE_NOFREE);
 1955 
 1956         /*
 1957          * Are large page mappings enabled?
 1958          *
 1959          * While HPT superpages are not better tested, leave it disabled by
 1960          * default.
 1961          */
 1962         superpages_enabled = 0;
 1963         TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
 1964         if (superpages_enabled) {
 1965                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 1966                     ("moea64_init: can't assign to pagesizes[1]"));
 1967 
 1968                 if (moea64_large_page_size == 0) {
 1969                         printf("mmu_oea64: HW does not support large pages. "
 1970                                         "Disabling superpages...\n");
 1971                         superpages_enabled = 0;
 1972                 } else if (!moea64_has_lp_4k_16m) {
 1973                         printf("mmu_oea64: "
 1974                             "HW does not support mixed 4KB/16MB page sizes. "
 1975                             "Disabling superpages...\n");
 1976                         superpages_enabled = 0;
 1977                 } else
 1978                         pagesizes[1] = HPT_SP_SIZE;
 1979         }
 1980 
 1981         if (!hw_direct_map) {
 1982                 uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc);
 1983         }
 1984 
 1985 #ifdef COMPAT_FREEBSD32
 1986         elf32_nxstack = 1;
 1987 #endif
 1988 
 1989         moea64_initialized = TRUE;
 1990 }
 1991 
 1992 boolean_t
 1993 moea64_is_referenced(vm_page_t m)
 1994 {
 1995 
 1996         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 1997             ("moea64_is_referenced: page %p is not managed", m));
 1998 
 1999         return (moea64_query_bit(m, LPTE_REF));
 2000 }
 2001 
 2002 boolean_t
 2003 moea64_is_modified(vm_page_t m)
 2004 {
 2005 
 2006         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 2007             ("moea64_is_modified: page %p is not managed", m));
 2008 
 2009         /*
 2010          * If the page is not busied then this check is racy.
 2011          */
 2012         if (!pmap_page_is_write_mapped(m))
 2013                 return (FALSE);
 2014 
 2015         return (moea64_query_bit(m, LPTE_CHG));
 2016 }
 2017 
 2018 boolean_t
 2019 moea64_is_prefaultable(pmap_t pmap, vm_offset_t va)
 2020 {
 2021         struct pvo_entry *pvo;
 2022         boolean_t rv = TRUE;
 2023 
 2024         PMAP_LOCK(pmap);
 2025         pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
 2026         if (pvo != NULL)
 2027                 rv = FALSE;
 2028         PMAP_UNLOCK(pmap);
 2029         return (rv);
 2030 }
 2031 
 2032 void
 2033 moea64_clear_modify(vm_page_t m)
 2034 {
 2035 
 2036         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 2037             ("moea64_clear_modify: page %p is not managed", m));
 2038         vm_page_assert_busied(m);
 2039 
 2040         if (!pmap_page_is_write_mapped(m))
 2041                 return;
 2042         moea64_clear_bit(m, LPTE_CHG);
 2043 }
 2044 
 2045 /*
 2046  * Clear the write and modified bits in each of the given page's mappings.
 2047  */
 2048 void
 2049 moea64_remove_write(vm_page_t m)
 2050 {
 2051         struct  pvo_entry *pvo;
 2052         int64_t refchg, ret;
 2053         pmap_t  pmap;
 2054 
 2055         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 2056             ("moea64_remove_write: page %p is not managed", m));
 2057         vm_page_assert_busied(m);
 2058 
 2059         if (!pmap_page_is_write_mapped(m))
 2060                 return;
 2061 
 2062         powerpc_sync();
 2063         PV_PAGE_LOCK(m);
 2064         refchg = 0;
 2065         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
 2066                 pmap = pvo->pvo_pmap;
 2067                 PMAP_LOCK(pmap);
 2068                 if (!(pvo->pvo_vaddr & PVO_DEAD) &&
 2069                     (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
 2070                         if (PVO_IS_SP(pvo)) {
 2071                                 CTR1(KTR_PMAP, "%s: demote before remwr",
 2072                                     __func__);
 2073                                 moea64_sp_demote(pvo);
 2074                         }
 2075                         pvo->pvo_pte.prot &= ~VM_PROT_WRITE;
 2076                         ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
 2077                         if (ret < 0)
 2078                                 ret = LPTE_CHG;
 2079                         refchg |= ret;
 2080                         if (pvo->pvo_pmap == kernel_pmap)
 2081                                 isync();
 2082                 }
 2083                 PMAP_UNLOCK(pmap);
 2084         }
 2085         if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG)
 2086                 vm_page_dirty(m);
 2087         vm_page_aflag_clear(m, PGA_WRITEABLE);
 2088         PV_PAGE_UNLOCK(m);
 2089 }
 2090 
 2091 /*
 2092  *      moea64_ts_referenced:
 2093  *
 2094  *      Return a count of reference bits for a page, clearing those bits.
 2095  *      It is not necessary for every reference bit to be cleared, but it
 2096  *      is necessary that 0 only be returned when there are truly no
 2097  *      reference bits set.
 2098  *
 2099  *      XXX: The exact number of bits to check and clear is a matter that
 2100  *      should be tested and standardized at some point in the future for
 2101  *      optimal aging of shared pages.
 2102  */
 2103 int
 2104 moea64_ts_referenced(vm_page_t m)
 2105 {
 2106 
 2107         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 2108             ("moea64_ts_referenced: page %p is not managed", m));
 2109         return (moea64_clear_bit(m, LPTE_REF));
 2110 }
 2111 
 2112 /*
 2113  * Modify the WIMG settings of all mappings for a page.
 2114  */
 2115 void
 2116 moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 2117 {
 2118         struct  pvo_entry *pvo;
 2119         int64_t refchg;
 2120         pmap_t  pmap;
 2121         uint64_t lo;
 2122 
 2123         CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x",
 2124             __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma);
 2125 
 2126         if ((m->oflags & VPO_UNMANAGED) != 0) {
 2127                 m->md.mdpg_cache_attrs = ma;
 2128                 return;
 2129         }
 2130 
 2131         lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
 2132 
 2133         PV_PAGE_LOCK(m);
 2134         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
 2135                 pmap = pvo->pvo_pmap;
 2136                 PMAP_LOCK(pmap);
 2137                 if (!(pvo->pvo_vaddr & PVO_DEAD)) {
 2138                         if (PVO_IS_SP(pvo)) {
 2139                                 CTR1(KTR_PMAP,
 2140                                     "%s: demote before set_memattr", __func__);
 2141                                 moea64_sp_demote(pvo);
 2142                         }
 2143                         pvo->pvo_pte.pa &= ~LPTE_WIMG;
 2144                         pvo->pvo_pte.pa |= lo;
 2145                         refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
 2146                         if (refchg < 0)
 2147                                 refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ?
 2148                                     LPTE_CHG : 0;
 2149                         if ((pvo->pvo_vaddr & PVO_MANAGED) &&
 2150                             (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
 2151                                 refchg |=
 2152                                     atomic_readandclear_32(&m->md.mdpg_attrs);
 2153                                 if (refchg & LPTE_CHG)
 2154                                         vm_page_dirty(m);
 2155                                 if (refchg & LPTE_REF)
 2156                                         vm_page_aflag_set(m, PGA_REFERENCED);
 2157                         }
 2158                         if (pvo->pvo_pmap == kernel_pmap)
 2159                                 isync();
 2160                 }
 2161                 PMAP_UNLOCK(pmap);
 2162         }
 2163         m->md.mdpg_cache_attrs = ma;
 2164         PV_PAGE_UNLOCK(m);
 2165 }
 2166 
 2167 /*
 2168  * Map a wired page into kernel virtual address space.
 2169  */
 2170 void
 2171 moea64_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
 2172 {
 2173         int             error;  
 2174         struct pvo_entry *pvo, *oldpvo;
 2175 
 2176         do {
 2177                 pvo = alloc_pvo_entry(0);
 2178                 if (pvo == NULL)
 2179                         vm_wait(NULL);
 2180         } while (pvo == NULL);
 2181         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 2182         pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma);
 2183         pvo->pvo_vaddr |= PVO_WIRED;
 2184 
 2185         PMAP_LOCK(kernel_pmap);
 2186         oldpvo = moea64_pvo_find_va(kernel_pmap, va);
 2187         if (oldpvo != NULL)
 2188                 moea64_pvo_remove_from_pmap(oldpvo);
 2189         init_pvo_entry(pvo, kernel_pmap, va);
 2190         error = moea64_pvo_enter(pvo, NULL, NULL);
 2191         PMAP_UNLOCK(kernel_pmap);
 2192 
 2193         /* Free any dead pages */
 2194         if (oldpvo != NULL) {
 2195                 moea64_pvo_remove_from_page(oldpvo);
 2196                 free_pvo_entry(oldpvo);
 2197         }
 2198 
 2199         if (error != 0)
 2200                 panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va,
 2201                     (uintmax_t)pa, error);
 2202 }
 2203 
 2204 void
 2205 moea64_kenter(vm_offset_t va, vm_paddr_t pa)
 2206 {
 2207 
 2208         moea64_kenter_attr(va, pa, VM_MEMATTR_DEFAULT);
 2209 }
 2210 
 2211 /*
 2212  * Extract the physical page address associated with the given kernel virtual
 2213  * address.
 2214  */
 2215 vm_paddr_t
 2216 moea64_kextract(vm_offset_t va)
 2217 {
 2218         struct          pvo_entry *pvo;
 2219         vm_paddr_t pa;
 2220 
 2221         /*
 2222          * Shortcut the direct-mapped case when applicable.  We never put
 2223          * anything but 1:1 (or 62-bit aliased) mappings below
 2224          * VM_MIN_KERNEL_ADDRESS.
 2225          */
 2226         if (va < VM_MIN_KERNEL_ADDRESS)
 2227                 return (va & ~DMAP_BASE_ADDRESS);
 2228 
 2229         PMAP_LOCK(kernel_pmap);
 2230         pvo = moea64_pvo_find_va(kernel_pmap, va);
 2231         KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR,
 2232             va));
 2233         pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
 2234         PMAP_UNLOCK(kernel_pmap);
 2235         return (pa);
 2236 }
 2237 
 2238 /*
 2239  * Remove a wired page from kernel virtual address space.
 2240  */
 2241 void
 2242 moea64_kremove(vm_offset_t va)
 2243 {
 2244         moea64_remove(kernel_pmap, va, va + PAGE_SIZE);
 2245 }
 2246 
 2247 /*
 2248  * Provide a kernel pointer corresponding to a given userland pointer.
 2249  * The returned pointer is valid until the next time this function is
 2250  * called in this thread. This is used internally in copyin/copyout.
 2251  */
 2252 static int
 2253 moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr,
 2254     void **kaddr, size_t ulen, size_t *klen)
 2255 {
 2256         size_t l;
 2257 #ifdef __powerpc64__
 2258         struct slb *slb;
 2259 #endif
 2260         register_t slbv;
 2261 
 2262         *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK);
 2263         l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr);
 2264         if (l > ulen)
 2265                 l = ulen;
 2266         if (klen)
 2267                 *klen = l;
 2268         else if (l != ulen)
 2269                 return (EFAULT);
 2270 
 2271 #ifdef __powerpc64__
 2272         /* Try lockless look-up first */
 2273         slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr);
 2274 
 2275         if (slb == NULL) {
 2276                 /* If it isn't there, we need to pre-fault the VSID */
 2277                 PMAP_LOCK(pm);
 2278                 slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT;
 2279                 PMAP_UNLOCK(pm);
 2280         } else {
 2281                 slbv = slb->slbv;
 2282         }
 2283 
 2284         /* Mark segment no-execute */
 2285         slbv |= SLBV_N;
 2286 #else
 2287         slbv = va_to_vsid(pm, (vm_offset_t)uaddr);
 2288 
 2289         /* Mark segment no-execute */
 2290         slbv |= SR_N;
 2291 #endif
 2292 
 2293         /* If we have already set this VSID, we can just return */
 2294         if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv)
 2295                 return (0);
 2296 
 2297         __asm __volatile("isync");
 2298         curthread->td_pcb->pcb_cpu.aim.usr_segm =
 2299             (uintptr_t)uaddr >> ADDR_SR_SHFT;
 2300         curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv;
 2301 #ifdef __powerpc64__
 2302         __asm __volatile ("slbie %0; slbmte %1, %2; isync" ::
 2303             "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE));
 2304 #else
 2305         __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv));
 2306 #endif
 2307 
 2308         return (0);
 2309 }
 2310 
 2311 /*
 2312  * Figure out where a given kernel pointer (usually in a fault) points
 2313  * to from the VM's perspective, potentially remapping into userland's
 2314  * address space.
 2315  */
 2316 static int
 2317 moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user,
 2318     vm_offset_t *decoded_addr)
 2319 {
 2320         vm_offset_t user_sr;
 2321 
 2322         if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) {
 2323                 user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm;
 2324                 addr &= ADDR_PIDX | ADDR_POFF;
 2325                 addr |= user_sr << ADDR_SR_SHFT;
 2326                 *decoded_addr = addr;
 2327                 *is_user = 1;
 2328         } else {
 2329                 *decoded_addr = addr;
 2330                 *is_user = 0;
 2331         }
 2332 
 2333         return (0);
 2334 }
 2335 
 2336 /*
 2337  * Map a range of physical addresses into kernel virtual address space.
 2338  *
 2339  * The value passed in *virt is a suggested virtual address for the mapping.
 2340  * Architectures which can support a direct-mapped physical to virtual region
 2341  * can return the appropriate address within that region, leaving '*virt'
 2342  * unchanged.  Other architectures should map the pages starting at '*virt' and
 2343  * update '*virt' with the first usable address after the mapped region.
 2344  */
 2345 vm_offset_t
 2346 moea64_map(vm_offset_t *virt, vm_paddr_t pa_start,
 2347     vm_paddr_t pa_end, int prot)
 2348 {
 2349         vm_offset_t     sva, va;
 2350 
 2351         if (hw_direct_map) {
 2352                 /*
 2353                  * Check if every page in the region is covered by the direct
 2354                  * map. The direct map covers all of physical memory. Use
 2355                  * moea64_calc_wimg() as a shortcut to see if the page is in
 2356                  * physical memory as a way to see if the direct map covers it.
 2357                  */
 2358                 for (va = pa_start; va < pa_end; va += PAGE_SIZE)
 2359                         if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M)
 2360                                 break;
 2361                 if (va == pa_end)
 2362                         return (PHYS_TO_DMAP(pa_start));
 2363         }
 2364         sva = *virt;
 2365         va = sva;
 2366         /* XXX respect prot argument */
 2367         for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
 2368                 moea64_kenter(va, pa_start);
 2369         *virt = va;
 2370 
 2371         return (sva);
 2372 }
 2373 
 2374 /*
 2375  * Returns true if the pmap's pv is one of the first
 2376  * 16 pvs linked to from this page.  This count may
 2377  * be changed upwards or downwards in the future; it
 2378  * is only necessary that true be returned for a small
 2379  * subset of pmaps for proper page aging.
 2380  */
 2381 boolean_t
 2382 moea64_page_exists_quick(pmap_t pmap, vm_page_t m)
 2383 {
 2384         int loops;
 2385         struct pvo_entry *pvo;
 2386         boolean_t rv;
 2387 
 2388         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 2389             ("moea64_page_exists_quick: page %p is not managed", m));
 2390         loops = 0;
 2391         rv = FALSE;
 2392         PV_PAGE_LOCK(m);
 2393         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
 2394                 if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) {
 2395                         rv = TRUE;
 2396                         break;
 2397                 }
 2398                 if (++loops >= 16)
 2399                         break;
 2400         }
 2401         PV_PAGE_UNLOCK(m);
 2402         return (rv);
 2403 }
 2404 
 2405 void
 2406 moea64_page_init(vm_page_t m)
 2407 {
 2408 
 2409         m->md.mdpg_attrs = 0;
 2410         m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
 2411         LIST_INIT(&m->md.mdpg_pvoh);
 2412 }
 2413 
 2414 /*
 2415  * Return the number of managed mappings to the given physical page
 2416  * that are wired.
 2417  */
 2418 int
 2419 moea64_page_wired_mappings(vm_page_t m)
 2420 {
 2421         struct pvo_entry *pvo;
 2422         int count;
 2423 
 2424         count = 0;
 2425         if ((m->oflags & VPO_UNMANAGED) != 0)
 2426                 return (count);
 2427         PV_PAGE_LOCK(m);
 2428         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
 2429                 if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED)
 2430                         count++;
 2431         PV_PAGE_UNLOCK(m);
 2432         return (count);
 2433 }
 2434 
 2435 static uintptr_t        moea64_vsidcontext;
 2436 
 2437 uintptr_t
 2438 moea64_get_unique_vsid(void) {
 2439         u_int entropy;
 2440         register_t hash;
 2441         uint32_t mask;
 2442         int i;
 2443 
 2444         entropy = 0;
 2445         __asm __volatile("mftb %0" : "=r"(entropy));
 2446 
 2447         mtx_lock(&moea64_slb_mutex);
 2448         for (i = 0; i < NVSIDS; i += VSID_NBPW) {
 2449                 u_int   n;
 2450 
 2451                 /*
 2452                  * Create a new value by multiplying by a prime and adding in
 2453                  * entropy from the timebase register.  This is to make the
 2454                  * VSID more random so that the PT hash function collides
 2455                  * less often.  (Note that the prime casues gcc to do shifts
 2456                  * instead of a multiply.)
 2457                  */
 2458                 moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy;
 2459                 hash = moea64_vsidcontext & (NVSIDS - 1);
 2460                 if (hash == 0)          /* 0 is special, avoid it */
 2461                         continue;
 2462                 n = hash >> 5;
 2463                 mask = 1 << (hash & (VSID_NBPW - 1));
 2464                 hash = (moea64_vsidcontext & VSID_HASHMASK);
 2465                 if (moea64_vsid_bitmap[n] & mask) {     /* collision? */
 2466                         /* anything free in this bucket? */
 2467                         if (moea64_vsid_bitmap[n] == 0xffffffff) {
 2468                                 entropy = (moea64_vsidcontext >> 20);
 2469                                 continue;
 2470                         }
 2471                         i = ffs(~moea64_vsid_bitmap[n]) - 1;
 2472                         mask = 1 << i;
 2473                         hash &= rounddown2(VSID_HASHMASK, VSID_NBPW);
 2474                         hash |= i;
 2475                 }
 2476                 if (hash == VSID_VRMA)  /* also special, avoid this too */
 2477                         continue;
 2478                 KASSERT(!(moea64_vsid_bitmap[n] & mask),
 2479                     ("Allocating in-use VSID %#zx\n", hash));
 2480                 moea64_vsid_bitmap[n] |= mask;
 2481                 mtx_unlock(&moea64_slb_mutex);
 2482                 return (hash);
 2483         }
 2484 
 2485         mtx_unlock(&moea64_slb_mutex);
 2486         panic("%s: out of segments",__func__);
 2487 }
 2488 
 2489 #ifdef __powerpc64__
 2490 int
 2491 moea64_pinit(pmap_t pmap)
 2492 {
 2493 
 2494         RB_INIT(&pmap->pmap_pvo);
 2495 
 2496         pmap->pm_slb_tree_root = slb_alloc_tree();
 2497         pmap->pm_slb = slb_alloc_user_cache();
 2498         pmap->pm_slb_len = 0;
 2499 
 2500         return (1);
 2501 }
 2502 #else
 2503 int
 2504 moea64_pinit(pmap_t pmap)
 2505 {
 2506         int     i;
 2507         uint32_t hash;
 2508 
 2509         RB_INIT(&pmap->pmap_pvo);
 2510 
 2511         if (pmap_bootstrapped)
 2512                 pmap->pmap_phys = (pmap_t)moea64_kextract((vm_offset_t)pmap);
 2513         else
 2514                 pmap->pmap_phys = pmap;
 2515 
 2516         /*
 2517          * Allocate some segment registers for this pmap.
 2518          */
 2519         hash = moea64_get_unique_vsid();
 2520 
 2521         for (i = 0; i < 16; i++)
 2522                 pmap->pm_sr[i] = VSID_MAKE(i, hash);
 2523 
 2524         KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0"));
 2525 
 2526         return (1);
 2527 }
 2528 #endif
 2529 
 2530 /*
 2531  * Initialize the pmap associated with process 0.
 2532  */
 2533 void
 2534 moea64_pinit0(pmap_t pm)
 2535 {
 2536 
 2537         PMAP_LOCK_INIT(pm);
 2538         moea64_pinit(pm);
 2539         bzero(&pm->pm_stats, sizeof(pm->pm_stats));
 2540 }
 2541 
 2542 /*
 2543  * Set the physical protection on the specified range of this map as requested.
 2544  */
 2545 static void
 2546 moea64_pvo_protect( pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot)
 2547 {
 2548         struct vm_page *pg;
 2549         vm_prot_t oldprot;
 2550         int32_t refchg;
 2551 
 2552         PMAP_LOCK_ASSERT(pm, MA_OWNED);
 2553 
 2554         /*
 2555          * Change the protection of the page.
 2556          */
 2557         oldprot = pvo->pvo_pte.prot;
 2558         pvo->pvo_pte.prot = prot;
 2559         pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
 2560 
 2561         /*
 2562          * If the PVO is in the page table, update mapping
 2563          */
 2564         refchg = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
 2565         if (refchg < 0)
 2566                 refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0;
 2567 
 2568         if (pm != kernel_pmap && pg != NULL &&
 2569             (pg->a.flags & PGA_EXECUTABLE) == 0 &&
 2570             (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
 2571                 if ((pg->oflags & VPO_UNMANAGED) == 0)
 2572                         vm_page_aflag_set(pg, PGA_EXECUTABLE);
 2573                 moea64_syncicache(pm, PVO_VADDR(pvo),
 2574                     PVO_PADDR(pvo), PAGE_SIZE);
 2575         }
 2576 
 2577         /*
 2578          * Update vm about the REF/CHG bits if the page is managed and we have
 2579          * removed write access.
 2580          */
 2581         if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) &&
 2582             (oldprot & VM_PROT_WRITE)) {
 2583                 refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
 2584                 if (refchg & LPTE_CHG)
 2585                         vm_page_dirty(pg);
 2586                 if (refchg & LPTE_REF)
 2587                         vm_page_aflag_set(pg, PGA_REFERENCED);
 2588         }
 2589 }
 2590 
 2591 void
 2592 moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
 2593     vm_prot_t prot)
 2594 {
 2595         struct  pvo_entry *pvo, key;
 2596 
 2597         CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
 2598             sva, eva, prot);
 2599 
 2600         KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
 2601             ("moea64_protect: non current pmap"));
 2602 
 2603         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 2604                 moea64_remove(pm, sva, eva);
 2605                 return;
 2606         }
 2607 
 2608         PMAP_LOCK(pm);
 2609         key.pvo_vaddr = sva;
 2610         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
 2611             pvo != NULL && PVO_VADDR(pvo) < eva;
 2612             pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
 2613                 if (PVO_IS_SP(pvo)) {
 2614                         if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
 2615                                 pvo = moea64_sp_protect(pvo, prot);
 2616                                 continue;
 2617                         } else {
 2618                                 CTR1(KTR_PMAP, "%s: demote before protect",
 2619                                     __func__);
 2620                                 moea64_sp_demote(pvo);
 2621                         }
 2622                 }
 2623                 moea64_pvo_protect(pm, pvo, prot);
 2624         }
 2625         PMAP_UNLOCK(pm);
 2626 }
 2627 
 2628 /*
 2629  * Map a list of wired pages into kernel virtual address space.  This is
 2630  * intended for temporary mappings which do not need page modification or
 2631  * references recorded.  Existing mappings in the region are overwritten.
 2632  */
 2633 void
 2634 moea64_qenter(vm_offset_t va, vm_page_t *m, int count)
 2635 {
 2636         while (count-- > 0) {
 2637                 moea64_kenter(va, VM_PAGE_TO_PHYS(*m));
 2638                 va += PAGE_SIZE;
 2639                 m++;
 2640         }
 2641 }
 2642 
 2643 /*
 2644  * Remove page mappings from kernel virtual address space.  Intended for
 2645  * temporary mappings entered by moea64_qenter.
 2646  */
 2647 void
 2648 moea64_qremove(vm_offset_t va, int count)
 2649 {
 2650         while (count-- > 0) {
 2651                 moea64_kremove(va);
 2652                 va += PAGE_SIZE;
 2653         }
 2654 }
 2655 
 2656 void
 2657 moea64_release_vsid(uint64_t vsid)
 2658 {
 2659         int idx, mask;
 2660 
 2661         mtx_lock(&moea64_slb_mutex);
 2662         idx = vsid & (NVSIDS-1);
 2663         mask = 1 << (idx % VSID_NBPW);
 2664         idx /= VSID_NBPW;
 2665         KASSERT(moea64_vsid_bitmap[idx] & mask,
 2666             ("Freeing unallocated VSID %#jx", vsid));
 2667         moea64_vsid_bitmap[idx] &= ~mask;
 2668         mtx_unlock(&moea64_slb_mutex);
 2669 }
 2670 
 2671 void
 2672 moea64_release(pmap_t pmap)
 2673 {
 2674 
 2675         /*
 2676          * Free segment registers' VSIDs
 2677          */
 2678     #ifdef __powerpc64__
 2679         slb_free_tree(pmap);
 2680         slb_free_user_cache(pmap->pm_slb);
 2681     #else
 2682         KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0"));
 2683 
 2684         moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0]));
 2685     #endif
 2686 }
 2687 
 2688 /*
 2689  * Remove all pages mapped by the specified pmap
 2690  */
 2691 void
 2692 moea64_remove_pages(pmap_t pm)
 2693 {
 2694         struct pvo_entry *pvo, *tpvo;
 2695         struct pvo_dlist tofree;
 2696 
 2697         SLIST_INIT(&tofree);
 2698 
 2699         PMAP_LOCK(pm);
 2700         RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) {
 2701                 if (pvo->pvo_vaddr & PVO_WIRED)
 2702                         continue;
 2703 
 2704                 /*
 2705                  * For locking reasons, remove this from the page table and
 2706                  * pmap, but save delinking from the vm_page for a second
 2707                  * pass
 2708                  */
 2709                 moea64_pvo_remove_from_pmap(pvo);
 2710                 SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink);
 2711         }
 2712         PMAP_UNLOCK(pm);
 2713 
 2714         while (!SLIST_EMPTY(&tofree)) {
 2715                 pvo = SLIST_FIRST(&tofree);
 2716                 SLIST_REMOVE_HEAD(&tofree, pvo_dlink);
 2717                 moea64_pvo_remove_from_page(pvo);
 2718                 free_pvo_entry(pvo);
 2719         }
 2720 }
 2721 
 2722 static void
 2723 moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
 2724     struct pvo_dlist *tofree)
 2725 {
 2726         struct pvo_entry *pvo, *tpvo, key;
 2727 
 2728         PMAP_LOCK_ASSERT(pm, MA_OWNED);
 2729 
 2730         key.pvo_vaddr = sva;
 2731         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
 2732             pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
 2733                 if (PVO_IS_SP(pvo)) {
 2734                         if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
 2735                                 tpvo = moea64_sp_remove(pvo, tofree);
 2736                                 continue;
 2737                         } else {
 2738                                 CTR1(KTR_PMAP, "%s: demote before remove",
 2739                                     __func__);
 2740                                 moea64_sp_demote(pvo);
 2741                         }
 2742                 }
 2743                 tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
 2744 
 2745                 /*
 2746                  * For locking reasons, remove this from the page table and
 2747                  * pmap, but save delinking from the vm_page for a second
 2748                  * pass
 2749                  */
 2750                 moea64_pvo_remove_from_pmap(pvo);
 2751                 SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
 2752         }
 2753 }
 2754 
 2755 /*
 2756  * Remove the given range of addresses from the specified map.
 2757  */
 2758 void
 2759 moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 2760 {
 2761         struct pvo_entry *pvo;
 2762         struct pvo_dlist tofree;
 2763 
 2764         /*
 2765          * Perform an unsynchronized read.  This is, however, safe.
 2766          */
 2767         if (pm->pm_stats.resident_count == 0)
 2768                 return;
 2769 
 2770         SLIST_INIT(&tofree);
 2771         PMAP_LOCK(pm);
 2772         moea64_remove_locked(pm, sva, eva, &tofree);
 2773         PMAP_UNLOCK(pm);
 2774 
 2775         while (!SLIST_EMPTY(&tofree)) {
 2776                 pvo = SLIST_FIRST(&tofree);
 2777                 SLIST_REMOVE_HEAD(&tofree, pvo_dlink);
 2778                 moea64_pvo_remove_from_page(pvo);
 2779                 free_pvo_entry(pvo);
 2780         }
 2781 }
 2782 
 2783 /*
 2784  * Remove physical page from all pmaps in which it resides. moea64_pvo_remove()
 2785  * will reflect changes in pte's back to the vm_page.
 2786  */
 2787 void
 2788 moea64_remove_all(vm_page_t m)
 2789 {
 2790         struct  pvo_entry *pvo, *next_pvo;
 2791         struct  pvo_head freequeue;
 2792         int     wasdead;
 2793         pmap_t  pmap;
 2794 
 2795         LIST_INIT(&freequeue);
 2796 
 2797         PV_PAGE_LOCK(m);
 2798         LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) {
 2799                 pmap = pvo->pvo_pmap;
 2800                 PMAP_LOCK(pmap);
 2801                 wasdead = (pvo->pvo_vaddr & PVO_DEAD);
 2802                 if (!wasdead) {
 2803                         if (PVO_IS_SP(pvo)) {
 2804                                 CTR1(KTR_PMAP, "%s: demote before remove_all",
 2805                                     __func__);
 2806                                 moea64_sp_demote(pvo);
 2807                         }
 2808                         moea64_pvo_remove_from_pmap(pvo);
 2809                 }
 2810                 moea64_pvo_remove_from_page_locked(pvo, m);
 2811                 if (!wasdead)
 2812                         LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink);
 2813                 PMAP_UNLOCK(pmap);
 2814                 
 2815         }
 2816         KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings"));
 2817         KASSERT((m->a.flags & PGA_WRITEABLE) == 0, ("Page still writable"));
 2818         PV_PAGE_UNLOCK(m);
 2819 
 2820         /* Clean up UMA allocations */
 2821         LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo)
 2822                 free_pvo_entry(pvo);
 2823 }
 2824 
 2825 /*
 2826  * Allocate a physical page of memory directly from the phys_avail map.
 2827  * Can only be called from moea64_bootstrap before avail start and end are
 2828  * calculated.
 2829  */
 2830 vm_offset_t
 2831 moea64_bootstrap_alloc(vm_size_t size, vm_size_t align)
 2832 {
 2833         vm_offset_t     s, e;
 2834         int             i, j;
 2835 
 2836         size = round_page(size);
 2837         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 2838                 if (align != 0)
 2839                         s = roundup2(phys_avail[i], align);
 2840                 else
 2841                         s = phys_avail[i];
 2842                 e = s + size;
 2843 
 2844                 if (s < phys_avail[i] || e > phys_avail[i + 1])
 2845                         continue;
 2846 
 2847                 if (s + size > platform_real_maxaddr())
 2848                         continue;
 2849 
 2850                 if (s == phys_avail[i]) {
 2851                         phys_avail[i] += size;
 2852                 } else if (e == phys_avail[i + 1]) {
 2853                         phys_avail[i + 1] -= size;
 2854                 } else {
 2855                         for (j = phys_avail_count * 2; j > i; j -= 2) {
 2856                                 phys_avail[j] = phys_avail[j - 2];
 2857                                 phys_avail[j + 1] = phys_avail[j - 1];
 2858                         }
 2859 
 2860                         phys_avail[i + 3] = phys_avail[i + 1];
 2861                         phys_avail[i + 1] = s;
 2862                         phys_avail[i + 2] = e;
 2863                         phys_avail_count++;
 2864                 }
 2865 
 2866                 return (s);
 2867         }
 2868         panic("moea64_bootstrap_alloc: could not allocate memory");
 2869 }
 2870 
 2871 static int
 2872 moea64_pvo_enter(struct pvo_entry *pvo, struct pvo_head *pvo_head,
 2873     struct pvo_entry **oldpvop)
 2874 {
 2875         struct pvo_entry *old_pvo;
 2876         int err;
 2877 
 2878         PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
 2879 
 2880         STAT_MOEA64(moea64_pvo_enter_calls++);
 2881 
 2882         /*
 2883          * Add to pmap list
 2884          */
 2885         old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
 2886 
 2887         if (old_pvo != NULL) {
 2888                 if (oldpvop != NULL)
 2889                         *oldpvop = old_pvo;
 2890                 return (EEXIST);
 2891         }
 2892 
 2893         if (pvo_head != NULL) {
 2894                 LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
 2895         }
 2896 
 2897         if (pvo->pvo_vaddr & PVO_WIRED)
 2898                 pvo->pvo_pmap->pm_stats.wired_count++;
 2899         pvo->pvo_pmap->pm_stats.resident_count++;
 2900 
 2901         /*
 2902          * Insert it into the hardware page table
 2903          */
 2904         err = moea64_pte_insert(pvo);
 2905         if (err != 0) {
 2906                 panic("moea64_pvo_enter: overflow");
 2907         }
 2908 
 2909         STAT_MOEA64(moea64_pvo_entries++);
 2910 
 2911         if (pvo->pvo_pmap == kernel_pmap)
 2912                 isync();
 2913 
 2914 #ifdef __powerpc64__
 2915         /*
 2916          * Make sure all our bootstrap mappings are in the SLB as soon
 2917          * as virtual memory is switched on.
 2918          */
 2919         if (!pmap_bootstrapped)
 2920                 moea64_bootstrap_slb_prefault(PVO_VADDR(pvo),
 2921                     pvo->pvo_vaddr & PVO_LARGE);
 2922 #endif
 2923 
 2924         return (0);
 2925 }
 2926 
 2927 static void
 2928 moea64_pvo_remove_from_pmap(struct pvo_entry *pvo)
 2929 {
 2930         struct  vm_page *pg;
 2931         int32_t refchg;
 2932 
 2933         KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap"));
 2934         PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
 2935         KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO"));
 2936 
 2937         /*
 2938          * If there is an active pte entry, we need to deactivate it
 2939          */
 2940         refchg = moea64_pte_unset(pvo);
 2941         if (refchg < 0) {
 2942                 /*
 2943                  * If it was evicted from the page table, be pessimistic and
 2944                  * dirty the page.
 2945                  */
 2946                 if (pvo->pvo_pte.prot & VM_PROT_WRITE)
 2947                         refchg = LPTE_CHG;
 2948                 else
 2949                         refchg = 0;
 2950         }
 2951 
 2952         /*
 2953          * Update our statistics.
 2954          */
 2955         pvo->pvo_pmap->pm_stats.resident_count--;
 2956         if (pvo->pvo_vaddr & PVO_WIRED)
 2957                 pvo->pvo_pmap->pm_stats.wired_count--;
 2958 
 2959         /*
 2960          * Remove this PVO from the pmap list.
 2961          */
 2962         RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
 2963 
 2964         /*
 2965          * Mark this for the next sweep
 2966          */
 2967         pvo->pvo_vaddr |= PVO_DEAD;
 2968 
 2969         /* Send RC bits to VM */
 2970         if ((pvo->pvo_vaddr & PVO_MANAGED) &&
 2971             (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
 2972                 pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
 2973                 if (pg != NULL) {
 2974                         refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
 2975                         if (refchg & LPTE_CHG)
 2976                                 vm_page_dirty(pg);
 2977                         if (refchg & LPTE_REF)
 2978                                 vm_page_aflag_set(pg, PGA_REFERENCED);
 2979                 }
 2980         }
 2981 }
 2982 
 2983 static inline void
 2984 moea64_pvo_remove_from_page_locked(struct pvo_entry *pvo,
 2985     vm_page_t m)
 2986 {
 2987 
 2988         KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page"));
 2989 
 2990         /* Use NULL pmaps as a sentinel for races in page deletion */
 2991         if (pvo->pvo_pmap == NULL)
 2992                 return;
 2993         pvo->pvo_pmap = NULL;
 2994 
 2995         /*
 2996          * Update vm about page writeability/executability if managed
 2997          */
 2998         PV_LOCKASSERT(PVO_PADDR(pvo));
 2999         if (pvo->pvo_vaddr & PVO_MANAGED) {
 3000                 if (m != NULL) {
 3001                         LIST_REMOVE(pvo, pvo_vlink);
 3002                         if (LIST_EMPTY(vm_page_to_pvoh(m)))
 3003                                 vm_page_aflag_clear(m,
 3004                                     PGA_WRITEABLE | PGA_EXECUTABLE);
 3005                 }
 3006         }
 3007 
 3008         STAT_MOEA64(moea64_pvo_entries--);
 3009         STAT_MOEA64(moea64_pvo_remove_calls++);
 3010 }
 3011 
 3012 static void
 3013 moea64_pvo_remove_from_page(struct pvo_entry *pvo)
 3014 {
 3015         vm_page_t pg = NULL;
 3016 
 3017         if (pvo->pvo_vaddr & PVO_MANAGED)
 3018                 pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
 3019 
 3020         PV_LOCK(PVO_PADDR(pvo));
 3021         moea64_pvo_remove_from_page_locked(pvo, pg);
 3022         PV_UNLOCK(PVO_PADDR(pvo));
 3023 }
 3024 
 3025 static struct pvo_entry *
 3026 moea64_pvo_find_va(pmap_t pm, vm_offset_t va)
 3027 {
 3028         struct pvo_entry key;
 3029 
 3030         PMAP_LOCK_ASSERT(pm, MA_OWNED);
 3031 
 3032         key.pvo_vaddr = va & ~ADDR_POFF;
 3033         return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key));
 3034 }
 3035 
 3036 static boolean_t
 3037 moea64_query_bit(vm_page_t m, uint64_t ptebit)
 3038 {
 3039         struct  pvo_entry *pvo;
 3040         int64_t ret;
 3041         boolean_t rv;
 3042         vm_page_t sp;
 3043 
 3044         /*
 3045          * See if this bit is stored in the page already.
 3046          *
 3047          * For superpages, the bit is stored in the first vm page.
 3048          */
 3049         if ((m->md.mdpg_attrs & ptebit) != 0 ||
 3050             ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK)) != NULL &&
 3051              (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) ==
 3052              (ptebit | MDPG_ATTR_SP)))
 3053                 return (TRUE);
 3054 
 3055         /*
 3056          * Examine each PTE.  Sync so that any pending REF/CHG bits are
 3057          * flushed to the PTEs.
 3058          */
 3059         rv = FALSE;
 3060         powerpc_sync();
 3061         PV_PAGE_LOCK(m);
 3062         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
 3063                 if (PVO_IS_SP(pvo)) {
 3064                         ret = moea64_sp_query(pvo, ptebit);
 3065                         /*
 3066                          * If SP was not demoted, check its REF/CHG bits here.
 3067                          */
 3068                         if (ret != -1) {
 3069                                 if ((ret & ptebit) != 0) {
 3070                                         rv = TRUE;
 3071                                         break;
 3072                                 }
 3073                                 continue;
 3074                         }
 3075                         /* else, fallthrough */
 3076                 }
 3077 
 3078                 ret = 0;
 3079 
 3080                 /*
 3081                  * See if this pvo has a valid PTE.  if so, fetch the
 3082                  * REF/CHG bits from the valid PTE.  If the appropriate
 3083                  * ptebit is set, return success.
 3084                  */
 3085                 PMAP_LOCK(pvo->pvo_pmap);
 3086                 if (!(pvo->pvo_vaddr & PVO_DEAD))
 3087                         ret = moea64_pte_synch(pvo);
 3088                 PMAP_UNLOCK(pvo->pvo_pmap);
 3089 
 3090                 if (ret > 0) {
 3091                         atomic_set_32(&m->md.mdpg_attrs,
 3092                             ret & (LPTE_CHG | LPTE_REF));
 3093                         if (ret & ptebit) {
 3094                                 rv = TRUE;
 3095                                 break;
 3096                         }
 3097                 }
 3098         }
 3099         PV_PAGE_UNLOCK(m);
 3100 
 3101         return (rv);
 3102 }
 3103 
 3104 static u_int
 3105 moea64_clear_bit(vm_page_t m, u_int64_t ptebit)
 3106 {
 3107         u_int   count;
 3108         struct  pvo_entry *pvo;
 3109         int64_t ret;
 3110 
 3111         /*
 3112          * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
 3113          * we can reset the right ones).
 3114          */
 3115         powerpc_sync();
 3116 
 3117         /*
 3118          * For each pvo entry, clear the pte's ptebit.
 3119          */
 3120         count = 0;
 3121         PV_PAGE_LOCK(m);
 3122         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
 3123                 if (PVO_IS_SP(pvo)) {
 3124                         if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) {
 3125                                 count += ret;
 3126                                 continue;
 3127                         }
 3128                 }
 3129                 ret = 0;
 3130 
 3131                 PMAP_LOCK(pvo->pvo_pmap);
 3132                 if (!(pvo->pvo_vaddr & PVO_DEAD))
 3133                         ret = moea64_pte_clear(pvo, ptebit);
 3134                 PMAP_UNLOCK(pvo->pvo_pmap);
 3135 
 3136                 if (ret > 0 && (ret & ptebit))
 3137                         count++;
 3138         }
 3139         atomic_clear_32(&m->md.mdpg_attrs, ptebit);
 3140         PV_PAGE_UNLOCK(m);
 3141 
 3142         return (count);
 3143 }
 3144 
 3145 boolean_t
 3146 moea64_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
 3147 {
 3148         struct pvo_entry *pvo, key;
 3149         vm_offset_t ppa;
 3150         int error = 0;
 3151 
 3152         if (hw_direct_map && mem_valid(pa, size) == 0)
 3153                 return (0);
 3154 
 3155         PMAP_LOCK(kernel_pmap);
 3156         ppa = pa & ~ADDR_POFF;
 3157         key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa;
 3158         for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key);
 3159             ppa < pa + size; ppa += PAGE_SIZE,
 3160             pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) {
 3161                 if (pvo == NULL || PVO_PADDR(pvo) != ppa) {
 3162                         error = EFAULT;
 3163                         break;
 3164                 }
 3165         }
 3166         PMAP_UNLOCK(kernel_pmap);
 3167 
 3168         return (error);
 3169 }
 3170 
 3171 /*
 3172  * Map a set of physical memory pages into the kernel virtual
 3173  * address space. Return a pointer to where it is mapped. This
 3174  * routine is intended to be used for mapping device memory,
 3175  * NOT real memory.
 3176  */
 3177 void *
 3178 moea64_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma)
 3179 {
 3180         vm_offset_t va, tmpva, ppa, offset;
 3181 
 3182         ppa = trunc_page(pa);
 3183         offset = pa & PAGE_MASK;
 3184         size = roundup2(offset + size, PAGE_SIZE);
 3185 
 3186         va = kva_alloc(size);
 3187 
 3188         if (!va)
 3189                 panic("moea64_mapdev: Couldn't alloc kernel virtual memory");
 3190 
 3191         for (tmpva = va; size > 0;) {
 3192                 moea64_kenter_attr(tmpva, ppa, ma);
 3193                 size -= PAGE_SIZE;
 3194                 tmpva += PAGE_SIZE;
 3195                 ppa += PAGE_SIZE;
 3196         }
 3197 
 3198         return ((void *)(va + offset));
 3199 }
 3200 
 3201 void *
 3202 moea64_mapdev(vm_paddr_t pa, vm_size_t size)
 3203 {
 3204 
 3205         return moea64_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT);
 3206 }
 3207 
 3208 void
 3209 moea64_unmapdev(void *p, vm_size_t size)
 3210 {
 3211         vm_offset_t base, offset, va;
 3212 
 3213         va = (vm_offset_t)p;
 3214         base = trunc_page(va);
 3215         offset = va & PAGE_MASK;
 3216         size = roundup2(offset + size, PAGE_SIZE);
 3217 
 3218         moea64_qremove(base, atop(size));
 3219         kva_free(base, size);
 3220 }
 3221 
 3222 void
 3223 moea64_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 3224 {
 3225         struct pvo_entry *pvo;
 3226         vm_offset_t lim;
 3227         vm_paddr_t pa;
 3228         vm_size_t len;
 3229 
 3230         if (__predict_false(pm == NULL))
 3231                 pm = &curthread->td_proc->p_vmspace->vm_pmap;
 3232 
 3233         PMAP_LOCK(pm);
 3234         while (sz > 0) {
 3235                 lim = round_page(va+1);
 3236                 len = MIN(lim - va, sz);
 3237                 pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF);
 3238                 if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) {
 3239                         pa = PVO_PADDR(pvo) | (va & ADDR_POFF);
 3240                         moea64_syncicache(pm, va, pa, len);
 3241                 }
 3242                 va += len;
 3243                 sz -= len;
 3244         }
 3245         PMAP_UNLOCK(pm);
 3246 }
 3247 
 3248 void
 3249 moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va)
 3250 {
 3251 
 3252         *va = (void *)(uintptr_t)pa;
 3253 }
 3254 
 3255 extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1];
 3256 
 3257 void
 3258 moea64_scan_init(void)
 3259 {
 3260         struct pvo_entry *pvo;
 3261         vm_offset_t va;
 3262         int i;
 3263 
 3264         if (!do_minidump) {
 3265                 /* Initialize phys. segments for dumpsys(). */
 3266                 memset(&dump_map, 0, sizeof(dump_map));
 3267                 mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
 3268                 for (i = 0; i < pregions_sz; i++) {
 3269                         dump_map[i].pa_start = pregions[i].mr_start;
 3270                         dump_map[i].pa_size = pregions[i].mr_size;
 3271                 }
 3272                 return;
 3273         }
 3274 
 3275         /* Virtual segments for minidumps: */
 3276         memset(&dump_map, 0, sizeof(dump_map));
 3277 
 3278         /* 1st: kernel .data and .bss. */
 3279         dump_map[0].pa_start = trunc_page((uintptr_t)_etext);
 3280         dump_map[0].pa_size = round_page((uintptr_t)_end) -
 3281             dump_map[0].pa_start;
 3282 
 3283         /* 2nd: msgbuf and tables (see pmap_bootstrap()). */
 3284         dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr;
 3285         dump_map[1].pa_size = round_page(msgbufp->msg_size);
 3286 
 3287         /* 3rd: kernel VM. */
 3288         va = dump_map[1].pa_start + dump_map[1].pa_size;
 3289         /* Find start of next chunk (from va). */
 3290         while (va < virtual_end) {
 3291                 /* Don't dump the buffer cache. */
 3292                 if (va >= kmi.buffer_sva && va < kmi.buffer_eva) {
 3293                         va = kmi.buffer_eva;
 3294                         continue;
 3295                 }
 3296                 pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
 3297                 if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD))
 3298                         break;
 3299                 va += PAGE_SIZE;
 3300         }
 3301         if (va < virtual_end) {
 3302                 dump_map[2].pa_start = va;
 3303                 va += PAGE_SIZE;
 3304                 /* Find last page in chunk. */
 3305                 while (va < virtual_end) {
 3306                         /* Don't run into the buffer cache. */
 3307                         if (va == kmi.buffer_sva)
 3308                                 break;
 3309                         pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
 3310                         if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD))
 3311                                 break;
 3312                         va += PAGE_SIZE;
 3313                 }
 3314                 dump_map[2].pa_size = va - dump_map[2].pa_start;
 3315         }
 3316 }
 3317 
 3318 #ifdef __powerpc64__
 3319 
 3320 static size_t
 3321 moea64_scan_pmap(struct bitset *dump_bitset)
 3322 {
 3323         struct pvo_entry *pvo;
 3324         vm_paddr_t pa, pa_end;
 3325         vm_offset_t va, pgva, kstart, kend, kstart_lp, kend_lp;
 3326         uint64_t lpsize;
 3327 
 3328         lpsize = moea64_large_page_size;
 3329         kstart = trunc_page((vm_offset_t)_etext);
 3330         kend = round_page((vm_offset_t)_end);
 3331         kstart_lp = kstart & ~moea64_large_page_mask;
 3332         kend_lp = (kend + moea64_large_page_mask) & ~moea64_large_page_mask;
 3333 
 3334         CTR4(KTR_PMAP, "moea64_scan_pmap: kstart=0x%016lx, kend=0x%016lx, "
 3335             "kstart_lp=0x%016lx, kend_lp=0x%016lx",
 3336             kstart, kend, kstart_lp, kend_lp);
 3337 
 3338         PMAP_LOCK(kernel_pmap);
 3339         RB_FOREACH(pvo, pvo_tree, &kernel_pmap->pmap_pvo) {
 3340                 va = pvo->pvo_vaddr;
 3341 
 3342                 if (va & PVO_DEAD)
 3343                         continue;
 3344 
 3345                 /* Skip DMAP (except kernel area) */
 3346                 if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) {
 3347                         if (va & PVO_LARGE) {
 3348                                 pgva = va & ~moea64_large_page_mask;
 3349                                 if (pgva < kstart_lp || pgva >= kend_lp)
 3350                                         continue;
 3351                         } else {
 3352                                 pgva = trunc_page(va);
 3353                                 if (pgva < kstart || pgva >= kend)
 3354                                         continue;
 3355                         }
 3356                 }
 3357 
 3358                 pa = PVO_PADDR(pvo);
 3359 
 3360                 if (va & PVO_LARGE) {
 3361                         pa_end = pa + lpsize;
 3362                         for (; pa < pa_end; pa += PAGE_SIZE) {
 3363                                 if (vm_phys_is_dumpable(pa))
 3364                                         vm_page_dump_add(dump_bitset, pa);
 3365                         }
 3366                 } else {
 3367                         if (vm_phys_is_dumpable(pa))
 3368                                 vm_page_dump_add(dump_bitset, pa);
 3369                 }
 3370         }
 3371         PMAP_UNLOCK(kernel_pmap);
 3372 
 3373         return (sizeof(struct lpte) * moea64_pteg_count * 8);
 3374 }
 3375 
 3376 static struct dump_context dump_ctx;
 3377 
 3378 static void *
 3379 moea64_dump_pmap_init(unsigned blkpgs)
 3380 {
 3381         dump_ctx.ptex = 0;
 3382         dump_ctx.ptex_end = moea64_pteg_count * 8;
 3383         dump_ctx.blksz = blkpgs * PAGE_SIZE;
 3384         return (&dump_ctx);
 3385 }
 3386 
 3387 #else
 3388 
 3389 static size_t
 3390 moea64_scan_pmap(struct bitset *dump_bitset __unused)
 3391 {
 3392         return (0);
 3393 }
 3394 
 3395 static void *
 3396 moea64_dump_pmap_init(unsigned blkpgs)
 3397 {
 3398         return (NULL);
 3399 }
 3400 
 3401 #endif
 3402 
 3403 #ifdef __powerpc64__
 3404 static void
 3405 moea64_map_range(vm_offset_t va, vm_paddr_t pa, vm_size_t npages)
 3406 {
 3407 
 3408         for (; npages > 0; --npages) {
 3409                 if (moea64_large_page_size != 0 &&
 3410                     (pa & moea64_large_page_mask) == 0 &&
 3411                     (va & moea64_large_page_mask) == 0 &&
 3412                     npages >= (moea64_large_page_size >> PAGE_SHIFT)) {
 3413                         PMAP_LOCK(kernel_pmap);
 3414                         moea64_kenter_large(va, pa, 0, 0);
 3415                         PMAP_UNLOCK(kernel_pmap);
 3416                         pa += moea64_large_page_size;
 3417                         va += moea64_large_page_size;
 3418                         npages -= (moea64_large_page_size >> PAGE_SHIFT) - 1;
 3419                 } else {
 3420                         moea64_kenter(va, pa);
 3421                         pa += PAGE_SIZE;
 3422                         va += PAGE_SIZE;
 3423                 }
 3424         }
 3425 }
 3426 
 3427 static void
 3428 moea64_page_array_startup(long pages)
 3429 {
 3430         long dom_pages[MAXMEMDOM];
 3431         vm_paddr_t pa;
 3432         vm_offset_t va, vm_page_base;
 3433         vm_size_t needed, size;
 3434         int domain;
 3435         int i;
 3436 
 3437         vm_page_base = 0xd000000000000000ULL;
 3438 
 3439         /* Short-circuit single-domain systems. */
 3440         if (vm_ndomains == 1) {
 3441                 size = round_page(pages * sizeof(struct vm_page));
 3442                 pa = vm_phys_early_alloc(0, size);
 3443                 vm_page_base = moea64_map(&vm_page_base,
 3444                     pa, pa + size, VM_PROT_READ | VM_PROT_WRITE);
 3445                 vm_page_array_size = pages;
 3446                 vm_page_array = (vm_page_t)vm_page_base;
 3447                 return;
 3448         }
 3449 
 3450         for (i = 0; i < MAXMEMDOM; i++)
 3451                 dom_pages[i] = 0;
 3452 
 3453         /* Now get the number of pages required per domain. */
 3454         for (i = 0; i < vm_phys_nsegs; i++) {
 3455                 domain = vm_phys_segs[i].domain;
 3456                 KASSERT(domain < MAXMEMDOM,
 3457                     ("Invalid vm_phys_segs NUMA domain %d!\n", domain));
 3458                 /* Get size of vm_page_array needed for this segment. */
 3459                 size = btoc(vm_phys_segs[i].end - vm_phys_segs[i].start);
 3460                 dom_pages[domain] += size;
 3461         }
 3462 
 3463         for (i = 0; phys_avail[i + 1] != 0; i+= 2) {
 3464                 domain = vm_phys_domain(phys_avail[i]);
 3465                 KASSERT(domain < MAXMEMDOM,
 3466                     ("Invalid phys_avail NUMA domain %d!\n", domain));
 3467                 size = btoc(phys_avail[i + 1] - phys_avail[i]);
 3468                 dom_pages[domain] += size;
 3469         }
 3470 
 3471         /*
 3472          * Map in chunks that can get us all 16MB pages.  There will be some
 3473          * overlap between domains, but that's acceptable for now.
 3474          */
 3475         vm_page_array_size = 0;
 3476         va = vm_page_base;
 3477         for (i = 0; i < MAXMEMDOM && vm_page_array_size < pages; i++) {
 3478                 if (dom_pages[i] == 0)
 3479                         continue;
 3480                 size = ulmin(pages - vm_page_array_size, dom_pages[i]);
 3481                 size = round_page(size * sizeof(struct vm_page));
 3482                 needed = size;
 3483                 size = roundup2(size, moea64_large_page_size);
 3484                 pa = vm_phys_early_alloc(i, size);
 3485                 vm_page_array_size += size / sizeof(struct vm_page);
 3486                 moea64_map_range(va, pa, size >> PAGE_SHIFT);
 3487                 /* Scoot up domain 0, to reduce the domain page overlap. */
 3488                 if (i == 0)
 3489                         vm_page_base += size - needed;
 3490                 va += size;
 3491         }
 3492         vm_page_array = (vm_page_t)vm_page_base;
 3493         vm_page_array_size = pages;
 3494 }
 3495 #endif
 3496 
 3497 static int64_t
 3498 moea64_null_method(void)
 3499 {
 3500         return (0);
 3501 }
 3502 
 3503 static int64_t moea64_pte_replace_default(struct pvo_entry *pvo, int flags)
 3504 {
 3505         int64_t refchg;
 3506 
 3507         refchg = moea64_pte_unset(pvo);
 3508         moea64_pte_insert(pvo);
 3509 
 3510         return (refchg);
 3511 }
 3512 
 3513 struct moea64_funcs *moea64_ops;
 3514 
 3515 #define DEFINE_OEA64_IFUNC(ret, func, args, def)                \
 3516         DEFINE_IFUNC(, ret, moea64_##func, args) {              \
 3517                 moea64_##func##_t f;                            \
 3518                 if (moea64_ops == NULL)                         \
 3519                         return ((moea64_##func##_t)def);        \
 3520                 f = moea64_ops->func;                           \
 3521                 return (f != NULL ? f : (moea64_##func##_t)def);\
 3522         }
 3523 
 3524 void
 3525 moea64_install(void)
 3526 {
 3527 #ifdef __powerpc64__
 3528         if (hw_direct_map == -1) {
 3529                 moea64_probe_large_page();
 3530 
 3531                 /* Use a direct map if we have large page support */
 3532                 if (moea64_large_page_size > 0)
 3533                         hw_direct_map = 1;
 3534                 else
 3535                         hw_direct_map = 0;
 3536         }
 3537 #endif
 3538 
 3539         /*
 3540          * Default to non-DMAP, and switch over to DMAP functions once we know
 3541          * we have DMAP.
 3542          */
 3543         if (hw_direct_map) {
 3544                 moea64_methods.quick_enter_page = moea64_quick_enter_page_dmap;
 3545                 moea64_methods.quick_remove_page = NULL;
 3546                 moea64_methods.copy_page = moea64_copy_page_dmap;
 3547                 moea64_methods.zero_page = moea64_zero_page_dmap;
 3548                 moea64_methods.copy_pages = moea64_copy_pages_dmap;
 3549         }
 3550 }
 3551 
 3552 DEFINE_OEA64_IFUNC(int64_t, pte_replace, (struct pvo_entry *, int),
 3553     moea64_pte_replace_default)
 3554 DEFINE_OEA64_IFUNC(int64_t, pte_insert, (struct pvo_entry *), moea64_null_method)
 3555 DEFINE_OEA64_IFUNC(int64_t, pte_unset, (struct pvo_entry *), moea64_null_method)
 3556 DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t),
 3557     moea64_null_method)
 3558 DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method)
 3559 DEFINE_OEA64_IFUNC(int64_t, pte_insert_sp, (struct pvo_entry *), moea64_null_method)
 3560 DEFINE_OEA64_IFUNC(int64_t, pte_unset_sp, (struct pvo_entry *), moea64_null_method)
 3561 DEFINE_OEA64_IFUNC(int64_t, pte_replace_sp, (struct pvo_entry *), moea64_null_method)
 3562 
 3563 /* Superpage functions */
 3564 
 3565 /* MMU interface */
 3566 
 3567 static bool
 3568 moea64_ps_enabled(pmap_t pmap)
 3569 {
 3570         return (superpages_enabled);
 3571 }
 3572 
 3573 static void
 3574 moea64_align_superpage(vm_object_t object, vm_ooffset_t offset,
 3575     vm_offset_t *addr, vm_size_t size)
 3576 {
 3577         vm_offset_t sp_offset;
 3578 
 3579         if (size < HPT_SP_SIZE)
 3580                 return;
 3581 
 3582         CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx",
 3583             __func__, (uintmax_t)offset, addr, (uintmax_t)size);
 3584 
 3585         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 3586                 offset += ptoa(object->pg_color);
 3587         sp_offset = offset & HPT_SP_MASK;
 3588         if (size - ((HPT_SP_SIZE - sp_offset) & HPT_SP_MASK) < HPT_SP_SIZE ||
 3589             (*addr & HPT_SP_MASK) == sp_offset)
 3590                 return;
 3591         if ((*addr & HPT_SP_MASK) < sp_offset)
 3592                 *addr = (*addr & ~HPT_SP_MASK) + sp_offset;
 3593         else
 3594                 *addr = ((*addr + HPT_SP_MASK) & ~HPT_SP_MASK) + sp_offset;
 3595 }
 3596 
 3597 /* Helpers */
 3598 
 3599 static __inline void
 3600 moea64_pvo_cleanup(struct pvo_dlist *tofree)
 3601 {
 3602         struct pvo_entry *pvo;
 3603 
 3604         /* clean up */
 3605         while (!SLIST_EMPTY(tofree)) {
 3606                 pvo = SLIST_FIRST(tofree);
 3607                 SLIST_REMOVE_HEAD(tofree, pvo_dlink);
 3608                 if (pvo->pvo_vaddr & PVO_DEAD)
 3609                         moea64_pvo_remove_from_page(pvo);
 3610                 free_pvo_entry(pvo);
 3611         }
 3612 }
 3613 
 3614 static __inline uint16_t
 3615 pvo_to_vmpage_flags(struct pvo_entry *pvo)
 3616 {
 3617         uint16_t flags;
 3618 
 3619         flags = 0;
 3620         if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0)
 3621                 flags |= PGA_WRITEABLE;
 3622         if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0)
 3623                 flags |= PGA_EXECUTABLE;
 3624 
 3625         return (flags);
 3626 }
 3627 
 3628 /*
 3629  * Check if the given pvo and its superpage are in sva-eva range.
 3630  */
 3631 static __inline bool
 3632 moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva)
 3633 {
 3634         vm_offset_t spva;
 3635 
 3636         spva = PVO_VADDR(pvo) & ~HPT_SP_MASK;
 3637         if (spva >= sva && spva + HPT_SP_SIZE <= eva) {
 3638                 /*
 3639                  * Because this function is intended to be called from loops
 3640                  * that iterate over ordered pvo entries, if the condition
 3641                  * above is true then the pvo must be the first of its
 3642                  * superpage.
 3643                  */
 3644                 KASSERT(PVO_VADDR(pvo) == spva,
 3645                     ("%s: unexpected unaligned superpage pvo", __func__));
 3646                 return (true);
 3647         }
 3648         return (false);
 3649 }
 3650 
 3651 /*
 3652  * Update vm about the REF/CHG bits if the superpage is managed and
 3653  * has (or had) write access.
 3654  */
 3655 static void
 3656 moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m,
 3657     int64_t sp_refchg, vm_prot_t prot)
 3658 {
 3659         vm_page_t m_end;
 3660         int64_t refchg;
 3661 
 3662         if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) {
 3663                 for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++) {
 3664                         refchg = sp_refchg |
 3665                             atomic_readandclear_32(&m->md.mdpg_attrs);
 3666                         if (refchg & LPTE_CHG)
 3667                                 vm_page_dirty(m);
 3668                         if (refchg & LPTE_REF)
 3669                                 vm_page_aflag_set(m, PGA_REFERENCED);
 3670                 }
 3671         }
 3672 }
 3673 
 3674 /* Superpage ops */
 3675 
 3676 static int
 3677 moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
 3678     vm_prot_t prot, u_int flags, int8_t psind)
 3679 {
 3680         struct pvo_entry *pvo, **pvos;
 3681         struct pvo_head *pvo_head;
 3682         vm_offset_t sva;
 3683         vm_page_t sm;
 3684         vm_paddr_t pa, spa;
 3685         bool sync;
 3686         struct pvo_dlist tofree;
 3687         int error __diagused, i;
 3688         uint16_t aflags;
 3689 
 3690         KASSERT((va & HPT_SP_MASK) == 0, ("%s: va %#jx unaligned",
 3691             __func__, (uintmax_t)va));
 3692         KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind));
 3693         KASSERT(m->psind == 1, ("%s: invalid m->psind: %d",
 3694             __func__, m->psind));
 3695         KASSERT(pmap != kernel_pmap,
 3696             ("%s: function called with kernel pmap", __func__));
 3697 
 3698         CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1",
 3699             __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m),
 3700             prot, flags);
 3701 
 3702         SLIST_INIT(&tofree);
 3703 
 3704         sva = va;
 3705         sm = m;
 3706         spa = pa = VM_PAGE_TO_PHYS(sm);
 3707 
 3708         /* Try to allocate all PVOs first, to make failure handling easier. */
 3709         pvos = malloc(HPT_SP_PAGES * sizeof(struct pvo_entry *), M_TEMP,
 3710             M_NOWAIT);
 3711         if (pvos == NULL) {
 3712                 CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__);
 3713                 return (KERN_RESOURCE_SHORTAGE);
 3714         }
 3715 
 3716         for (i = 0; i < HPT_SP_PAGES; i++) {
 3717                 pvos[i] = alloc_pvo_entry(0);
 3718                 if (pvos[i] == NULL) {
 3719                         CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__);
 3720                         for (i = i - 1; i >= 0; i--)
 3721                                 free_pvo_entry(pvos[i]);
 3722                         free(pvos, M_TEMP);
 3723                         return (KERN_RESOURCE_SHORTAGE);
 3724                 }
 3725         }
 3726 
 3727         SP_PV_LOCK_ALIGNED(spa);
 3728         PMAP_LOCK(pmap);
 3729 
 3730         /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */
 3731         moea64_remove_locked(pmap, va, va + HPT_SP_SIZE, &tofree);
 3732 
 3733         /* Enter pages */
 3734         for (i = 0; i < HPT_SP_PAGES;
 3735             i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) {
 3736                 pvo = pvos[i];
 3737 
 3738                 pvo->pvo_pte.prot = prot;
 3739                 pvo->pvo_pte.pa = (pa & ~HPT_SP_MASK) | LPTE_LP_4K_16M |
 3740                     moea64_calc_wimg(pa, pmap_page_get_memattr(m));
 3741 
 3742                 if ((flags & PMAP_ENTER_WIRED) != 0)
 3743                         pvo->pvo_vaddr |= PVO_WIRED;
 3744                 pvo->pvo_vaddr |= PVO_LARGE;
 3745 
 3746                 if ((m->oflags & VPO_UNMANAGED) != 0)
 3747                         pvo_head = NULL;
 3748                 else {
 3749                         pvo_head = &m->md.mdpg_pvoh;
 3750                         pvo->pvo_vaddr |= PVO_MANAGED;
 3751                 }
 3752 
 3753                 init_pvo_entry(pvo, pmap, va);
 3754 
 3755                 error = moea64_pvo_enter(pvo, pvo_head, NULL);
 3756                 /*
 3757                  * All superpage PVOs were previously removed, so no errors
 3758                  * should occur while inserting the new ones.
 3759                  */
 3760                 KASSERT(error == 0, ("%s: unexpected error "
 3761                             "when inserting superpage PVO: %d",
 3762                             __func__, error));
 3763         }
 3764 
 3765         PMAP_UNLOCK(pmap);
 3766         SP_PV_UNLOCK_ALIGNED(spa);
 3767 
 3768         sync = (sm->a.flags & PGA_EXECUTABLE) == 0;
 3769         /* Note: moea64_pvo_cleanup() also clears page prot. flags. */
 3770         moea64_pvo_cleanup(&tofree);
 3771         pvo = pvos[0];
 3772 
 3773         /* Set vm page flags */
 3774         aflags = pvo_to_vmpage_flags(pvo);
 3775         if (aflags != 0)
 3776                 for (m = sm; m < &sm[HPT_SP_PAGES]; m++)
 3777                         vm_page_aflag_set(m, aflags);
 3778 
 3779         /*
 3780          * Flush the page from the instruction cache if this page is
 3781          * mapped executable and cacheable.
 3782          */
 3783         if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0)
 3784                 moea64_syncicache(pmap, sva, spa, HPT_SP_SIZE);
 3785 
 3786         atomic_add_long(&sp_mappings, 1);
 3787         CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p",
 3788             __func__, (uintmax_t)sva, pmap);
 3789 
 3790         free(pvos, M_TEMP);
 3791         return (KERN_SUCCESS);
 3792 }
 3793 
 3794 static void
 3795 moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m)
 3796 {
 3797         struct pvo_entry *first, *pvo;
 3798         vm_paddr_t pa, pa_end;
 3799         vm_offset_t sva, va_end;
 3800         int64_t sp_refchg;
 3801 
 3802         /* This CTR may generate a lot of output. */
 3803         /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */
 3804 
 3805         va &= ~HPT_SP_MASK;
 3806         sva = va;
 3807         /* Get superpage */
 3808         pa = VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK;
 3809         m = PHYS_TO_VM_PAGE(pa);
 3810 
 3811         PMAP_LOCK(pmap);
 3812 
 3813         /*
 3814          * Check if all pages meet promotion criteria.
 3815          *
 3816          * XXX In some cases the loop below may be executed for each or most
 3817          * of the entered pages of a superpage, which can be expensive
 3818          * (although it was not profiled) and need some optimization.
 3819          *
 3820          * Some cases where this seems to happen are:
 3821          * - When a superpage is first entered read-only and later becomes
 3822          *   read-write.
 3823          * - When some of the superpage's virtual addresses map to previously
 3824          *   wired/cached pages while others map to pages allocated from a
 3825          *   different physical address range. A common scenario where this
 3826          *   happens is when mmap'ing a file that is already present in FS
 3827          *   block cache and doesn't fill a superpage.
 3828          */
 3829         first = pvo = moea64_pvo_find_va(pmap, sva);
 3830         for (pa_end = pa + HPT_SP_SIZE;
 3831             pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) {
 3832                 if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
 3833                         CTR3(KTR_PMAP,
 3834                             "%s: NULL or dead PVO: pmap=%p, va=%#jx",
 3835                             __func__, pmap, (uintmax_t)va);
 3836                         goto error;
 3837                 }
 3838                 if (PVO_PADDR(pvo) != pa) {
 3839                         CTR5(KTR_PMAP, "%s: PAs don't match: "
 3840                             "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx",
 3841                             __func__, pmap, (uintmax_t)va,
 3842                             (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa);
 3843                         atomic_add_long(&sp_p_fail_pa, 1);
 3844                         goto error;
 3845                 }
 3846                 if ((first->pvo_vaddr & PVO_FLAGS_PROMOTE) !=
 3847                     (pvo->pvo_vaddr & PVO_FLAGS_PROMOTE)) {
 3848                         CTR5(KTR_PMAP, "%s: PVO flags don't match: "
 3849                             "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx",
 3850                             __func__, pmap, (uintmax_t)va,
 3851                             (uintmax_t)(pvo->pvo_vaddr & PVO_FLAGS_PROMOTE),
 3852                             (uintmax_t)(first->pvo_vaddr & PVO_FLAGS_PROMOTE));
 3853                         atomic_add_long(&sp_p_fail_flags, 1);
 3854                         goto error;
 3855                 }
 3856                 if (first->pvo_pte.prot != pvo->pvo_pte.prot) {
 3857                         CTR5(KTR_PMAP, "%s: PVO protections don't match: "
 3858                             "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x",
 3859                             __func__, pmap, (uintmax_t)va,
 3860                             pvo->pvo_pte.prot, first->pvo_pte.prot);
 3861                         atomic_add_long(&sp_p_fail_prot, 1);
 3862                         goto error;
 3863                 }
 3864                 if ((first->pvo_pte.pa & LPTE_WIMG) !=
 3865                     (pvo->pvo_pte.pa & LPTE_WIMG)) {
 3866                         CTR5(KTR_PMAP, "%s: WIMG bits don't match: "
 3867                             "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx",
 3868                             __func__, pmap, (uintmax_t)va,
 3869                             (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG),
 3870                             (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG));
 3871                         atomic_add_long(&sp_p_fail_wimg, 1);
 3872                         goto error;
 3873                 }
 3874 
 3875                 pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo);
 3876         }
 3877 
 3878         /* All OK, promote. */
 3879 
 3880         /*
 3881          * Handle superpage REF/CHG bits. If REF or CHG is set in
 3882          * any page, then it must be set in the superpage.
 3883          *
 3884          * Instead of querying each page, we take advantage of two facts:
 3885          * 1- If a page is being promoted, it was referenced.
 3886          * 2- If promoted pages are writable, they were modified.
 3887          */
 3888         sp_refchg = LPTE_REF |
 3889             ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0);
 3890 
 3891         /* Promote pages */
 3892 
 3893         for (pvo = first, va_end = PVO_VADDR(pvo) + HPT_SP_SIZE;
 3894             pvo != NULL && PVO_VADDR(pvo) < va_end;
 3895             pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
 3896                 pvo->pvo_pte.pa &= ADDR_POFF | ~HPT_SP_MASK;
 3897                 pvo->pvo_pte.pa |= LPTE_LP_4K_16M;
 3898                 pvo->pvo_vaddr |= PVO_LARGE;
 3899         }
 3900         moea64_pte_replace_sp(first);
 3901 
 3902         /* Send REF/CHG bits to VM */
 3903         moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot);
 3904 
 3905         /* Use first page to cache REF/CHG bits */
 3906         atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP);
 3907 
 3908         PMAP_UNLOCK(pmap);
 3909 
 3910         atomic_add_long(&sp_mappings, 1);
 3911         atomic_add_long(&sp_promotions, 1);
 3912         CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
 3913             __func__, (uintmax_t)sva, pmap);
 3914         return;
 3915 
 3916 error:
 3917         atomic_add_long(&sp_p_failures, 1);
 3918         PMAP_UNLOCK(pmap);
 3919 }
 3920 
 3921 static void
 3922 moea64_sp_demote_aligned(struct pvo_entry *sp)
 3923 {
 3924         struct pvo_entry *pvo;
 3925         vm_offset_t va, va_end;
 3926         vm_paddr_t pa;
 3927         vm_page_t m;
 3928         pmap_t pmap __diagused;
 3929         int64_t refchg;
 3930 
 3931         CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
 3932 
 3933         pmap = sp->pvo_pmap;
 3934         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 3935 
 3936         pvo = sp;
 3937 
 3938         /* Demote pages */
 3939 
 3940         va = PVO_VADDR(pvo);
 3941         pa = PVO_PADDR(pvo);
 3942         m = PHYS_TO_VM_PAGE(pa);
 3943 
 3944         for (pvo = sp, va_end = va + HPT_SP_SIZE;
 3945             pvo != NULL && PVO_VADDR(pvo) < va_end;
 3946             pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo),
 3947             va += PAGE_SIZE, pa += PAGE_SIZE) {
 3948                 KASSERT(pvo && PVO_VADDR(pvo) == va,
 3949                     ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va));
 3950 
 3951                 pvo->pvo_vaddr &= ~PVO_LARGE;
 3952                 pvo->pvo_pte.pa &= ~LPTE_RPGN;
 3953                 pvo->pvo_pte.pa |= pa;
 3954 
 3955         }
 3956         refchg = moea64_pte_replace_sp(sp);
 3957 
 3958         /*
 3959          * Clear SP flag
 3960          *
 3961          * XXX It is possible that another pmap has this page mapped as
 3962          *     part of a superpage, but as the SP flag is used only for
 3963          *     caching SP REF/CHG bits, that will be queried if not set
 3964          *     in cache, it should be ok to clear it here.
 3965          */
 3966         atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP);
 3967 
 3968         /*
 3969          * Handle superpage REF/CHG bits. A bit set in the superpage
 3970          * means all pages should consider it set.
 3971          */
 3972         moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot);
 3973 
 3974         atomic_add_long(&sp_demotions, 1);
 3975         CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
 3976             __func__, (uintmax_t)PVO_VADDR(sp), pmap);
 3977 }
 3978 
 3979 static void
 3980 moea64_sp_demote(struct pvo_entry *pvo)
 3981 {
 3982         PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
 3983 
 3984         if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
 3985                 pvo = moea64_pvo_find_va(pvo->pvo_pmap,
 3986                     PVO_VADDR(pvo) & ~HPT_SP_MASK);
 3987                 KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx",
 3988                      __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
 3989         }
 3990         moea64_sp_demote_aligned(pvo);
 3991 }
 3992 
 3993 static struct pvo_entry *
 3994 moea64_sp_unwire(struct pvo_entry *sp)
 3995 {
 3996         struct pvo_entry *pvo, *prev;
 3997         vm_offset_t eva;
 3998         pmap_t pm;
 3999         int64_t ret, refchg;
 4000 
 4001         CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
 4002 
 4003         pm = sp->pvo_pmap;
 4004         PMAP_LOCK_ASSERT(pm, MA_OWNED);
 4005 
 4006         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
 4007         refchg = 0;
 4008         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
 4009             prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
 4010                 if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
 4011                         panic("%s: pvo %p is missing PVO_WIRED",
 4012                             __func__, pvo);
 4013                 pvo->pvo_vaddr &= ~PVO_WIRED;
 4014 
 4015                 ret = moea64_pte_replace(pvo, 0 /* No invalidation */);
 4016                 if (ret < 0)
 4017                         refchg |= LPTE_CHG;
 4018                 else
 4019                         refchg |= ret;
 4020 
 4021                 pm->pm_stats.wired_count--;
 4022         }
 4023 
 4024         /* Send REF/CHG bits to VM */
 4025         moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)),
 4026             refchg, sp->pvo_pte.prot);
 4027 
 4028         return (prev);
 4029 }
 4030 
 4031 static struct pvo_entry *
 4032 moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot)
 4033 {
 4034         struct pvo_entry *pvo, *prev;
 4035         vm_offset_t eva;
 4036         pmap_t pm;
 4037         vm_page_t m, m_end;
 4038         int64_t ret, refchg;
 4039         vm_prot_t oldprot;
 4040 
 4041         CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x",
 4042             __func__, (uintmax_t)PVO_VADDR(sp), prot);
 4043 
 4044         pm = sp->pvo_pmap;
 4045         PMAP_LOCK_ASSERT(pm, MA_OWNED);
 4046 
 4047         oldprot = sp->pvo_pte.prot;
 4048         m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
 4049         KASSERT(m != NULL, ("%s: missing vm page for pa %#jx",
 4050             __func__, (uintmax_t)PVO_PADDR(sp)));
 4051         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
 4052         refchg = 0;
 4053 
 4054         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
 4055             prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
 4056                 pvo->pvo_pte.prot = prot;
 4057                 /*
 4058                  * If the PVO is in the page table, update mapping
 4059                  */
 4060                 ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
 4061                 if (ret < 0)
 4062                         refchg |= LPTE_CHG;
 4063                 else
 4064                         refchg |= ret;
 4065         }
 4066 
 4067         /* Send REF/CHG bits to VM */
 4068         moea64_sp_refchg_process(sp, m, refchg, oldprot);
 4069 
 4070         /* Handle pages that became executable */
 4071         if ((m->a.flags & PGA_EXECUTABLE) == 0 &&
 4072             (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
 4073                 if ((m->oflags & VPO_UNMANAGED) == 0)
 4074                         for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++)
 4075                                 vm_page_aflag_set(m, PGA_EXECUTABLE);
 4076                 moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp),
 4077                     HPT_SP_SIZE);
 4078         }
 4079 
 4080         return (prev);
 4081 }
 4082 
 4083 static struct pvo_entry *
 4084 moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree)
 4085 {
 4086         struct pvo_entry *pvo, *tpvo;
 4087         vm_offset_t eva;
 4088         pmap_t pm __diagused;
 4089 
 4090         CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
 4091 
 4092         pm = sp->pvo_pmap;
 4093         PMAP_LOCK_ASSERT(pm, MA_OWNED);
 4094 
 4095         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
 4096         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
 4097                 tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
 4098 
 4099                 /*
 4100                  * For locking reasons, remove this from the page table and
 4101                  * pmap, but save delinking from the vm_page for a second
 4102                  * pass
 4103                  */
 4104                 moea64_pvo_remove_from_pmap(pvo);
 4105                 SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
 4106         }
 4107 
 4108         /*
 4109          * Clear SP bit
 4110          *
 4111          * XXX See comment in moea64_sp_demote_aligned() for why it's
 4112          *     ok to always clear the SP bit on remove/demote.
 4113          */
 4114         atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs,
 4115             MDPG_ATTR_SP);
 4116 
 4117         return (tpvo);
 4118 }
 4119 
 4120 static int64_t
 4121 moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit)
 4122 {
 4123         int64_t refchg, ret;
 4124         vm_offset_t eva;
 4125         vm_page_t m;
 4126         pmap_t pmap;
 4127         struct pvo_entry *sp;
 4128 
 4129         pmap = pvo->pvo_pmap;
 4130         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 4131 
 4132         /* Get first SP PVO */
 4133         if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
 4134                 sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK);
 4135                 KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
 4136                      __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
 4137         } else
 4138                 sp = pvo;
 4139         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
 4140 
 4141         refchg = 0;
 4142         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
 4143             pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
 4144                 ret = moea64_pte_synch(pvo);
 4145                 if (ret > 0) {
 4146                         refchg |= ret & (LPTE_CHG | LPTE_REF);
 4147                         if ((refchg & ptebit) != 0)
 4148                                 break;
 4149                 }
 4150         }
 4151 
 4152         /* Save results */
 4153         if (refchg != 0) {
 4154                 m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
 4155                 atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP);
 4156         }
 4157 
 4158         return (refchg);
 4159 }
 4160 
 4161 static int64_t
 4162 moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit)
 4163 {
 4164         int64_t refchg;
 4165         pmap_t pmap;
 4166 
 4167         pmap = pvo->pvo_pmap;
 4168         PMAP_LOCK(pmap);
 4169 
 4170         /*
 4171          * Check if SP was demoted/removed before pmap lock was acquired.
 4172          */
 4173         if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
 4174                 CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
 4175                     __func__, (uintmax_t)PVO_PADDR(pvo));
 4176                 PMAP_UNLOCK(pmap);
 4177                 return (-1);
 4178         }
 4179 
 4180         refchg = moea64_sp_query_locked(pvo, ptebit);
 4181         PMAP_UNLOCK(pmap);
 4182 
 4183         CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
 4184             __func__, (uintmax_t)PVO_VADDR(pvo),
 4185             (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg);
 4186 
 4187         return (refchg);
 4188 }
 4189 
 4190 static int64_t
 4191 moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit)
 4192 {
 4193         int64_t refchg, ret;
 4194         pmap_t pmap;
 4195         struct pvo_entry *sp;
 4196         vm_offset_t eva;
 4197         vm_page_t m;
 4198 
 4199         pmap = pvo->pvo_pmap;
 4200         PMAP_LOCK(pmap);
 4201 
 4202         /*
 4203          * Check if SP was demoted/removed before pmap lock was acquired.
 4204          */
 4205         if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
 4206                 CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
 4207                     __func__, (uintmax_t)PVO_PADDR(pvo));
 4208                 PMAP_UNLOCK(pmap);
 4209                 return (-1);
 4210         }
 4211 
 4212         /* Get first SP PVO */
 4213         if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
 4214                 sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK);
 4215                 KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
 4216                      __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
 4217         } else
 4218                 sp = pvo;
 4219         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
 4220 
 4221         refchg = 0;
 4222         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
 4223             pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
 4224                 ret = moea64_pte_clear(pvo, ptebit);
 4225                 if (ret > 0)
 4226                         refchg |= ret & (LPTE_CHG | LPTE_REF);
 4227         }
 4228 
 4229         m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
 4230         atomic_clear_32(&m->md.mdpg_attrs, ptebit);
 4231         PMAP_UNLOCK(pmap);
 4232 
 4233         CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
 4234             __func__, (uintmax_t)PVO_VADDR(sp),
 4235             (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg);
 4236 
 4237         return (refchg);
 4238 }
 4239 
 4240 static int64_t
 4241 moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit)
 4242 {
 4243         int64_t count, ret;
 4244         pmap_t pmap;
 4245 
 4246         count = 0;
 4247         pmap = pvo->pvo_pmap;
 4248 
 4249         /*
 4250          * Since this reference bit is shared by 4096 4KB pages, it
 4251          * should not be cleared every time it is tested. Apply a
 4252          * simple "hash" function on the physical page number, the
 4253          * virtual superpage number, and the pmap address to select
 4254          * one 4KB page out of the 4096 on which testing the
 4255          * reference bit will result in clearing that reference bit.
 4256          * This function is designed to avoid the selection of the
 4257          * same 4KB page for every 16MB page mapping.
 4258          *
 4259          * Always leave the reference bit of a wired mapping set, as
 4260          * the current state of its reference bit won't affect page
 4261          * replacement.
 4262          */
 4263         if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^
 4264             (PVO_VADDR(pvo) >> HPT_SP_SHIFT) ^ (uintptr_t)pmap) &
 4265             (HPT_SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) {
 4266                 if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1)
 4267                         return (-1);
 4268 
 4269                 if ((ret & ptebit) != 0)
 4270                         count++;
 4271 
 4272         /*
 4273          * If this page was not selected by the hash function, then assume
 4274          * its REF bit was set.
 4275          */
 4276         } else if (ptebit == LPTE_REF) {
 4277                 count++;
 4278 
 4279         /*
 4280          * To clear the CHG bit of a single SP page, first it must be demoted.
 4281          * But if no CHG bit is set, no bit clear and thus no SP demotion is
 4282          * needed.
 4283          */
 4284         } else {
 4285                 CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx",
 4286                     __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo),
 4287                     (uintmax_t)PVO_PADDR(pvo));
 4288 
 4289                 PMAP_LOCK(pmap);
 4290 
 4291                 /*
 4292                  * Make sure SP wasn't demoted/removed before pmap lock
 4293                  * was acquired.
 4294                  */
 4295                 if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
 4296                         CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
 4297                             __func__, (uintmax_t)PVO_PADDR(pvo));
 4298                         PMAP_UNLOCK(pmap);
 4299                         return (-1);
 4300                 }
 4301 
 4302                 ret = moea64_sp_query_locked(pvo, ptebit);
 4303                 if ((ret & ptebit) != 0)
 4304                         count++;
 4305                 else {
 4306                         PMAP_UNLOCK(pmap);
 4307                         return (0);
 4308                 }
 4309 
 4310                 moea64_sp_demote(pvo);
 4311                 moea64_pte_clear(pvo, ptebit);
 4312 
 4313                 /*
 4314                  * Write protect the mapping to a single page so that a
 4315                  * subsequent write access may repromote.
 4316                  */
 4317                 if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
 4318                         moea64_pvo_protect(pmap, pvo,
 4319                             pvo->pvo_pte.prot & ~VM_PROT_WRITE);
 4320 
 4321                 PMAP_UNLOCK(pmap);
 4322         }
 4323 
 4324         return (count);
 4325 }

Cache object: 80aeb46d79ad985eb57922c8a4f0b24a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.