FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c
1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 * must display the following acknowledgement:
27 * This product includes software developed by the University of
28 * California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 * may be used to endorse or promote products derived from this software
31 * without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
46 */
47 /*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 * notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 * notice, this list of conditions and the following disclaimer in the
64 * documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79 #define AMD64_NPT_AWARE
80
81 #include <sys/cdefs.h>
82 __FBSDID("$FreeBSD: releng/10.4/sys/amd64/amd64/pmap.c 322063 2017-08-04 21:38:34Z marius $");
83
84 /*
85 * Manages physical address maps.
86 *
87 * Since the information managed by this module is
88 * also stored by the logical address mapping module,
89 * this module may throw away valid virtual-to-physical
90 * mappings at almost any time. However, invalidations
91 * of virtual-to-physical mappings must be done as
92 * requested.
93 *
94 * In order to cope with hardware architectures which
95 * make virtual-to-physical map invalidates expensive,
96 * this module may delay invalidate or reduced protection
97 * operations until such time as they are actually
98 * necessary. This module is given full information as
99 * to which processors are currently using which maps,
100 * and to when physical maps must be made correct.
101 */
102
103 #include "opt_pmap.h"
104 #include "opt_vm.h"
105
106 #include <sys/param.h>
107 #include <sys/bus.h>
108 #include <sys/systm.h>
109 #include <sys/kernel.h>
110 #include <sys/ktr.h>
111 #include <sys/lock.h>
112 #include <sys/malloc.h>
113 #include <sys/mman.h>
114 #include <sys/mutex.h>
115 #include <sys/proc.h>
116 #include <sys/rwlock.h>
117 #include <sys/sx.h>
118 #include <sys/vmmeter.h>
119 #include <sys/sched.h>
120 #include <sys/sysctl.h>
121 #include <sys/_unrhdr.h>
122 #include <sys/smp.h>
123
124 #include <vm/vm.h>
125 #include <vm/vm_param.h>
126 #include <vm/vm_kern.h>
127 #include <vm/vm_page.h>
128 #include <vm/vm_map.h>
129 #include <vm/vm_object.h>
130 #include <vm/vm_extern.h>
131 #include <vm/vm_pageout.h>
132 #include <vm/vm_pager.h>
133 #include <vm/vm_phys.h>
134 #include <vm/vm_radix.h>
135 #include <vm/vm_reserv.h>
136 #include <vm/uma.h>
137
138 #include <machine/intr_machdep.h>
139 #include <machine/apicvar.h>
140 #include <machine/cpu.h>
141 #include <machine/cputypes.h>
142 #include <machine/md_var.h>
143 #include <machine/pcb.h>
144 #include <machine/specialreg.h>
145 #ifdef SMP
146 #include <machine/smp.h>
147 #endif
148
149 static __inline boolean_t
150 pmap_type_guest(pmap_t pmap)
151 {
152
153 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
154 }
155
156 static __inline boolean_t
157 pmap_emulate_ad_bits(pmap_t pmap)
158 {
159
160 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
161 }
162
163 static __inline pt_entry_t
164 pmap_valid_bit(pmap_t pmap)
165 {
166 pt_entry_t mask;
167
168 switch (pmap->pm_type) {
169 case PT_X86:
170 case PT_RVI:
171 mask = X86_PG_V;
172 break;
173 case PT_EPT:
174 if (pmap_emulate_ad_bits(pmap))
175 mask = EPT_PG_EMUL_V;
176 else
177 mask = EPT_PG_READ;
178 break;
179 default:
180 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
181 }
182
183 return (mask);
184 }
185
186 static __inline pt_entry_t
187 pmap_rw_bit(pmap_t pmap)
188 {
189 pt_entry_t mask;
190
191 switch (pmap->pm_type) {
192 case PT_X86:
193 case PT_RVI:
194 mask = X86_PG_RW;
195 break;
196 case PT_EPT:
197 if (pmap_emulate_ad_bits(pmap))
198 mask = EPT_PG_EMUL_RW;
199 else
200 mask = EPT_PG_WRITE;
201 break;
202 default:
203 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
204 }
205
206 return (mask);
207 }
208
209 static __inline pt_entry_t
210 pmap_global_bit(pmap_t pmap)
211 {
212 pt_entry_t mask;
213
214 switch (pmap->pm_type) {
215 case PT_X86:
216 mask = X86_PG_G;
217 break;
218 case PT_RVI:
219 case PT_EPT:
220 mask = 0;
221 break;
222 default:
223 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
224 }
225
226 return (mask);
227 }
228
229 static __inline pt_entry_t
230 pmap_accessed_bit(pmap_t pmap)
231 {
232 pt_entry_t mask;
233
234 switch (pmap->pm_type) {
235 case PT_X86:
236 case PT_RVI:
237 mask = X86_PG_A;
238 break;
239 case PT_EPT:
240 if (pmap_emulate_ad_bits(pmap))
241 mask = EPT_PG_READ;
242 else
243 mask = EPT_PG_A;
244 break;
245 default:
246 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
247 }
248
249 return (mask);
250 }
251
252 static __inline pt_entry_t
253 pmap_modified_bit(pmap_t pmap)
254 {
255 pt_entry_t mask;
256
257 switch (pmap->pm_type) {
258 case PT_X86:
259 case PT_RVI:
260 mask = X86_PG_M;
261 break;
262 case PT_EPT:
263 if (pmap_emulate_ad_bits(pmap))
264 mask = EPT_PG_WRITE;
265 else
266 mask = EPT_PG_M;
267 break;
268 default:
269 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
270 }
271
272 return (mask);
273 }
274
275 #if !defined(DIAGNOSTIC)
276 #ifdef __GNUC_GNU_INLINE__
277 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline
278 #else
279 #define PMAP_INLINE extern inline
280 #endif
281 #else
282 #define PMAP_INLINE
283 #endif
284
285 #ifdef PV_STATS
286 #define PV_STAT(x) do { x ; } while (0)
287 #else
288 #define PV_STAT(x) do { } while (0)
289 #endif
290
291 #define pa_index(pa) ((pa) >> PDRSHIFT)
292 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
293
294 #define NPV_LIST_LOCKS MAXCPU
295
296 #define PHYS_TO_PV_LIST_LOCK(pa) \
297 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
298
299 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
300 struct rwlock **_lockp = (lockp); \
301 struct rwlock *_new_lock; \
302 \
303 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
304 if (_new_lock != *_lockp) { \
305 if (*_lockp != NULL) \
306 rw_wunlock(*_lockp); \
307 *_lockp = _new_lock; \
308 rw_wlock(*_lockp); \
309 } \
310 } while (0)
311
312 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
313 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
314
315 #define RELEASE_PV_LIST_LOCK(lockp) do { \
316 struct rwlock **_lockp = (lockp); \
317 \
318 if (*_lockp != NULL) { \
319 rw_wunlock(*_lockp); \
320 *_lockp = NULL; \
321 } \
322 } while (0)
323
324 #define VM_PAGE_TO_PV_LIST_LOCK(m) \
325 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
326
327 struct pmap kernel_pmap_store;
328
329 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
330 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
331
332 int nkpt;
333 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
334 "Number of kernel page table pages allocated on bootup");
335
336 static int ndmpdp;
337 vm_paddr_t dmaplimit;
338 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
339 pt_entry_t pg_nx;
340
341 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
342
343 static int pat_works = 1;
344 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
345 "Is page attribute table fully functional?");
346
347 static int pg_ps_enabled = 1;
348 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
349 "Are large page mappings enabled?");
350
351 #define PAT_INDEX_SIZE 8
352 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
353
354 static u_int64_t KPTphys; /* phys addr of kernel level 1 */
355 static u_int64_t KPDphys; /* phys addr of kernel level 2 */
356 u_int64_t KPDPphys; /* phys addr of kernel level 3 */
357 u_int64_t KPML4phys; /* phys addr of kernel level 4 */
358
359 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
360 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
361 static int ndmpdpphys; /* number of DMPDPphys pages */
362
363 /*
364 * pmap_mapdev support pre initialization (i.e. console)
365 */
366 #define PMAP_PREINIT_MAPPING_COUNT 8
367 static struct pmap_preinit_mapping {
368 vm_paddr_t pa;
369 vm_offset_t va;
370 vm_size_t sz;
371 int mode;
372 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
373 static int pmap_initialized;
374
375 static struct rwlock_padalign pvh_global_lock;
376
377 /*
378 * Data for the pv entry allocation mechanism
379 */
380 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
381 static struct mtx pv_chunks_mutex;
382 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
383 static struct md_page *pv_table;
384 static struct md_page pv_dummy;
385
386 /*
387 * All those kernel PT submaps that BSD is so fond of
388 */
389 pt_entry_t *CMAP1 = 0;
390 caddr_t CADDR1 = 0;
391
392 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
393
394 static struct unrhdr pcid_unr;
395 static struct mtx pcid_mtx;
396 int pmap_pcid_enabled = 0;
397 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
398 0, "Is TLB Context ID enabled ?");
399 int invpcid_works = 0;
400 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
401 "Is the invpcid instruction available ?");
402
403 static int
404 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
405 {
406 int i;
407 uint64_t res;
408
409 res = 0;
410 CPU_FOREACH(i) {
411 res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
412 }
413 return (sysctl_handle_64(oidp, &res, 0, req));
414 }
415 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
416 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
417 "Count of saved TLB context on switch");
418
419 /* pmap_copy_pages() over non-DMAP */
420 static struct mtx cpage_lock;
421 static vm_offset_t cpage_a;
422 static vm_offset_t cpage_b;
423
424 /*
425 * Crashdump maps.
426 */
427 static caddr_t crashdumpmap;
428
429 static void free_pv_chunk(struct pv_chunk *pc);
430 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
431 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
432 static int popcnt_pc_map_elem(uint64_t elem);
433 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
434 static void reserve_pv_entries(pmap_t pmap, int needed,
435 struct rwlock **lockp);
436 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
437 struct rwlock **lockp);
438 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
439 struct rwlock **lockp);
440 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
441 struct rwlock **lockp);
442 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
443 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
444 vm_offset_t va);
445
446 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
447 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
448 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
449 vm_offset_t va, struct rwlock **lockp);
450 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
451 vm_offset_t va);
452 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
453 vm_prot_t prot, struct rwlock **lockp);
454 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
455 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
456 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
457 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
458 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
459 pd_entry_t pde);
460 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
461 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
462 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
463 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
464 struct rwlock **lockp);
465 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
466 vm_prot_t prot);
467 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
468 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
469 struct spglist *free, struct rwlock **lockp);
470 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
471 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
472 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
473 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
474 struct spglist *free);
475 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
476 vm_page_t m, struct rwlock **lockp);
477 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
478 pd_entry_t newpde);
479 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
480
481 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
482 struct rwlock **lockp);
483 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
484 struct rwlock **lockp);
485 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
486 struct rwlock **lockp);
487
488 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
489 struct spglist *free);
490 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
491 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
492
493 /*
494 * Move the kernel virtual free pointer to the next
495 * 2MB. This is used to help improve performance
496 * by using a large (2MB) page for much of the kernel
497 * (.text, .data, .bss)
498 */
499 static vm_offset_t
500 pmap_kmem_choose(vm_offset_t addr)
501 {
502 vm_offset_t newaddr = addr;
503
504 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
505 return (newaddr);
506 }
507
508 /********************/
509 /* Inline functions */
510 /********************/
511
512 /* Return a non-clipped PD index for a given VA */
513 static __inline vm_pindex_t
514 pmap_pde_pindex(vm_offset_t va)
515 {
516 return (va >> PDRSHIFT);
517 }
518
519
520 /* Return various clipped indexes for a given VA */
521 static __inline vm_pindex_t
522 pmap_pte_index(vm_offset_t va)
523 {
524
525 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
526 }
527
528 static __inline vm_pindex_t
529 pmap_pde_index(vm_offset_t va)
530 {
531
532 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
533 }
534
535 static __inline vm_pindex_t
536 pmap_pdpe_index(vm_offset_t va)
537 {
538
539 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
540 }
541
542 static __inline vm_pindex_t
543 pmap_pml4e_index(vm_offset_t va)
544 {
545
546 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
547 }
548
549 /* Return a pointer to the PML4 slot that corresponds to a VA */
550 static __inline pml4_entry_t *
551 pmap_pml4e(pmap_t pmap, vm_offset_t va)
552 {
553
554 return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
555 }
556
557 /* Return a pointer to the PDP slot that corresponds to a VA */
558 static __inline pdp_entry_t *
559 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
560 {
561 pdp_entry_t *pdpe;
562
563 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
564 return (&pdpe[pmap_pdpe_index(va)]);
565 }
566
567 /* Return a pointer to the PDP slot that corresponds to a VA */
568 static __inline pdp_entry_t *
569 pmap_pdpe(pmap_t pmap, vm_offset_t va)
570 {
571 pml4_entry_t *pml4e;
572 pt_entry_t PG_V;
573
574 PG_V = pmap_valid_bit(pmap);
575 pml4e = pmap_pml4e(pmap, va);
576 if ((*pml4e & PG_V) == 0)
577 return (NULL);
578 return (pmap_pml4e_to_pdpe(pml4e, va));
579 }
580
581 /* Return a pointer to the PD slot that corresponds to a VA */
582 static __inline pd_entry_t *
583 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
584 {
585 pd_entry_t *pde;
586
587 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
588 return (&pde[pmap_pde_index(va)]);
589 }
590
591 /* Return a pointer to the PD slot that corresponds to a VA */
592 static __inline pd_entry_t *
593 pmap_pde(pmap_t pmap, vm_offset_t va)
594 {
595 pdp_entry_t *pdpe;
596 pt_entry_t PG_V;
597
598 PG_V = pmap_valid_bit(pmap);
599 pdpe = pmap_pdpe(pmap, va);
600 if (pdpe == NULL || (*pdpe & PG_V) == 0)
601 return (NULL);
602 return (pmap_pdpe_to_pde(pdpe, va));
603 }
604
605 /* Return a pointer to the PT slot that corresponds to a VA */
606 static __inline pt_entry_t *
607 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
608 {
609 pt_entry_t *pte;
610
611 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
612 return (&pte[pmap_pte_index(va)]);
613 }
614
615 /* Return a pointer to the PT slot that corresponds to a VA */
616 static __inline pt_entry_t *
617 pmap_pte(pmap_t pmap, vm_offset_t va)
618 {
619 pd_entry_t *pde;
620 pt_entry_t PG_V;
621
622 PG_V = pmap_valid_bit(pmap);
623 pde = pmap_pde(pmap, va);
624 if (pde == NULL || (*pde & PG_V) == 0)
625 return (NULL);
626 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */
627 return ((pt_entry_t *)pde);
628 return (pmap_pde_to_pte(pde, va));
629 }
630
631 static __inline void
632 pmap_resident_count_inc(pmap_t pmap, int count)
633 {
634
635 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
636 pmap->pm_stats.resident_count += count;
637 }
638
639 static __inline void
640 pmap_resident_count_dec(pmap_t pmap, int count)
641 {
642
643 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
644 KASSERT(pmap->pm_stats.resident_count >= count,
645 ("pmap %p resident count underflow %ld %d", pmap,
646 pmap->pm_stats.resident_count, count));
647 pmap->pm_stats.resident_count -= count;
648 }
649
650 PMAP_INLINE pt_entry_t *
651 vtopte(vm_offset_t va)
652 {
653 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
654
655 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
656
657 return (PTmap + ((va >> PAGE_SHIFT) & mask));
658 }
659
660 static __inline pd_entry_t *
661 vtopde(vm_offset_t va)
662 {
663 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
664
665 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
666
667 return (PDmap + ((va >> PDRSHIFT) & mask));
668 }
669
670 static u_int64_t
671 allocpages(vm_paddr_t *firstaddr, int n)
672 {
673 u_int64_t ret;
674
675 ret = *firstaddr;
676 bzero((void *)ret, n * PAGE_SIZE);
677 *firstaddr += n * PAGE_SIZE;
678 return (ret);
679 }
680
681 CTASSERT(powerof2(NDMPML4E));
682
683 /* number of kernel PDP slots */
684 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG)
685
686 static void
687 nkpt_init(vm_paddr_t addr)
688 {
689 int pt_pages;
690
691 #ifdef NKPT
692 pt_pages = NKPT;
693 #else
694 pt_pages = howmany(addr, 1 << PDRSHIFT);
695 pt_pages += NKPDPE(pt_pages);
696
697 /*
698 * Add some slop beyond the bare minimum required for bootstrapping
699 * the kernel.
700 *
701 * This is quite important when allocating KVA for kernel modules.
702 * The modules are required to be linked in the negative 2GB of
703 * the address space. If we run out of KVA in this region then
704 * pmap_growkernel() will need to allocate page table pages to map
705 * the entire 512GB of KVA space which is an unnecessary tax on
706 * physical memory.
707 *
708 * Secondly, device memory mapped as part of setting up the low-
709 * level console(s) is taken from KVA, starting at virtual_avail.
710 * This is because cninit() is called after pmap_bootstrap() but
711 * before vm_init() and pmap_init(). 20MB for a frame buffer is
712 * not uncommon.
713 */
714 pt_pages += 32; /* 64MB additional slop. */
715 #endif
716 nkpt = pt_pages;
717 }
718
719 static void
720 create_pagetables(vm_paddr_t *firstaddr)
721 {
722 int i, j, ndm1g, nkpdpe;
723 pt_entry_t *pt_p;
724 pd_entry_t *pd_p;
725 pdp_entry_t *pdp_p;
726 pml4_entry_t *p4_p;
727
728 /* Allocate page table pages for the direct map */
729 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
730 if (ndmpdp < 4) /* Minimum 4GB of dirmap */
731 ndmpdp = 4;
732 ndmpdpphys = howmany(ndmpdp, NPDPEPG);
733 if (ndmpdpphys > NDMPML4E) {
734 /*
735 * Each NDMPML4E allows 512 GB, so limit to that,
736 * and then readjust ndmpdp and ndmpdpphys.
737 */
738 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
739 Maxmem = atop(NDMPML4E * NBPML4);
740 ndmpdpphys = NDMPML4E;
741 ndmpdp = NDMPML4E * NPDEPG;
742 }
743 DMPDPphys = allocpages(firstaddr, ndmpdpphys);
744 ndm1g = 0;
745 if ((amd_feature & AMDID_PAGE1GB) != 0)
746 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
747 if (ndm1g < ndmpdp)
748 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
749 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
750
751 /* Allocate pages */
752 KPML4phys = allocpages(firstaddr, 1);
753 KPDPphys = allocpages(firstaddr, NKPML4E);
754
755 /*
756 * Allocate the initial number of kernel page table pages required to
757 * bootstrap. We defer this until after all memory-size dependent
758 * allocations are done (e.g. direct map), so that we don't have to
759 * build in too much slop in our estimate.
760 *
761 * Note that when NKPML4E > 1, we have an empty page underneath
762 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
763 * pages. (pmap_enter requires a PD page to exist for each KPML4E.)
764 */
765 nkpt_init(*firstaddr);
766 nkpdpe = NKPDPE(nkpt);
767
768 KPTphys = allocpages(firstaddr, nkpt);
769 KPDphys = allocpages(firstaddr, nkpdpe);
770
771 /* Fill in the underlying page table pages */
772 /* Nominally read-only (but really R/W) from zero to physfree */
773 /* XXX not fully used, underneath 2M pages */
774 pt_p = (pt_entry_t *)KPTphys;
775 for (i = 0; ptoa(i) < *firstaddr; i++)
776 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
777
778 /* Now map the page tables at their location within PTmap */
779 pd_p = (pd_entry_t *)KPDphys;
780 for (i = 0; i < nkpt; i++)
781 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
782
783 /* Map from zero to end of allocations under 2M pages */
784 /* This replaces some of the KPTphys entries above */
785 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
786 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
787 X86_PG_G;
788
789 /* And connect up the PD to the PDP (leaving room for L4 pages) */
790 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
791 for (i = 0; i < nkpdpe; i++)
792 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
793 PG_U;
794
795 /*
796 * Now, set up the direct map region using 2MB and/or 1GB pages. If
797 * the end of physical memory is not aligned to a 1GB page boundary,
798 * then the residual physical memory is mapped with 2MB pages. Later,
799 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
800 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
801 * that are partially used.
802 */
803 pd_p = (pd_entry_t *)DMPDphys;
804 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
805 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
806 /* Preset PG_M and PG_A because demotion expects it. */
807 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
808 X86_PG_M | X86_PG_A;
809 }
810 pdp_p = (pdp_entry_t *)DMPDPphys;
811 for (i = 0; i < ndm1g; i++) {
812 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
813 /* Preset PG_M and PG_A because demotion expects it. */
814 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
815 X86_PG_M | X86_PG_A;
816 }
817 for (j = 0; i < ndmpdp; i++, j++) {
818 pdp_p[i] = DMPDphys + ptoa(j);
819 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
820 }
821
822 /* And recursively map PML4 to itself in order to get PTmap */
823 p4_p = (pml4_entry_t *)KPML4phys;
824 p4_p[PML4PML4I] = KPML4phys;
825 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
826
827 /* Connect the Direct Map slot(s) up to the PML4. */
828 for (i = 0; i < ndmpdpphys; i++) {
829 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
830 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
831 }
832
833 /* Connect the KVA slots up to the PML4 */
834 for (i = 0; i < NKPML4E; i++) {
835 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
836 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
837 }
838 }
839
840 /*
841 * Bootstrap the system enough to run with virtual memory.
842 *
843 * On amd64 this is called after mapping has already been enabled
844 * and just syncs the pmap module with what has already been done.
845 * [We can't call it easily with mapping off since the kernel is not
846 * mapped with PA == VA, hence we would have to relocate every address
847 * from the linked base (virtual) address "KERNBASE" to the actual
848 * (physical) address starting relative to 0]
849 */
850 void
851 pmap_bootstrap(vm_paddr_t *firstaddr)
852 {
853 vm_offset_t va;
854 pt_entry_t *pte;
855
856 /*
857 * Create an initial set of page tables to run the kernel in.
858 */
859 create_pagetables(firstaddr);
860
861 /*
862 * Add a physical memory segment (vm_phys_seg) corresponding to the
863 * preallocated kernel page table pages so that vm_page structures
864 * representing these pages will be created. The vm_page structures
865 * are required for promotion of the corresponding kernel virtual
866 * addresses to superpage mappings.
867 */
868 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
869
870 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
871 virtual_avail = pmap_kmem_choose(virtual_avail);
872
873 virtual_end = VM_MAX_KERNEL_ADDRESS;
874
875
876 /* XXX do %cr0 as well */
877 load_cr4(rcr4() | CR4_PGE | CR4_PSE);
878 load_cr3(KPML4phys);
879 if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
880 load_cr4(rcr4() | CR4_SMEP);
881
882 /*
883 * Initialize the kernel pmap (which is statically allocated).
884 */
885 PMAP_LOCK_INIT(kernel_pmap);
886 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
887 kernel_pmap->pm_cr3 = KPML4phys;
888 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
889 CPU_FILL(&kernel_pmap->pm_save); /* always superset of pm_active */
890 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
891 kernel_pmap->pm_flags = pmap_flags;
892
893 /*
894 * Initialize the global pv list lock.
895 */
896 rw_init(&pvh_global_lock, "pmap pv global");
897
898 /*
899 * Reserve some special page table entries/VA space for temporary
900 * mapping of pages.
901 */
902 #define SYSMAP(c, p, v, n) \
903 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
904
905 va = virtual_avail;
906 pte = vtopte(va);
907
908 /*
909 * Crashdump maps. The first page is reused as CMAP1 for the
910 * memory test.
911 */
912 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
913 CADDR1 = crashdumpmap;
914
915 virtual_avail = va;
916
917 /*
918 * Initialize the PAT MSR.
919 * pmap_init_pat() clears and sets CR4_PGE, which, as a
920 * side-effect, invalidates stale PG_G TLB entries that might
921 * have been created in our pre-boot environment.
922 */
923 pmap_init_pat();
924
925 /* Initialize TLB Context Id. */
926 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
927 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
928 load_cr4(rcr4() | CR4_PCIDE);
929 mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
930 init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
931 /* Check for INVPCID support */
932 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
933 != 0;
934 kernel_pmap->pm_pcid = 0;
935 #ifndef SMP
936 pmap_pcid_enabled = 0;
937 #endif
938 } else
939 pmap_pcid_enabled = 0;
940 }
941
942 /*
943 * Setup the PAT MSR.
944 */
945 void
946 pmap_init_pat(void)
947 {
948 int pat_table[PAT_INDEX_SIZE];
949 uint64_t pat_msr;
950 u_long cr0, cr4;
951 int i;
952
953 /* Bail if this CPU doesn't implement PAT. */
954 if ((cpu_feature & CPUID_PAT) == 0)
955 panic("no PAT??");
956
957 /* Set default PAT index table. */
958 for (i = 0; i < PAT_INDEX_SIZE; i++)
959 pat_table[i] = -1;
960 pat_table[PAT_WRITE_BACK] = 0;
961 pat_table[PAT_WRITE_THROUGH] = 1;
962 pat_table[PAT_UNCACHEABLE] = 3;
963 pat_table[PAT_WRITE_COMBINING] = 3;
964 pat_table[PAT_WRITE_PROTECTED] = 3;
965 pat_table[PAT_UNCACHED] = 3;
966
967 /* Initialize default PAT entries. */
968 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
969 PAT_VALUE(1, PAT_WRITE_THROUGH) |
970 PAT_VALUE(2, PAT_UNCACHED) |
971 PAT_VALUE(3, PAT_UNCACHEABLE) |
972 PAT_VALUE(4, PAT_WRITE_BACK) |
973 PAT_VALUE(5, PAT_WRITE_THROUGH) |
974 PAT_VALUE(6, PAT_UNCACHED) |
975 PAT_VALUE(7, PAT_UNCACHEABLE);
976
977 if (pat_works) {
978 /*
979 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
980 * Program 5 and 6 as WP and WC.
981 * Leave 4 and 7 as WB and UC.
982 */
983 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
984 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
985 PAT_VALUE(6, PAT_WRITE_COMBINING);
986 pat_table[PAT_UNCACHED] = 2;
987 pat_table[PAT_WRITE_PROTECTED] = 5;
988 pat_table[PAT_WRITE_COMBINING] = 6;
989 } else {
990 /*
991 * Just replace PAT Index 2 with WC instead of UC-.
992 */
993 pat_msr &= ~PAT_MASK(2);
994 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
995 pat_table[PAT_WRITE_COMBINING] = 2;
996 }
997
998 /* Disable PGE. */
999 cr4 = rcr4();
1000 load_cr4(cr4 & ~CR4_PGE);
1001
1002 /* Disable caches (CD = 1, NW = 0). */
1003 cr0 = rcr0();
1004 load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1005
1006 /* Flushes caches and TLBs. */
1007 wbinvd();
1008 invltlb();
1009
1010 /* Update PAT and index table. */
1011 wrmsr(MSR_PAT, pat_msr);
1012 for (i = 0; i < PAT_INDEX_SIZE; i++)
1013 pat_index[i] = pat_table[i];
1014
1015 /* Flush caches and TLBs again. */
1016 wbinvd();
1017 invltlb();
1018
1019 /* Restore caches and PGE. */
1020 load_cr0(cr0);
1021 load_cr4(cr4);
1022 }
1023
1024 /*
1025 * Initialize a vm_page's machine-dependent fields.
1026 */
1027 void
1028 pmap_page_init(vm_page_t m)
1029 {
1030
1031 TAILQ_INIT(&m->md.pv_list);
1032 m->md.pat_mode = PAT_WRITE_BACK;
1033 }
1034
1035 /*
1036 * Initialize the pmap module.
1037 * Called by vm_init, to initialize any structures that the pmap
1038 * system needs to map virtual memory.
1039 */
1040 void
1041 pmap_init(void)
1042 {
1043 struct pmap_preinit_mapping *ppim;
1044 vm_page_t mpte;
1045 vm_size_t s;
1046 int i, pv_npg;
1047
1048 /*
1049 * Initialize the vm page array entries for the kernel pmap's
1050 * page table pages.
1051 */
1052 for (i = 0; i < nkpt; i++) {
1053 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1054 KASSERT(mpte >= vm_page_array &&
1055 mpte < &vm_page_array[vm_page_array_size],
1056 ("pmap_init: page table page is out of range"));
1057 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1058 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1059 }
1060
1061 /*
1062 * If the kernel is running on a virtual machine, then it must assume
1063 * that MCA is enabled by the hypervisor. Moreover, the kernel must
1064 * be prepared for the hypervisor changing the vendor and family that
1065 * are reported by CPUID. Consequently, the workaround for AMD Family
1066 * 10h Erratum 383 is enabled if the processor's feature set does not
1067 * include at least one feature that is only supported by older Intel
1068 * or newer AMD processors.
1069 */
1070 if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
1071 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1072 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1073 AMDID2_FMA4)) == 0)
1074 workaround_erratum383 = 1;
1075
1076 /*
1077 * Are large page mappings enabled?
1078 */
1079 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1080 if (pg_ps_enabled) {
1081 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1082 ("pmap_init: can't assign to pagesizes[1]"));
1083 pagesizes[1] = NBPDR;
1084 }
1085
1086 /*
1087 * Initialize the pv chunk list mutex.
1088 */
1089 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1090
1091 /*
1092 * Initialize the pool of pv list locks.
1093 */
1094 for (i = 0; i < NPV_LIST_LOCKS; i++)
1095 rw_init(&pv_list_locks[i], "pmap pv list");
1096
1097 /*
1098 * Calculate the size of the pv head table for superpages.
1099 */
1100 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
1101
1102 /*
1103 * Allocate memory for the pv head table for superpages.
1104 */
1105 s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1106 s = round_page(s);
1107 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
1108 M_WAITOK | M_ZERO);
1109 for (i = 0; i < pv_npg; i++)
1110 TAILQ_INIT(&pv_table[i].pv_list);
1111 TAILQ_INIT(&pv_dummy.pv_list);
1112
1113 mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
1114 cpage_a = kva_alloc(PAGE_SIZE);
1115 cpage_b = kva_alloc(PAGE_SIZE);
1116
1117 pmap_initialized = 1;
1118 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
1119 ppim = pmap_preinit_mapping + i;
1120 if (ppim->va == 0)
1121 continue;
1122 /* Make the direct map consistent */
1123 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
1124 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
1125 ppim->sz, ppim->mode);
1126 }
1127 if (!bootverbose)
1128 continue;
1129 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
1130 ppim->pa, ppim->va, ppim->sz, ppim->mode);
1131 }
1132 }
1133
1134 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1135 "2MB page mapping counters");
1136
1137 static u_long pmap_pde_demotions;
1138 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1139 &pmap_pde_demotions, 0, "2MB page demotions");
1140
1141 static u_long pmap_pde_mappings;
1142 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1143 &pmap_pde_mappings, 0, "2MB page mappings");
1144
1145 static u_long pmap_pde_p_failures;
1146 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1147 &pmap_pde_p_failures, 0, "2MB page promotion failures");
1148
1149 static u_long pmap_pde_promotions;
1150 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1151 &pmap_pde_promotions, 0, "2MB page promotions");
1152
1153 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1154 "1GB page mapping counters");
1155
1156 static u_long pmap_pdpe_demotions;
1157 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1158 &pmap_pdpe_demotions, 0, "1GB page demotions");
1159
1160 /***************************************************
1161 * Low level helper routines.....
1162 ***************************************************/
1163
1164 static pt_entry_t
1165 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1166 {
1167 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1168
1169 switch (pmap->pm_type) {
1170 case PT_X86:
1171 case PT_RVI:
1172 /* Verify that both PAT bits are not set at the same time */
1173 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1174 ("Invalid PAT bits in entry %#lx", entry));
1175
1176 /* Swap the PAT bits if one of them is set */
1177 if ((entry & x86_pat_bits) != 0)
1178 entry ^= x86_pat_bits;
1179 break;
1180 case PT_EPT:
1181 /*
1182 * Nothing to do - the memory attributes are represented
1183 * the same way for regular pages and superpages.
1184 */
1185 break;
1186 default:
1187 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1188 }
1189
1190 return (entry);
1191 }
1192
1193 /*
1194 * Determine the appropriate bits to set in a PTE or PDE for a specified
1195 * caching mode.
1196 */
1197 static int
1198 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1199 {
1200 int cache_bits, pat_flag, pat_idx;
1201
1202 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
1203 panic("Unknown caching mode %d\n", mode);
1204
1205 switch (pmap->pm_type) {
1206 case PT_X86:
1207 case PT_RVI:
1208 /* The PAT bit is different for PTE's and PDE's. */
1209 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1210
1211 /* Map the caching mode to a PAT index. */
1212 pat_idx = pat_index[mode];
1213
1214 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1215 cache_bits = 0;
1216 if (pat_idx & 0x4)
1217 cache_bits |= pat_flag;
1218 if (pat_idx & 0x2)
1219 cache_bits |= PG_NC_PCD;
1220 if (pat_idx & 0x1)
1221 cache_bits |= PG_NC_PWT;
1222 break;
1223
1224 case PT_EPT:
1225 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1226 break;
1227
1228 default:
1229 panic("unsupported pmap type %d", pmap->pm_type);
1230 }
1231
1232 return (cache_bits);
1233 }
1234
1235 static int
1236 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1237 {
1238 int mask;
1239
1240 switch (pmap->pm_type) {
1241 case PT_X86:
1242 case PT_RVI:
1243 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1244 break;
1245 case PT_EPT:
1246 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1247 break;
1248 default:
1249 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1250 }
1251
1252 return (mask);
1253 }
1254
1255 static __inline boolean_t
1256 pmap_ps_enabled(pmap_t pmap)
1257 {
1258
1259 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1260 }
1261
1262 static void
1263 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1264 {
1265
1266 switch (pmap->pm_type) {
1267 case PT_X86:
1268 break;
1269 case PT_RVI:
1270 case PT_EPT:
1271 /*
1272 * XXX
1273 * This is a little bogus since the generation number is
1274 * supposed to be bumped up when a region of the address
1275 * space is invalidated in the page tables.
1276 *
1277 * In this case the old PDE entry is valid but yet we want
1278 * to make sure that any mappings using the old entry are
1279 * invalidated in the TLB.
1280 *
1281 * The reason this works as expected is because we rendezvous
1282 * "all" host cpus and force any vcpu context to exit as a
1283 * side-effect.
1284 */
1285 atomic_add_acq_long(&pmap->pm_eptgen, 1);
1286 break;
1287 default:
1288 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1289 }
1290 pde_store(pde, newpde);
1291 }
1292
1293 /*
1294 * After changing the page size for the specified virtual address in the page
1295 * table, flush the corresponding entries from the processor's TLB. Only the
1296 * calling processor's TLB is affected.
1297 *
1298 * The calling thread must be pinned to a processor.
1299 */
1300 static void
1301 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1302 {
1303 pt_entry_t PG_G;
1304
1305 if (pmap_type_guest(pmap))
1306 return;
1307
1308 KASSERT(pmap->pm_type == PT_X86,
1309 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1310
1311 PG_G = pmap_global_bit(pmap);
1312
1313 if ((newpde & PG_PS) == 0)
1314 /* Demotion: flush a specific 2MB page mapping. */
1315 invlpg(va);
1316 else if ((newpde & PG_G) == 0)
1317 /*
1318 * Promotion: flush every 4KB page mapping from the TLB
1319 * because there are too many to flush individually.
1320 */
1321 invltlb();
1322 else {
1323 /*
1324 * Promotion: flush every 4KB page mapping from the TLB,
1325 * including any global (PG_G) mappings.
1326 */
1327 invltlb_globpcid();
1328 }
1329 }
1330 #ifdef SMP
1331
1332 static void
1333 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
1334 {
1335 struct invpcid_descr d;
1336 uint64_t cr3;
1337
1338 if (invpcid_works) {
1339 d.pcid = pmap->pm_pcid;
1340 d.pad = 0;
1341 d.addr = va;
1342 invpcid(&d, INVPCID_ADDR);
1343 return;
1344 }
1345
1346 cr3 = rcr3();
1347 critical_enter();
1348 load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1349 invlpg(va);
1350 load_cr3(cr3 | CR3_PCID_SAVE);
1351 critical_exit();
1352 }
1353
1354 /*
1355 * For SMP, these functions have to use the IPI mechanism for coherence.
1356 *
1357 * N.B.: Before calling any of the following TLB invalidation functions,
1358 * the calling processor must ensure that all stores updating a non-
1359 * kernel page table are globally performed. Otherwise, another
1360 * processor could cache an old, pre-update entry without being
1361 * invalidated. This can happen one of two ways: (1) The pmap becomes
1362 * active on another processor after its pm_active field is checked by
1363 * one of the following functions but before a store updating the page
1364 * table is globally performed. (2) The pmap becomes active on another
1365 * processor before its pm_active field is checked but due to
1366 * speculative loads one of the following functions stills reads the
1367 * pmap as inactive on the other processor.
1368 *
1369 * The kernel page table is exempt because its pm_active field is
1370 * immutable. The kernel page table is always active on every
1371 * processor.
1372 */
1373
1374 /*
1375 * Interrupt the cpus that are executing in the guest context.
1376 * This will force the vcpu to exit and the cached EPT mappings
1377 * will be invalidated by the host before the next vmresume.
1378 */
1379 static __inline void
1380 pmap_invalidate_ept(pmap_t pmap)
1381 {
1382 int ipinum;
1383
1384 sched_pin();
1385 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1386 ("pmap_invalidate_ept: absurd pm_active"));
1387
1388 /*
1389 * The TLB mappings associated with a vcpu context are not
1390 * flushed each time a different vcpu is chosen to execute.
1391 *
1392 * This is in contrast with a process's vtop mappings that
1393 * are flushed from the TLB on each context switch.
1394 *
1395 * Therefore we need to do more than just a TLB shootdown on
1396 * the active cpus in 'pmap->pm_active'. To do this we keep
1397 * track of the number of invalidations performed on this pmap.
1398 *
1399 * Each vcpu keeps a cache of this counter and compares it
1400 * just before a vmresume. If the counter is out-of-date an
1401 * invept will be done to flush stale mappings from the TLB.
1402 */
1403 atomic_add_acq_long(&pmap->pm_eptgen, 1);
1404
1405 /*
1406 * Force the vcpu to exit and trap back into the hypervisor.
1407 */
1408 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1409 ipi_selected(pmap->pm_active, ipinum);
1410 sched_unpin();
1411 }
1412
1413 void
1414 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1415 {
1416 cpuset_t other_cpus;
1417 u_int cpuid;
1418
1419 if (pmap_type_guest(pmap)) {
1420 pmap_invalidate_ept(pmap);
1421 return;
1422 }
1423
1424 KASSERT(pmap->pm_type == PT_X86,
1425 ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1426
1427 sched_pin();
1428 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1429 if (!pmap_pcid_enabled) {
1430 invlpg(va);
1431 } else {
1432 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1433 if (pmap == PCPU_GET(curpmap))
1434 invlpg(va);
1435 else
1436 pmap_invalidate_page_pcid(pmap, va);
1437 } else {
1438 invltlb_globpcid();
1439 }
1440 }
1441 smp_invlpg(pmap, va);
1442 } else {
1443 cpuid = PCPU_GET(cpuid);
1444 other_cpus = all_cpus;
1445 CPU_CLR(cpuid, &other_cpus);
1446 if (CPU_ISSET(cpuid, &pmap->pm_active))
1447 invlpg(va);
1448 else if (pmap_pcid_enabled) {
1449 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1450 pmap_invalidate_page_pcid(pmap, va);
1451 else
1452 invltlb_globpcid();
1453 }
1454 if (pmap_pcid_enabled)
1455 CPU_AND(&other_cpus, &pmap->pm_save);
1456 else
1457 CPU_AND(&other_cpus, &pmap->pm_active);
1458 if (!CPU_EMPTY(&other_cpus))
1459 smp_masked_invlpg(other_cpus, pmap, va);
1460 }
1461 sched_unpin();
1462 }
1463
1464 static void
1465 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1466 {
1467 struct invpcid_descr d;
1468 uint64_t cr3;
1469 vm_offset_t addr;
1470
1471 if (invpcid_works) {
1472 d.pcid = pmap->pm_pcid;
1473 d.pad = 0;
1474 for (addr = sva; addr < eva; addr += PAGE_SIZE) {
1475 d.addr = addr;
1476 invpcid(&d, INVPCID_ADDR);
1477 }
1478 return;
1479 }
1480
1481 cr3 = rcr3();
1482 critical_enter();
1483 load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
1484 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1485 invlpg(addr);
1486 load_cr3(cr3 | CR3_PCID_SAVE);
1487 critical_exit();
1488 }
1489
1490 void
1491 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1492 {
1493 cpuset_t other_cpus;
1494 vm_offset_t addr;
1495 u_int cpuid;
1496
1497 if (pmap_type_guest(pmap)) {
1498 pmap_invalidate_ept(pmap);
1499 return;
1500 }
1501
1502 KASSERT(pmap->pm_type == PT_X86,
1503 ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1504
1505 sched_pin();
1506 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1507 if (!pmap_pcid_enabled) {
1508 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1509 invlpg(addr);
1510 } else {
1511 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1512 if (pmap == PCPU_GET(curpmap)) {
1513 for (addr = sva; addr < eva;
1514 addr += PAGE_SIZE)
1515 invlpg(addr);
1516 } else {
1517 pmap_invalidate_range_pcid(pmap,
1518 sva, eva);
1519 }
1520 } else {
1521 invltlb_globpcid();
1522 }
1523 }
1524 smp_invlpg_range(pmap, sva, eva);
1525 } else {
1526 cpuid = PCPU_GET(cpuid);
1527 other_cpus = all_cpus;
1528 CPU_CLR(cpuid, &other_cpus);
1529 if (CPU_ISSET(cpuid, &pmap->pm_active)) {
1530 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1531 invlpg(addr);
1532 } else if (pmap_pcid_enabled) {
1533 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
1534 pmap_invalidate_range_pcid(pmap, sva, eva);
1535 else
1536 invltlb_globpcid();
1537 }
1538 if (pmap_pcid_enabled)
1539 CPU_AND(&other_cpus, &pmap->pm_save);
1540 else
1541 CPU_AND(&other_cpus, &pmap->pm_active);
1542 if (!CPU_EMPTY(&other_cpus))
1543 smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
1544 }
1545 sched_unpin();
1546 }
1547
1548 void
1549 pmap_invalidate_all(pmap_t pmap)
1550 {
1551 cpuset_t other_cpus;
1552 struct invpcid_descr d;
1553 uint64_t cr3;
1554 u_int cpuid;
1555
1556 if (pmap_type_guest(pmap)) {
1557 pmap_invalidate_ept(pmap);
1558 return;
1559 }
1560
1561 KASSERT(pmap->pm_type == PT_X86,
1562 ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1563
1564 sched_pin();
1565 cpuid = PCPU_GET(cpuid);
1566 if (pmap == kernel_pmap ||
1567 (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
1568 !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1569 if (invpcid_works) {
1570 bzero(&d, sizeof(d));
1571 invpcid(&d, INVPCID_CTXGLOB);
1572 } else {
1573 invltlb_globpcid();
1574 }
1575 if (!CPU_ISSET(cpuid, &pmap->pm_active))
1576 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1577 smp_invltlb(pmap);
1578 } else {
1579 other_cpus = all_cpus;
1580 CPU_CLR(cpuid, &other_cpus);
1581
1582 /*
1583 * This logic is duplicated in the Xinvltlb shootdown
1584 * IPI handler.
1585 */
1586 if (pmap_pcid_enabled) {
1587 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
1588 if (invpcid_works) {
1589 d.pcid = pmap->pm_pcid;
1590 d.pad = 0;
1591 d.addr = 0;
1592 invpcid(&d, INVPCID_CTX);
1593 } else {
1594 cr3 = rcr3();
1595 critical_enter();
1596
1597 /*
1598 * Bit 63 is clear, pcid TLB
1599 * entries are invalidated.
1600 */
1601 load_cr3(pmap->pm_cr3);
1602 load_cr3(cr3 | CR3_PCID_SAVE);
1603 critical_exit();
1604 }
1605 } else {
1606 invltlb_globpcid();
1607 }
1608 } else if (CPU_ISSET(cpuid, &pmap->pm_active))
1609 invltlb();
1610 if (!CPU_ISSET(cpuid, &pmap->pm_active))
1611 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
1612 if (pmap_pcid_enabled)
1613 CPU_AND(&other_cpus, &pmap->pm_save);
1614 else
1615 CPU_AND(&other_cpus, &pmap->pm_active);
1616 if (!CPU_EMPTY(&other_cpus))
1617 smp_masked_invltlb(other_cpus, pmap);
1618 }
1619 sched_unpin();
1620 }
1621
1622 void
1623 pmap_invalidate_cache(void)
1624 {
1625
1626 sched_pin();
1627 wbinvd();
1628 smp_cache_flush();
1629 sched_unpin();
1630 }
1631
1632 struct pde_action {
1633 cpuset_t invalidate; /* processors that invalidate their TLB */
1634 pmap_t pmap;
1635 vm_offset_t va;
1636 pd_entry_t *pde;
1637 pd_entry_t newpde;
1638 u_int store; /* processor that updates the PDE */
1639 };
1640
1641 static void
1642 pmap_update_pde_action(void *arg)
1643 {
1644 struct pde_action *act = arg;
1645
1646 if (act->store == PCPU_GET(cpuid))
1647 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1648 }
1649
1650 static void
1651 pmap_update_pde_teardown(void *arg)
1652 {
1653 struct pde_action *act = arg;
1654
1655 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1656 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1657 }
1658
1659 /*
1660 * Change the page size for the specified virtual address in a way that
1661 * prevents any possibility of the TLB ever having two entries that map the
1662 * same virtual address using different page sizes. This is the recommended
1663 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
1664 * machine check exception for a TLB state that is improperly diagnosed as a
1665 * hardware error.
1666 */
1667 static void
1668 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1669 {
1670 struct pde_action act;
1671 cpuset_t active, other_cpus;
1672 u_int cpuid;
1673
1674 sched_pin();
1675 cpuid = PCPU_GET(cpuid);
1676 other_cpus = all_cpus;
1677 CPU_CLR(cpuid, &other_cpus);
1678 if (pmap == kernel_pmap || pmap_type_guest(pmap))
1679 active = all_cpus;
1680 else {
1681 active = pmap->pm_active;
1682 CPU_AND_ATOMIC(&pmap->pm_save, &active);
1683 }
1684 if (CPU_OVERLAP(&active, &other_cpus)) {
1685 act.store = cpuid;
1686 act.invalidate = active;
1687 act.va = va;
1688 act.pmap = pmap;
1689 act.pde = pde;
1690 act.newpde = newpde;
1691 CPU_SET(cpuid, &active);
1692 smp_rendezvous_cpus(active,
1693 smp_no_rendevous_barrier, pmap_update_pde_action,
1694 pmap_update_pde_teardown, &act);
1695 } else {
1696 pmap_update_pde_store(pmap, pde, newpde);
1697 if (CPU_ISSET(cpuid, &active))
1698 pmap_update_pde_invalidate(pmap, va, newpde);
1699 }
1700 sched_unpin();
1701 }
1702 #else /* !SMP */
1703 /*
1704 * Normal, non-SMP, invalidation functions.
1705 * We inline these within pmap.c for speed.
1706 */
1707 PMAP_INLINE void
1708 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1709 {
1710
1711 switch (pmap->pm_type) {
1712 case PT_X86:
1713 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1714 invlpg(va);
1715 break;
1716 case PT_RVI:
1717 case PT_EPT:
1718 pmap->pm_eptgen++;
1719 break;
1720 default:
1721 panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
1722 }
1723 }
1724
1725 PMAP_INLINE void
1726 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1727 {
1728 vm_offset_t addr;
1729
1730 switch (pmap->pm_type) {
1731 case PT_X86:
1732 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1733 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1734 invlpg(addr);
1735 break;
1736 case PT_RVI:
1737 case PT_EPT:
1738 pmap->pm_eptgen++;
1739 break;
1740 default:
1741 panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
1742 }
1743 }
1744
1745 PMAP_INLINE void
1746 pmap_invalidate_all(pmap_t pmap)
1747 {
1748
1749 switch (pmap->pm_type) {
1750 case PT_X86:
1751 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1752 invltlb();
1753 break;
1754 case PT_RVI:
1755 case PT_EPT:
1756 pmap->pm_eptgen++;
1757 break;
1758 default:
1759 panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
1760 }
1761 }
1762
1763 PMAP_INLINE void
1764 pmap_invalidate_cache(void)
1765 {
1766
1767 wbinvd();
1768 }
1769
1770 static void
1771 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1772 {
1773
1774 pmap_update_pde_store(pmap, pde, newpde);
1775 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1776 pmap_update_pde_invalidate(pmap, va, newpde);
1777 else
1778 CPU_ZERO(&pmap->pm_save);
1779 }
1780 #endif /* !SMP */
1781
1782 static void
1783 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
1784 {
1785
1786 /*
1787 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
1788 * by a promotion that did not invalidate the 512 4KB page mappings
1789 * that might exist in the TLB. Consequently, at this point, the TLB
1790 * may hold both 4KB and 2MB page mappings for the address range [va,
1791 * va + NBPDR). Therefore, the entire range must be invalidated here.
1792 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
1793 * 4KB page mappings for the address range [va, va + NBPDR), and so a
1794 * single INVLPG suffices to invalidate the 2MB page mapping from the
1795 * TLB.
1796 */
1797 if ((pde & PG_PROMOTED) != 0)
1798 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
1799 else
1800 pmap_invalidate_page(pmap, va);
1801 }
1802
1803 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
1804
1805 void
1806 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1807 {
1808
1809 if (force) {
1810 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
1811 } else {
1812 KASSERT((sva & PAGE_MASK) == 0,
1813 ("pmap_invalidate_cache_range: sva not page-aligned"));
1814 KASSERT((eva & PAGE_MASK) == 0,
1815 ("pmap_invalidate_cache_range: eva not page-aligned"));
1816 }
1817
1818 if ((cpu_feature & CPUID_SS) != 0 && !force)
1819 ; /* If "Self Snoop" is supported and allowed, do nothing. */
1820 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
1821 eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1822 /*
1823 * XXX: Some CPUs fault, hang, or trash the local APIC
1824 * registers if we use CLFLUSH on the local APIC
1825 * range. The local APIC is always uncached, so we
1826 * don't need to flush for that range anyway.
1827 */
1828 if (pmap_kextract(sva) == lapic_paddr)
1829 return;
1830
1831 /*
1832 * Otherwise, do per-cache line flush. Use the sfence
1833 * instruction to insure that previous stores are
1834 * included in the write-back. The processor
1835 * propagates flush to other processors in the cache
1836 * coherence domain.
1837 */
1838 sfence();
1839 for (; sva < eva; sva += cpu_clflush_line_size)
1840 clflushopt(sva);
1841 sfence();
1842 } else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1843 eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1844 if (pmap_kextract(sva) == lapic_paddr)
1845 return;
1846 /*
1847 * Writes are ordered by CLFLUSH on Intel CPUs.
1848 */
1849 if (cpu_vendor_id != CPU_VENDOR_INTEL)
1850 mfence();
1851 for (; sva < eva; sva += cpu_clflush_line_size)
1852 clflush(sva);
1853 if (cpu_vendor_id != CPU_VENDOR_INTEL)
1854 mfence();
1855 } else {
1856
1857 /*
1858 * No targeted cache flush methods are supported by CPU,
1859 * or the supplied range is bigger than 2MB.
1860 * Globally invalidate cache.
1861 */
1862 pmap_invalidate_cache();
1863 }
1864 }
1865
1866 /*
1867 * Remove the specified set of pages from the data and instruction caches.
1868 *
1869 * In contrast to pmap_invalidate_cache_range(), this function does not
1870 * rely on the CPU's self-snoop feature, because it is intended for use
1871 * when moving pages into a different cache domain.
1872 */
1873 void
1874 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1875 {
1876 vm_offset_t daddr, eva;
1877 int i;
1878 bool useclflushopt;
1879
1880 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
1881 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1882 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
1883 pmap_invalidate_cache();
1884 else {
1885 if (useclflushopt)
1886 sfence();
1887 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
1888 mfence();
1889 for (i = 0; i < count; i++) {
1890 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1891 eva = daddr + PAGE_SIZE;
1892 for (; daddr < eva; daddr += cpu_clflush_line_size) {
1893 if (useclflushopt)
1894 clflushopt(daddr);
1895 else
1896 clflush(daddr);
1897 }
1898 }
1899 if (useclflushopt)
1900 sfence();
1901 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
1902 mfence();
1903 }
1904 }
1905
1906 /*
1907 * Routine: pmap_extract
1908 * Function:
1909 * Extract the physical page address associated
1910 * with the given map/virtual_address pair.
1911 */
1912 vm_paddr_t
1913 pmap_extract(pmap_t pmap, vm_offset_t va)
1914 {
1915 pdp_entry_t *pdpe;
1916 pd_entry_t *pde;
1917 pt_entry_t *pte, PG_V;
1918 vm_paddr_t pa;
1919
1920 pa = 0;
1921 PG_V = pmap_valid_bit(pmap);
1922 PMAP_LOCK(pmap);
1923 pdpe = pmap_pdpe(pmap, va);
1924 if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1925 if ((*pdpe & PG_PS) != 0)
1926 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
1927 else {
1928 pde = pmap_pdpe_to_pde(pdpe, va);
1929 if ((*pde & PG_V) != 0) {
1930 if ((*pde & PG_PS) != 0) {
1931 pa = (*pde & PG_PS_FRAME) |
1932 (va & PDRMASK);
1933 } else {
1934 pte = pmap_pde_to_pte(pde, va);
1935 pa = (*pte & PG_FRAME) |
1936 (va & PAGE_MASK);
1937 }
1938 }
1939 }
1940 }
1941 PMAP_UNLOCK(pmap);
1942 return (pa);
1943 }
1944
1945 /*
1946 * Routine: pmap_extract_and_hold
1947 * Function:
1948 * Atomically extract and hold the physical page
1949 * with the given pmap and virtual address pair
1950 * if that mapping permits the given protection.
1951 */
1952 vm_page_t
1953 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1954 {
1955 pd_entry_t pde, *pdep;
1956 pt_entry_t pte, PG_RW, PG_V;
1957 vm_paddr_t pa;
1958 vm_page_t m;
1959
1960 pa = 0;
1961 m = NULL;
1962 PG_RW = pmap_rw_bit(pmap);
1963 PG_V = pmap_valid_bit(pmap);
1964 PMAP_LOCK(pmap);
1965 retry:
1966 pdep = pmap_pde(pmap, va);
1967 if (pdep != NULL && (pde = *pdep)) {
1968 if (pde & PG_PS) {
1969 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1970 if (vm_page_pa_tryrelock(pmap, (pde &
1971 PG_PS_FRAME) | (va & PDRMASK), &pa))
1972 goto retry;
1973 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1974 (va & PDRMASK));
1975 vm_page_hold(m);
1976 }
1977 } else {
1978 pte = *pmap_pde_to_pte(pdep, va);
1979 if ((pte & PG_V) &&
1980 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1981 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1982 &pa))
1983 goto retry;
1984 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1985 vm_page_hold(m);
1986 }
1987 }
1988 }
1989 PA_UNLOCK_COND(pa);
1990 PMAP_UNLOCK(pmap);
1991 return (m);
1992 }
1993
1994 vm_paddr_t
1995 pmap_kextract(vm_offset_t va)
1996 {
1997 pd_entry_t pde;
1998 vm_paddr_t pa;
1999
2000 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
2001 pa = DMAP_TO_PHYS(va);
2002 } else {
2003 pde = *vtopde(va);
2004 if (pde & PG_PS) {
2005 pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
2006 } else {
2007 /*
2008 * Beware of a concurrent promotion that changes the
2009 * PDE at this point! For example, vtopte() must not
2010 * be used to access the PTE because it would use the
2011 * new PDE. It is, however, safe to use the old PDE
2012 * because the page table page is preserved by the
2013 * promotion.
2014 */
2015 pa = *pmap_pde_to_pte(&pde, va);
2016 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
2017 }
2018 }
2019 return (pa);
2020 }
2021
2022 /***************************************************
2023 * Low level mapping routines.....
2024 ***************************************************/
2025
2026 /*
2027 * Add a wired page to the kva.
2028 * Note: not SMP coherent.
2029 */
2030 PMAP_INLINE void
2031 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
2032 {
2033 pt_entry_t *pte;
2034
2035 pte = vtopte(va);
2036 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
2037 }
2038
2039 static __inline void
2040 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
2041 {
2042 pt_entry_t *pte;
2043 int cache_bits;
2044
2045 pte = vtopte(va);
2046 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
2047 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
2048 }
2049
2050 /*
2051 * Remove a page from the kernel pagetables.
2052 * Note: not SMP coherent.
2053 */
2054 PMAP_INLINE void
2055 pmap_kremove(vm_offset_t va)
2056 {
2057 pt_entry_t *pte;
2058
2059 pte = vtopte(va);
2060 pte_clear(pte);
2061 }
2062
2063 /*
2064 * Used to map a range of physical addresses into kernel
2065 * virtual address space.
2066 *
2067 * The value passed in '*virt' is a suggested virtual address for
2068 * the mapping. Architectures which can support a direct-mapped
2069 * physical to virtual region can return the appropriate address
2070 * within that region, leaving '*virt' unchanged. Other
2071 * architectures should map the pages starting at '*virt' and
2072 * update '*virt' with the first usable address after the mapped
2073 * region.
2074 */
2075 vm_offset_t
2076 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2077 {
2078 return PHYS_TO_DMAP(start);
2079 }
2080
2081
2082 /*
2083 * Add a list of wired pages to the kva
2084 * this routine is only used for temporary
2085 * kernel mappings that do not need to have
2086 * page modification or references recorded.
2087 * Note that old mappings are simply written
2088 * over. The page *must* be wired.
2089 * Note: SMP coherent. Uses a ranged shootdown IPI.
2090 */
2091 void
2092 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2093 {
2094 pt_entry_t *endpte, oldpte, pa, *pte;
2095 vm_page_t m;
2096 int cache_bits;
2097
2098 oldpte = 0;
2099 pte = vtopte(sva);
2100 endpte = pte + count;
2101 while (pte < endpte) {
2102 m = *ma++;
2103 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
2104 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
2105 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
2106 oldpte |= *pte;
2107 pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
2108 }
2109 pte++;
2110 }
2111 if (__predict_false((oldpte & X86_PG_V) != 0))
2112 pmap_invalidate_range(kernel_pmap, sva, sva + count *
2113 PAGE_SIZE);
2114 }
2115
2116 /*
2117 * This routine tears out page mappings from the
2118 * kernel -- it is meant only for temporary mappings.
2119 * Note: SMP coherent. Uses a ranged shootdown IPI.
2120 */
2121 void
2122 pmap_qremove(vm_offset_t sva, int count)
2123 {
2124 vm_offset_t va;
2125
2126 va = sva;
2127 while (count-- > 0) {
2128 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2129 pmap_kremove(va);
2130 va += PAGE_SIZE;
2131 }
2132 pmap_invalidate_range(kernel_pmap, sva, va);
2133 }
2134
2135 /***************************************************
2136 * Page table page management routines.....
2137 ***************************************************/
2138 static __inline void
2139 pmap_free_zero_pages(struct spglist *free)
2140 {
2141 vm_page_t m;
2142
2143 while ((m = SLIST_FIRST(free)) != NULL) {
2144 SLIST_REMOVE_HEAD(free, plinks.s.ss);
2145 /* Preserve the page's PG_ZERO setting. */
2146 vm_page_free_toq(m);
2147 }
2148 }
2149
2150 /*
2151 * Schedule the specified unused page table page to be freed. Specifically,
2152 * add the page to the specified list of pages that will be released to the
2153 * physical memory manager after the TLB has been updated.
2154 */
2155 static __inline void
2156 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2157 boolean_t set_PG_ZERO)
2158 {
2159
2160 if (set_PG_ZERO)
2161 m->flags |= PG_ZERO;
2162 else
2163 m->flags &= ~PG_ZERO;
2164 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2165 }
2166
2167 /*
2168 * Inserts the specified page table page into the specified pmap's collection
2169 * of idle page table pages. Each of a pmap's page table pages is responsible
2170 * for mapping a distinct range of virtual addresses. The pmap's collection is
2171 * ordered by this virtual address range.
2172 */
2173 static __inline int
2174 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2175 {
2176
2177 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2178 return (vm_radix_insert(&pmap->pm_root, mpte));
2179 }
2180
2181 /*
2182 * Looks for a page table page mapping the specified virtual address in the
2183 * specified pmap's collection of idle page table pages. Returns NULL if there
2184 * is no page table page corresponding to the specified virtual address.
2185 */
2186 static __inline vm_page_t
2187 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
2188 {
2189
2190 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2191 return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
2192 }
2193
2194 /*
2195 * Removes the specified page table page from the specified pmap's collection
2196 * of idle page table pages. The specified page table page must be a member of
2197 * the pmap's collection.
2198 */
2199 static __inline void
2200 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
2201 {
2202
2203 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2204 vm_radix_remove(&pmap->pm_root, mpte->pindex);
2205 }
2206
2207 /*
2208 * Decrements a page table page's wire count, which is used to record the
2209 * number of valid page table entries within the page. If the wire count
2210 * drops to zero, then the page table page is unmapped. Returns TRUE if the
2211 * page table page was unmapped and FALSE otherwise.
2212 */
2213 static inline boolean_t
2214 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2215 {
2216
2217 --m->wire_count;
2218 if (m->wire_count == 0) {
2219 _pmap_unwire_ptp(pmap, va, m, free);
2220 return (TRUE);
2221 } else
2222 return (FALSE);
2223 }
2224
2225 static void
2226 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2227 {
2228
2229 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2230 /*
2231 * unmap the page table page
2232 */
2233 if (m->pindex >= (NUPDE + NUPDPE)) {
2234 /* PDP page */
2235 pml4_entry_t *pml4;
2236 pml4 = pmap_pml4e(pmap, va);
2237 *pml4 = 0;
2238 } else if (m->pindex >= NUPDE) {
2239 /* PD page */
2240 pdp_entry_t *pdp;
2241 pdp = pmap_pdpe(pmap, va);
2242 *pdp = 0;
2243 } else {
2244 /* PTE page */
2245 pd_entry_t *pd;
2246 pd = pmap_pde(pmap, va);
2247 *pd = 0;
2248 }
2249 pmap_resident_count_dec(pmap, 1);
2250 if (m->pindex < NUPDE) {
2251 /* We just released a PT, unhold the matching PD */
2252 vm_page_t pdpg;
2253
2254 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2255 pmap_unwire_ptp(pmap, va, pdpg, free);
2256 }
2257 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2258 /* We just released a PD, unhold the matching PDP */
2259 vm_page_t pdppg;
2260
2261 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2262 pmap_unwire_ptp(pmap, va, pdppg, free);
2263 }
2264
2265 /*
2266 * This is a release store so that the ordinary store unmapping
2267 * the page table page is globally performed before TLB shoot-
2268 * down is begun.
2269 */
2270 atomic_subtract_rel_int(&cnt.v_wire_count, 1);
2271
2272 /*
2273 * Put page on a list so that it is released after
2274 * *ALL* TLB shootdown is done
2275 */
2276 pmap_add_delayed_free_list(m, free, TRUE);
2277 }
2278
2279 /*
2280 * After removing a page table entry, this routine is used to
2281 * conditionally free the page, and manage the hold/wire counts.
2282 */
2283 static int
2284 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2285 struct spglist *free)
2286 {
2287 vm_page_t mpte;
2288
2289 if (va >= VM_MAXUSER_ADDRESS)
2290 return (0);
2291 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2292 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2293 return (pmap_unwire_ptp(pmap, va, mpte, free));
2294 }
2295
2296 void
2297 pmap_pinit0(pmap_t pmap)
2298 {
2299
2300 PMAP_LOCK_INIT(pmap);
2301 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2302 pmap->pm_cr3 = KPML4phys;
2303 pmap->pm_root.rt_root = 0;
2304 CPU_ZERO(&pmap->pm_active);
2305 CPU_ZERO(&pmap->pm_save);
2306 PCPU_SET(curpmap, pmap);
2307 TAILQ_INIT(&pmap->pm_pvchunk);
2308 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2309 pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
2310 pmap->pm_flags = pmap_flags;
2311 }
2312
2313 /*
2314 * Initialize a preallocated and zeroed pmap structure,
2315 * such as one in a vmspace structure.
2316 */
2317 int
2318 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2319 {
2320 vm_page_t pml4pg;
2321 vm_paddr_t pml4phys;
2322 int i;
2323
2324 /*
2325 * allocate the page directory page
2326 */
2327 while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2328 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
2329 VM_WAIT;
2330
2331 pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2332 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2333 pmap->pm_pcid = -1;
2334 pmap->pm_cr3 = ~0; /* initialize to an invalid value */
2335
2336 if ((pml4pg->flags & PG_ZERO) == 0)
2337 pagezero(pmap->pm_pml4);
2338
2339 /*
2340 * Do not install the host kernel mappings in the nested page
2341 * tables. These mappings are meaningless in the guest physical
2342 * address space.
2343 */
2344 if ((pmap->pm_type = pm_type) == PT_X86) {
2345 pmap->pm_cr3 = pml4phys;
2346
2347 /* Wire in kernel global address entries. */
2348 for (i = 0; i < NKPML4E; i++) {
2349 pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
2350 X86_PG_RW | X86_PG_V | PG_U;
2351 }
2352 for (i = 0; i < ndmpdpphys; i++) {
2353 pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
2354 X86_PG_RW | X86_PG_V | PG_U;
2355 }
2356
2357 /* install self-referential address mapping entry(s) */
2358 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
2359 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2360
2361 if (pmap_pcid_enabled) {
2362 pmap->pm_pcid = alloc_unr(&pcid_unr);
2363 if (pmap->pm_pcid != -1)
2364 pmap->pm_cr3 |= pmap->pm_pcid;
2365 }
2366 }
2367
2368 pmap->pm_root.rt_root = 0;
2369 CPU_ZERO(&pmap->pm_active);
2370 TAILQ_INIT(&pmap->pm_pvchunk);
2371 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2372 pmap->pm_flags = flags;
2373 pmap->pm_eptgen = 0;
2374 CPU_ZERO(&pmap->pm_save);
2375
2376 return (1);
2377 }
2378
2379 int
2380 pmap_pinit(pmap_t pmap)
2381 {
2382
2383 return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2384 }
2385
2386 /*
2387 * This routine is called if the desired page table page does not exist.
2388 *
2389 * If page table page allocation fails, this routine may sleep before
2390 * returning NULL. It sleeps only if a lock pointer was given.
2391 *
2392 * Note: If a page allocation fails at page table level two or three,
2393 * one or two pages may be held during the wait, only to be released
2394 * afterwards. This conservative approach is easily argued to avoid
2395 * race conditions.
2396 */
2397 static vm_page_t
2398 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2399 {
2400 vm_page_t m, pdppg, pdpg;
2401 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2402
2403 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2404
2405 PG_A = pmap_accessed_bit(pmap);
2406 PG_M = pmap_modified_bit(pmap);
2407 PG_V = pmap_valid_bit(pmap);
2408 PG_RW = pmap_rw_bit(pmap);
2409
2410 /*
2411 * Allocate a page table page.
2412 */
2413 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2414 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2415 if (lockp != NULL) {
2416 RELEASE_PV_LIST_LOCK(lockp);
2417 PMAP_UNLOCK(pmap);
2418 rw_runlock(&pvh_global_lock);
2419 VM_WAIT;
2420 rw_rlock(&pvh_global_lock);
2421 PMAP_LOCK(pmap);
2422 }
2423
2424 /*
2425 * Indicate the need to retry. While waiting, the page table
2426 * page may have been allocated.
2427 */
2428 return (NULL);
2429 }
2430 if ((m->flags & PG_ZERO) == 0)
2431 pmap_zero_page(m);
2432
2433 /*
2434 * Map the pagetable page into the process address space, if
2435 * it isn't already there.
2436 */
2437
2438 if (ptepindex >= (NUPDE + NUPDPE)) {
2439 pml4_entry_t *pml4;
2440 vm_pindex_t pml4index;
2441
2442 /* Wire up a new PDPE page */
2443 pml4index = ptepindex - (NUPDE + NUPDPE);
2444 pml4 = &pmap->pm_pml4[pml4index];
2445 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2446
2447 } else if (ptepindex >= NUPDE) {
2448 vm_pindex_t pml4index;
2449 vm_pindex_t pdpindex;
2450 pml4_entry_t *pml4;
2451 pdp_entry_t *pdp;
2452
2453 /* Wire up a new PDE page */
2454 pdpindex = ptepindex - NUPDE;
2455 pml4index = pdpindex >> NPML4EPGSHIFT;
2456
2457 pml4 = &pmap->pm_pml4[pml4index];
2458 if ((*pml4 & PG_V) == 0) {
2459 /* Have to allocate a new pdp, recurse */
2460 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2461 lockp) == NULL) {
2462 --m->wire_count;
2463 atomic_subtract_int(&cnt.v_wire_count, 1);
2464 vm_page_free_zero(m);
2465 return (NULL);
2466 }
2467 } else {
2468 /* Add reference to pdp page */
2469 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2470 pdppg->wire_count++;
2471 }
2472 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2473
2474 /* Now find the pdp page */
2475 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2476 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2477
2478 } else {
2479 vm_pindex_t pml4index;
2480 vm_pindex_t pdpindex;
2481 pml4_entry_t *pml4;
2482 pdp_entry_t *pdp;
2483 pd_entry_t *pd;
2484
2485 /* Wire up a new PTE page */
2486 pdpindex = ptepindex >> NPDPEPGSHIFT;
2487 pml4index = pdpindex >> NPML4EPGSHIFT;
2488
2489 /* First, find the pdp and check that its valid. */
2490 pml4 = &pmap->pm_pml4[pml4index];
2491 if ((*pml4 & PG_V) == 0) {
2492 /* Have to allocate a new pd, recurse */
2493 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2494 lockp) == NULL) {
2495 --m->wire_count;
2496 atomic_subtract_int(&cnt.v_wire_count, 1);
2497 vm_page_free_zero(m);
2498 return (NULL);
2499 }
2500 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2501 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2502 } else {
2503 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2504 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2505 if ((*pdp & PG_V) == 0) {
2506 /* Have to allocate a new pd, recurse */
2507 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2508 lockp) == NULL) {
2509 --m->wire_count;
2510 atomic_subtract_int(&cnt.v_wire_count,
2511 1);
2512 vm_page_free_zero(m);
2513 return (NULL);
2514 }
2515 } else {
2516 /* Add reference to the pd page */
2517 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2518 pdpg->wire_count++;
2519 }
2520 }
2521 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2522
2523 /* Now we know where the page directory page is */
2524 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2525 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2526 }
2527
2528 pmap_resident_count_inc(pmap, 1);
2529
2530 return (m);
2531 }
2532
2533 static vm_page_t
2534 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2535 {
2536 vm_pindex_t pdpindex, ptepindex;
2537 pdp_entry_t *pdpe, PG_V;
2538 vm_page_t pdpg;
2539
2540 PG_V = pmap_valid_bit(pmap);
2541
2542 retry:
2543 pdpe = pmap_pdpe(pmap, va);
2544 if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2545 /* Add a reference to the pd page. */
2546 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2547 pdpg->wire_count++;
2548 } else {
2549 /* Allocate a pd page. */
2550 ptepindex = pmap_pde_pindex(va);
2551 pdpindex = ptepindex >> NPDPEPGSHIFT;
2552 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2553 if (pdpg == NULL && lockp != NULL)
2554 goto retry;
2555 }
2556 return (pdpg);
2557 }
2558
2559 static vm_page_t
2560 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2561 {
2562 vm_pindex_t ptepindex;
2563 pd_entry_t *pd, PG_V;
2564 vm_page_t m;
2565
2566 PG_V = pmap_valid_bit(pmap);
2567
2568 /*
2569 * Calculate pagetable page index
2570 */
2571 ptepindex = pmap_pde_pindex(va);
2572 retry:
2573 /*
2574 * Get the page directory entry
2575 */
2576 pd = pmap_pde(pmap, va);
2577
2578 /*
2579 * This supports switching from a 2MB page to a
2580 * normal 4K page.
2581 */
2582 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2583 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2584 /*
2585 * Invalidation of the 2MB page mapping may have caused
2586 * the deallocation of the underlying PD page.
2587 */
2588 pd = NULL;
2589 }
2590 }
2591
2592 /*
2593 * If the page table page is mapped, we just increment the
2594 * hold count, and activate it.
2595 */
2596 if (pd != NULL && (*pd & PG_V) != 0) {
2597 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2598 m->wire_count++;
2599 } else {
2600 /*
2601 * Here if the pte page isn't mapped, or if it has been
2602 * deallocated.
2603 */
2604 m = _pmap_allocpte(pmap, ptepindex, lockp);
2605 if (m == NULL && lockp != NULL)
2606 goto retry;
2607 }
2608 return (m);
2609 }
2610
2611
2612 /***************************************************
2613 * Pmap allocation/deallocation routines.
2614 ***************************************************/
2615
2616 /*
2617 * Release any resources held by the given physical map.
2618 * Called when a pmap initialized by pmap_pinit is being released.
2619 * Should only be called if the map contains no valid mappings.
2620 */
2621 void
2622 pmap_release(pmap_t pmap)
2623 {
2624 vm_page_t m;
2625 int i;
2626
2627 KASSERT(pmap->pm_stats.resident_count == 0,
2628 ("pmap_release: pmap resident count %ld != 0",
2629 pmap->pm_stats.resident_count));
2630 KASSERT(vm_radix_is_empty(&pmap->pm_root),
2631 ("pmap_release: pmap has reserved page table page(s)"));
2632
2633 if (pmap_pcid_enabled) {
2634 /*
2635 * Invalidate any left TLB entries, to allow the reuse
2636 * of the pcid.
2637 */
2638 pmap_invalidate_all(pmap);
2639 }
2640
2641 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
2642
2643 for (i = 0; i < NKPML4E; i++) /* KVA */
2644 pmap->pm_pml4[KPML4BASE + i] = 0;
2645 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
2646 pmap->pm_pml4[DMPML4I + i] = 0;
2647 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
2648
2649 m->wire_count--;
2650 atomic_subtract_int(&cnt.v_wire_count, 1);
2651 vm_page_free_zero(m);
2652 if (pmap->pm_pcid != -1)
2653 free_unr(&pcid_unr, pmap->pm_pcid);
2654 }
2655
2656 static int
2657 kvm_size(SYSCTL_HANDLER_ARGS)
2658 {
2659 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2660
2661 return sysctl_handle_long(oidp, &ksize, 0, req);
2662 }
2663 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2664 0, 0, kvm_size, "LU", "Size of KVM");
2665
2666 static int
2667 kvm_free(SYSCTL_HANDLER_ARGS)
2668 {
2669 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2670
2671 return sysctl_handle_long(oidp, &kfree, 0, req);
2672 }
2673 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2674 0, 0, kvm_free, "LU", "Amount of KVM free");
2675
2676 /*
2677 * grow the number of kernel page table entries, if needed
2678 */
2679 void
2680 pmap_growkernel(vm_offset_t addr)
2681 {
2682 vm_paddr_t paddr;
2683 vm_page_t nkpg;
2684 pd_entry_t *pde, newpdir;
2685 pdp_entry_t *pdpe;
2686
2687 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2688
2689 /*
2690 * Return if "addr" is within the range of kernel page table pages
2691 * that were preallocated during pmap bootstrap. Moreover, leave
2692 * "kernel_vm_end" and the kernel page table as they were.
2693 *
2694 * The correctness of this action is based on the following
2695 * argument: vm_map_insert() allocates contiguous ranges of the
2696 * kernel virtual address space. It calls this function if a range
2697 * ends after "kernel_vm_end". If the kernel is mapped between
2698 * "kernel_vm_end" and "addr", then the range cannot begin at
2699 * "kernel_vm_end". In fact, its beginning address cannot be less
2700 * than the kernel. Thus, there is no immediate need to allocate
2701 * any new kernel page table pages between "kernel_vm_end" and
2702 * "KERNBASE".
2703 */
2704 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
2705 return;
2706
2707 addr = roundup2(addr, NBPDR);
2708 if (addr - 1 >= kernel_map->max_offset)
2709 addr = kernel_map->max_offset;
2710 while (kernel_vm_end < addr) {
2711 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
2712 if ((*pdpe & X86_PG_V) == 0) {
2713 /* We need a new PDP entry */
2714 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
2715 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
2716 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2717 if (nkpg == NULL)
2718 panic("pmap_growkernel: no memory to grow kernel");
2719 if ((nkpg->flags & PG_ZERO) == 0)
2720 pmap_zero_page(nkpg);
2721 paddr = VM_PAGE_TO_PHYS(nkpg);
2722 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
2723 X86_PG_A | X86_PG_M);
2724 continue; /* try again */
2725 }
2726 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
2727 if ((*pde & X86_PG_V) != 0) {
2728 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2729 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2730 kernel_vm_end = kernel_map->max_offset;
2731 break;
2732 }
2733 continue;
2734 }
2735
2736 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
2737 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2738 VM_ALLOC_ZERO);
2739 if (nkpg == NULL)
2740 panic("pmap_growkernel: no memory to grow kernel");
2741 if ((nkpg->flags & PG_ZERO) == 0)
2742 pmap_zero_page(nkpg);
2743 paddr = VM_PAGE_TO_PHYS(nkpg);
2744 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
2745 pde_store(pde, newpdir);
2746
2747 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2748 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2749 kernel_vm_end = kernel_map->max_offset;
2750 break;
2751 }
2752 }
2753 }
2754
2755
2756 /***************************************************
2757 * page management routines.
2758 ***************************************************/
2759
2760 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2761 CTASSERT(_NPCM == 3);
2762 CTASSERT(_NPCPV == 168);
2763
2764 static __inline struct pv_chunk *
2765 pv_to_chunk(pv_entry_t pv)
2766 {
2767
2768 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2769 }
2770
2771 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2772
2773 #define PC_FREE0 0xfffffffffffffffful
2774 #define PC_FREE1 0xfffffffffffffffful
2775 #define PC_FREE2 0x000000fffffffffful
2776
2777 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2778
2779 #ifdef PV_STATS
2780 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2781
2782 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2783 "Current number of pv entry chunks");
2784 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2785 "Current number of pv entry chunks allocated");
2786 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2787 "Current number of pv entry chunks frees");
2788 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2789 "Number of times tried to get a chunk page but failed.");
2790
2791 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2792 static int pv_entry_spare;
2793
2794 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2795 "Current number of pv entry frees");
2796 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2797 "Current number of pv entry allocs");
2798 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2799 "Current number of pv entries");
2800 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2801 "Current number of spare pv entries");
2802 #endif
2803
2804 /*
2805 * We are in a serious low memory condition. Resort to
2806 * drastic measures to free some pages so we can allocate
2807 * another pv entry chunk.
2808 *
2809 * Returns NULL if PV entries were reclaimed from the specified pmap.
2810 *
2811 * We do not, however, unmap 2mpages because subsequent accesses will
2812 * allocate per-page pv entries until repromotion occurs, thereby
2813 * exacerbating the shortage of free pv entries.
2814 */
2815 static vm_page_t
2816 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2817 {
2818 struct pch new_tail;
2819 struct pv_chunk *pc;
2820 struct md_page *pvh;
2821 pd_entry_t *pde;
2822 pmap_t pmap;
2823 pt_entry_t *pte, tpte;
2824 pt_entry_t PG_G, PG_A, PG_M, PG_RW;
2825 pv_entry_t pv;
2826 vm_offset_t va;
2827 vm_page_t m, m_pc;
2828 struct spglist free;
2829 uint64_t inuse;
2830 int bit, field, freed;
2831
2832 rw_assert(&pvh_global_lock, RA_LOCKED);
2833 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2834 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2835 pmap = NULL;
2836 m_pc = NULL;
2837 PG_G = PG_A = PG_M = PG_RW = 0;
2838 SLIST_INIT(&free);
2839 TAILQ_INIT(&new_tail);
2840 mtx_lock(&pv_chunks_mutex);
2841 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
2842 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2843 mtx_unlock(&pv_chunks_mutex);
2844 if (pmap != pc->pc_pmap) {
2845 if (pmap != NULL) {
2846 pmap_invalidate_all(pmap);
2847 if (pmap != locked_pmap)
2848 PMAP_UNLOCK(pmap);
2849 }
2850 pmap = pc->pc_pmap;
2851 /* Avoid deadlock and lock recursion. */
2852 if (pmap > locked_pmap) {
2853 RELEASE_PV_LIST_LOCK(lockp);
2854 PMAP_LOCK(pmap);
2855 } else if (pmap != locked_pmap &&
2856 !PMAP_TRYLOCK(pmap)) {
2857 pmap = NULL;
2858 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2859 mtx_lock(&pv_chunks_mutex);
2860 continue;
2861 }
2862 PG_G = pmap_global_bit(pmap);
2863 PG_A = pmap_accessed_bit(pmap);
2864 PG_M = pmap_modified_bit(pmap);
2865 PG_RW = pmap_rw_bit(pmap);
2866 }
2867
2868 /*
2869 * Destroy every non-wired, 4 KB page mapping in the chunk.
2870 */
2871 freed = 0;
2872 for (field = 0; field < _NPCM; field++) {
2873 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2874 inuse != 0; inuse &= ~(1UL << bit)) {
2875 bit = bsfq(inuse);
2876 pv = &pc->pc_pventry[field * 64 + bit];
2877 va = pv->pv_va;
2878 pde = pmap_pde(pmap, va);
2879 if ((*pde & PG_PS) != 0)
2880 continue;
2881 pte = pmap_pde_to_pte(pde, va);
2882 if ((*pte & PG_W) != 0)
2883 continue;
2884 tpte = pte_load_clear(pte);
2885 if ((tpte & PG_G) != 0)
2886 pmap_invalidate_page(pmap, va);
2887 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2888 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2889 vm_page_dirty(m);
2890 if ((tpte & PG_A) != 0)
2891 vm_page_aflag_set(m, PGA_REFERENCED);
2892 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2893 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2894 m->md.pv_gen++;
2895 if (TAILQ_EMPTY(&m->md.pv_list) &&
2896 (m->flags & PG_FICTITIOUS) == 0) {
2897 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2898 if (TAILQ_EMPTY(&pvh->pv_list)) {
2899 vm_page_aflag_clear(m,
2900 PGA_WRITEABLE);
2901 }
2902 }
2903 pc->pc_map[field] |= 1UL << bit;
2904 pmap_unuse_pt(pmap, va, *pde, &free);
2905 freed++;
2906 }
2907 }
2908 if (freed == 0) {
2909 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2910 mtx_lock(&pv_chunks_mutex);
2911 continue;
2912 }
2913 /* Every freed mapping is for a 4 KB page. */
2914 pmap_resident_count_dec(pmap, freed);
2915 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2916 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2917 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2918 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2919 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2920 pc->pc_map[2] == PC_FREE2) {
2921 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2922 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2923 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2924 /* Entire chunk is free; return it. */
2925 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2926 dump_drop_page(m_pc->phys_addr);
2927 mtx_lock(&pv_chunks_mutex);
2928 break;
2929 }
2930 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2931 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2932 mtx_lock(&pv_chunks_mutex);
2933 /* One freed pv entry in locked_pmap is sufficient. */
2934 if (pmap == locked_pmap)
2935 break;
2936 }
2937 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2938 mtx_unlock(&pv_chunks_mutex);
2939 if (pmap != NULL) {
2940 pmap_invalidate_all(pmap);
2941 if (pmap != locked_pmap)
2942 PMAP_UNLOCK(pmap);
2943 }
2944 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2945 m_pc = SLIST_FIRST(&free);
2946 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2947 /* Recycle a freed page table page. */
2948 m_pc->wire_count = 1;
2949 atomic_add_int(&cnt.v_wire_count, 1);
2950 }
2951 pmap_free_zero_pages(&free);
2952 return (m_pc);
2953 }
2954
2955 /*
2956 * free the pv_entry back to the free list
2957 */
2958 static void
2959 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2960 {
2961 struct pv_chunk *pc;
2962 int idx, field, bit;
2963
2964 rw_assert(&pvh_global_lock, RA_LOCKED);
2965 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2966 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2967 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2968 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2969 pc = pv_to_chunk(pv);
2970 idx = pv - &pc->pc_pventry[0];
2971 field = idx / 64;
2972 bit = idx % 64;
2973 pc->pc_map[field] |= 1ul << bit;
2974 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2975 pc->pc_map[2] != PC_FREE2) {
2976 /* 98% of the time, pc is already at the head of the list. */
2977 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2978 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2979 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2980 }
2981 return;
2982 }
2983 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2984 free_pv_chunk(pc);
2985 }
2986
2987 static void
2988 free_pv_chunk(struct pv_chunk *pc)
2989 {
2990 vm_page_t m;
2991
2992 mtx_lock(&pv_chunks_mutex);
2993 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2994 mtx_unlock(&pv_chunks_mutex);
2995 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2996 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2997 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2998 /* entire chunk is free, return it */
2999 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3000 dump_drop_page(m->phys_addr);
3001 vm_page_unwire(m, 0);
3002 vm_page_free(m);
3003 }
3004
3005 /*
3006 * Returns a new PV entry, allocating a new PV chunk from the system when
3007 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3008 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3009 * returned.
3010 *
3011 * The given PV list lock may be released.
3012 */
3013 static pv_entry_t
3014 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3015 {
3016 int bit, field;
3017 pv_entry_t pv;
3018 struct pv_chunk *pc;
3019 vm_page_t m;
3020
3021 rw_assert(&pvh_global_lock, RA_LOCKED);
3022 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3023 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3024 retry:
3025 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3026 if (pc != NULL) {
3027 for (field = 0; field < _NPCM; field++) {
3028 if (pc->pc_map[field]) {
3029 bit = bsfq(pc->pc_map[field]);
3030 break;
3031 }
3032 }
3033 if (field < _NPCM) {
3034 pv = &pc->pc_pventry[field * 64 + bit];
3035 pc->pc_map[field] &= ~(1ul << bit);
3036 /* If this was the last item, move it to tail */
3037 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
3038 pc->pc_map[2] == 0) {
3039 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3040 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3041 pc_list);
3042 }
3043 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3044 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3045 return (pv);
3046 }
3047 }
3048 /* No free items, allocate another chunk */
3049 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3050 VM_ALLOC_WIRED);
3051 if (m == NULL) {
3052 if (lockp == NULL) {
3053 PV_STAT(pc_chunk_tryfail++);
3054 return (NULL);
3055 }
3056 m = reclaim_pv_chunk(pmap, lockp);
3057 if (m == NULL)
3058 goto retry;
3059 }
3060 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3061 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3062 dump_add_page(m->phys_addr);
3063 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3064 pc->pc_pmap = pmap;
3065 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
3066 pc->pc_map[1] = PC_FREE1;
3067 pc->pc_map[2] = PC_FREE2;
3068 mtx_lock(&pv_chunks_mutex);
3069 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3070 mtx_unlock(&pv_chunks_mutex);
3071 pv = &pc->pc_pventry[0];
3072 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3073 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3074 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3075 return (pv);
3076 }
3077
3078 /*
3079 * Returns the number of one bits within the given PV chunk map element.
3080 */
3081 static int
3082 popcnt_pc_map_elem(uint64_t elem)
3083 {
3084 int count;
3085
3086 /*
3087 * This simple method of counting the one bits performs well because
3088 * the given element typically contains more zero bits than one bits.
3089 */
3090 count = 0;
3091 for (; elem != 0; elem &= elem - 1)
3092 count++;
3093 return (count);
3094 }
3095
3096 /*
3097 * Ensure that the number of spare PV entries in the specified pmap meets or
3098 * exceeds the given count, "needed".
3099 *
3100 * The given PV list lock may be released.
3101 */
3102 static void
3103 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3104 {
3105 struct pch new_tail;
3106 struct pv_chunk *pc;
3107 int avail, free;
3108 vm_page_t m;
3109
3110 rw_assert(&pvh_global_lock, RA_LOCKED);
3111 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3112 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3113
3114 /*
3115 * Newly allocated PV chunks must be stored in a private list until
3116 * the required number of PV chunks have been allocated. Otherwise,
3117 * reclaim_pv_chunk() could recycle one of these chunks. In
3118 * contrast, these chunks must be added to the pmap upon allocation.
3119 */
3120 TAILQ_INIT(&new_tail);
3121 retry:
3122 avail = 0;
3123 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3124 if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
3125 free = popcnt_pc_map_elem(pc->pc_map[0]);
3126 free += popcnt_pc_map_elem(pc->pc_map[1]);
3127 free += popcnt_pc_map_elem(pc->pc_map[2]);
3128 } else {
3129 free = popcntq(pc->pc_map[0]);
3130 free += popcntq(pc->pc_map[1]);
3131 free += popcntq(pc->pc_map[2]);
3132 }
3133 if (free == 0)
3134 break;
3135 avail += free;
3136 if (avail >= needed)
3137 break;
3138 }
3139 for (; avail < needed; avail += _NPCPV) {
3140 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3141 VM_ALLOC_WIRED);
3142 if (m == NULL) {
3143 m = reclaim_pv_chunk(pmap, lockp);
3144 if (m == NULL)
3145 goto retry;
3146 }
3147 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3148 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3149 dump_add_page(m->phys_addr);
3150 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3151 pc->pc_pmap = pmap;
3152 pc->pc_map[0] = PC_FREE0;
3153 pc->pc_map[1] = PC_FREE1;
3154 pc->pc_map[2] = PC_FREE2;
3155 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3156 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3157 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3158 }
3159 if (!TAILQ_EMPTY(&new_tail)) {
3160 mtx_lock(&pv_chunks_mutex);
3161 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3162 mtx_unlock(&pv_chunks_mutex);
3163 }
3164 }
3165
3166 /*
3167 * First find and then remove the pv entry for the specified pmap and virtual
3168 * address from the specified pv list. Returns the pv entry if found and NULL
3169 * otherwise. This operation can be performed on pv lists for either 4KB or
3170 * 2MB page mappings.
3171 */
3172 static __inline pv_entry_t
3173 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3174 {
3175 pv_entry_t pv;
3176
3177 rw_assert(&pvh_global_lock, RA_LOCKED);
3178 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3179 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3180 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3181 pvh->pv_gen++;
3182 break;
3183 }
3184 }
3185 return (pv);
3186 }
3187
3188 /*
3189 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3190 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3191 * entries for each of the 4KB page mappings.
3192 */
3193 static void
3194 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3195 struct rwlock **lockp)
3196 {
3197 struct md_page *pvh;
3198 struct pv_chunk *pc;
3199 pv_entry_t pv;
3200 vm_offset_t va_last;
3201 vm_page_t m;
3202 int bit, field;
3203
3204 rw_assert(&pvh_global_lock, RA_LOCKED);
3205 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3206 KASSERT((pa & PDRMASK) == 0,
3207 ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3208 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3209
3210 /*
3211 * Transfer the 2mpage's pv entry for this mapping to the first
3212 * page's pv list. Once this transfer begins, the pv list lock
3213 * must not be released until the last pv entry is reinstantiated.
3214 */
3215 pvh = pa_to_pvh(pa);
3216 va = trunc_2mpage(va);
3217 pv = pmap_pvh_remove(pvh, pmap, va);
3218 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3219 m = PHYS_TO_VM_PAGE(pa);
3220 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3221 m->md.pv_gen++;
3222 /* Instantiate the remaining NPTEPG - 1 pv entries. */
3223 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3224 va_last = va + NBPDR - PAGE_SIZE;
3225 for (;;) {
3226 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3227 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3228 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3229 for (field = 0; field < _NPCM; field++) {
3230 while (pc->pc_map[field]) {
3231 bit = bsfq(pc->pc_map[field]);
3232 pc->pc_map[field] &= ~(1ul << bit);
3233 pv = &pc->pc_pventry[field * 64 + bit];
3234 va += PAGE_SIZE;
3235 pv->pv_va = va;
3236 m++;
3237 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3238 ("pmap_pv_demote_pde: page %p is not managed", m));
3239 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3240 m->md.pv_gen++;
3241 if (va == va_last)
3242 goto out;
3243 }
3244 }
3245 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3246 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3247 }
3248 out:
3249 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3250 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3251 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3252 }
3253 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3254 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3255 }
3256
3257 /*
3258 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3259 * replace the many pv entries for the 4KB page mappings by a single pv entry
3260 * for the 2MB page mapping.
3261 */
3262 static void
3263 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3264 struct rwlock **lockp)
3265 {
3266 struct md_page *pvh;
3267 pv_entry_t pv;
3268 vm_offset_t va_last;
3269 vm_page_t m;
3270
3271 rw_assert(&pvh_global_lock, RA_LOCKED);
3272 KASSERT((pa & PDRMASK) == 0,
3273 ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3274 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3275
3276 /*
3277 * Transfer the first page's pv entry for this mapping to the 2mpage's
3278 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
3279 * a transfer avoids the possibility that get_pv_entry() calls
3280 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3281 * mappings that is being promoted.
3282 */
3283 m = PHYS_TO_VM_PAGE(pa);
3284 va = trunc_2mpage(va);
3285 pv = pmap_pvh_remove(&m->md, pmap, va);
3286 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3287 pvh = pa_to_pvh(pa);
3288 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3289 pvh->pv_gen++;
3290 /* Free the remaining NPTEPG - 1 pv entries. */
3291 va_last = va + NBPDR - PAGE_SIZE;
3292 do {
3293 m++;
3294 va += PAGE_SIZE;
3295 pmap_pvh_free(&m->md, pmap, va);
3296 } while (va < va_last);
3297 }
3298
3299 /*
3300 * First find and then destroy the pv entry for the specified pmap and virtual
3301 * address. This operation can be performed on pv lists for either 4KB or 2MB
3302 * page mappings.
3303 */
3304 static void
3305 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3306 {
3307 pv_entry_t pv;
3308
3309 pv = pmap_pvh_remove(pvh, pmap, va);
3310 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3311 free_pv_entry(pmap, pv);
3312 }
3313
3314 /*
3315 * Conditionally create the PV entry for a 4KB page mapping if the required
3316 * memory can be allocated without resorting to reclamation.
3317 */
3318 static boolean_t
3319 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3320 struct rwlock **lockp)
3321 {
3322 pv_entry_t pv;
3323
3324 rw_assert(&pvh_global_lock, RA_LOCKED);
3325 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3326 /* Pass NULL instead of the lock pointer to disable reclamation. */
3327 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3328 pv->pv_va = va;
3329 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3330 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3331 m->md.pv_gen++;
3332 return (TRUE);
3333 } else
3334 return (FALSE);
3335 }
3336
3337 /*
3338 * Conditionally create the PV entry for a 2MB page mapping if the required
3339 * memory can be allocated without resorting to reclamation.
3340 */
3341 static boolean_t
3342 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3343 struct rwlock **lockp)
3344 {
3345 struct md_page *pvh;
3346 pv_entry_t pv;
3347
3348 rw_assert(&pvh_global_lock, RA_LOCKED);
3349 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3350 /* Pass NULL instead of the lock pointer to disable reclamation. */
3351 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3352 pv->pv_va = va;
3353 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3354 pvh = pa_to_pvh(pa);
3355 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3356 pvh->pv_gen++;
3357 return (TRUE);
3358 } else
3359 return (FALSE);
3360 }
3361
3362 /*
3363 * Fills a page table page with mappings to consecutive physical pages.
3364 */
3365 static void
3366 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3367 {
3368 pt_entry_t *pte;
3369
3370 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3371 *pte = newpte;
3372 newpte += PAGE_SIZE;
3373 }
3374 }
3375
3376 /*
3377 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page
3378 * mapping is invalidated.
3379 */
3380 static boolean_t
3381 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3382 {
3383 struct rwlock *lock;
3384 boolean_t rv;
3385
3386 lock = NULL;
3387 rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3388 if (lock != NULL)
3389 rw_wunlock(lock);
3390 return (rv);
3391 }
3392
3393 static boolean_t
3394 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3395 struct rwlock **lockp)
3396 {
3397 pd_entry_t newpde, oldpde;
3398 pt_entry_t *firstpte, newpte;
3399 pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3400 vm_paddr_t mptepa;
3401 vm_page_t mpte;
3402 struct spglist free;
3403 vm_offset_t sva;
3404 int PG_PTE_CACHE;
3405
3406 PG_G = pmap_global_bit(pmap);
3407 PG_A = pmap_accessed_bit(pmap);
3408 PG_M = pmap_modified_bit(pmap);
3409 PG_RW = pmap_rw_bit(pmap);
3410 PG_V = pmap_valid_bit(pmap);
3411 PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3412
3413 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3414 oldpde = *pde;
3415 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3416 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3417 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
3418 NULL)
3419 pmap_remove_pt_page(pmap, mpte);
3420 else {
3421 KASSERT((oldpde & PG_W) == 0,
3422 ("pmap_demote_pde: page table page for a wired mapping"
3423 " is missing"));
3424
3425 /*
3426 * Invalidate the 2MB page mapping and return "failure" if the
3427 * mapping was never accessed or the allocation of the new
3428 * page table page fails. If the 2MB page mapping belongs to
3429 * the direct map region of the kernel's address space, then
3430 * the page allocation request specifies the highest possible
3431 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is
3432 * normal. Page table pages are preallocated for every other
3433 * part of the kernel address space, so the direct map region
3434 * is the only part of the kernel address space that must be
3435 * handled here.
3436 */
3437 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3438 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3439 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3440 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3441 SLIST_INIT(&free);
3442 sva = trunc_2mpage(va);
3443 pmap_remove_pde(pmap, pde, sva, &free, lockp);
3444 if ((oldpde & PG_G) == 0)
3445 pmap_invalidate_pde_page(pmap, sva, oldpde);
3446 pmap_free_zero_pages(&free);
3447 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3448 " in pmap %p", va, pmap);
3449 return (FALSE);
3450 }
3451 if (va < VM_MAXUSER_ADDRESS)
3452 pmap_resident_count_inc(pmap, 1);
3453 }
3454 mptepa = VM_PAGE_TO_PHYS(mpte);
3455 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3456 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3457 KASSERT((oldpde & PG_A) != 0,
3458 ("pmap_demote_pde: oldpde is missing PG_A"));
3459 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3460 ("pmap_demote_pde: oldpde is missing PG_M"));
3461 newpte = oldpde & ~PG_PS;
3462 newpte = pmap_swap_pat(pmap, newpte);
3463
3464 /*
3465 * If the page table page is new, initialize it.
3466 */
3467 if (mpte->wire_count == 1) {
3468 mpte->wire_count = NPTEPG;
3469 pmap_fill_ptp(firstpte, newpte);
3470 }
3471 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3472 ("pmap_demote_pde: firstpte and newpte map different physical"
3473 " addresses"));
3474
3475 /*
3476 * If the mapping has changed attributes, update the page table
3477 * entries.
3478 */
3479 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3480 pmap_fill_ptp(firstpte, newpte);
3481
3482 /*
3483 * The spare PV entries must be reserved prior to demoting the
3484 * mapping, that is, prior to changing the PDE. Otherwise, the state
3485 * of the PDE and the PV lists will be inconsistent, which can result
3486 * in reclaim_pv_chunk() attempting to remove a PV entry from the
3487 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3488 * PV entry for the 2MB page mapping that is being demoted.
3489 */
3490 if ((oldpde & PG_MANAGED) != 0)
3491 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3492
3493 /*
3494 * Demote the mapping. This pmap is locked. The old PDE has
3495 * PG_A set. If the old PDE has PG_RW set, it also has PG_M
3496 * set. Thus, there is no danger of a race with another
3497 * processor changing the setting of PG_A and/or PG_M between
3498 * the read above and the store below.
3499 */
3500 if (workaround_erratum383)
3501 pmap_update_pde(pmap, va, pde, newpde);
3502 else
3503 pde_store(pde, newpde);
3504
3505 /*
3506 * Invalidate a stale recursive mapping of the page table page.
3507 */
3508 if (va >= VM_MAXUSER_ADDRESS)
3509 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3510
3511 /*
3512 * Demote the PV entry.
3513 */
3514 if ((oldpde & PG_MANAGED) != 0)
3515 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
3516
3517 atomic_add_long(&pmap_pde_demotions, 1);
3518 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
3519 " in pmap %p", va, pmap);
3520 return (TRUE);
3521 }
3522
3523 /*
3524 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
3525 */
3526 static void
3527 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3528 {
3529 pd_entry_t newpde;
3530 vm_paddr_t mptepa;
3531 vm_page_t mpte;
3532
3533 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3534 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3535 mpte = pmap_lookup_pt_page(pmap, va);
3536 if (mpte == NULL)
3537 panic("pmap_remove_kernel_pde: Missing pt page.");
3538
3539 pmap_remove_pt_page(pmap, mpte);
3540 mptepa = VM_PAGE_TO_PHYS(mpte);
3541 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
3542
3543 /*
3544 * Initialize the page table page.
3545 */
3546 pagezero((void *)PHYS_TO_DMAP(mptepa));
3547
3548 /*
3549 * Demote the mapping.
3550 */
3551 if (workaround_erratum383)
3552 pmap_update_pde(pmap, va, pde, newpde);
3553 else
3554 pde_store(pde, newpde);
3555
3556 /*
3557 * Invalidate a stale recursive mapping of the page table page.
3558 */
3559 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
3560 }
3561
3562 /*
3563 * pmap_remove_pde: do the things to unmap a superpage in a process
3564 */
3565 static int
3566 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
3567 struct spglist *free, struct rwlock **lockp)
3568 {
3569 struct md_page *pvh;
3570 pd_entry_t oldpde;
3571 vm_offset_t eva, va;
3572 vm_page_t m, mpte;
3573 pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3574
3575 PG_G = pmap_global_bit(pmap);
3576 PG_A = pmap_accessed_bit(pmap);
3577 PG_M = pmap_modified_bit(pmap);
3578 PG_RW = pmap_rw_bit(pmap);
3579
3580 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3581 KASSERT((sva & PDRMASK) == 0,
3582 ("pmap_remove_pde: sva is not 2mpage aligned"));
3583 oldpde = pte_load_clear(pdq);
3584 if (oldpde & PG_W)
3585 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
3586 if ((oldpde & PG_G) != 0)
3587 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
3588 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
3589 if (oldpde & PG_MANAGED) {
3590 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
3591 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
3592 pmap_pvh_free(pvh, pmap, sva);
3593 eva = sva + NBPDR;
3594 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3595 va < eva; va += PAGE_SIZE, m++) {
3596 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3597 vm_page_dirty(m);
3598 if (oldpde & PG_A)
3599 vm_page_aflag_set(m, PGA_REFERENCED);
3600 if (TAILQ_EMPTY(&m->md.pv_list) &&
3601 TAILQ_EMPTY(&pvh->pv_list))
3602 vm_page_aflag_clear(m, PGA_WRITEABLE);
3603 }
3604 }
3605 if (pmap == kernel_pmap) {
3606 pmap_remove_kernel_pde(pmap, pdq, sva);
3607 } else {
3608 mpte = pmap_lookup_pt_page(pmap, sva);
3609 if (mpte != NULL) {
3610 pmap_remove_pt_page(pmap, mpte);
3611 pmap_resident_count_dec(pmap, 1);
3612 KASSERT(mpte->wire_count == NPTEPG,
3613 ("pmap_remove_pde: pte page wire count error"));
3614 mpte->wire_count = 0;
3615 pmap_add_delayed_free_list(mpte, free, FALSE);
3616 atomic_subtract_int(&cnt.v_wire_count, 1);
3617 }
3618 }
3619 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
3620 }
3621
3622 /*
3623 * pmap_remove_pte: do the things to unmap a page in a process
3624 */
3625 static int
3626 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
3627 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
3628 {
3629 struct md_page *pvh;
3630 pt_entry_t oldpte, PG_A, PG_M, PG_RW;
3631 vm_page_t m;
3632
3633 PG_A = pmap_accessed_bit(pmap);
3634 PG_M = pmap_modified_bit(pmap);
3635 PG_RW = pmap_rw_bit(pmap);
3636
3637 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3638 oldpte = pte_load_clear(ptq);
3639 if (oldpte & PG_W)
3640 pmap->pm_stats.wired_count -= 1;
3641 pmap_resident_count_dec(pmap, 1);
3642 if (oldpte & PG_MANAGED) {
3643 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
3644 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3645 vm_page_dirty(m);
3646 if (oldpte & PG_A)
3647 vm_page_aflag_set(m, PGA_REFERENCED);
3648 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3649 pmap_pvh_free(&m->md, pmap, va);
3650 if (TAILQ_EMPTY(&m->md.pv_list) &&
3651 (m->flags & PG_FICTITIOUS) == 0) {
3652 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3653 if (TAILQ_EMPTY(&pvh->pv_list))
3654 vm_page_aflag_clear(m, PGA_WRITEABLE);
3655 }
3656 }
3657 return (pmap_unuse_pt(pmap, va, ptepde, free));
3658 }
3659
3660 /*
3661 * Remove a single page from a process address space
3662 */
3663 static void
3664 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
3665 struct spglist *free)
3666 {
3667 struct rwlock *lock;
3668 pt_entry_t *pte, PG_V;
3669
3670 PG_V = pmap_valid_bit(pmap);
3671 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3672 if ((*pde & PG_V) == 0)
3673 return;
3674 pte = pmap_pde_to_pte(pde, va);
3675 if ((*pte & PG_V) == 0)
3676 return;
3677 lock = NULL;
3678 pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
3679 if (lock != NULL)
3680 rw_wunlock(lock);
3681 pmap_invalidate_page(pmap, va);
3682 }
3683
3684 /*
3685 * Remove the given range of addresses from the specified map.
3686 *
3687 * It is assumed that the start and end are properly
3688 * rounded to the page size.
3689 */
3690 void
3691 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3692 {
3693 struct rwlock *lock;
3694 vm_offset_t va, va_next;
3695 pml4_entry_t *pml4e;
3696 pdp_entry_t *pdpe;
3697 pd_entry_t ptpaddr, *pde;
3698 pt_entry_t *pte, PG_G, PG_V;
3699 struct spglist free;
3700 int anyvalid;
3701
3702 PG_G = pmap_global_bit(pmap);
3703 PG_V = pmap_valid_bit(pmap);
3704
3705 /*
3706 * Perform an unsynchronized read. This is, however, safe.
3707 */
3708 if (pmap->pm_stats.resident_count == 0)
3709 return;
3710
3711 anyvalid = 0;
3712 SLIST_INIT(&free);
3713
3714 rw_rlock(&pvh_global_lock);
3715 PMAP_LOCK(pmap);
3716
3717 /*
3718 * special handling of removing one page. a very
3719 * common operation and easy to short circuit some
3720 * code.
3721 */
3722 if (sva + PAGE_SIZE == eva) {
3723 pde = pmap_pde(pmap, sva);
3724 if (pde && (*pde & PG_PS) == 0) {
3725 pmap_remove_page(pmap, sva, pde, &free);
3726 goto out;
3727 }
3728 }
3729
3730 lock = NULL;
3731 for (; sva < eva; sva = va_next) {
3732
3733 if (pmap->pm_stats.resident_count == 0)
3734 break;
3735
3736 pml4e = pmap_pml4e(pmap, sva);
3737 if ((*pml4e & PG_V) == 0) {
3738 va_next = (sva + NBPML4) & ~PML4MASK;
3739 if (va_next < sva)
3740 va_next = eva;
3741 continue;
3742 }
3743
3744 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3745 if ((*pdpe & PG_V) == 0) {
3746 va_next = (sva + NBPDP) & ~PDPMASK;
3747 if (va_next < sva)
3748 va_next = eva;
3749 continue;
3750 }
3751
3752 /*
3753 * Calculate index for next page table.
3754 */
3755 va_next = (sva + NBPDR) & ~PDRMASK;
3756 if (va_next < sva)
3757 va_next = eva;
3758
3759 pde = pmap_pdpe_to_pde(pdpe, sva);
3760 ptpaddr = *pde;
3761
3762 /*
3763 * Weed out invalid mappings.
3764 */
3765 if (ptpaddr == 0)
3766 continue;
3767
3768 /*
3769 * Check for large page.
3770 */
3771 if ((ptpaddr & PG_PS) != 0) {
3772 /*
3773 * Are we removing the entire large page? If not,
3774 * demote the mapping and fall through.
3775 */
3776 if (sva + NBPDR == va_next && eva >= va_next) {
3777 /*
3778 * The TLB entry for a PG_G mapping is
3779 * invalidated by pmap_remove_pde().
3780 */
3781 if ((ptpaddr & PG_G) == 0)
3782 anyvalid = 1;
3783 pmap_remove_pde(pmap, pde, sva, &free, &lock);
3784 continue;
3785 } else if (!pmap_demote_pde_locked(pmap, pde, sva,
3786 &lock)) {
3787 /* The large page mapping was destroyed. */
3788 continue;
3789 } else
3790 ptpaddr = *pde;
3791 }
3792
3793 /*
3794 * Limit our scan to either the end of the va represented
3795 * by the current page table page, or to the end of the
3796 * range being removed.
3797 */
3798 if (va_next > eva)
3799 va_next = eva;
3800
3801 va = va_next;
3802 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3803 sva += PAGE_SIZE) {
3804 if (*pte == 0) {
3805 if (va != va_next) {
3806 pmap_invalidate_range(pmap, va, sva);
3807 va = va_next;
3808 }
3809 continue;
3810 }
3811 if ((*pte & PG_G) == 0)
3812 anyvalid = 1;
3813 else if (va == va_next)
3814 va = sva;
3815 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
3816 &lock)) {
3817 sva += PAGE_SIZE;
3818 break;
3819 }
3820 }
3821 if (va != va_next)
3822 pmap_invalidate_range(pmap, va, sva);
3823 }
3824 if (lock != NULL)
3825 rw_wunlock(lock);
3826 out:
3827 if (anyvalid)
3828 pmap_invalidate_all(pmap);
3829 rw_runlock(&pvh_global_lock);
3830 PMAP_UNLOCK(pmap);
3831 pmap_free_zero_pages(&free);
3832 }
3833
3834 /*
3835 * Routine: pmap_remove_all
3836 * Function:
3837 * Removes this physical page from
3838 * all physical maps in which it resides.
3839 * Reflects back modify bits to the pager.
3840 *
3841 * Notes:
3842 * Original versions of this routine were very
3843 * inefficient because they iteratively called
3844 * pmap_remove (slow...)
3845 */
3846
3847 void
3848 pmap_remove_all(vm_page_t m)
3849 {
3850 struct md_page *pvh;
3851 pv_entry_t pv;
3852 pmap_t pmap;
3853 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
3854 pd_entry_t *pde;
3855 vm_offset_t va;
3856 struct spglist free;
3857
3858 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3859 ("pmap_remove_all: page %p is not managed", m));
3860 SLIST_INIT(&free);
3861 rw_wlock(&pvh_global_lock);
3862 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
3863 pa_to_pvh(VM_PAGE_TO_PHYS(m));
3864 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3865 pmap = PV_PMAP(pv);
3866 PMAP_LOCK(pmap);
3867 va = pv->pv_va;
3868 pde = pmap_pde(pmap, va);
3869 (void)pmap_demote_pde(pmap, pde, va);
3870 PMAP_UNLOCK(pmap);
3871 }
3872 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3873 pmap = PV_PMAP(pv);
3874 PMAP_LOCK(pmap);
3875 PG_A = pmap_accessed_bit(pmap);
3876 PG_M = pmap_modified_bit(pmap);
3877 PG_RW = pmap_rw_bit(pmap);
3878 pmap_resident_count_dec(pmap, 1);
3879 pde = pmap_pde(pmap, pv->pv_va);
3880 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3881 " a 2mpage in page %p's pv list", m));
3882 pte = pmap_pde_to_pte(pde, pv->pv_va);
3883 tpte = pte_load_clear(pte);
3884 if (tpte & PG_W)
3885 pmap->pm_stats.wired_count--;
3886 if (tpte & PG_A)
3887 vm_page_aflag_set(m, PGA_REFERENCED);
3888
3889 /*
3890 * Update the vm_page_t clean and reference bits.
3891 */
3892 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3893 vm_page_dirty(m);
3894 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
3895 pmap_invalidate_page(pmap, pv->pv_va);
3896 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3897 m->md.pv_gen++;
3898 free_pv_entry(pmap, pv);
3899 PMAP_UNLOCK(pmap);
3900 }
3901 vm_page_aflag_clear(m, PGA_WRITEABLE);
3902 rw_wunlock(&pvh_global_lock);
3903 pmap_free_zero_pages(&free);
3904 }
3905
3906 /*
3907 * pmap_protect_pde: do the things to protect a 2mpage in a process
3908 */
3909 static boolean_t
3910 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3911 {
3912 pd_entry_t newpde, oldpde;
3913 vm_offset_t eva, va;
3914 vm_page_t m;
3915 boolean_t anychanged;
3916 pt_entry_t PG_G, PG_M, PG_RW;
3917
3918 PG_G = pmap_global_bit(pmap);
3919 PG_M = pmap_modified_bit(pmap);
3920 PG_RW = pmap_rw_bit(pmap);
3921
3922 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3923 KASSERT((sva & PDRMASK) == 0,
3924 ("pmap_protect_pde: sva is not 2mpage aligned"));
3925 anychanged = FALSE;
3926 retry:
3927 oldpde = newpde = *pde;
3928 if (oldpde & PG_MANAGED) {
3929 eva = sva + NBPDR;
3930 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3931 va < eva; va += PAGE_SIZE, m++)
3932 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3933 vm_page_dirty(m);
3934 }
3935 if ((prot & VM_PROT_WRITE) == 0)
3936 newpde &= ~(PG_RW | PG_M);
3937 if ((prot & VM_PROT_EXECUTE) == 0)
3938 newpde |= pg_nx;
3939 if (newpde != oldpde) {
3940 /*
3941 * As an optimization to future operations on this PDE, clear
3942 * PG_PROMOTED. The impending invalidation will remove any
3943 * lingering 4KB page mappings from the TLB.
3944 */
3945 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
3946 goto retry;
3947 if ((oldpde & PG_G) != 0)
3948 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
3949 else
3950 anychanged = TRUE;
3951 }
3952 return (anychanged);
3953 }
3954
3955 /*
3956 * Set the physical protection on the
3957 * specified range of this map as requested.
3958 */
3959 void
3960 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3961 {
3962 vm_offset_t va_next;
3963 pml4_entry_t *pml4e;
3964 pdp_entry_t *pdpe;
3965 pd_entry_t ptpaddr, *pde;
3966 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
3967 boolean_t anychanged, pv_lists_locked;
3968
3969 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3970 if (prot == VM_PROT_NONE) {
3971 pmap_remove(pmap, sva, eva);
3972 return;
3973 }
3974
3975 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3976 (VM_PROT_WRITE|VM_PROT_EXECUTE))
3977 return;
3978
3979 PG_G = pmap_global_bit(pmap);
3980 PG_M = pmap_modified_bit(pmap);
3981 PG_V = pmap_valid_bit(pmap);
3982 PG_RW = pmap_rw_bit(pmap);
3983 pv_lists_locked = FALSE;
3984 resume:
3985 anychanged = FALSE;
3986
3987 PMAP_LOCK(pmap);
3988 for (; sva < eva; sva = va_next) {
3989
3990 pml4e = pmap_pml4e(pmap, sva);
3991 if ((*pml4e & PG_V) == 0) {
3992 va_next = (sva + NBPML4) & ~PML4MASK;
3993 if (va_next < sva)
3994 va_next = eva;
3995 continue;
3996 }
3997
3998 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3999 if ((*pdpe & PG_V) == 0) {
4000 va_next = (sva + NBPDP) & ~PDPMASK;
4001 if (va_next < sva)
4002 va_next = eva;
4003 continue;
4004 }
4005
4006 va_next = (sva + NBPDR) & ~PDRMASK;
4007 if (va_next < sva)
4008 va_next = eva;
4009
4010 pde = pmap_pdpe_to_pde(pdpe, sva);
4011 ptpaddr = *pde;
4012
4013 /*
4014 * Weed out invalid mappings.
4015 */
4016 if (ptpaddr == 0)
4017 continue;
4018
4019 /*
4020 * Check for large page.
4021 */
4022 if ((ptpaddr & PG_PS) != 0) {
4023 /*
4024 * Are we protecting the entire large page? If not,
4025 * demote the mapping and fall through.
4026 */
4027 if (sva + NBPDR == va_next && eva >= va_next) {
4028 /*
4029 * The TLB entry for a PG_G mapping is
4030 * invalidated by pmap_protect_pde().
4031 */
4032 if (pmap_protect_pde(pmap, pde, sva, prot))
4033 anychanged = TRUE;
4034 continue;
4035 } else {
4036 if (!pv_lists_locked) {
4037 pv_lists_locked = TRUE;
4038 if (!rw_try_rlock(&pvh_global_lock)) {
4039 if (anychanged)
4040 pmap_invalidate_all(
4041 pmap);
4042 PMAP_UNLOCK(pmap);
4043 rw_rlock(&pvh_global_lock);
4044 goto resume;
4045 }
4046 }
4047 if (!pmap_demote_pde(pmap, pde, sva)) {
4048 /*
4049 * The large page mapping was
4050 * destroyed.
4051 */
4052 continue;
4053 }
4054 }
4055 }
4056
4057 if (va_next > eva)
4058 va_next = eva;
4059
4060 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4061 sva += PAGE_SIZE) {
4062 pt_entry_t obits, pbits;
4063 vm_page_t m;
4064
4065 retry:
4066 obits = pbits = *pte;
4067 if ((pbits & PG_V) == 0)
4068 continue;
4069
4070 if ((prot & VM_PROT_WRITE) == 0) {
4071 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4072 (PG_MANAGED | PG_M | PG_RW)) {
4073 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4074 vm_page_dirty(m);
4075 }
4076 pbits &= ~(PG_RW | PG_M);
4077 }
4078 if ((prot & VM_PROT_EXECUTE) == 0)
4079 pbits |= pg_nx;
4080
4081 if (pbits != obits) {
4082 if (!atomic_cmpset_long(pte, obits, pbits))
4083 goto retry;
4084 if (obits & PG_G)
4085 pmap_invalidate_page(pmap, sva);
4086 else
4087 anychanged = TRUE;
4088 }
4089 }
4090 }
4091 if (anychanged)
4092 pmap_invalidate_all(pmap);
4093 if (pv_lists_locked)
4094 rw_runlock(&pvh_global_lock);
4095 PMAP_UNLOCK(pmap);
4096 }
4097
4098 /*
4099 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4100 * single page table page (PTP) to a single 2MB page mapping. For promotion
4101 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4102 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4103 * identical characteristics.
4104 */
4105 static void
4106 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4107 struct rwlock **lockp)
4108 {
4109 pd_entry_t newpde;
4110 pt_entry_t *firstpte, oldpte, pa, *pte;
4111 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
4112 vm_page_t mpte;
4113 int PG_PTE_CACHE;
4114
4115 PG_A = pmap_accessed_bit(pmap);
4116 PG_G = pmap_global_bit(pmap);
4117 PG_M = pmap_modified_bit(pmap);
4118 PG_V = pmap_valid_bit(pmap);
4119 PG_RW = pmap_rw_bit(pmap);
4120 PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4121
4122 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4123
4124 /*
4125 * Examine the first PTE in the specified PTP. Abort if this PTE is
4126 * either invalid, unused, or does not map the first 4KB physical page
4127 * within a 2MB page.
4128 */
4129 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4130 setpde:
4131 newpde = *firstpte;
4132 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4133 atomic_add_long(&pmap_pde_p_failures, 1);
4134 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4135 " in pmap %p", va, pmap);
4136 return;
4137 }
4138 if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4139 /*
4140 * When PG_M is already clear, PG_RW can be cleared without
4141 * a TLB invalidation.
4142 */
4143 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4144 goto setpde;
4145 newpde &= ~PG_RW;
4146 }
4147
4148 /*
4149 * Examine each of the other PTEs in the specified PTP. Abort if this
4150 * PTE maps an unexpected 4KB physical page or does not have identical
4151 * characteristics to the first PTE.
4152 */
4153 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4154 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4155 setpte:
4156 oldpte = *pte;
4157 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4158 atomic_add_long(&pmap_pde_p_failures, 1);
4159 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4160 " in pmap %p", va, pmap);
4161 return;
4162 }
4163 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4164 /*
4165 * When PG_M is already clear, PG_RW can be cleared
4166 * without a TLB invalidation.
4167 */
4168 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4169 goto setpte;
4170 oldpte &= ~PG_RW;
4171 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4172 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
4173 (va & ~PDRMASK), pmap);
4174 }
4175 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4176 atomic_add_long(&pmap_pde_p_failures, 1);
4177 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4178 " in pmap %p", va, pmap);
4179 return;
4180 }
4181 pa -= PAGE_SIZE;
4182 }
4183
4184 /*
4185 * Save the page table page in its current state until the PDE
4186 * mapping the superpage is demoted by pmap_demote_pde() or
4187 * destroyed by pmap_remove_pde().
4188 */
4189 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4190 KASSERT(mpte >= vm_page_array &&
4191 mpte < &vm_page_array[vm_page_array_size],
4192 ("pmap_promote_pde: page table page is out of range"));
4193 KASSERT(mpte->pindex == pmap_pde_pindex(va),
4194 ("pmap_promote_pde: page table page's pindex is wrong"));
4195 if (pmap_insert_pt_page(pmap, mpte)) {
4196 atomic_add_long(&pmap_pde_p_failures, 1);
4197 CTR2(KTR_PMAP,
4198 "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4199 pmap);
4200 return;
4201 }
4202
4203 /*
4204 * Promote the pv entries.
4205 */
4206 if ((newpde & PG_MANAGED) != 0)
4207 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4208
4209 /*
4210 * Propagate the PAT index to its proper position.
4211 */
4212 newpde = pmap_swap_pat(pmap, newpde);
4213
4214 /*
4215 * Map the superpage.
4216 */
4217 if (workaround_erratum383)
4218 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4219 else
4220 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
4221
4222 atomic_add_long(&pmap_pde_promotions, 1);
4223 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4224 " in pmap %p", va, pmap);
4225 }
4226
4227 /*
4228 * Insert the given physical page (p) at
4229 * the specified virtual address (v) in the
4230 * target physical map with the protection requested.
4231 *
4232 * If specified, the page will be wired down, meaning
4233 * that the related pte can not be reclaimed.
4234 *
4235 * NB: This is the only routine which MAY NOT lazy-evaluate
4236 * or lose information. That is, this routine must actually
4237 * insert this page into the given map NOW.
4238 */
4239 int
4240 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4241 u_int flags, int8_t psind __unused)
4242 {
4243 struct rwlock *lock;
4244 pd_entry_t *pde;
4245 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4246 pt_entry_t newpte, origpte;
4247 pv_entry_t pv;
4248 vm_paddr_t opa, pa;
4249 vm_page_t mpte, om;
4250 boolean_t nosleep;
4251
4252 PG_A = pmap_accessed_bit(pmap);
4253 PG_G = pmap_global_bit(pmap);
4254 PG_M = pmap_modified_bit(pmap);
4255 PG_V = pmap_valid_bit(pmap);
4256 PG_RW = pmap_rw_bit(pmap);
4257
4258 va = trunc_page(va);
4259 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4260 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4261 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4262 va));
4263 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4264 va >= kmi.clean_eva,
4265 ("pmap_enter: managed mapping within the clean submap"));
4266 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4267 VM_OBJECT_ASSERT_LOCKED(m->object);
4268 pa = VM_PAGE_TO_PHYS(m);
4269 newpte = (pt_entry_t)(pa | PG_A | PG_V);
4270 if ((flags & VM_PROT_WRITE) != 0)
4271 newpte |= PG_M;
4272 if ((prot & VM_PROT_WRITE) != 0)
4273 newpte |= PG_RW;
4274 KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4275 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
4276 if ((prot & VM_PROT_EXECUTE) == 0)
4277 newpte |= pg_nx;
4278 if ((flags & PMAP_ENTER_WIRED) != 0)
4279 newpte |= PG_W;
4280 if (va < VM_MAXUSER_ADDRESS)
4281 newpte |= PG_U;
4282 if (pmap == kernel_pmap)
4283 newpte |= PG_G;
4284 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
4285
4286 /*
4287 * Set modified bit gratuitously for writeable mappings if
4288 * the page is unmanaged. We do not want to take a fault
4289 * to do the dirty bit accounting for these mappings.
4290 */
4291 if ((m->oflags & VPO_UNMANAGED) != 0) {
4292 if ((newpte & PG_RW) != 0)
4293 newpte |= PG_M;
4294 }
4295
4296 mpte = NULL;
4297
4298 lock = NULL;
4299 rw_rlock(&pvh_global_lock);
4300 PMAP_LOCK(pmap);
4301
4302 /*
4303 * In the case that a page table page is not
4304 * resident, we are creating it here.
4305 */
4306 retry:
4307 pde = pmap_pde(pmap, va);
4308 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4309 pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4310 pte = pmap_pde_to_pte(pde, va);
4311 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4312 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4313 mpte->wire_count++;
4314 }
4315 } else if (va < VM_MAXUSER_ADDRESS) {
4316 /*
4317 * Here if the pte page isn't mapped, or if it has been
4318 * deallocated.
4319 */
4320 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4321 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
4322 nosleep ? NULL : &lock);
4323 if (mpte == NULL && nosleep) {
4324 if (lock != NULL)
4325 rw_wunlock(lock);
4326 rw_runlock(&pvh_global_lock);
4327 PMAP_UNLOCK(pmap);
4328 return (KERN_RESOURCE_SHORTAGE);
4329 }
4330 goto retry;
4331 } else
4332 panic("pmap_enter: invalid page directory va=%#lx", va);
4333
4334 origpte = *pte;
4335
4336 /*
4337 * Is the specified virtual address already mapped?
4338 */
4339 if ((origpte & PG_V) != 0) {
4340 /*
4341 * Wiring change, just update stats. We don't worry about
4342 * wiring PT pages as they remain resident as long as there
4343 * are valid mappings in them. Hence, if a user page is wired,
4344 * the PT page will be also.
4345 */
4346 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4347 pmap->pm_stats.wired_count++;
4348 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4349 pmap->pm_stats.wired_count--;
4350
4351 /*
4352 * Remove the extra PT page reference.
4353 */
4354 if (mpte != NULL) {
4355 mpte->wire_count--;
4356 KASSERT(mpte->wire_count > 0,
4357 ("pmap_enter: missing reference to page table page,"
4358 " va: 0x%lx", va));
4359 }
4360
4361 /*
4362 * Has the physical page changed?
4363 */
4364 opa = origpte & PG_FRAME;
4365 if (opa == pa) {
4366 /*
4367 * No, might be a protection or wiring change.
4368 */
4369 if ((origpte & PG_MANAGED) != 0) {
4370 newpte |= PG_MANAGED;
4371 if ((newpte & PG_RW) != 0)
4372 vm_page_aflag_set(m, PGA_WRITEABLE);
4373 }
4374 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4375 goto unchanged;
4376 goto validate;
4377 }
4378 } else {
4379 /*
4380 * Increment the counters.
4381 */
4382 if ((newpte & PG_W) != 0)
4383 pmap->pm_stats.wired_count++;
4384 pmap_resident_count_inc(pmap, 1);
4385 }
4386
4387 /*
4388 * Enter on the PV list if part of our managed memory.
4389 */
4390 if ((m->oflags & VPO_UNMANAGED) == 0) {
4391 newpte |= PG_MANAGED;
4392 pv = get_pv_entry(pmap, &lock);
4393 pv->pv_va = va;
4394 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4395 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4396 m->md.pv_gen++;
4397 if ((newpte & PG_RW) != 0)
4398 vm_page_aflag_set(m, PGA_WRITEABLE);
4399 }
4400
4401 /*
4402 * Update the PTE.
4403 */
4404 if ((origpte & PG_V) != 0) {
4405 validate:
4406 origpte = pte_load_store(pte, newpte);
4407 opa = origpte & PG_FRAME;
4408 if (opa != pa) {
4409 if ((origpte & PG_MANAGED) != 0) {
4410 om = PHYS_TO_VM_PAGE(opa);
4411 if ((origpte & (PG_M | PG_RW)) == (PG_M |
4412 PG_RW))
4413 vm_page_dirty(om);
4414 if ((origpte & PG_A) != 0)
4415 vm_page_aflag_set(om, PGA_REFERENCED);
4416 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4417 pmap_pvh_free(&om->md, pmap, va);
4418 if ((om->aflags & PGA_WRITEABLE) != 0 &&
4419 TAILQ_EMPTY(&om->md.pv_list) &&
4420 ((om->flags & PG_FICTITIOUS) != 0 ||
4421 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4422 vm_page_aflag_clear(om, PGA_WRITEABLE);
4423 }
4424 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
4425 PG_RW)) == (PG_M | PG_RW)) {
4426 if ((origpte & PG_MANAGED) != 0)
4427 vm_page_dirty(m);
4428
4429 /*
4430 * Although the PTE may still have PG_RW set, TLB
4431 * invalidation may nonetheless be required because
4432 * the PTE no longer has PG_M set.
4433 */
4434 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
4435 /*
4436 * This PTE change does not require TLB invalidation.
4437 */
4438 goto unchanged;
4439 }
4440 if ((origpte & PG_A) != 0)
4441 pmap_invalidate_page(pmap, va);
4442 } else
4443 pte_store(pte, newpte);
4444
4445 unchanged:
4446
4447 /*
4448 * If both the page table page and the reservation are fully
4449 * populated, then attempt promotion.
4450 */
4451 if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
4452 pmap_ps_enabled(pmap) &&
4453 (m->flags & PG_FICTITIOUS) == 0 &&
4454 vm_reserv_level_iffullpop(m) == 0)
4455 pmap_promote_pde(pmap, pde, va, &lock);
4456
4457 if (lock != NULL)
4458 rw_wunlock(lock);
4459 rw_runlock(&pvh_global_lock);
4460 PMAP_UNLOCK(pmap);
4461 return (KERN_SUCCESS);
4462 }
4463
4464 /*
4465 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE
4466 * otherwise. Fails if (1) a page table page cannot be allocated without
4467 * blocking, (2) a mapping already exists at the specified virtual address, or
4468 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
4469 */
4470 static boolean_t
4471 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4472 struct rwlock **lockp)
4473 {
4474 pd_entry_t *pde, newpde;
4475 pt_entry_t PG_V;
4476 vm_page_t mpde;
4477 struct spglist free;
4478
4479 PG_V = pmap_valid_bit(pmap);
4480 rw_assert(&pvh_global_lock, RA_LOCKED);
4481 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4482
4483 if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
4484 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4485 " in pmap %p", va, pmap);
4486 return (FALSE);
4487 }
4488 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
4489 pde = &pde[pmap_pde_index(va)];
4490 if ((*pde & PG_V) != 0) {
4491 KASSERT(mpde->wire_count > 1,
4492 ("pmap_enter_pde: mpde's wire count is too low"));
4493 mpde->wire_count--;
4494 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4495 " in pmap %p", va, pmap);
4496 return (FALSE);
4497 }
4498 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
4499 PG_PS | PG_V;
4500 if ((m->oflags & VPO_UNMANAGED) == 0) {
4501 newpde |= PG_MANAGED;
4502
4503 /*
4504 * Abort this mapping if its PV entry could not be created.
4505 */
4506 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
4507 lockp)) {
4508 SLIST_INIT(&free);
4509 if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
4510 pmap_invalidate_page(pmap, va);
4511 pmap_free_zero_pages(&free);
4512 }
4513 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
4514 " in pmap %p", va, pmap);
4515 return (FALSE);
4516 }
4517 }
4518 if ((prot & VM_PROT_EXECUTE) == 0)
4519 newpde |= pg_nx;
4520 if (va < VM_MAXUSER_ADDRESS)
4521 newpde |= PG_U;
4522
4523 /*
4524 * Increment counters.
4525 */
4526 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4527
4528 /*
4529 * Map the superpage. (This is not a promoted mapping; there will not
4530 * be any lingering 4KB page mappings in the TLB.)
4531 */
4532 pde_store(pde, newpde);
4533
4534 atomic_add_long(&pmap_pde_mappings, 1);
4535 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
4536 " in pmap %p", va, pmap);
4537 return (TRUE);
4538 }
4539
4540 /*
4541 * Maps a sequence of resident pages belonging to the same object.
4542 * The sequence begins with the given page m_start. This page is
4543 * mapped at the given virtual address start. Each subsequent page is
4544 * mapped at a virtual address that is offset from start by the same
4545 * amount as the page is offset from m_start within the object. The
4546 * last page in the sequence is the page with the largest offset from
4547 * m_start that can be mapped at a virtual address less than the given
4548 * virtual address end. Not every virtual page between start and end
4549 * is mapped; only those for which a resident page exists with the
4550 * corresponding offset from m_start are mapped.
4551 */
4552 void
4553 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4554 vm_page_t m_start, vm_prot_t prot)
4555 {
4556 struct rwlock *lock;
4557 vm_offset_t va;
4558 vm_page_t m, mpte;
4559 vm_pindex_t diff, psize;
4560
4561 VM_OBJECT_ASSERT_LOCKED(m_start->object);
4562
4563 psize = atop(end - start);
4564 mpte = NULL;
4565 m = m_start;
4566 lock = NULL;
4567 rw_rlock(&pvh_global_lock);
4568 PMAP_LOCK(pmap);
4569 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4570 va = start + ptoa(diff);
4571 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
4572 m->psind == 1 && pmap_ps_enabled(pmap) &&
4573 pmap_enter_pde(pmap, va, m, prot, &lock))
4574 m = &m[NBPDR / PAGE_SIZE - 1];
4575 else
4576 mpte = pmap_enter_quick_locked(pmap, va, m, prot,
4577 mpte, &lock);
4578 m = TAILQ_NEXT(m, listq);
4579 }
4580 if (lock != NULL)
4581 rw_wunlock(lock);
4582 rw_runlock(&pvh_global_lock);
4583 PMAP_UNLOCK(pmap);
4584 }
4585
4586 /*
4587 * this code makes some *MAJOR* assumptions:
4588 * 1. Current pmap & pmap exists.
4589 * 2. Not wired.
4590 * 3. Read access.
4591 * 4. No page table pages.
4592 * but is *MUCH* faster than pmap_enter...
4593 */
4594
4595 void
4596 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4597 {
4598 struct rwlock *lock;
4599
4600 lock = NULL;
4601 rw_rlock(&pvh_global_lock);
4602 PMAP_LOCK(pmap);
4603 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4604 if (lock != NULL)
4605 rw_wunlock(lock);
4606 rw_runlock(&pvh_global_lock);
4607 PMAP_UNLOCK(pmap);
4608 }
4609
4610 static vm_page_t
4611 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4612 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4613 {
4614 struct spglist free;
4615 pt_entry_t *pte, PG_V;
4616 vm_paddr_t pa;
4617
4618 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4619 (m->oflags & VPO_UNMANAGED) != 0,
4620 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4621 PG_V = pmap_valid_bit(pmap);
4622 rw_assert(&pvh_global_lock, RA_LOCKED);
4623 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4624
4625 /*
4626 * In the case that a page table page is not
4627 * resident, we are creating it here.
4628 */
4629 if (va < VM_MAXUSER_ADDRESS) {
4630 vm_pindex_t ptepindex;
4631 pd_entry_t *ptepa;
4632
4633 /*
4634 * Calculate pagetable page index
4635 */
4636 ptepindex = pmap_pde_pindex(va);
4637 if (mpte && (mpte->pindex == ptepindex)) {
4638 mpte->wire_count++;
4639 } else {
4640 /*
4641 * Get the page directory entry
4642 */
4643 ptepa = pmap_pde(pmap, va);
4644
4645 /*
4646 * If the page table page is mapped, we just increment
4647 * the hold count, and activate it. Otherwise, we
4648 * attempt to allocate a page table page. If this
4649 * attempt fails, we don't retry. Instead, we give up.
4650 */
4651 if (ptepa && (*ptepa & PG_V) != 0) {
4652 if (*ptepa & PG_PS)
4653 return (NULL);
4654 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
4655 mpte->wire_count++;
4656 } else {
4657 /*
4658 * Pass NULL instead of the PV list lock
4659 * pointer, because we don't intend to sleep.
4660 */
4661 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
4662 if (mpte == NULL)
4663 return (mpte);
4664 }
4665 }
4666 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4667 pte = &pte[pmap_pte_index(va)];
4668 } else {
4669 mpte = NULL;
4670 pte = vtopte(va);
4671 }
4672 if (*pte) {
4673 if (mpte != NULL) {
4674 mpte->wire_count--;
4675 mpte = NULL;
4676 }
4677 return (mpte);
4678 }
4679
4680 /*
4681 * Enter on the PV list if part of our managed memory.
4682 */
4683 if ((m->oflags & VPO_UNMANAGED) == 0 &&
4684 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4685 if (mpte != NULL) {
4686 SLIST_INIT(&free);
4687 if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4688 pmap_invalidate_page(pmap, va);
4689 pmap_free_zero_pages(&free);
4690 }
4691 mpte = NULL;
4692 }
4693 return (mpte);
4694 }
4695
4696 /*
4697 * Increment counters
4698 */
4699 pmap_resident_count_inc(pmap, 1);
4700
4701 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
4702 if ((prot & VM_PROT_EXECUTE) == 0)
4703 pa |= pg_nx;
4704
4705 /*
4706 * Now validate mapping with RO protection
4707 */
4708 if ((m->oflags & VPO_UNMANAGED) != 0)
4709 pte_store(pte, pa | PG_V | PG_U);
4710 else
4711 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
4712 return (mpte);
4713 }
4714
4715 /*
4716 * Make a temporary mapping for a physical address. This is only intended
4717 * to be used for panic dumps.
4718 */
4719 void *
4720 pmap_kenter_temporary(vm_paddr_t pa, int i)
4721 {
4722 vm_offset_t va;
4723
4724 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
4725 pmap_kenter(va, pa);
4726 invlpg(va);
4727 return ((void *)crashdumpmap);
4728 }
4729
4730 /*
4731 * This code maps large physical mmap regions into the
4732 * processor address space. Note that some shortcuts
4733 * are taken, but the code works.
4734 */
4735 void
4736 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4737 vm_pindex_t pindex, vm_size_t size)
4738 {
4739 pd_entry_t *pde;
4740 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4741 vm_paddr_t pa, ptepa;
4742 vm_page_t p, pdpg;
4743 int pat_mode;
4744
4745 PG_A = pmap_accessed_bit(pmap);
4746 PG_M = pmap_modified_bit(pmap);
4747 PG_V = pmap_valid_bit(pmap);
4748 PG_RW = pmap_rw_bit(pmap);
4749
4750 VM_OBJECT_ASSERT_WLOCKED(object);
4751 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4752 ("pmap_object_init_pt: non-device object"));
4753 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
4754 if (!pmap_ps_enabled(pmap))
4755 return;
4756 if (!vm_object_populate(object, pindex, pindex + atop(size)))
4757 return;
4758 p = vm_page_lookup(object, pindex);
4759 KASSERT(p->valid == VM_PAGE_BITS_ALL,
4760 ("pmap_object_init_pt: invalid page %p", p));
4761 pat_mode = p->md.pat_mode;
4762
4763 /*
4764 * Abort the mapping if the first page is not physically
4765 * aligned to a 2MB page boundary.
4766 */
4767 ptepa = VM_PAGE_TO_PHYS(p);
4768 if (ptepa & (NBPDR - 1))
4769 return;
4770
4771 /*
4772 * Skip the first page. Abort the mapping if the rest of
4773 * the pages are not physically contiguous or have differing
4774 * memory attributes.
4775 */
4776 p = TAILQ_NEXT(p, listq);
4777 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4778 pa += PAGE_SIZE) {
4779 KASSERT(p->valid == VM_PAGE_BITS_ALL,
4780 ("pmap_object_init_pt: invalid page %p", p));
4781 if (pa != VM_PAGE_TO_PHYS(p) ||
4782 pat_mode != p->md.pat_mode)
4783 return;
4784 p = TAILQ_NEXT(p, listq);
4785 }
4786
4787 /*
4788 * Map using 2MB pages. Since "ptepa" is 2M aligned and
4789 * "size" is a multiple of 2M, adding the PAT setting to "pa"
4790 * will not affect the termination of this loop.
4791 */
4792 PMAP_LOCK(pmap);
4793 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
4794 pa < ptepa + size; pa += NBPDR) {
4795 pdpg = pmap_allocpde(pmap, addr, NULL);
4796 if (pdpg == NULL) {
4797 /*
4798 * The creation of mappings below is only an
4799 * optimization. If a page directory page
4800 * cannot be allocated without blocking,
4801 * continue on to the next mapping rather than
4802 * blocking.
4803 */
4804 addr += NBPDR;
4805 continue;
4806 }
4807 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4808 pde = &pde[pmap_pde_index(addr)];
4809 if ((*pde & PG_V) == 0) {
4810 pde_store(pde, pa | PG_PS | PG_M | PG_A |
4811 PG_U | PG_RW | PG_V);
4812 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
4813 atomic_add_long(&pmap_pde_mappings, 1);
4814 } else {
4815 /* Continue on if the PDE is already valid. */
4816 pdpg->wire_count--;
4817 KASSERT(pdpg->wire_count > 0,
4818 ("pmap_object_init_pt: missing reference "
4819 "to page directory page, va: 0x%lx", addr));
4820 }
4821 addr += NBPDR;
4822 }
4823 PMAP_UNLOCK(pmap);
4824 }
4825 }
4826
4827 /*
4828 * Clear the wired attribute from the mappings for the specified range of
4829 * addresses in the given pmap. Every valid mapping within that range
4830 * must have the wired attribute set. In contrast, invalid mappings
4831 * cannot have the wired attribute set, so they are ignored.
4832 *
4833 * The wired attribute of the page table entry is not a hardware feature,
4834 * so there is no need to invalidate any TLB entries.
4835 */
4836 void
4837 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4838 {
4839 vm_offset_t va_next;
4840 pml4_entry_t *pml4e;
4841 pdp_entry_t *pdpe;
4842 pd_entry_t *pde;
4843 pt_entry_t *pte, PG_V;
4844 boolean_t pv_lists_locked;
4845
4846 PG_V = pmap_valid_bit(pmap);
4847 pv_lists_locked = FALSE;
4848 resume:
4849 PMAP_LOCK(pmap);
4850 for (; sva < eva; sva = va_next) {
4851 pml4e = pmap_pml4e(pmap, sva);
4852 if ((*pml4e & PG_V) == 0) {
4853 va_next = (sva + NBPML4) & ~PML4MASK;
4854 if (va_next < sva)
4855 va_next = eva;
4856 continue;
4857 }
4858 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4859 if ((*pdpe & PG_V) == 0) {
4860 va_next = (sva + NBPDP) & ~PDPMASK;
4861 if (va_next < sva)
4862 va_next = eva;
4863 continue;
4864 }
4865 va_next = (sva + NBPDR) & ~PDRMASK;
4866 if (va_next < sva)
4867 va_next = eva;
4868 pde = pmap_pdpe_to_pde(pdpe, sva);
4869 if ((*pde & PG_V) == 0)
4870 continue;
4871 if ((*pde & PG_PS) != 0) {
4872 if ((*pde & PG_W) == 0)
4873 panic("pmap_unwire: pde %#jx is missing PG_W",
4874 (uintmax_t)*pde);
4875
4876 /*
4877 * Are we unwiring the entire large page? If not,
4878 * demote the mapping and fall through.
4879 */
4880 if (sva + NBPDR == va_next && eva >= va_next) {
4881 atomic_clear_long(pde, PG_W);
4882 pmap->pm_stats.wired_count -= NBPDR /
4883 PAGE_SIZE;
4884 continue;
4885 } else {
4886 if (!pv_lists_locked) {
4887 pv_lists_locked = TRUE;
4888 if (!rw_try_rlock(&pvh_global_lock)) {
4889 PMAP_UNLOCK(pmap);
4890 rw_rlock(&pvh_global_lock);
4891 /* Repeat sva. */
4892 goto resume;
4893 }
4894 }
4895 if (!pmap_demote_pde(pmap, pde, sva))
4896 panic("pmap_unwire: demotion failed");
4897 }
4898 }
4899 if (va_next > eva)
4900 va_next = eva;
4901 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4902 sva += PAGE_SIZE) {
4903 if ((*pte & PG_V) == 0)
4904 continue;
4905 if ((*pte & PG_W) == 0)
4906 panic("pmap_unwire: pte %#jx is missing PG_W",
4907 (uintmax_t)*pte);
4908
4909 /*
4910 * PG_W must be cleared atomically. Although the pmap
4911 * lock synchronizes access to PG_W, another processor
4912 * could be setting PG_M and/or PG_A concurrently.
4913 */
4914 atomic_clear_long(pte, PG_W);
4915 pmap->pm_stats.wired_count--;
4916 }
4917 }
4918 if (pv_lists_locked)
4919 rw_runlock(&pvh_global_lock);
4920 PMAP_UNLOCK(pmap);
4921 }
4922
4923 /*
4924 * Copy the range specified by src_addr/len
4925 * from the source map to the range dst_addr/len
4926 * in the destination map.
4927 *
4928 * This routine is only advisory and need not do anything.
4929 */
4930
4931 void
4932 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4933 vm_offset_t src_addr)
4934 {
4935 struct rwlock *lock;
4936 struct spglist free;
4937 vm_offset_t addr;
4938 vm_offset_t end_addr = src_addr + len;
4939 vm_offset_t va_next;
4940 pt_entry_t PG_A, PG_M, PG_V;
4941
4942 if (dst_addr != src_addr)
4943 return;
4944
4945 if (dst_pmap->pm_type != src_pmap->pm_type)
4946 return;
4947
4948 /*
4949 * EPT page table entries that require emulation of A/D bits are
4950 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
4951 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
4952 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
4953 * implementations flag an EPT misconfiguration for exec-only
4954 * mappings we skip this function entirely for emulated pmaps.
4955 */
4956 if (pmap_emulate_ad_bits(dst_pmap))
4957 return;
4958
4959 lock = NULL;
4960 rw_rlock(&pvh_global_lock);
4961 if (dst_pmap < src_pmap) {
4962 PMAP_LOCK(dst_pmap);
4963 PMAP_LOCK(src_pmap);
4964 } else {
4965 PMAP_LOCK(src_pmap);
4966 PMAP_LOCK(dst_pmap);
4967 }
4968
4969 PG_A = pmap_accessed_bit(dst_pmap);
4970 PG_M = pmap_modified_bit(dst_pmap);
4971 PG_V = pmap_valid_bit(dst_pmap);
4972
4973 for (addr = src_addr; addr < end_addr; addr = va_next) {
4974 pt_entry_t *src_pte, *dst_pte;
4975 vm_page_t dstmpde, dstmpte, srcmpte;
4976 pml4_entry_t *pml4e;
4977 pdp_entry_t *pdpe;
4978 pd_entry_t srcptepaddr, *pde;
4979
4980 KASSERT(addr < UPT_MIN_ADDRESS,
4981 ("pmap_copy: invalid to pmap_copy page tables"));
4982
4983 pml4e = pmap_pml4e(src_pmap, addr);
4984 if ((*pml4e & PG_V) == 0) {
4985 va_next = (addr + NBPML4) & ~PML4MASK;
4986 if (va_next < addr)
4987 va_next = end_addr;
4988 continue;
4989 }
4990
4991 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
4992 if ((*pdpe & PG_V) == 0) {
4993 va_next = (addr + NBPDP) & ~PDPMASK;
4994 if (va_next < addr)
4995 va_next = end_addr;
4996 continue;
4997 }
4998
4999 va_next = (addr + NBPDR) & ~PDRMASK;
5000 if (va_next < addr)
5001 va_next = end_addr;
5002
5003 pde = pmap_pdpe_to_pde(pdpe, addr);
5004 srcptepaddr = *pde;
5005 if (srcptepaddr == 0)
5006 continue;
5007
5008 if (srcptepaddr & PG_PS) {
5009 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
5010 continue;
5011 dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
5012 if (dstmpde == NULL)
5013 break;
5014 pde = (pd_entry_t *)
5015 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
5016 pde = &pde[pmap_pde_index(addr)];
5017 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
5018 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
5019 PG_PS_FRAME, &lock))) {
5020 *pde = srcptepaddr & ~PG_W;
5021 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
5022 } else
5023 dstmpde->wire_count--;
5024 continue;
5025 }
5026
5027 srcptepaddr &= PG_FRAME;
5028 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
5029 KASSERT(srcmpte->wire_count > 0,
5030 ("pmap_copy: source page table page is unused"));
5031
5032 if (va_next > end_addr)
5033 va_next = end_addr;
5034
5035 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
5036 src_pte = &src_pte[pmap_pte_index(addr)];
5037 dstmpte = NULL;
5038 while (addr < va_next) {
5039 pt_entry_t ptetemp;
5040 ptetemp = *src_pte;
5041 /*
5042 * we only virtual copy managed pages
5043 */
5044 if ((ptetemp & PG_MANAGED) != 0) {
5045 if (dstmpte != NULL &&
5046 dstmpte->pindex == pmap_pde_pindex(addr))
5047 dstmpte->wire_count++;
5048 else if ((dstmpte = pmap_allocpte(dst_pmap,
5049 addr, NULL)) == NULL)
5050 goto out;
5051 dst_pte = (pt_entry_t *)
5052 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5053 dst_pte = &dst_pte[pmap_pte_index(addr)];
5054 if (*dst_pte == 0 &&
5055 pmap_try_insert_pv_entry(dst_pmap, addr,
5056 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
5057 &lock)) {
5058 /*
5059 * Clear the wired, modified, and
5060 * accessed (referenced) bits
5061 * during the copy.
5062 */
5063 *dst_pte = ptetemp & ~(PG_W | PG_M |
5064 PG_A);
5065 pmap_resident_count_inc(dst_pmap, 1);
5066 } else {
5067 SLIST_INIT(&free);
5068 if (pmap_unwire_ptp(dst_pmap, addr,
5069 dstmpte, &free)) {
5070 pmap_invalidate_page(dst_pmap,
5071 addr);
5072 pmap_free_zero_pages(&free);
5073 }
5074 goto out;
5075 }
5076 if (dstmpte->wire_count >= srcmpte->wire_count)
5077 break;
5078 }
5079 addr += PAGE_SIZE;
5080 src_pte++;
5081 }
5082 }
5083 out:
5084 if (lock != NULL)
5085 rw_wunlock(lock);
5086 rw_runlock(&pvh_global_lock);
5087 PMAP_UNLOCK(src_pmap);
5088 PMAP_UNLOCK(dst_pmap);
5089 }
5090
5091 /*
5092 * pmap_zero_page zeros the specified hardware page by mapping
5093 * the page into KVM and using bzero to clear its contents.
5094 */
5095 void
5096 pmap_zero_page(vm_page_t m)
5097 {
5098 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5099
5100 pagezero((void *)va);
5101 }
5102
5103 /*
5104 * pmap_zero_page_area zeros the specified hardware page by mapping
5105 * the page into KVM and using bzero to clear its contents.
5106 *
5107 * off and size may not cover an area beyond a single hardware page.
5108 */
5109 void
5110 pmap_zero_page_area(vm_page_t m, int off, int size)
5111 {
5112 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5113
5114 if (off == 0 && size == PAGE_SIZE)
5115 pagezero((void *)va);
5116 else
5117 bzero((char *)va + off, size);
5118 }
5119
5120 /*
5121 * pmap_zero_page_idle zeros the specified hardware page by mapping
5122 * the page into KVM and using bzero to clear its contents. This
5123 * is intended to be called from the vm_pagezero process only and
5124 * outside of Giant.
5125 */
5126 void
5127 pmap_zero_page_idle(vm_page_t m)
5128 {
5129 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5130
5131 pagezero((void *)va);
5132 }
5133
5134 /*
5135 * pmap_copy_page copies the specified (machine independent)
5136 * page by mapping the page into virtual memory and using
5137 * bcopy to copy the page, one machine dependent page at a
5138 * time.
5139 */
5140 void
5141 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5142 {
5143 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5144 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5145
5146 pagecopy((void *)src, (void *)dst);
5147 }
5148
5149 int unmapped_buf_allowed = 1;
5150
5151 void
5152 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5153 vm_offset_t b_offset, int xfersize)
5154 {
5155 void *a_cp, *b_cp;
5156 vm_page_t m_a, m_b;
5157 vm_paddr_t p_a, p_b;
5158 pt_entry_t *pte;
5159 vm_offset_t a_pg_offset, b_pg_offset;
5160 int cnt;
5161 boolean_t pinned;
5162
5163 /*
5164 * NB: The sequence of updating a page table followed by accesses
5165 * to the corresponding pages used in the !DMAP case is subject to
5166 * the situation described in the "AMD64 Architecture Programmer's
5167 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
5168 * Coherency Considerations". Therefore, issuing the INVLPG right
5169 * after modifying the PTE bits is crucial.
5170 */
5171 pinned = FALSE;
5172 while (xfersize > 0) {
5173 a_pg_offset = a_offset & PAGE_MASK;
5174 m_a = ma[a_offset >> PAGE_SHIFT];
5175 p_a = m_a->phys_addr;
5176 b_pg_offset = b_offset & PAGE_MASK;
5177 m_b = mb[b_offset >> PAGE_SHIFT];
5178 p_b = m_b->phys_addr;
5179 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5180 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5181 if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
5182 p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
5183 mtx_lock(&cpage_lock);
5184 sched_pin();
5185 pinned = TRUE;
5186 pte = vtopte(cpage_a);
5187 *pte = p_a | X86_PG_A | X86_PG_V |
5188 pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
5189 invlpg(cpage_a);
5190 a_cp = (char *)cpage_a + a_pg_offset;
5191 } else {
5192 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
5193 }
5194 if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
5195 p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
5196 if (!pinned) {
5197 mtx_lock(&cpage_lock);
5198 sched_pin();
5199 pinned = TRUE;
5200 }
5201 pte = vtopte(cpage_b);
5202 *pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
5203 X86_PG_V | pmap_cache_bits(kernel_pmap,
5204 m_b->md.pat_mode, 0);
5205 invlpg(cpage_b);
5206 b_cp = (char *)cpage_b + b_pg_offset;
5207 } else {
5208 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
5209 }
5210 bcopy(a_cp, b_cp, cnt);
5211 if (__predict_false(pinned)) {
5212 sched_unpin();
5213 mtx_unlock(&cpage_lock);
5214 pinned = FALSE;
5215 }
5216 a_offset += cnt;
5217 b_offset += cnt;
5218 xfersize -= cnt;
5219 }
5220 }
5221
5222 /*
5223 * Returns true if the pmap's pv is one of the first
5224 * 16 pvs linked to from this page. This count may
5225 * be changed upwards or downwards in the future; it
5226 * is only necessary that true be returned for a small
5227 * subset of pmaps for proper page aging.
5228 */
5229 boolean_t
5230 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5231 {
5232 struct md_page *pvh;
5233 struct rwlock *lock;
5234 pv_entry_t pv;
5235 int loops = 0;
5236 boolean_t rv;
5237
5238 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5239 ("pmap_page_exists_quick: page %p is not managed", m));
5240 rv = FALSE;
5241 rw_rlock(&pvh_global_lock);
5242 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5243 rw_rlock(lock);
5244 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5245 if (PV_PMAP(pv) == pmap) {
5246 rv = TRUE;
5247 break;
5248 }
5249 loops++;
5250 if (loops >= 16)
5251 break;
5252 }
5253 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5254 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5255 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5256 if (PV_PMAP(pv) == pmap) {
5257 rv = TRUE;
5258 break;
5259 }
5260 loops++;
5261 if (loops >= 16)
5262 break;
5263 }
5264 }
5265 rw_runlock(lock);
5266 rw_runlock(&pvh_global_lock);
5267 return (rv);
5268 }
5269
5270 /*
5271 * pmap_page_wired_mappings:
5272 *
5273 * Return the number of managed mappings to the given physical page
5274 * that are wired.
5275 */
5276 int
5277 pmap_page_wired_mappings(vm_page_t m)
5278 {
5279 struct rwlock *lock;
5280 struct md_page *pvh;
5281 pmap_t pmap;
5282 pt_entry_t *pte;
5283 pv_entry_t pv;
5284 int count, md_gen, pvh_gen;
5285
5286 if ((m->oflags & VPO_UNMANAGED) != 0)
5287 return (0);
5288 rw_rlock(&pvh_global_lock);
5289 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5290 rw_rlock(lock);
5291 restart:
5292 count = 0;
5293 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5294 pmap = PV_PMAP(pv);
5295 if (!PMAP_TRYLOCK(pmap)) {
5296 md_gen = m->md.pv_gen;
5297 rw_runlock(lock);
5298 PMAP_LOCK(pmap);
5299 rw_rlock(lock);
5300 if (md_gen != m->md.pv_gen) {
5301 PMAP_UNLOCK(pmap);
5302 goto restart;
5303 }
5304 }
5305 pte = pmap_pte(pmap, pv->pv_va);
5306 if ((*pte & PG_W) != 0)
5307 count++;
5308 PMAP_UNLOCK(pmap);
5309 }
5310 if ((m->flags & PG_FICTITIOUS) == 0) {
5311 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5312 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5313 pmap = PV_PMAP(pv);
5314 if (!PMAP_TRYLOCK(pmap)) {
5315 md_gen = m->md.pv_gen;
5316 pvh_gen = pvh->pv_gen;
5317 rw_runlock(lock);
5318 PMAP_LOCK(pmap);
5319 rw_rlock(lock);
5320 if (md_gen != m->md.pv_gen ||
5321 pvh_gen != pvh->pv_gen) {
5322 PMAP_UNLOCK(pmap);
5323 goto restart;
5324 }
5325 }
5326 pte = pmap_pde(pmap, pv->pv_va);
5327 if ((*pte & PG_W) != 0)
5328 count++;
5329 PMAP_UNLOCK(pmap);
5330 }
5331 }
5332 rw_runlock(lock);
5333 rw_runlock(&pvh_global_lock);
5334 return (count);
5335 }
5336
5337 /*
5338 * Returns TRUE if the given page is mapped individually or as part of
5339 * a 2mpage. Otherwise, returns FALSE.
5340 */
5341 boolean_t
5342 pmap_page_is_mapped(vm_page_t m)
5343 {
5344 struct rwlock *lock;
5345 boolean_t rv;
5346
5347 if ((m->oflags & VPO_UNMANAGED) != 0)
5348 return (FALSE);
5349 rw_rlock(&pvh_global_lock);
5350 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5351 rw_rlock(lock);
5352 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5353 ((m->flags & PG_FICTITIOUS) == 0 &&
5354 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5355 rw_runlock(lock);
5356 rw_runlock(&pvh_global_lock);
5357 return (rv);
5358 }
5359
5360 /*
5361 * Destroy all managed, non-wired mappings in the given user-space
5362 * pmap. This pmap cannot be active on any processor besides the
5363 * caller.
5364 *
5365 * This function cannot be applied to the kernel pmap. Moreover, it
5366 * is not intended for general use. It is only to be used during
5367 * process termination. Consequently, it can be implemented in ways
5368 * that make it faster than pmap_remove(). First, it can more quickly
5369 * destroy mappings by iterating over the pmap's collection of PV
5370 * entries, rather than searching the page table. Second, it doesn't
5371 * have to test and clear the page table entries atomically, because
5372 * no processor is currently accessing the user address space. In
5373 * particular, a page table entry's dirty bit won't change state once
5374 * this function starts.
5375 */
5376 void
5377 pmap_remove_pages(pmap_t pmap)
5378 {
5379 pd_entry_t ptepde;
5380 pt_entry_t *pte, tpte;
5381 pt_entry_t PG_M, PG_RW, PG_V;
5382 struct spglist free;
5383 vm_page_t m, mpte, mt;
5384 pv_entry_t pv;
5385 struct md_page *pvh;
5386 struct pv_chunk *pc, *npc;
5387 struct rwlock *lock;
5388 int64_t bit;
5389 uint64_t inuse, bitmask;
5390 int allfree, field, freed, idx;
5391 boolean_t superpage;
5392 vm_paddr_t pa;
5393
5394 /*
5395 * Assert that the given pmap is only active on the current
5396 * CPU. Unfortunately, we cannot block another CPU from
5397 * activating the pmap while this function is executing.
5398 */
5399 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5400 #ifdef INVARIANTS
5401 {
5402 cpuset_t other_cpus;
5403
5404 other_cpus = all_cpus;
5405 critical_enter();
5406 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5407 CPU_AND(&other_cpus, &pmap->pm_active);
5408 critical_exit();
5409 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
5410 }
5411 #endif
5412
5413 lock = NULL;
5414 PG_M = pmap_modified_bit(pmap);
5415 PG_V = pmap_valid_bit(pmap);
5416 PG_RW = pmap_rw_bit(pmap);
5417
5418 SLIST_INIT(&free);
5419 rw_rlock(&pvh_global_lock);
5420 PMAP_LOCK(pmap);
5421 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5422 allfree = 1;
5423 freed = 0;
5424 for (field = 0; field < _NPCM; field++) {
5425 inuse = ~pc->pc_map[field] & pc_freemask[field];
5426 while (inuse != 0) {
5427 bit = bsfq(inuse);
5428 bitmask = 1UL << bit;
5429 idx = field * 64 + bit;
5430 pv = &pc->pc_pventry[idx];
5431 inuse &= ~bitmask;
5432
5433 pte = pmap_pdpe(pmap, pv->pv_va);
5434 ptepde = *pte;
5435 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
5436 tpte = *pte;
5437 if ((tpte & (PG_PS | PG_V)) == PG_V) {
5438 superpage = FALSE;
5439 ptepde = tpte;
5440 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5441 PG_FRAME);
5442 pte = &pte[pmap_pte_index(pv->pv_va)];
5443 tpte = *pte;
5444 } else {
5445 /*
5446 * Keep track whether 'tpte' is a
5447 * superpage explicitly instead of
5448 * relying on PG_PS being set.
5449 *
5450 * This is because PG_PS is numerically
5451 * identical to PG_PTE_PAT and thus a
5452 * regular page could be mistaken for
5453 * a superpage.
5454 */
5455 superpage = TRUE;
5456 }
5457
5458 if ((tpte & PG_V) == 0) {
5459 panic("bad pte va %lx pte %lx",
5460 pv->pv_va, tpte);
5461 }
5462
5463 /*
5464 * We cannot remove wired pages from a process' mapping at this time
5465 */
5466 if (tpte & PG_W) {
5467 allfree = 0;
5468 continue;
5469 }
5470
5471 if (superpage)
5472 pa = tpte & PG_PS_FRAME;
5473 else
5474 pa = tpte & PG_FRAME;
5475
5476 m = PHYS_TO_VM_PAGE(pa);
5477 KASSERT(m->phys_addr == pa,
5478 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5479 m, (uintmax_t)m->phys_addr,
5480 (uintmax_t)tpte));
5481
5482 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5483 m < &vm_page_array[vm_page_array_size],
5484 ("pmap_remove_pages: bad tpte %#jx",
5485 (uintmax_t)tpte));
5486
5487 pte_clear(pte);
5488
5489 /*
5490 * Update the vm_page_t clean/reference bits.
5491 */
5492 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5493 if (superpage) {
5494 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5495 vm_page_dirty(mt);
5496 } else
5497 vm_page_dirty(m);
5498 }
5499
5500 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5501
5502 /* Mark free */
5503 pc->pc_map[field] |= bitmask;
5504 if (superpage) {
5505 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5506 pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5507 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5508 pvh->pv_gen++;
5509 if (TAILQ_EMPTY(&pvh->pv_list)) {
5510 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5511 if ((mt->aflags & PGA_WRITEABLE) != 0 &&
5512 TAILQ_EMPTY(&mt->md.pv_list))
5513 vm_page_aflag_clear(mt, PGA_WRITEABLE);
5514 }
5515 mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
5516 if (mpte != NULL) {
5517 pmap_remove_pt_page(pmap, mpte);
5518 pmap_resident_count_dec(pmap, 1);
5519 KASSERT(mpte->wire_count == NPTEPG,
5520 ("pmap_remove_pages: pte page wire count error"));
5521 mpte->wire_count = 0;
5522 pmap_add_delayed_free_list(mpte, &free, FALSE);
5523 atomic_subtract_int(&cnt.v_wire_count, 1);
5524 }
5525 } else {
5526 pmap_resident_count_dec(pmap, 1);
5527 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5528 m->md.pv_gen++;
5529 if ((m->aflags & PGA_WRITEABLE) != 0 &&
5530 TAILQ_EMPTY(&m->md.pv_list) &&
5531 (m->flags & PG_FICTITIOUS) == 0) {
5532 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5533 if (TAILQ_EMPTY(&pvh->pv_list))
5534 vm_page_aflag_clear(m, PGA_WRITEABLE);
5535 }
5536 }
5537 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
5538 freed++;
5539 }
5540 }
5541 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5542 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5543 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5544 if (allfree) {
5545 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5546 free_pv_chunk(pc);
5547 }
5548 }
5549 if (lock != NULL)
5550 rw_wunlock(lock);
5551 pmap_invalidate_all(pmap);
5552 rw_runlock(&pvh_global_lock);
5553 PMAP_UNLOCK(pmap);
5554 pmap_free_zero_pages(&free);
5555 }
5556
5557 static boolean_t
5558 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5559 {
5560 struct rwlock *lock;
5561 pv_entry_t pv;
5562 struct md_page *pvh;
5563 pt_entry_t *pte, mask;
5564 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5565 pmap_t pmap;
5566 int md_gen, pvh_gen;
5567 boolean_t rv;
5568
5569 rv = FALSE;
5570 rw_rlock(&pvh_global_lock);
5571 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5572 rw_rlock(lock);
5573 restart:
5574 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5575 pmap = PV_PMAP(pv);
5576 if (!PMAP_TRYLOCK(pmap)) {
5577 md_gen = m->md.pv_gen;
5578 rw_runlock(lock);
5579 PMAP_LOCK(pmap);
5580 rw_rlock(lock);
5581 if (md_gen != m->md.pv_gen) {
5582 PMAP_UNLOCK(pmap);
5583 goto restart;
5584 }
5585 }
5586 pte = pmap_pte(pmap, pv->pv_va);
5587 mask = 0;
5588 if (modified) {
5589 PG_M = pmap_modified_bit(pmap);
5590 PG_RW = pmap_rw_bit(pmap);
5591 mask |= PG_RW | PG_M;
5592 }
5593 if (accessed) {
5594 PG_A = pmap_accessed_bit(pmap);
5595 PG_V = pmap_valid_bit(pmap);
5596 mask |= PG_V | PG_A;
5597 }
5598 rv = (*pte & mask) == mask;
5599 PMAP_UNLOCK(pmap);
5600 if (rv)
5601 goto out;
5602 }
5603 if ((m->flags & PG_FICTITIOUS) == 0) {
5604 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5605 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5606 pmap = PV_PMAP(pv);
5607 if (!PMAP_TRYLOCK(pmap)) {
5608 md_gen = m->md.pv_gen;
5609 pvh_gen = pvh->pv_gen;
5610 rw_runlock(lock);
5611 PMAP_LOCK(pmap);
5612 rw_rlock(lock);
5613 if (md_gen != m->md.pv_gen ||
5614 pvh_gen != pvh->pv_gen) {
5615 PMAP_UNLOCK(pmap);
5616 goto restart;
5617 }
5618 }
5619 pte = pmap_pde(pmap, pv->pv_va);
5620 mask = 0;
5621 if (modified) {
5622 PG_M = pmap_modified_bit(pmap);
5623 PG_RW = pmap_rw_bit(pmap);
5624 mask |= PG_RW | PG_M;
5625 }
5626 if (accessed) {
5627 PG_A = pmap_accessed_bit(pmap);
5628 PG_V = pmap_valid_bit(pmap);
5629 mask |= PG_V | PG_A;
5630 }
5631 rv = (*pte & mask) == mask;
5632 PMAP_UNLOCK(pmap);
5633 if (rv)
5634 goto out;
5635 }
5636 }
5637 out:
5638 rw_runlock(lock);
5639 rw_runlock(&pvh_global_lock);
5640 return (rv);
5641 }
5642
5643 /*
5644 * pmap_is_modified:
5645 *
5646 * Return whether or not the specified physical page was modified
5647 * in any physical maps.
5648 */
5649 boolean_t
5650 pmap_is_modified(vm_page_t m)
5651 {
5652
5653 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5654 ("pmap_is_modified: page %p is not managed", m));
5655
5656 /*
5657 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5658 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE
5659 * is clear, no PTEs can have PG_M set.
5660 */
5661 VM_OBJECT_ASSERT_WLOCKED(m->object);
5662 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5663 return (FALSE);
5664 return (pmap_page_test_mappings(m, FALSE, TRUE));
5665 }
5666
5667 /*
5668 * pmap_is_prefaultable:
5669 *
5670 * Return whether or not the specified virtual address is eligible
5671 * for prefault.
5672 */
5673 boolean_t
5674 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5675 {
5676 pd_entry_t *pde;
5677 pt_entry_t *pte, PG_V;
5678 boolean_t rv;
5679
5680 PG_V = pmap_valid_bit(pmap);
5681 rv = FALSE;
5682 PMAP_LOCK(pmap);
5683 pde = pmap_pde(pmap, addr);
5684 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
5685 pte = pmap_pde_to_pte(pde, addr);
5686 rv = (*pte & PG_V) == 0;
5687 }
5688 PMAP_UNLOCK(pmap);
5689 return (rv);
5690 }
5691
5692 /*
5693 * pmap_is_referenced:
5694 *
5695 * Return whether or not the specified physical page was referenced
5696 * in any physical maps.
5697 */
5698 boolean_t
5699 pmap_is_referenced(vm_page_t m)
5700 {
5701
5702 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5703 ("pmap_is_referenced: page %p is not managed", m));
5704 return (pmap_page_test_mappings(m, TRUE, FALSE));
5705 }
5706
5707 /*
5708 * Clear the write and modified bits in each of the given page's mappings.
5709 */
5710 void
5711 pmap_remove_write(vm_page_t m)
5712 {
5713 struct md_page *pvh;
5714 pmap_t pmap;
5715 struct rwlock *lock;
5716 pv_entry_t next_pv, pv;
5717 pd_entry_t *pde;
5718 pt_entry_t oldpte, *pte, PG_M, PG_RW;
5719 vm_offset_t va;
5720 int pvh_gen, md_gen;
5721
5722 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5723 ("pmap_remove_write: page %p is not managed", m));
5724
5725 /*
5726 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
5727 * set by another thread while the object is locked. Thus,
5728 * if PGA_WRITEABLE is clear, no page table entries need updating.
5729 */
5730 VM_OBJECT_ASSERT_WLOCKED(m->object);
5731 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
5732 return;
5733 rw_rlock(&pvh_global_lock);
5734 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5735 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5736 pa_to_pvh(VM_PAGE_TO_PHYS(m));
5737 retry_pv_loop:
5738 rw_wlock(lock);
5739 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5740 pmap = PV_PMAP(pv);
5741 if (!PMAP_TRYLOCK(pmap)) {
5742 pvh_gen = pvh->pv_gen;
5743 rw_wunlock(lock);
5744 PMAP_LOCK(pmap);
5745 rw_wlock(lock);
5746 if (pvh_gen != pvh->pv_gen) {
5747 PMAP_UNLOCK(pmap);
5748 rw_wunlock(lock);
5749 goto retry_pv_loop;
5750 }
5751 }
5752 PG_RW = pmap_rw_bit(pmap);
5753 va = pv->pv_va;
5754 pde = pmap_pde(pmap, va);
5755 if ((*pde & PG_RW) != 0)
5756 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5757 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5758 ("inconsistent pv lock %p %p for page %p",
5759 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5760 PMAP_UNLOCK(pmap);
5761 }
5762 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5763 pmap = PV_PMAP(pv);
5764 if (!PMAP_TRYLOCK(pmap)) {
5765 pvh_gen = pvh->pv_gen;
5766 md_gen = m->md.pv_gen;
5767 rw_wunlock(lock);
5768 PMAP_LOCK(pmap);
5769 rw_wlock(lock);
5770 if (pvh_gen != pvh->pv_gen ||
5771 md_gen != m->md.pv_gen) {
5772 PMAP_UNLOCK(pmap);
5773 rw_wunlock(lock);
5774 goto retry_pv_loop;
5775 }
5776 }
5777 PG_M = pmap_modified_bit(pmap);
5778 PG_RW = pmap_rw_bit(pmap);
5779 pde = pmap_pde(pmap, pv->pv_va);
5780 KASSERT((*pde & PG_PS) == 0,
5781 ("pmap_remove_write: found a 2mpage in page %p's pv list",
5782 m));
5783 pte = pmap_pde_to_pte(pde, pv->pv_va);
5784 retry:
5785 oldpte = *pte;
5786 if (oldpte & PG_RW) {
5787 if (!atomic_cmpset_long(pte, oldpte, oldpte &
5788 ~(PG_RW | PG_M)))
5789 goto retry;
5790 if ((oldpte & PG_M) != 0)
5791 vm_page_dirty(m);
5792 pmap_invalidate_page(pmap, pv->pv_va);
5793 }
5794 PMAP_UNLOCK(pmap);
5795 }
5796 rw_wunlock(lock);
5797 vm_page_aflag_clear(m, PGA_WRITEABLE);
5798 rw_runlock(&pvh_global_lock);
5799 }
5800
5801 static __inline boolean_t
5802 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
5803 {
5804
5805 if (!pmap_emulate_ad_bits(pmap))
5806 return (TRUE);
5807
5808 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
5809
5810 /*
5811 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
5812 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
5813 * if the EPT_PG_WRITE bit is set.
5814 */
5815 if ((pte & EPT_PG_WRITE) != 0)
5816 return (FALSE);
5817
5818 /*
5819 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
5820 */
5821 if ((pte & EPT_PG_EXECUTE) == 0 ||
5822 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
5823 return (TRUE);
5824 else
5825 return (FALSE);
5826 }
5827
5828 #define PMAP_TS_REFERENCED_MAX 5
5829
5830 /*
5831 * pmap_ts_referenced:
5832 *
5833 * Return a count of reference bits for a page, clearing those bits.
5834 * It is not necessary for every reference bit to be cleared, but it
5835 * is necessary that 0 only be returned when there are truly no
5836 * reference bits set.
5837 *
5838 * XXX: The exact number of bits to check and clear is a matter that
5839 * should be tested and standardized at some point in the future for
5840 * optimal aging of shared pages.
5841 */
5842 int
5843 pmap_ts_referenced(vm_page_t m)
5844 {
5845 struct md_page *pvh;
5846 pv_entry_t pv, pvf;
5847 pmap_t pmap;
5848 struct rwlock *lock;
5849 pd_entry_t oldpde, *pde;
5850 pt_entry_t *pte, PG_A;
5851 vm_offset_t va;
5852 vm_paddr_t pa;
5853 int cleared, md_gen, not_cleared, pvh_gen;
5854 struct spglist free;
5855 boolean_t demoted;
5856
5857 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5858 ("pmap_ts_referenced: page %p is not managed", m));
5859 SLIST_INIT(&free);
5860 cleared = 0;
5861 pa = VM_PAGE_TO_PHYS(m);
5862 lock = PHYS_TO_PV_LIST_LOCK(pa);
5863 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
5864 rw_rlock(&pvh_global_lock);
5865 rw_wlock(lock);
5866 retry:
5867 not_cleared = 0;
5868 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5869 goto small_mappings;
5870 pv = pvf;
5871 do {
5872 if (pvf == NULL)
5873 pvf = pv;
5874 pmap = PV_PMAP(pv);
5875 if (!PMAP_TRYLOCK(pmap)) {
5876 pvh_gen = pvh->pv_gen;
5877 rw_wunlock(lock);
5878 PMAP_LOCK(pmap);
5879 rw_wlock(lock);
5880 if (pvh_gen != pvh->pv_gen) {
5881 PMAP_UNLOCK(pmap);
5882 goto retry;
5883 }
5884 }
5885 PG_A = pmap_accessed_bit(pmap);
5886 va = pv->pv_va;
5887 pde = pmap_pde(pmap, pv->pv_va);
5888 oldpde = *pde;
5889 if ((*pde & PG_A) != 0) {
5890 /*
5891 * Since this reference bit is shared by 512 4KB
5892 * pages, it should not be cleared every time it is
5893 * tested. Apply a simple "hash" function on the
5894 * physical page number, the virtual superpage number,
5895 * and the pmap address to select one 4KB page out of
5896 * the 512 on which testing the reference bit will
5897 * result in clearing that reference bit. This
5898 * function is designed to avoid the selection of the
5899 * same 4KB page for every 2MB page mapping.
5900 *
5901 * On demotion, a mapping that hasn't been referenced
5902 * is simply destroyed. To avoid the possibility of a
5903 * subsequent page fault on a demoted wired mapping,
5904 * always leave its reference bit set. Moreover,
5905 * since the superpage is wired, the current state of
5906 * its reference bit won't affect page replacement.
5907 */
5908 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
5909 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
5910 (*pde & PG_W) == 0) {
5911 if (safe_to_clear_referenced(pmap, oldpde)) {
5912 atomic_clear_long(pde, PG_A);
5913 pmap_invalidate_page(pmap, pv->pv_va);
5914 demoted = FALSE;
5915 } else if (pmap_demote_pde_locked(pmap, pde,
5916 pv->pv_va, &lock)) {
5917 /*
5918 * Remove the mapping to a single page
5919 * so that a subsequent access may
5920 * repromote. Since the underlying
5921 * page table page is fully populated,
5922 * this removal never frees a page
5923 * table page.
5924 */
5925 demoted = TRUE;
5926 va += VM_PAGE_TO_PHYS(m) - (oldpde &
5927 PG_PS_FRAME);
5928 pte = pmap_pde_to_pte(pde, va);
5929 pmap_remove_pte(pmap, pte, va, *pde,
5930 NULL, &lock);
5931 pmap_invalidate_page(pmap, va);
5932 } else
5933 demoted = TRUE;
5934
5935 if (demoted) {
5936 /*
5937 * The superpage mapping was removed
5938 * entirely and therefore 'pv' is no
5939 * longer valid.
5940 */
5941 if (pvf == pv)
5942 pvf = NULL;
5943 pv = NULL;
5944 }
5945 cleared++;
5946 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5947 ("inconsistent pv lock %p %p for page %p",
5948 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5949 } else
5950 not_cleared++;
5951 }
5952 PMAP_UNLOCK(pmap);
5953 /* Rotate the PV list if it has more than one entry. */
5954 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5955 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5956 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5957 pvh->pv_gen++;
5958 }
5959 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5960 goto out;
5961 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5962 small_mappings:
5963 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5964 goto out;
5965 pv = pvf;
5966 do {
5967 if (pvf == NULL)
5968 pvf = pv;
5969 pmap = PV_PMAP(pv);
5970 if (!PMAP_TRYLOCK(pmap)) {
5971 pvh_gen = pvh->pv_gen;
5972 md_gen = m->md.pv_gen;
5973 rw_wunlock(lock);
5974 PMAP_LOCK(pmap);
5975 rw_wlock(lock);
5976 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5977 PMAP_UNLOCK(pmap);
5978 goto retry;
5979 }
5980 }
5981 PG_A = pmap_accessed_bit(pmap);
5982 pde = pmap_pde(pmap, pv->pv_va);
5983 KASSERT((*pde & PG_PS) == 0,
5984 ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
5985 m));
5986 pte = pmap_pde_to_pte(pde, pv->pv_va);
5987 if ((*pte & PG_A) != 0) {
5988 if (safe_to_clear_referenced(pmap, *pte)) {
5989 atomic_clear_long(pte, PG_A);
5990 pmap_invalidate_page(pmap, pv->pv_va);
5991 cleared++;
5992 } else if ((*pte & PG_W) == 0) {
5993 /*
5994 * Wired pages cannot be paged out so
5995 * doing accessed bit emulation for
5996 * them is wasted effort. We do the
5997 * hard work for unwired pages only.
5998 */
5999 pmap_remove_pte(pmap, pte, pv->pv_va,
6000 *pde, &free, &lock);
6001 pmap_invalidate_page(pmap, pv->pv_va);
6002 cleared++;
6003 if (pvf == pv)
6004 pvf = NULL;
6005 pv = NULL;
6006 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6007 ("inconsistent pv lock %p %p for page %p",
6008 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6009 } else
6010 not_cleared++;
6011 }
6012 PMAP_UNLOCK(pmap);
6013 /* Rotate the PV list if it has more than one entry. */
6014 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6015 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6016 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6017 m->md.pv_gen++;
6018 }
6019 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6020 not_cleared < PMAP_TS_REFERENCED_MAX);
6021 out:
6022 rw_wunlock(lock);
6023 rw_runlock(&pvh_global_lock);
6024 pmap_free_zero_pages(&free);
6025 return (cleared + not_cleared);
6026 }
6027
6028 /*
6029 * Apply the given advice to the specified range of addresses within the
6030 * given pmap. Depending on the advice, clear the referenced and/or
6031 * modified flags in each mapping and set the mapped page's dirty field.
6032 */
6033 void
6034 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6035 {
6036 struct rwlock *lock;
6037 pml4_entry_t *pml4e;
6038 pdp_entry_t *pdpe;
6039 pd_entry_t oldpde, *pde;
6040 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
6041 vm_offset_t va_next;
6042 vm_page_t m;
6043 boolean_t anychanged, pv_lists_locked;
6044
6045 if (advice != MADV_DONTNEED && advice != MADV_FREE)
6046 return;
6047
6048 /*
6049 * A/D bit emulation requires an alternate code path when clearing
6050 * the modified and accessed bits below. Since this function is
6051 * advisory in nature we skip it entirely for pmaps that require
6052 * A/D bit emulation.
6053 */
6054 if (pmap_emulate_ad_bits(pmap))
6055 return;
6056
6057 PG_A = pmap_accessed_bit(pmap);
6058 PG_G = pmap_global_bit(pmap);
6059 PG_M = pmap_modified_bit(pmap);
6060 PG_V = pmap_valid_bit(pmap);
6061 PG_RW = pmap_rw_bit(pmap);
6062
6063 pv_lists_locked = FALSE;
6064 resume:
6065 anychanged = FALSE;
6066 PMAP_LOCK(pmap);
6067 for (; sva < eva; sva = va_next) {
6068 pml4e = pmap_pml4e(pmap, sva);
6069 if ((*pml4e & PG_V) == 0) {
6070 va_next = (sva + NBPML4) & ~PML4MASK;
6071 if (va_next < sva)
6072 va_next = eva;
6073 continue;
6074 }
6075 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6076 if ((*pdpe & PG_V) == 0) {
6077 va_next = (sva + NBPDP) & ~PDPMASK;
6078 if (va_next < sva)
6079 va_next = eva;
6080 continue;
6081 }
6082 va_next = (sva + NBPDR) & ~PDRMASK;
6083 if (va_next < sva)
6084 va_next = eva;
6085 pde = pmap_pdpe_to_pde(pdpe, sva);
6086 oldpde = *pde;
6087 if ((oldpde & PG_V) == 0)
6088 continue;
6089 else if ((oldpde & PG_PS) != 0) {
6090 if ((oldpde & PG_MANAGED) == 0)
6091 continue;
6092 if (!pv_lists_locked) {
6093 pv_lists_locked = TRUE;
6094 if (!rw_try_rlock(&pvh_global_lock)) {
6095 if (anychanged)
6096 pmap_invalidate_all(pmap);
6097 PMAP_UNLOCK(pmap);
6098 rw_rlock(&pvh_global_lock);
6099 goto resume;
6100 }
6101 }
6102 lock = NULL;
6103 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
6104 if (lock != NULL)
6105 rw_wunlock(lock);
6106
6107 /*
6108 * The large page mapping was destroyed.
6109 */
6110 continue;
6111 }
6112
6113 /*
6114 * Unless the page mappings are wired, remove the
6115 * mapping to a single page so that a subsequent
6116 * access may repromote. Since the underlying page
6117 * table page is fully populated, this removal never
6118 * frees a page table page.
6119 */
6120 if ((oldpde & PG_W) == 0) {
6121 pte = pmap_pde_to_pte(pde, sva);
6122 KASSERT((*pte & PG_V) != 0,
6123 ("pmap_advise: invalid PTE"));
6124 pmap_remove_pte(pmap, pte, sva, *pde, NULL,
6125 &lock);
6126 anychanged = TRUE;
6127 }
6128 if (lock != NULL)
6129 rw_wunlock(lock);
6130 }
6131 if (va_next > eva)
6132 va_next = eva;
6133 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6134 sva += PAGE_SIZE) {
6135 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
6136 PG_V))
6137 continue;
6138 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6139 if (advice == MADV_DONTNEED) {
6140 /*
6141 * Future calls to pmap_is_modified()
6142 * can be avoided by making the page
6143 * dirty now.
6144 */
6145 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6146 vm_page_dirty(m);
6147 }
6148 atomic_clear_long(pte, PG_M | PG_A);
6149 } else if ((*pte & PG_A) != 0)
6150 atomic_clear_long(pte, PG_A);
6151 else
6152 continue;
6153 if ((*pte & PG_G) != 0)
6154 pmap_invalidate_page(pmap, sva);
6155 else
6156 anychanged = TRUE;
6157 }
6158 }
6159 if (anychanged)
6160 pmap_invalidate_all(pmap);
6161 if (pv_lists_locked)
6162 rw_runlock(&pvh_global_lock);
6163 PMAP_UNLOCK(pmap);
6164 }
6165
6166 /*
6167 * Clear the modify bits on the specified physical page.
6168 */
6169 void
6170 pmap_clear_modify(vm_page_t m)
6171 {
6172 struct md_page *pvh;
6173 pmap_t pmap;
6174 pv_entry_t next_pv, pv;
6175 pd_entry_t oldpde, *pde;
6176 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
6177 struct rwlock *lock;
6178 vm_offset_t va;
6179 int md_gen, pvh_gen;
6180
6181 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6182 ("pmap_clear_modify: page %p is not managed", m));
6183 VM_OBJECT_ASSERT_WLOCKED(m->object);
6184 KASSERT(!vm_page_xbusied(m),
6185 ("pmap_clear_modify: page %p is exclusive busied", m));
6186
6187 /*
6188 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6189 * If the object containing the page is locked and the page is not
6190 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6191 */
6192 if ((m->aflags & PGA_WRITEABLE) == 0)
6193 return;
6194 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6195 pa_to_pvh(VM_PAGE_TO_PHYS(m));
6196 rw_rlock(&pvh_global_lock);
6197 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6198 rw_wlock(lock);
6199 restart:
6200 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6201 pmap = PV_PMAP(pv);
6202 if (!PMAP_TRYLOCK(pmap)) {
6203 pvh_gen = pvh->pv_gen;
6204 rw_wunlock(lock);
6205 PMAP_LOCK(pmap);
6206 rw_wlock(lock);
6207 if (pvh_gen != pvh->pv_gen) {
6208 PMAP_UNLOCK(pmap);
6209 goto restart;
6210 }
6211 }
6212 PG_M = pmap_modified_bit(pmap);
6213 PG_V = pmap_valid_bit(pmap);
6214 PG_RW = pmap_rw_bit(pmap);
6215 va = pv->pv_va;
6216 pde = pmap_pde(pmap, va);
6217 oldpde = *pde;
6218 if ((oldpde & PG_RW) != 0) {
6219 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6220 if ((oldpde & PG_W) == 0) {
6221 /*
6222 * Write protect the mapping to a
6223 * single page so that a subsequent
6224 * write access may repromote.
6225 */
6226 va += VM_PAGE_TO_PHYS(m) - (oldpde &
6227 PG_PS_FRAME);
6228 pte = pmap_pde_to_pte(pde, va);
6229 oldpte = *pte;
6230 if ((oldpte & PG_V) != 0) {
6231 while (!atomic_cmpset_long(pte,
6232 oldpte,
6233 oldpte & ~(PG_M | PG_RW)))
6234 oldpte = *pte;
6235 vm_page_dirty(m);
6236 pmap_invalidate_page(pmap, va);
6237 }
6238 }
6239 }
6240 }
6241 PMAP_UNLOCK(pmap);
6242 }
6243 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6244 pmap = PV_PMAP(pv);
6245 if (!PMAP_TRYLOCK(pmap)) {
6246 md_gen = m->md.pv_gen;
6247 pvh_gen = pvh->pv_gen;
6248 rw_wunlock(lock);
6249 PMAP_LOCK(pmap);
6250 rw_wlock(lock);
6251 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6252 PMAP_UNLOCK(pmap);
6253 goto restart;
6254 }
6255 }
6256 PG_M = pmap_modified_bit(pmap);
6257 PG_RW = pmap_rw_bit(pmap);
6258 pde = pmap_pde(pmap, pv->pv_va);
6259 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6260 " a 2mpage in page %p's pv list", m));
6261 pte = pmap_pde_to_pte(pde, pv->pv_va);
6262 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6263 atomic_clear_long(pte, PG_M);
6264 pmap_invalidate_page(pmap, pv->pv_va);
6265 }
6266 PMAP_UNLOCK(pmap);
6267 }
6268 rw_wunlock(lock);
6269 rw_runlock(&pvh_global_lock);
6270 }
6271
6272 /*
6273 * Miscellaneous support routines follow
6274 */
6275
6276 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
6277 static __inline void
6278 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6279 {
6280 u_int opte, npte;
6281
6282 /*
6283 * The cache mode bits are all in the low 32-bits of the
6284 * PTE, so we can just spin on updating the low 32-bits.
6285 */
6286 do {
6287 opte = *(u_int *)pte;
6288 npte = opte & ~mask;
6289 npte |= cache_bits;
6290 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6291 }
6292
6293 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
6294 static __inline void
6295 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6296 {
6297 u_int opde, npde;
6298
6299 /*
6300 * The cache mode bits are all in the low 32-bits of the
6301 * PDE, so we can just spin on updating the low 32-bits.
6302 */
6303 do {
6304 opde = *(u_int *)pde;
6305 npde = opde & ~mask;
6306 npde |= cache_bits;
6307 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6308 }
6309
6310 /*
6311 * Map a set of physical memory pages into the kernel virtual
6312 * address space. Return a pointer to where it is mapped. This
6313 * routine is intended to be used for mapping device memory,
6314 * NOT real memory.
6315 */
6316 void *
6317 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6318 {
6319 struct pmap_preinit_mapping *ppim;
6320 vm_offset_t va, offset;
6321 vm_size_t tmpsize;
6322 int i;
6323
6324 offset = pa & PAGE_MASK;
6325 size = round_page(offset + size);
6326 pa = trunc_page(pa);
6327
6328 if (!pmap_initialized) {
6329 va = 0;
6330 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6331 ppim = pmap_preinit_mapping + i;
6332 if (ppim->va == 0) {
6333 ppim->pa = pa;
6334 ppim->sz = size;
6335 ppim->mode = mode;
6336 ppim->va = virtual_avail;
6337 virtual_avail += size;
6338 va = ppim->va;
6339 break;
6340 }
6341 }
6342 if (va == 0)
6343 panic("%s: too many preinit mappings", __func__);
6344 } else {
6345 /*
6346 * If we have a preinit mapping, re-use it.
6347 */
6348 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6349 ppim = pmap_preinit_mapping + i;
6350 if (ppim->pa == pa && ppim->sz == size &&
6351 ppim->mode == mode)
6352 return ((void *)(ppim->va + offset));
6353 }
6354 /*
6355 * If the specified range of physical addresses fits within
6356 * the direct map window, use the direct map.
6357 */
6358 if (pa < dmaplimit && pa + size < dmaplimit) {
6359 va = PHYS_TO_DMAP(pa);
6360 if (!pmap_change_attr(va, size, mode))
6361 return ((void *)(va + offset));
6362 }
6363 va = kva_alloc(size);
6364 if (va == 0)
6365 panic("%s: Couldn't allocate KVA", __func__);
6366 }
6367 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6368 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6369 pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6370 pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
6371 return ((void *)(va + offset));
6372 }
6373
6374 void *
6375 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6376 {
6377
6378 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6379 }
6380
6381 void *
6382 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6383 {
6384
6385 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6386 }
6387
6388 void
6389 pmap_unmapdev(vm_offset_t va, vm_size_t size)
6390 {
6391 struct pmap_preinit_mapping *ppim;
6392 vm_offset_t offset;
6393 int i;
6394
6395 /* If we gave a direct map region in pmap_mapdev, do nothing */
6396 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6397 return;
6398 offset = va & PAGE_MASK;
6399 size = round_page(offset + size);
6400 va = trunc_page(va);
6401 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6402 ppim = pmap_preinit_mapping + i;
6403 if (ppim->va == va && ppim->sz == size) {
6404 if (pmap_initialized)
6405 return;
6406 ppim->pa = 0;
6407 ppim->va = 0;
6408 ppim->sz = 0;
6409 ppim->mode = 0;
6410 if (va + size == virtual_avail)
6411 virtual_avail = va;
6412 return;
6413 }
6414 }
6415 if (pmap_initialized)
6416 kva_free(va, size);
6417 }
6418
6419 /*
6420 * Tries to demote a 1GB page mapping.
6421 */
6422 static boolean_t
6423 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
6424 {
6425 pdp_entry_t newpdpe, oldpdpe;
6426 pd_entry_t *firstpde, newpde, *pde;
6427 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6428 vm_paddr_t mpdepa;
6429 vm_page_t mpde;
6430
6431 PG_A = pmap_accessed_bit(pmap);
6432 PG_M = pmap_modified_bit(pmap);
6433 PG_V = pmap_valid_bit(pmap);
6434 PG_RW = pmap_rw_bit(pmap);
6435
6436 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6437 oldpdpe = *pdpe;
6438 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
6439 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6440 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
6441 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
6442 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6443 " in pmap %p", va, pmap);
6444 return (FALSE);
6445 }
6446 mpdepa = VM_PAGE_TO_PHYS(mpde);
6447 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa);
6448 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
6449 KASSERT((oldpdpe & PG_A) != 0,
6450 ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6451 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6452 ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6453 newpde = oldpdpe;
6454
6455 /*
6456 * Initialize the page directory page.
6457 */
6458 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6459 *pde = newpde;
6460 newpde += NBPDR;
6461 }
6462
6463 /*
6464 * Demote the mapping.
6465 */
6466 *pdpe = newpdpe;
6467
6468 /*
6469 * Invalidate a stale recursive mapping of the page directory page.
6470 */
6471 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
6472
6473 pmap_pdpe_demotions++;
6474 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6475 " in pmap %p", va, pmap);
6476 return (TRUE);
6477 }
6478
6479 /*
6480 * Sets the memory attribute for the specified page.
6481 */
6482 void
6483 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6484 {
6485
6486 m->md.pat_mode = ma;
6487
6488 /*
6489 * If "m" is a normal page, update its direct mapping. This update
6490 * can be relied upon to perform any cache operations that are
6491 * required for data coherence.
6492 */
6493 if ((m->flags & PG_FICTITIOUS) == 0 &&
6494 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6495 m->md.pat_mode))
6496 panic("memory attribute change on the direct map failed");
6497 }
6498
6499 /*
6500 * Changes the specified virtual address range's memory type to that given by
6501 * the parameter "mode". The specified virtual address range must be
6502 * completely contained within either the direct map or the kernel map. If
6503 * the virtual address range is contained within the kernel map, then the
6504 * memory type for each of the corresponding ranges of the direct map is also
6505 * changed. (The corresponding ranges of the direct map are those ranges that
6506 * map the same physical pages as the specified virtual address range.) These
6507 * changes to the direct map are necessary because Intel describes the
6508 * behavior of their processors as "undefined" if two or more mappings to the
6509 * same physical page have different memory types.
6510 *
6511 * Returns zero if the change completed successfully, and either EINVAL or
6512 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
6513 * of the virtual address range was not mapped, and ENOMEM is returned if
6514 * there was insufficient memory available to complete the change. In the
6515 * latter case, the memory type may have been changed on some part of the
6516 * virtual address range or the direct map.
6517 */
6518 int
6519 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6520 {
6521 int error;
6522
6523 PMAP_LOCK(kernel_pmap);
6524 error = pmap_change_attr_locked(va, size, mode);
6525 PMAP_UNLOCK(kernel_pmap);
6526 return (error);
6527 }
6528
6529 static int
6530 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
6531 {
6532 vm_offset_t base, offset, tmpva;
6533 vm_paddr_t pa_start, pa_end, pa_end1;
6534 pdp_entry_t *pdpe;
6535 pd_entry_t *pde;
6536 pt_entry_t *pte;
6537 int cache_bits_pte, cache_bits_pde, error;
6538 boolean_t changed;
6539
6540 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6541 base = trunc_page(va);
6542 offset = va & PAGE_MASK;
6543 size = round_page(offset + size);
6544
6545 /*
6546 * Only supported on kernel virtual addresses, including the direct
6547 * map but excluding the recursive map.
6548 */
6549 if (base < DMAP_MIN_ADDRESS)
6550 return (EINVAL);
6551
6552 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
6553 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
6554 changed = FALSE;
6555
6556 /*
6557 * Pages that aren't mapped aren't supported. Also break down 2MB pages
6558 * into 4KB pages if required.
6559 */
6560 for (tmpva = base; tmpva < base + size; ) {
6561 pdpe = pmap_pdpe(kernel_pmap, tmpva);
6562 if (pdpe == NULL || *pdpe == 0)
6563 return (EINVAL);
6564 if (*pdpe & PG_PS) {
6565 /*
6566 * If the current 1GB page already has the required
6567 * memory type, then we need not demote this page. Just
6568 * increment tmpva to the next 1GB page frame.
6569 */
6570 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
6571 tmpva = trunc_1gpage(tmpva) + NBPDP;
6572 continue;
6573 }
6574
6575 /*
6576 * If the current offset aligns with a 1GB page frame
6577 * and there is at least 1GB left within the range, then
6578 * we need not break down this page into 2MB pages.
6579 */
6580 if ((tmpva & PDPMASK) == 0 &&
6581 tmpva + PDPMASK < base + size) {
6582 tmpva += NBPDP;
6583 continue;
6584 }
6585 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
6586 return (ENOMEM);
6587 }
6588 pde = pmap_pdpe_to_pde(pdpe, tmpva);
6589 if (*pde == 0)
6590 return (EINVAL);
6591 if (*pde & PG_PS) {
6592 /*
6593 * If the current 2MB page already has the required
6594 * memory type, then we need not demote this page. Just
6595 * increment tmpva to the next 2MB page frame.
6596 */
6597 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
6598 tmpva = trunc_2mpage(tmpva) + NBPDR;
6599 continue;
6600 }
6601
6602 /*
6603 * If the current offset aligns with a 2MB page frame
6604 * and there is at least 2MB left within the range, then
6605 * we need not break down this page into 4KB pages.
6606 */
6607 if ((tmpva & PDRMASK) == 0 &&
6608 tmpva + PDRMASK < base + size) {
6609 tmpva += NBPDR;
6610 continue;
6611 }
6612 if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
6613 return (ENOMEM);
6614 }
6615 pte = pmap_pde_to_pte(pde, tmpva);
6616 if (*pte == 0)
6617 return (EINVAL);
6618 tmpva += PAGE_SIZE;
6619 }
6620 error = 0;
6621
6622 /*
6623 * Ok, all the pages exist, so run through them updating their
6624 * cache mode if required.
6625 */
6626 pa_start = pa_end = 0;
6627 for (tmpva = base; tmpva < base + size; ) {
6628 pdpe = pmap_pdpe(kernel_pmap, tmpva);
6629 if (*pdpe & PG_PS) {
6630 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
6631 pmap_pde_attr(pdpe, cache_bits_pde,
6632 X86_PG_PDE_CACHE);
6633 changed = TRUE;
6634 }
6635 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6636 (*pdpe & PG_PS_FRAME) < dmaplimit) {
6637 if (pa_start == pa_end) {
6638 /* Start physical address run. */
6639 pa_start = *pdpe & PG_PS_FRAME;
6640 pa_end = pa_start + NBPDP;
6641 } else if (pa_end == (*pdpe & PG_PS_FRAME))
6642 pa_end += NBPDP;
6643 else {
6644 /* Run ended, update direct map. */
6645 error = pmap_change_attr_locked(
6646 PHYS_TO_DMAP(pa_start),
6647 pa_end - pa_start, mode);
6648 if (error != 0)
6649 break;
6650 /* Start physical address run. */
6651 pa_start = *pdpe & PG_PS_FRAME;
6652 pa_end = pa_start + NBPDP;
6653 }
6654 }
6655 tmpva = trunc_1gpage(tmpva) + NBPDP;
6656 continue;
6657 }
6658 pde = pmap_pdpe_to_pde(pdpe, tmpva);
6659 if (*pde & PG_PS) {
6660 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
6661 pmap_pde_attr(pde, cache_bits_pde,
6662 X86_PG_PDE_CACHE);
6663 changed = TRUE;
6664 }
6665 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6666 (*pde & PG_PS_FRAME) < dmaplimit) {
6667 if (pa_start == pa_end) {
6668 /* Start physical address run. */
6669 pa_start = *pde & PG_PS_FRAME;
6670 pa_end = pa_start + NBPDR;
6671 } else if (pa_end == (*pde & PG_PS_FRAME))
6672 pa_end += NBPDR;
6673 else {
6674 /* Run ended, update direct map. */
6675 error = pmap_change_attr_locked(
6676 PHYS_TO_DMAP(pa_start),
6677 pa_end - pa_start, mode);
6678 if (error != 0)
6679 break;
6680 /* Start physical address run. */
6681 pa_start = *pde & PG_PS_FRAME;
6682 pa_end = pa_start + NBPDR;
6683 }
6684 }
6685 tmpva = trunc_2mpage(tmpva) + NBPDR;
6686 } else {
6687 pte = pmap_pde_to_pte(pde, tmpva);
6688 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
6689 pmap_pte_attr(pte, cache_bits_pte,
6690 X86_PG_PTE_CACHE);
6691 changed = TRUE;
6692 }
6693 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6694 (*pte & PG_FRAME) < dmaplimit) {
6695 if (pa_start == pa_end) {
6696 /* Start physical address run. */
6697 pa_start = *pte & PG_FRAME;
6698 pa_end = pa_start + PAGE_SIZE;
6699 } else if (pa_end == (*pte & PG_FRAME))
6700 pa_end += PAGE_SIZE;
6701 else {
6702 /* Run ended, update direct map. */
6703 error = pmap_change_attr_locked(
6704 PHYS_TO_DMAP(pa_start),
6705 pa_end - pa_start, mode);
6706 if (error != 0)
6707 break;
6708 /* Start physical address run. */
6709 pa_start = *pte & PG_FRAME;
6710 pa_end = pa_start + PAGE_SIZE;
6711 }
6712 }
6713 tmpva += PAGE_SIZE;
6714 }
6715 }
6716 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
6717 pa_end1 = MIN(pa_end, dmaplimit);
6718 if (pa_start != pa_end1)
6719 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6720 pa_end1 - pa_start, mode);
6721 }
6722
6723 /*
6724 * Flush CPU caches if required to make sure any data isn't cached that
6725 * shouldn't be, etc.
6726 */
6727 if (changed) {
6728 pmap_invalidate_range(kernel_pmap, base, tmpva);
6729 pmap_invalidate_cache_range(base, tmpva, FALSE);
6730 }
6731 return (error);
6732 }
6733
6734 /*
6735 * Demotes any mapping within the direct map region that covers more than the
6736 * specified range of physical addresses. This range's size must be a power
6737 * of two and its starting address must be a multiple of its size. Since the
6738 * demotion does not change any attributes of the mapping, a TLB invalidation
6739 * is not mandatory. The caller may, however, request a TLB invalidation.
6740 */
6741 void
6742 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
6743 {
6744 pdp_entry_t *pdpe;
6745 pd_entry_t *pde;
6746 vm_offset_t va;
6747 boolean_t changed;
6748
6749 if (len == 0)
6750 return;
6751 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
6752 KASSERT((base & (len - 1)) == 0,
6753 ("pmap_demote_DMAP: base is not a multiple of len"));
6754 if (len < NBPDP && base < dmaplimit) {
6755 va = PHYS_TO_DMAP(base);
6756 changed = FALSE;
6757 PMAP_LOCK(kernel_pmap);
6758 pdpe = pmap_pdpe(kernel_pmap, va);
6759 if ((*pdpe & X86_PG_V) == 0)
6760 panic("pmap_demote_DMAP: invalid PDPE");
6761 if ((*pdpe & PG_PS) != 0) {
6762 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
6763 panic("pmap_demote_DMAP: PDPE failed");
6764 changed = TRUE;
6765 }
6766 if (len < NBPDR) {
6767 pde = pmap_pdpe_to_pde(pdpe, va);
6768 if ((*pde & X86_PG_V) == 0)
6769 panic("pmap_demote_DMAP: invalid PDE");
6770 if ((*pde & PG_PS) != 0) {
6771 if (!pmap_demote_pde(kernel_pmap, pde, va))
6772 panic("pmap_demote_DMAP: PDE failed");
6773 changed = TRUE;
6774 }
6775 }
6776 if (changed && invalidate)
6777 pmap_invalidate_page(kernel_pmap, va);
6778 PMAP_UNLOCK(kernel_pmap);
6779 }
6780 }
6781
6782 /*
6783 * perform the pmap work for mincore
6784 */
6785 int
6786 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
6787 {
6788 pd_entry_t *pdep;
6789 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
6790 vm_paddr_t pa;
6791 int val;
6792
6793 PG_A = pmap_accessed_bit(pmap);
6794 PG_M = pmap_modified_bit(pmap);
6795 PG_V = pmap_valid_bit(pmap);
6796 PG_RW = pmap_rw_bit(pmap);
6797
6798 PMAP_LOCK(pmap);
6799 retry:
6800 pdep = pmap_pde(pmap, addr);
6801 if (pdep != NULL && (*pdep & PG_V)) {
6802 if (*pdep & PG_PS) {
6803 pte = *pdep;
6804 /* Compute the physical address of the 4KB page. */
6805 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
6806 PG_FRAME;
6807 val = MINCORE_SUPER;
6808 } else {
6809 pte = *pmap_pde_to_pte(pdep, addr);
6810 pa = pte & PG_FRAME;
6811 val = 0;
6812 }
6813 } else {
6814 pte = 0;
6815 pa = 0;
6816 val = 0;
6817 }
6818 if ((pte & PG_V) != 0) {
6819 val |= MINCORE_INCORE;
6820 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6821 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6822 if ((pte & PG_A) != 0)
6823 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6824 }
6825 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6826 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
6827 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
6828 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
6829 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
6830 goto retry;
6831 } else
6832 PA_UNLOCK_COND(*locked_pa);
6833 PMAP_UNLOCK(pmap);
6834 return (val);
6835 }
6836
6837 void
6838 pmap_activate(struct thread *td)
6839 {
6840 pmap_t pmap, oldpmap;
6841 u_int cpuid;
6842
6843 critical_enter();
6844 pmap = vmspace_pmap(td->td_proc->p_vmspace);
6845 oldpmap = PCPU_GET(curpmap);
6846 cpuid = PCPU_GET(cpuid);
6847 #ifdef SMP
6848 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
6849 CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
6850 CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
6851 #else
6852 CPU_CLR(cpuid, &oldpmap->pm_active);
6853 CPU_SET(cpuid, &pmap->pm_active);
6854 CPU_SET(cpuid, &pmap->pm_save);
6855 #endif
6856 td->td_pcb->pcb_cr3 = pmap->pm_cr3;
6857 load_cr3(pmap->pm_cr3);
6858 PCPU_SET(curpmap, pmap);
6859 critical_exit();
6860 }
6861
6862 void
6863 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
6864 {
6865 }
6866
6867 /*
6868 * Increase the starting virtual address of the given mapping if a
6869 * different alignment might result in more superpage mappings.
6870 */
6871 void
6872 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6873 vm_offset_t *addr, vm_size_t size)
6874 {
6875 vm_offset_t superpage_offset;
6876
6877 if (size < NBPDR)
6878 return;
6879 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6880 offset += ptoa(object->pg_color);
6881 superpage_offset = offset & PDRMASK;
6882 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
6883 (*addr & PDRMASK) == superpage_offset)
6884 return;
6885 if ((*addr & PDRMASK) < superpage_offset)
6886 *addr = (*addr & ~PDRMASK) + superpage_offset;
6887 else
6888 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
6889 }
6890
6891 #ifdef INVARIANTS
6892 static unsigned long num_dirty_emulations;
6893 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
6894 &num_dirty_emulations, 0, NULL);
6895
6896 static unsigned long num_accessed_emulations;
6897 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
6898 &num_accessed_emulations, 0, NULL);
6899
6900 static unsigned long num_superpage_accessed_emulations;
6901 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
6902 &num_superpage_accessed_emulations, 0, NULL);
6903
6904 static unsigned long ad_emulation_superpage_promotions;
6905 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
6906 &ad_emulation_superpage_promotions, 0, NULL);
6907 #endif /* INVARIANTS */
6908
6909 int
6910 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
6911 {
6912 int rv;
6913 struct rwlock *lock;
6914 vm_page_t m, mpte;
6915 pd_entry_t *pde;
6916 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
6917 boolean_t pv_lists_locked;
6918
6919 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
6920 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
6921
6922 if (!pmap_emulate_ad_bits(pmap))
6923 return (-1);
6924
6925 PG_A = pmap_accessed_bit(pmap);
6926 PG_M = pmap_modified_bit(pmap);
6927 PG_V = pmap_valid_bit(pmap);
6928 PG_RW = pmap_rw_bit(pmap);
6929
6930 rv = -1;
6931 lock = NULL;
6932 pv_lists_locked = FALSE;
6933 retry:
6934 PMAP_LOCK(pmap);
6935
6936 pde = pmap_pde(pmap, va);
6937 if (pde == NULL || (*pde & PG_V) == 0)
6938 goto done;
6939
6940 if ((*pde & PG_PS) != 0) {
6941 if (ftype == VM_PROT_READ) {
6942 #ifdef INVARIANTS
6943 atomic_add_long(&num_superpage_accessed_emulations, 1);
6944 #endif
6945 *pde |= PG_A;
6946 rv = 0;
6947 }
6948 goto done;
6949 }
6950
6951 pte = pmap_pde_to_pte(pde, va);
6952 if ((*pte & PG_V) == 0)
6953 goto done;
6954
6955 if (ftype == VM_PROT_WRITE) {
6956 if ((*pte & PG_RW) == 0)
6957 goto done;
6958 /*
6959 * Set the modified and accessed bits simultaneously.
6960 *
6961 * Intel EPT PTEs that do software emulation of A/D bits map
6962 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
6963 * An EPT misconfiguration is triggered if the PTE is writable
6964 * but not readable (WR=10). This is avoided by setting PG_A
6965 * and PG_M simultaneously.
6966 */
6967 *pte |= PG_M | PG_A;
6968 } else {
6969 *pte |= PG_A;
6970 }
6971
6972 /* try to promote the mapping */
6973 if (va < VM_MAXUSER_ADDRESS)
6974 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6975 else
6976 mpte = NULL;
6977
6978 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6979
6980 if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
6981 pmap_ps_enabled(pmap) &&
6982 (m->flags & PG_FICTITIOUS) == 0 &&
6983 vm_reserv_level_iffullpop(m) == 0) {
6984 if (!pv_lists_locked) {
6985 pv_lists_locked = TRUE;
6986 if (!rw_try_rlock(&pvh_global_lock)) {
6987 PMAP_UNLOCK(pmap);
6988 rw_rlock(&pvh_global_lock);
6989 goto retry;
6990 }
6991 }
6992 pmap_promote_pde(pmap, pde, va, &lock);
6993 #ifdef INVARIANTS
6994 atomic_add_long(&ad_emulation_superpage_promotions, 1);
6995 #endif
6996 }
6997 #ifdef INVARIANTS
6998 if (ftype == VM_PROT_WRITE)
6999 atomic_add_long(&num_dirty_emulations, 1);
7000 else
7001 atomic_add_long(&num_accessed_emulations, 1);
7002 #endif
7003 rv = 0; /* success */
7004 done:
7005 if (lock != NULL)
7006 rw_wunlock(lock);
7007 if (pv_lists_locked)
7008 rw_runlock(&pvh_global_lock);
7009 PMAP_UNLOCK(pmap);
7010 return (rv);
7011 }
7012
7013 void
7014 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
7015 {
7016 pml4_entry_t *pml4;
7017 pdp_entry_t *pdp;
7018 pd_entry_t *pde;
7019 pt_entry_t *pte, PG_V;
7020 int idx;
7021
7022 idx = 0;
7023 PG_V = pmap_valid_bit(pmap);
7024 PMAP_LOCK(pmap);
7025
7026 pml4 = pmap_pml4e(pmap, va);
7027 ptr[idx++] = *pml4;
7028 if ((*pml4 & PG_V) == 0)
7029 goto done;
7030
7031 pdp = pmap_pml4e_to_pdpe(pml4, va);
7032 ptr[idx++] = *pdp;
7033 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
7034 goto done;
7035
7036 pde = pmap_pdpe_to_pde(pdp, va);
7037 ptr[idx++] = *pde;
7038 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
7039 goto done;
7040
7041 pte = pmap_pde_to_pte(pde, va);
7042 ptr[idx++] = *pte;
7043
7044 done:
7045 PMAP_UNLOCK(pmap);
7046 *num = idx;
7047 }
7048
7049 #include "opt_ddb.h"
7050 #ifdef DDB
7051 #include <ddb/ddb.h>
7052
7053 DB_SHOW_COMMAND(pte, pmap_print_pte)
7054 {
7055 pmap_t pmap;
7056 pml4_entry_t *pml4;
7057 pdp_entry_t *pdp;
7058 pd_entry_t *pde;
7059 pt_entry_t *pte, PG_V;
7060 vm_offset_t va;
7061
7062 if (have_addr) {
7063 va = (vm_offset_t)addr;
7064 pmap = PCPU_GET(curpmap); /* XXX */
7065 } else {
7066 db_printf("show pte addr\n");
7067 return;
7068 }
7069 PG_V = pmap_valid_bit(pmap);
7070 pml4 = pmap_pml4e(pmap, va);
7071 db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
7072 if ((*pml4 & PG_V) == 0) {
7073 db_printf("\n");
7074 return;
7075 }
7076 pdp = pmap_pml4e_to_pdpe(pml4, va);
7077 db_printf(" pdpe %#016lx", *pdp);
7078 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
7079 db_printf("\n");
7080 return;
7081 }
7082 pde = pmap_pdpe_to_pde(pdp, va);
7083 db_printf(" pde %#016lx", *pde);
7084 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
7085 db_printf("\n");
7086 return;
7087 }
7088 pte = pmap_pde_to_pte(pde, va);
7089 db_printf(" pte %#016lx\n", *pte);
7090 }
7091
7092 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
7093 {
7094 vm_paddr_t a;
7095
7096 if (have_addr) {
7097 a = (vm_paddr_t)addr;
7098 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
7099 } else {
7100 db_printf("show phys2dmap addr\n");
7101 }
7102 }
7103 #endif
Cache object: 931fbf718e454108a28824bf2f0c5c26
|