FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c
1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 1991 Regents of the University of California.
5 * All rights reserved.
6 * Copyright (c) 1994 John S. Dyson
7 * All rights reserved.
8 * Copyright (c) 1994 David Greenman
9 * All rights reserved.
10 * Copyright (c) 2003 Peter Wemm
11 * All rights reserved.
12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13 * All rights reserved.
14 *
15 * This code is derived from software contributed to Berkeley by
16 * the Systems Programming Group of the University of Utah Computer
17 * Science Department and William Jolitz of UUNET Technologies Inc.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
27 * 3. All advertising materials mentioning features or use of this software
28 * must display the following acknowledgement:
29 * This product includes software developed by the University of
30 * California, Berkeley and its contributors.
31 * 4. Neither the name of the University nor the names of its contributors
32 * may be used to endorse or promote products derived from this software
33 * without specific prior written permission.
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
45 * SUCH DAMAGE.
46 *
47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
48 */
49 /*-
50 * Copyright (c) 2003 Networks Associates Technology, Inc.
51 * Copyright (c) 2014-2019 The FreeBSD Foundation
52 * All rights reserved.
53 *
54 * This software was developed for the FreeBSD Project by Jake Burkholder,
55 * Safeport Network Services, and Network Associates Laboratories, the
56 * Security Research Division of Network Associates, Inc. under
57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
58 * CHATS research program.
59 *
60 * Portions of this software were developed by
61 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
62 * the FreeBSD Foundation.
63 *
64 * Redistribution and use in source and binary forms, with or without
65 * modification, are permitted provided that the following conditions
66 * are met:
67 * 1. Redistributions of source code must retain the above copyright
68 * notice, this list of conditions and the following disclaimer.
69 * 2. Redistributions in binary form must reproduce the above copyright
70 * notice, this list of conditions and the following disclaimer in the
71 * documentation and/or other materials provided with the distribution.
72 *
73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 */
85
86 #define AMD64_NPT_AWARE
87
88 #include <sys/cdefs.h>
89 __FBSDID("$FreeBSD$");
90
91 /*
92 * Manages physical address maps.
93 *
94 * Since the information managed by this module is
95 * also stored by the logical address mapping module,
96 * this module may throw away valid virtual-to-physical
97 * mappings at almost any time. However, invalidations
98 * of virtual-to-physical mappings must be done as
99 * requested.
100 *
101 * In order to cope with hardware architectures which
102 * make virtual-to-physical map invalidates expensive,
103 * this module may delay invalidate or reduced protection
104 * operations until such time as they are actually
105 * necessary. This module is given full information as
106 * to which processors are currently using which maps,
107 * and to when physical maps must be made correct.
108 */
109
110 #include "opt_ddb.h"
111 #include "opt_pmap.h"
112 #include "opt_vm.h"
113
114 #include <sys/param.h>
115 #include <sys/bitstring.h>
116 #include <sys/bus.h>
117 #include <sys/systm.h>
118 #include <sys/kernel.h>
119 #include <sys/ktr.h>
120 #include <sys/lock.h>
121 #include <sys/malloc.h>
122 #include <sys/mman.h>
123 #include <sys/mutex.h>
124 #include <sys/proc.h>
125 #include <sys/rangeset.h>
126 #include <sys/rwlock.h>
127 #include <sys/sbuf.h>
128 #include <sys/sx.h>
129 #include <sys/turnstile.h>
130 #include <sys/vmem.h>
131 #include <sys/vmmeter.h>
132 #include <sys/sched.h>
133 #include <sys/sysctl.h>
134 #include <sys/smp.h>
135 #ifdef DDB
136 #include <sys/kdb.h>
137 #include <ddb/ddb.h>
138 #endif
139
140 #include <vm/vm.h>
141 #include <vm/vm_param.h>
142 #include <vm/vm_kern.h>
143 #include <vm/vm_page.h>
144 #include <vm/vm_map.h>
145 #include <vm/vm_object.h>
146 #include <vm/vm_extern.h>
147 #include <vm/vm_pageout.h>
148 #include <vm/vm_pager.h>
149 #include <vm/vm_phys.h>
150 #include <vm/vm_radix.h>
151 #include <vm/vm_reserv.h>
152 #include <vm/uma.h>
153
154 #include <machine/intr_machdep.h>
155 #include <x86/apicvar.h>
156 #include <x86/ifunc.h>
157 #include <machine/cpu.h>
158 #include <machine/cputypes.h>
159 #include <machine/intr_machdep.h>
160 #include <machine/md_var.h>
161 #include <machine/pcb.h>
162 #include <machine/specialreg.h>
163 #ifdef SMP
164 #include <machine/smp.h>
165 #endif
166 #include <machine/sysarch.h>
167 #include <machine/tss.h>
168
169 static __inline boolean_t
170 pmap_type_guest(pmap_t pmap)
171 {
172
173 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
174 }
175
176 static __inline boolean_t
177 pmap_emulate_ad_bits(pmap_t pmap)
178 {
179
180 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
181 }
182
183 static __inline pt_entry_t
184 pmap_valid_bit(pmap_t pmap)
185 {
186 pt_entry_t mask;
187
188 switch (pmap->pm_type) {
189 case PT_X86:
190 case PT_RVI:
191 mask = X86_PG_V;
192 break;
193 case PT_EPT:
194 if (pmap_emulate_ad_bits(pmap))
195 mask = EPT_PG_EMUL_V;
196 else
197 mask = EPT_PG_READ;
198 break;
199 default:
200 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
201 }
202
203 return (mask);
204 }
205
206 static __inline pt_entry_t
207 pmap_rw_bit(pmap_t pmap)
208 {
209 pt_entry_t mask;
210
211 switch (pmap->pm_type) {
212 case PT_X86:
213 case PT_RVI:
214 mask = X86_PG_RW;
215 break;
216 case PT_EPT:
217 if (pmap_emulate_ad_bits(pmap))
218 mask = EPT_PG_EMUL_RW;
219 else
220 mask = EPT_PG_WRITE;
221 break;
222 default:
223 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
224 }
225
226 return (mask);
227 }
228
229 static pt_entry_t pg_g;
230
231 static __inline pt_entry_t
232 pmap_global_bit(pmap_t pmap)
233 {
234 pt_entry_t mask;
235
236 switch (pmap->pm_type) {
237 case PT_X86:
238 mask = pg_g;
239 break;
240 case PT_RVI:
241 case PT_EPT:
242 mask = 0;
243 break;
244 default:
245 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
246 }
247
248 return (mask);
249 }
250
251 static __inline pt_entry_t
252 pmap_accessed_bit(pmap_t pmap)
253 {
254 pt_entry_t mask;
255
256 switch (pmap->pm_type) {
257 case PT_X86:
258 case PT_RVI:
259 mask = X86_PG_A;
260 break;
261 case PT_EPT:
262 if (pmap_emulate_ad_bits(pmap))
263 mask = EPT_PG_READ;
264 else
265 mask = EPT_PG_A;
266 break;
267 default:
268 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
269 }
270
271 return (mask);
272 }
273
274 static __inline pt_entry_t
275 pmap_modified_bit(pmap_t pmap)
276 {
277 pt_entry_t mask;
278
279 switch (pmap->pm_type) {
280 case PT_X86:
281 case PT_RVI:
282 mask = X86_PG_M;
283 break;
284 case PT_EPT:
285 if (pmap_emulate_ad_bits(pmap))
286 mask = EPT_PG_WRITE;
287 else
288 mask = EPT_PG_M;
289 break;
290 default:
291 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
292 }
293
294 return (mask);
295 }
296
297 static __inline pt_entry_t
298 pmap_pku_mask_bit(pmap_t pmap)
299 {
300
301 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
302 }
303
304 #if !defined(DIAGNOSTIC)
305 #ifdef __GNUC_GNU_INLINE__
306 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline
307 #else
308 #define PMAP_INLINE extern inline
309 #endif
310 #else
311 #define PMAP_INLINE
312 #endif
313
314 #ifdef PV_STATS
315 #define PV_STAT(x) do { x ; } while (0)
316 #else
317 #define PV_STAT(x) do { } while (0)
318 #endif
319
320 #define pa_index(pa) ((pa) >> PDRSHIFT)
321 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
322
323 #define NPV_LIST_LOCKS MAXCPU
324
325 #define PHYS_TO_PV_LIST_LOCK(pa) \
326 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
327
328 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
329 struct rwlock **_lockp = (lockp); \
330 struct rwlock *_new_lock; \
331 \
332 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
333 if (_new_lock != *_lockp) { \
334 if (*_lockp != NULL) \
335 rw_wunlock(*_lockp); \
336 *_lockp = _new_lock; \
337 rw_wlock(*_lockp); \
338 } \
339 } while (0)
340
341 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
342 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
343
344 #define RELEASE_PV_LIST_LOCK(lockp) do { \
345 struct rwlock **_lockp = (lockp); \
346 \
347 if (*_lockp != NULL) { \
348 rw_wunlock(*_lockp); \
349 *_lockp = NULL; \
350 } \
351 } while (0)
352
353 #define VM_PAGE_TO_PV_LIST_LOCK(m) \
354 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
355
356 struct pmap kernel_pmap_store;
357
358 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
359 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
360
361 int nkpt;
362 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
363 "Number of kernel page table pages allocated on bootup");
364
365 static int ndmpdp;
366 vm_paddr_t dmaplimit;
367 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
368 pt_entry_t pg_nx;
369
370 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
371
372 /* Unused, kept for ABI stability on the stable branch. */
373 static int pat_works = 1;
374 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
375 "Is page attribute table fully functional?");
376
377 static int pg_ps_enabled = 1;
378 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
379 &pg_ps_enabled, 0, "Are large page mappings enabled?");
380
381 #define PAT_INDEX_SIZE 8
382 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
383
384 static u_int64_t KPTphys; /* phys addr of kernel level 1 */
385 static u_int64_t KPDphys; /* phys addr of kernel level 2 */
386 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */
387 u_int64_t KPML4phys; /* phys addr of kernel level 4 */
388
389 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
390 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
391 static int ndmpdpphys; /* number of DMPDPphys pages */
392
393 static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */
394
395 /*
396 * pmap_mapdev support pre initialization (i.e. console)
397 */
398 #define PMAP_PREINIT_MAPPING_COUNT 8
399 static struct pmap_preinit_mapping {
400 vm_paddr_t pa;
401 vm_offset_t va;
402 vm_size_t sz;
403 int mode;
404 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
405 static int pmap_initialized;
406
407 /*
408 * Data for the pv entry allocation mechanism.
409 * Updates to pv_invl_gen are protected by the pv_list_locks[]
410 * elements, but reads are not.
411 */
412 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
413 static struct mtx __exclusive_cache_line pv_chunks_mutex;
414 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
415 static u_long pv_invl_gen[NPV_LIST_LOCKS];
416 static struct md_page *pv_table;
417 static struct md_page pv_dummy;
418
419 /*
420 * All those kernel PT submaps that BSD is so fond of
421 */
422 pt_entry_t *CMAP1 = NULL;
423 caddr_t CADDR1 = 0;
424 static vm_offset_t qframe = 0;
425 static struct mtx qframe_mtx;
426
427 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
428
429 static vmem_t *large_vmem;
430 static u_int lm_ents;
431 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \
432 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
433
434 int pmap_pcid_enabled = 1;
435 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
436 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
437 int invpcid_works = 0;
438 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
439 "Is the invpcid instruction available ?");
440
441 int __read_frequently pti = 0;
442 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
443 &pti, 0,
444 "Page Table Isolation enabled");
445 static vm_object_t pti_obj;
446 static pml4_entry_t *pti_pml4;
447 static vm_pindex_t pti_pg_idx;
448 static bool pti_finalized;
449
450 struct pmap_pkru_range {
451 struct rs_el pkru_rs_el;
452 u_int pkru_keyidx;
453 int pkru_flags;
454 };
455
456 static uma_zone_t pmap_pkru_ranges_zone;
457 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
458 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
459 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
460 static void *pkru_dup_range(void *ctx, void *data);
461 static void pkru_free_range(void *ctx, void *node);
462 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
463 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
464 static void pmap_pkru_deassign_all(pmap_t pmap);
465
466 static int
467 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
468 {
469 int i;
470 uint64_t res;
471
472 res = 0;
473 CPU_FOREACH(i) {
474 res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
475 }
476 return (sysctl_handle_64(oidp, &res, 0, req));
477 }
478 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD |
479 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
480 "Count of saved TLB context on switch");
481
482 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
483 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
484 static struct mtx invl_gen_mtx;
485 /* Fake lock object to satisfy turnstiles interface. */
486 static struct lock_object invl_gen_ts = {
487 .lo_name = "invlts",
488 };
489 static struct pmap_invl_gen pmap_invl_gen_head = {
490 .gen = 1,
491 .next = NULL,
492 };
493 static u_long pmap_invl_gen = 1;
494 static int pmap_invl_waiters;
495 static struct callout pmap_invl_callout;
496 static bool pmap_invl_callout_inited;
497
498 #define PMAP_ASSERT_NOT_IN_DI() \
499 KASSERT(pmap_not_in_di(), ("DI already started"))
500
501 static bool
502 pmap_di_locked(void)
503 {
504 int tun;
505
506 if ((cpu_feature2 & CPUID2_CX16) == 0)
507 return (true);
508 tun = 0;
509 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun);
510 return (tun != 0);
511 }
512
513 static int
514 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)
515 {
516 int locked;
517
518 locked = pmap_di_locked();
519 return (sysctl_handle_int(oidp, &locked, 0, req));
520 }
521 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN |
522 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "",
523 "Locked delayed invalidation");
524
525 static bool pmap_not_in_di_l(void);
526 static bool pmap_not_in_di_u(void);
527 DEFINE_IFUNC(, bool, pmap_not_in_di, (void), static)
528 {
529
530 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u);
531 }
532
533 static bool
534 pmap_not_in_di_l(void)
535 {
536 struct pmap_invl_gen *invl_gen;
537
538 invl_gen = &curthread->td_md.md_invl_gen;
539 return (invl_gen->gen == 0);
540 }
541
542 static void
543 pmap_thread_init_invl_gen_l(struct thread *td)
544 {
545 struct pmap_invl_gen *invl_gen;
546
547 invl_gen = &td->td_md.md_invl_gen;
548 invl_gen->gen = 0;
549 }
550
551 static void
552 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen)
553 {
554 struct turnstile *ts;
555
556 ts = turnstile_trywait(&invl_gen_ts);
557 if (*m_gen > atomic_load_long(invl_gen))
558 turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
559 else
560 turnstile_cancel(ts);
561 }
562
563 static void
564 pmap_delayed_invl_finish_unblock(u_long new_gen)
565 {
566 struct turnstile *ts;
567
568 turnstile_chain_lock(&invl_gen_ts);
569 ts = turnstile_lookup(&invl_gen_ts);
570 if (new_gen != 0)
571 pmap_invl_gen = new_gen;
572 if (ts != NULL) {
573 turnstile_broadcast(ts, TS_SHARED_QUEUE);
574 turnstile_unpend(ts);
575 }
576 turnstile_chain_unlock(&invl_gen_ts);
577 }
578
579 /*
580 * Start a new Delayed Invalidation (DI) block of code, executed by
581 * the current thread. Within a DI block, the current thread may
582 * destroy both the page table and PV list entries for a mapping and
583 * then release the corresponding PV list lock before ensuring that
584 * the mapping is flushed from the TLBs of any processors with the
585 * pmap active.
586 */
587 static void
588 pmap_delayed_invl_start_l(void)
589 {
590 struct pmap_invl_gen *invl_gen;
591 u_long currgen;
592
593 invl_gen = &curthread->td_md.md_invl_gen;
594 PMAP_ASSERT_NOT_IN_DI();
595 mtx_lock(&invl_gen_mtx);
596 if (LIST_EMPTY(&pmap_invl_gen_tracker))
597 currgen = pmap_invl_gen;
598 else
599 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
600 invl_gen->gen = currgen + 1;
601 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
602 mtx_unlock(&invl_gen_mtx);
603 }
604
605 /*
606 * Finish the DI block, previously started by the current thread. All
607 * required TLB flushes for the pages marked by
608 * pmap_delayed_invl_page() must be finished before this function is
609 * called.
610 *
611 * This function works by bumping the global DI generation number to
612 * the generation number of the current thread's DI, unless there is a
613 * pending DI that started earlier. In the latter case, bumping the
614 * global DI generation number would incorrectly signal that the
615 * earlier DI had finished. Instead, this function bumps the earlier
616 * DI's generation number to match the generation number of the
617 * current thread's DI.
618 */
619 static void
620 pmap_delayed_invl_finish_l(void)
621 {
622 struct pmap_invl_gen *invl_gen, *next;
623
624 invl_gen = &curthread->td_md.md_invl_gen;
625 KASSERT(invl_gen->gen != 0, ("missed invl_start"));
626 mtx_lock(&invl_gen_mtx);
627 next = LIST_NEXT(invl_gen, link);
628 if (next == NULL)
629 pmap_delayed_invl_finish_unblock(invl_gen->gen);
630 else
631 next->gen = invl_gen->gen;
632 LIST_REMOVE(invl_gen, link);
633 mtx_unlock(&invl_gen_mtx);
634 invl_gen->gen = 0;
635 }
636
637 static bool
638 pmap_not_in_di_u(void)
639 {
640 struct pmap_invl_gen *invl_gen;
641
642 invl_gen = &curthread->td_md.md_invl_gen;
643 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
644 }
645
646 static void
647 pmap_thread_init_invl_gen_u(struct thread *td)
648 {
649 struct pmap_invl_gen *invl_gen;
650
651 invl_gen = &td->td_md.md_invl_gen;
652 invl_gen->gen = 0;
653 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
654 }
655
656 static bool
657 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
658 {
659 uint64_t new_high, new_low, old_high, old_low;
660 char res;
661
662 old_low = new_low = 0;
663 old_high = new_high = (uintptr_t)0;
664
665 __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
666 : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
667 : "b"(new_low), "c" (new_high)
668 : "memory", "cc");
669 if (res == 0) {
670 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
671 return (false);
672 out->gen = old_low;
673 out->next = (void *)old_high;
674 } else {
675 out->gen = new_low;
676 out->next = (void *)new_high;
677 }
678 return (true);
679 }
680
681 static bool
682 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
683 struct pmap_invl_gen *new_val)
684 {
685 uint64_t new_high, new_low, old_high, old_low;
686 char res;
687
688 new_low = new_val->gen;
689 new_high = (uintptr_t)new_val->next;
690 old_low = old_val->gen;
691 old_high = (uintptr_t)old_val->next;
692
693 __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
694 : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
695 : "b"(new_low), "c" (new_high)
696 : "memory", "cc");
697 return (res);
698 }
699
700 #ifdef PV_STATS
701 static long invl_start_restart;
702 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
703 &invl_start_restart, 0,
704 "");
705 static long invl_finish_restart;
706 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
707 &invl_finish_restart, 0,
708 "");
709 static int invl_max_qlen;
710 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
711 &invl_max_qlen, 0,
712 "");
713 #endif
714
715 static struct lock_delay_config __read_frequently di_delay;
716 LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
717
718 static void
719 pmap_delayed_invl_start_u(void)
720 {
721 struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
722 struct thread *td;
723 struct lock_delay_arg lda;
724 uintptr_t prevl;
725 u_char pri;
726 #ifdef PV_STATS
727 int i, ii;
728 #endif
729
730 td = curthread;
731 invl_gen = &td->td_md.md_invl_gen;
732 PMAP_ASSERT_NOT_IN_DI();
733 lock_delay_arg_init(&lda, &di_delay);
734 invl_gen->saved_pri = 0;
735 pri = td->td_base_pri;
736 if (pri > PVM) {
737 thread_lock(td);
738 pri = td->td_base_pri;
739 if (pri > PVM) {
740 invl_gen->saved_pri = pri;
741 sched_prio(td, PVM);
742 }
743 thread_unlock(td);
744 }
745 again:
746 PV_STAT(i = 0);
747 for (p = &pmap_invl_gen_head;; p = prev.next) {
748 PV_STAT(i++);
749 prevl = (uintptr_t)atomic_load_ptr(&p->next);
750 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
751 PV_STAT(atomic_add_long(&invl_start_restart, 1));
752 lock_delay(&lda);
753 goto again;
754 }
755 if (prevl == 0)
756 break;
757 prev.next = (void *)prevl;
758 }
759 #ifdef PV_STATS
760 if ((ii = invl_max_qlen) < i)
761 atomic_cmpset_int(&invl_max_qlen, ii, i);
762 #endif
763
764 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
765 PV_STAT(atomic_add_long(&invl_start_restart, 1));
766 lock_delay(&lda);
767 goto again;
768 }
769
770 new_prev.gen = prev.gen;
771 new_prev.next = invl_gen;
772 invl_gen->gen = prev.gen + 1;
773
774 /* Formal fence between store to invl->gen and updating *p. */
775 atomic_thread_fence_rel();
776
777 /*
778 * After inserting an invl_gen element with invalid bit set,
779 * this thread blocks any other thread trying to enter the
780 * delayed invalidation block. Do not allow to remove us from
781 * the CPU, because it causes starvation for other threads.
782 */
783 critical_enter();
784
785 /*
786 * ABA for *p is not possible there, since p->gen can only
787 * increase. So if the *p thread finished its di, then
788 * started a new one and got inserted into the list at the
789 * same place, its gen will appear greater than the previously
790 * read gen.
791 */
792 if (!pmap_di_store_invl(p, &prev, &new_prev)) {
793 critical_exit();
794 PV_STAT(atomic_add_long(&invl_start_restart, 1));
795 lock_delay(&lda);
796 goto again;
797 }
798
799 /*
800 * There we clear PMAP_INVL_GEN_NEXT_INVALID in
801 * invl_gen->next, allowing other threads to iterate past us.
802 * pmap_di_store_invl() provides fence between the generation
803 * write and the update of next.
804 */
805 invl_gen->next = NULL;
806 critical_exit();
807 }
808
809 static bool
810 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen,
811 struct pmap_invl_gen *p)
812 {
813 struct pmap_invl_gen prev, new_prev;
814 u_long mygen;
815
816 /*
817 * Load invl_gen->gen after setting invl_gen->next
818 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger
819 * generations to propagate to our invl_gen->gen. Lock prefix
820 * in atomic_set_ptr() worked as seq_cst fence.
821 */
822 mygen = atomic_load_long(&invl_gen->gen);
823
824 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
825 return (false);
826
827 KASSERT(prev.gen < mygen,
828 ("invalid di gen sequence %lu %lu", prev.gen, mygen));
829 new_prev.gen = mygen;
830 new_prev.next = (void *)((uintptr_t)invl_gen->next &
831 ~PMAP_INVL_GEN_NEXT_INVALID);
832
833 /* Formal fence between load of prev and storing update to it. */
834 atomic_thread_fence_rel();
835
836 return (pmap_di_store_invl(p, &prev, &new_prev));
837 }
838
839 static void
840 pmap_delayed_invl_finish_u(void)
841 {
842 struct pmap_invl_gen *invl_gen, *p;
843 struct thread *td;
844 struct lock_delay_arg lda;
845 uintptr_t prevl;
846
847 td = curthread;
848 invl_gen = &td->td_md.md_invl_gen;
849 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
850 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
851 ("missed invl_start: INVALID"));
852 lock_delay_arg_init(&lda, &di_delay);
853
854 again:
855 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
856 prevl = (uintptr_t)atomic_load_ptr(&p->next);
857 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
858 PV_STAT(atomic_add_long(&invl_finish_restart, 1));
859 lock_delay(&lda);
860 goto again;
861 }
862 if ((void *)prevl == invl_gen)
863 break;
864 }
865
866 /*
867 * It is legitimate to not find ourself on the list if a
868 * thread before us finished its DI and started it again.
869 */
870 if (__predict_false(p == NULL)) {
871 PV_STAT(atomic_add_long(&invl_finish_restart, 1));
872 lock_delay(&lda);
873 goto again;
874 }
875
876 critical_enter();
877 atomic_set_ptr((uintptr_t *)&invl_gen->next,
878 PMAP_INVL_GEN_NEXT_INVALID);
879 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) {
880 atomic_clear_ptr((uintptr_t *)&invl_gen->next,
881 PMAP_INVL_GEN_NEXT_INVALID);
882 critical_exit();
883 PV_STAT(atomic_add_long(&invl_finish_restart, 1));
884 lock_delay(&lda);
885 goto again;
886 }
887 critical_exit();
888 if (atomic_load_int(&pmap_invl_waiters) > 0)
889 pmap_delayed_invl_finish_unblock(0);
890 if (invl_gen->saved_pri != 0) {
891 thread_lock(td);
892 sched_prio(td, invl_gen->saved_pri);
893 thread_unlock(td);
894 }
895 }
896
897 #ifdef DDB
898 DB_SHOW_COMMAND(di_queue, pmap_di_queue)
899 {
900 struct pmap_invl_gen *p, *pn;
901 struct thread *td;
902 uintptr_t nextl;
903 bool first;
904
905 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
906 first = false) {
907 nextl = (uintptr_t)atomic_load_ptr(&p->next);
908 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
909 td = first ? NULL : __containerof(p, struct thread,
910 td_md.md_invl_gen);
911 db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
912 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
913 td != NULL ? td->td_tid : -1);
914 }
915 }
916 #endif
917
918 #ifdef PV_STATS
919 static long invl_wait;
920 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
921 "Number of times DI invalidation blocked pmap_remove_all/write");
922 static long invl_wait_slow;
923 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0,
924 "Number of slow invalidation waits for lockless DI");
925 #endif
926
927 static u_long *
928 pmap_delayed_invl_genp(vm_page_t m)
929 {
930
931 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
932 }
933
934 static void
935 pmap_delayed_invl_callout_func(void *arg __unused)
936 {
937
938 if (atomic_load_int(&pmap_invl_waiters) == 0)
939 return;
940 pmap_delayed_invl_finish_unblock(0);
941 }
942
943 static void
944 pmap_delayed_invl_callout_init(void *arg __unused)
945 {
946
947 if (pmap_di_locked())
948 return;
949 callout_init(&pmap_invl_callout, 1);
950 pmap_invl_callout_inited = true;
951 }
952 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY,
953 pmap_delayed_invl_callout_init, NULL);
954
955 /*
956 * Ensure that all currently executing DI blocks, that need to flush
957 * TLB for the given page m, actually flushed the TLB at the time the
958 * function returned. If the page m has an empty PV list and we call
959 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
960 * valid mapping for the page m in either its page table or TLB.
961 *
962 * This function works by blocking until the global DI generation
963 * number catches up with the generation number associated with the
964 * given page m and its PV list. Since this function's callers
965 * typically own an object lock and sometimes own a page lock, it
966 * cannot sleep. Instead, it blocks on a turnstile to relinquish the
967 * processor.
968 */
969 static void
970 pmap_delayed_invl_wait_l(vm_page_t m)
971 {
972 u_long *m_gen;
973 #ifdef PV_STATS
974 bool accounted = false;
975 #endif
976
977 m_gen = pmap_delayed_invl_genp(m);
978 while (*m_gen > pmap_invl_gen) {
979 #ifdef PV_STATS
980 if (!accounted) {
981 atomic_add_long(&invl_wait, 1);
982 accounted = true;
983 }
984 #endif
985 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen);
986 }
987 }
988
989 static void
990 pmap_delayed_invl_wait_u(vm_page_t m)
991 {
992 u_long *m_gen;
993 struct lock_delay_arg lda;
994 bool fast;
995
996 fast = true;
997 m_gen = pmap_delayed_invl_genp(m);
998 lock_delay_arg_init(&lda, &di_delay);
999 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
1000 if (fast || !pmap_invl_callout_inited) {
1001 PV_STAT(atomic_add_long(&invl_wait, 1));
1002 lock_delay(&lda);
1003 fast = false;
1004 } else {
1005 /*
1006 * The page's invalidation generation number
1007 * is still below the current thread's number.
1008 * Prepare to block so that we do not waste
1009 * CPU cycles or worse, suffer livelock.
1010 *
1011 * Since it is impossible to block without
1012 * racing with pmap_delayed_invl_finish_u(),
1013 * prepare for the race by incrementing
1014 * pmap_invl_waiters and arming a 1-tick
1015 * callout which will unblock us if we lose
1016 * the race.
1017 */
1018 atomic_add_int(&pmap_invl_waiters, 1);
1019
1020 /*
1021 * Re-check the current thread's invalidation
1022 * generation after incrementing
1023 * pmap_invl_waiters, so that there is no race
1024 * with pmap_delayed_invl_finish_u() setting
1025 * the page generation and checking
1026 * pmap_invl_waiters. The only race allowed
1027 * is for a missed unblock, which is handled
1028 * by the callout.
1029 */
1030 if (*m_gen >
1031 atomic_load_long(&pmap_invl_gen_head.gen)) {
1032 callout_reset(&pmap_invl_callout, 1,
1033 pmap_delayed_invl_callout_func, NULL);
1034 PV_STAT(atomic_add_long(&invl_wait_slow, 1));
1035 pmap_delayed_invl_wait_block(m_gen,
1036 &pmap_invl_gen_head.gen);
1037 }
1038 atomic_add_int(&pmap_invl_waiters, -1);
1039 }
1040 }
1041 }
1042
1043 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *), static)
1044 {
1045
1046 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l :
1047 pmap_thread_init_invl_gen_u);
1048 }
1049
1050 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void), static)
1051 {
1052
1053 return (pmap_di_locked() ? pmap_delayed_invl_start_l :
1054 pmap_delayed_invl_start_u);
1055 }
1056
1057 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void), static)
1058 {
1059
1060 return (pmap_di_locked() ? pmap_delayed_invl_finish_l :
1061 pmap_delayed_invl_finish_u);
1062 }
1063
1064 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t), static)
1065 {
1066
1067 return (pmap_di_locked() ? pmap_delayed_invl_wait_l :
1068 pmap_delayed_invl_wait_u);
1069 }
1070
1071 /*
1072 * Mark the page m's PV list as participating in the current thread's
1073 * DI block. Any threads concurrently using m's PV list to remove or
1074 * restrict all mappings to m will wait for the current thread's DI
1075 * block to complete before proceeding.
1076 *
1077 * The function works by setting the DI generation number for m's PV
1078 * list to at least the DI generation number of the current thread.
1079 * This forces a caller of pmap_delayed_invl_wait() to block until
1080 * current thread calls pmap_delayed_invl_finish().
1081 */
1082 static void
1083 pmap_delayed_invl_page(vm_page_t m)
1084 {
1085 u_long gen, *m_gen;
1086
1087 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
1088 gen = curthread->td_md.md_invl_gen.gen;
1089 if (gen == 0)
1090 return;
1091 m_gen = pmap_delayed_invl_genp(m);
1092 if (*m_gen < gen)
1093 *m_gen = gen;
1094 }
1095
1096 /*
1097 * Crashdump maps.
1098 */
1099 static caddr_t crashdumpmap;
1100
1101 /*
1102 * Internal flags for pmap_enter()'s helper functions.
1103 */
1104 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
1105 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
1106
1107 /*
1108 * Internal flags for pmap_mapdev_internal() and
1109 * pmap_change_props_locked().
1110 */
1111 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */
1112 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */
1113 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */
1114
1115 TAILQ_HEAD(pv_chunklist, pv_chunk);
1116
1117 static void free_pv_chunk(struct pv_chunk *pc);
1118 static void free_pv_chunk_batch(struct pv_chunklist *batch);
1119 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
1120 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
1121 static int popcnt_pc_map_pq(uint64_t *map);
1122 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
1123 static void reserve_pv_entries(pmap_t pmap, int needed,
1124 struct rwlock **lockp);
1125 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1126 struct rwlock **lockp);
1127 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
1128 u_int flags, struct rwlock **lockp);
1129 #if VM_NRESERVLEVEL > 0
1130 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1131 struct rwlock **lockp);
1132 #endif
1133 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
1134 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
1135 vm_offset_t va);
1136
1137 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
1138 vm_prot_t prot, int mode, int flags);
1139 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
1140 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
1141 vm_offset_t va, struct rwlock **lockp);
1142 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
1143 vm_offset_t va);
1144 static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
1145 vm_prot_t prot, struct rwlock **lockp);
1146 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
1147 u_int flags, vm_page_t m, struct rwlock **lockp);
1148 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
1149 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
1150 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
1151 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
1152 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
1153 vm_offset_t eva);
1154 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
1155 vm_offset_t eva);
1156 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
1157 pd_entry_t pde);
1158 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
1159 static vm_page_t pmap_large_map_getptp_unlocked(void);
1160 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
1161 #if VM_NRESERVLEVEL > 0
1162 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
1163 struct rwlock **lockp);
1164 #endif
1165 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
1166 vm_prot_t prot);
1167 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask);
1168 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
1169 bool exec);
1170 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
1171 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
1172 static void pmap_pti_wire_pte(void *pte);
1173 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
1174 struct spglist *free, struct rwlock **lockp);
1175 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
1176 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
1177 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
1178 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
1179 struct spglist *free);
1180 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1181 pd_entry_t *pde, struct spglist *free,
1182 struct rwlock **lockp);
1183 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
1184 vm_page_t m, struct rwlock **lockp);
1185 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
1186 pd_entry_t newpde);
1187 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
1188
1189 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
1190 struct rwlock **lockp);
1191 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
1192 struct rwlock **lockp);
1193 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
1194 struct rwlock **lockp);
1195
1196 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
1197 struct spglist *free);
1198 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
1199
1200 /********************/
1201 /* Inline functions */
1202 /********************/
1203
1204 /* Return a non-clipped PD index for a given VA */
1205 static __inline vm_pindex_t
1206 pmap_pde_pindex(vm_offset_t va)
1207 {
1208 return (va >> PDRSHIFT);
1209 }
1210
1211
1212 /* Return a pointer to the PML4 slot that corresponds to a VA */
1213 static __inline pml4_entry_t *
1214 pmap_pml4e(pmap_t pmap, vm_offset_t va)
1215 {
1216
1217 return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
1218 }
1219
1220 /* Return a pointer to the PDP slot that corresponds to a VA */
1221 static __inline pdp_entry_t *
1222 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
1223 {
1224 pdp_entry_t *pdpe;
1225
1226 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
1227 return (&pdpe[pmap_pdpe_index(va)]);
1228 }
1229
1230 /* Return a pointer to the PDP slot that corresponds to a VA */
1231 static __inline pdp_entry_t *
1232 pmap_pdpe(pmap_t pmap, vm_offset_t va)
1233 {
1234 pml4_entry_t *pml4e;
1235 pt_entry_t PG_V;
1236
1237 PG_V = pmap_valid_bit(pmap);
1238 pml4e = pmap_pml4e(pmap, va);
1239 if ((*pml4e & PG_V) == 0)
1240 return (NULL);
1241 return (pmap_pml4e_to_pdpe(pml4e, va));
1242 }
1243
1244 /* Return a pointer to the PD slot that corresponds to a VA */
1245 static __inline pd_entry_t *
1246 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
1247 {
1248 pd_entry_t *pde;
1249
1250 KASSERT((*pdpe & PG_PS) == 0,
1251 ("%s: pdpe %#lx is a leaf", __func__, *pdpe));
1252 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
1253 return (&pde[pmap_pde_index(va)]);
1254 }
1255
1256 /* Return a pointer to the PD slot that corresponds to a VA */
1257 static __inline pd_entry_t *
1258 pmap_pde(pmap_t pmap, vm_offset_t va)
1259 {
1260 pdp_entry_t *pdpe;
1261 pt_entry_t PG_V;
1262
1263 PG_V = pmap_valid_bit(pmap);
1264 pdpe = pmap_pdpe(pmap, va);
1265 if (pdpe == NULL || (*pdpe & PG_V) == 0)
1266 return (NULL);
1267 return (pmap_pdpe_to_pde(pdpe, va));
1268 }
1269
1270 /* Return a pointer to the PT slot that corresponds to a VA */
1271 static __inline pt_entry_t *
1272 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
1273 {
1274 pt_entry_t *pte;
1275
1276 KASSERT((*pde & PG_PS) == 0,
1277 ("%s: pde %#lx is a leaf", __func__, *pde));
1278 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
1279 return (&pte[pmap_pte_index(va)]);
1280 }
1281
1282 /* Return a pointer to the PT slot that corresponds to a VA */
1283 static __inline pt_entry_t *
1284 pmap_pte(pmap_t pmap, vm_offset_t va)
1285 {
1286 pd_entry_t *pde;
1287 pt_entry_t PG_V;
1288
1289 PG_V = pmap_valid_bit(pmap);
1290 pde = pmap_pde(pmap, va);
1291 if (pde == NULL || (*pde & PG_V) == 0)
1292 return (NULL);
1293 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */
1294 return ((pt_entry_t *)pde);
1295 return (pmap_pde_to_pte(pde, va));
1296 }
1297
1298 static __inline void
1299 pmap_resident_count_inc(pmap_t pmap, int count)
1300 {
1301
1302 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1303 pmap->pm_stats.resident_count += count;
1304 }
1305
1306 static __inline void
1307 pmap_resident_count_dec(pmap_t pmap, int count)
1308 {
1309
1310 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1311 KASSERT(pmap->pm_stats.resident_count >= count,
1312 ("pmap %p resident count underflow %ld %d", pmap,
1313 pmap->pm_stats.resident_count, count));
1314 pmap->pm_stats.resident_count -= count;
1315 }
1316
1317 PMAP_INLINE pt_entry_t *
1318 vtopte(vm_offset_t va)
1319 {
1320 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
1321
1322 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
1323
1324 return (PTmap + ((va >> PAGE_SHIFT) & mask));
1325 }
1326
1327 static __inline pd_entry_t *
1328 vtopde(vm_offset_t va)
1329 {
1330 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
1331
1332 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
1333
1334 return (PDmap + ((va >> PDRSHIFT) & mask));
1335 }
1336
1337 static u_int64_t
1338 allocpages(vm_paddr_t *firstaddr, int n)
1339 {
1340 u_int64_t ret;
1341
1342 ret = *firstaddr;
1343 bzero((void *)ret, n * PAGE_SIZE);
1344 *firstaddr += n * PAGE_SIZE;
1345 return (ret);
1346 }
1347
1348 CTASSERT(powerof2(NDMPML4E));
1349
1350 /* number of kernel PDP slots */
1351 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG)
1352
1353 static void
1354 nkpt_init(vm_paddr_t addr)
1355 {
1356 int pt_pages;
1357
1358 #ifdef NKPT
1359 pt_pages = NKPT;
1360 #else
1361 pt_pages = howmany(addr, 1 << PDRSHIFT);
1362 pt_pages += NKPDPE(pt_pages);
1363
1364 /*
1365 * Add some slop beyond the bare minimum required for bootstrapping
1366 * the kernel.
1367 *
1368 * This is quite important when allocating KVA for kernel modules.
1369 * The modules are required to be linked in the negative 2GB of
1370 * the address space. If we run out of KVA in this region then
1371 * pmap_growkernel() will need to allocate page table pages to map
1372 * the entire 512GB of KVA space which is an unnecessary tax on
1373 * physical memory.
1374 *
1375 * Secondly, device memory mapped as part of setting up the low-
1376 * level console(s) is taken from KVA, starting at virtual_avail.
1377 * This is because cninit() is called after pmap_bootstrap() but
1378 * before vm_init() and pmap_init(). 20MB for a frame buffer is
1379 * not uncommon.
1380 */
1381 pt_pages += 32; /* 64MB additional slop. */
1382 #endif
1383 nkpt = pt_pages;
1384 }
1385
1386 /*
1387 * Returns the proper write/execute permission for a physical page that is
1388 * part of the initial boot allocations.
1389 *
1390 * If the page has kernel text, it is marked as read-only. If the page has
1391 * kernel read-only data, it is marked as read-only/not-executable. If the
1392 * page has only read-write data, it is marked as read-write/not-executable.
1393 * If the page is below/above the kernel range, it is marked as read-write.
1394 *
1395 * This function operates on 2M pages, since we map the kernel space that
1396 * way.
1397 *
1398 * Note that this doesn't currently provide any protection for modules.
1399 */
1400 static inline pt_entry_t
1401 bootaddr_rwx(vm_paddr_t pa)
1402 {
1403
1404 /*
1405 * Everything in the same 2M page as the start of the kernel
1406 * should be static. On the other hand, things in the same 2M
1407 * page as the end of the kernel could be read-write/executable,
1408 * as the kernel image is not guaranteed to end on a 2M boundary.
1409 */
1410 if (pa < trunc_2mpage(btext - KERNBASE) ||
1411 pa >= trunc_2mpage(_end - KERNBASE))
1412 return (X86_PG_RW);
1413 /*
1414 * The linker should ensure that the read-only and read-write
1415 * portions don't share the same 2M page, so this shouldn't
1416 * impact read-only data. However, in any case, any page with
1417 * read-write data needs to be read-write.
1418 */
1419 if (pa >= trunc_2mpage(brwsection - KERNBASE))
1420 return (X86_PG_RW | pg_nx);
1421 /*
1422 * Mark any 2M page containing kernel text as read-only. Mark
1423 * other pages with read-only data as read-only and not executable.
1424 * (It is likely a small portion of the read-only data section will
1425 * be marked as read-only, but executable. This should be acceptable
1426 * since the read-only protection will keep the data from changing.)
1427 * Note that fixups to the .text section will still work until we
1428 * set CR0.WP.
1429 */
1430 if (pa < round_2mpage(etext - KERNBASE))
1431 return (0);
1432 return (pg_nx);
1433 }
1434
1435 static void
1436 create_pagetables(vm_paddr_t *firstaddr)
1437 {
1438 int i, j, ndm1g, nkpdpe, nkdmpde;
1439 pd_entry_t *pd_p;
1440 pdp_entry_t *pdp_p;
1441 pml4_entry_t *p4_p;
1442 uint64_t DMPDkernphys;
1443
1444 /* Allocate page table pages for the direct map */
1445 ndmpdp = howmany(ptoa(Maxmem), NBPDP);
1446 if (ndmpdp < 4) /* Minimum 4GB of dirmap */
1447 ndmpdp = 4;
1448 ndmpdpphys = howmany(ndmpdp, NPDPEPG);
1449 if (ndmpdpphys > NDMPML4E) {
1450 /*
1451 * Each NDMPML4E allows 512 GB, so limit to that,
1452 * and then readjust ndmpdp and ndmpdpphys.
1453 */
1454 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
1455 Maxmem = atop(NDMPML4E * NBPML4);
1456 ndmpdpphys = NDMPML4E;
1457 ndmpdp = NDMPML4E * NPDEPG;
1458 }
1459 DMPDPphys = allocpages(firstaddr, ndmpdpphys);
1460 ndm1g = 0;
1461 if ((amd_feature & AMDID_PAGE1GB) != 0) {
1462 /*
1463 * Calculate the number of 1G pages that will fully fit in
1464 * Maxmem.
1465 */
1466 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
1467
1468 /*
1469 * Allocate 2M pages for the kernel. These will be used in
1470 * place of the first one or more 1G pages from ndm1g.
1471 */
1472 nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
1473 DMPDkernphys = allocpages(firstaddr, nkdmpde);
1474 }
1475 if (ndm1g < ndmpdp)
1476 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
1477 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
1478
1479 /* Allocate pages */
1480 KPML4phys = allocpages(firstaddr, 1);
1481 KPDPphys = allocpages(firstaddr, NKPML4E);
1482
1483 /*
1484 * Allocate the initial number of kernel page table pages required to
1485 * bootstrap. We defer this until after all memory-size dependent
1486 * allocations are done (e.g. direct map), so that we don't have to
1487 * build in too much slop in our estimate.
1488 *
1489 * Note that when NKPML4E > 1, we have an empty page underneath
1490 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
1491 * pages. (pmap_enter requires a PD page to exist for each KPML4E.)
1492 */
1493 nkpt_init(*firstaddr);
1494 nkpdpe = NKPDPE(nkpt);
1495
1496 KPTphys = allocpages(firstaddr, nkpt);
1497 KPDphys = allocpages(firstaddr, nkpdpe);
1498
1499 /*
1500 * Connect the zero-filled PT pages to their PD entries. This
1501 * implicitly maps the PT pages at their correct locations within
1502 * the PTmap.
1503 */
1504 pd_p = (pd_entry_t *)KPDphys;
1505 for (i = 0; i < nkpt; i++)
1506 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1507
1508 /*
1509 * Map from physical address zero to the end of loader preallocated
1510 * memory using 2MB pages. This replaces some of the PD entries
1511 * created above.
1512 */
1513 for (i = 0; (i << PDRSHIFT) < KERNend; i++)
1514 /* Preset PG_M and PG_A because demotion expects it. */
1515 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
1516 X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
1517
1518 /*
1519 * Because we map the physical blocks in 2M pages, adjust firstaddr
1520 * to record the physical blocks we've actually mapped into kernel
1521 * virtual address space.
1522 */
1523 if (*firstaddr < round_2mpage(KERNend))
1524 *firstaddr = round_2mpage(KERNend);
1525
1526 /* And connect up the PD to the PDP (leaving room for L4 pages) */
1527 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
1528 for (i = 0; i < nkpdpe; i++)
1529 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1530
1531 /*
1532 * Now, set up the direct map region using 2MB and/or 1GB pages. If
1533 * the end of physical memory is not aligned to a 1GB page boundary,
1534 * then the residual physical memory is mapped with 2MB pages. Later,
1535 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
1536 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
1537 * that are partially used.
1538 */
1539 pd_p = (pd_entry_t *)DMPDphys;
1540 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
1541 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
1542 /* Preset PG_M and PG_A because demotion expects it. */
1543 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1544 X86_PG_M | X86_PG_A | pg_nx;
1545 }
1546 pdp_p = (pdp_entry_t *)DMPDPphys;
1547 for (i = 0; i < ndm1g; i++) {
1548 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
1549 /* Preset PG_M and PG_A because demotion expects it. */
1550 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1551 X86_PG_M | X86_PG_A | pg_nx;
1552 }
1553 for (j = 0; i < ndmpdp; i++, j++) {
1554 pdp_p[i] = DMPDphys + ptoa(j);
1555 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx;
1556 }
1557
1558 /*
1559 * Instead of using a 1G page for the memory containing the kernel,
1560 * use 2M pages with read-only and no-execute permissions. (If using 1G
1561 * pages, this will partially overwrite the PDPEs above.)
1562 */
1563 if (ndm1g) {
1564 pd_p = (pd_entry_t *)DMPDkernphys;
1565 for (i = 0; i < (NPDEPG * nkdmpde); i++)
1566 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
1567 X86_PG_M | X86_PG_A | pg_nx |
1568 bootaddr_rwx(i << PDRSHIFT);
1569 for (i = 0; i < nkdmpde; i++)
1570 pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
1571 X86_PG_V | pg_nx;
1572 }
1573
1574 /* And recursively map PML4 to itself in order to get PTmap */
1575 p4_p = (pml4_entry_t *)KPML4phys;
1576 p4_p[PML4PML4I] = KPML4phys;
1577 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
1578
1579 /* Connect the Direct Map slot(s) up to the PML4. */
1580 for (i = 0; i < ndmpdpphys; i++) {
1581 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
1582 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
1583 }
1584
1585 /* Connect the KVA slots up to the PML4 */
1586 for (i = 0; i < NKPML4E; i++) {
1587 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
1588 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
1589 }
1590 }
1591
1592 /*
1593 * Bootstrap the system enough to run with virtual memory.
1594 *
1595 * On amd64 this is called after mapping has already been enabled
1596 * and just syncs the pmap module with what has already been done.
1597 * [We can't call it easily with mapping off since the kernel is not
1598 * mapped with PA == VA, hence we would have to relocate every address
1599 * from the linked base (virtual) address "KERNBASE" to the actual
1600 * (physical) address starting relative to 0]
1601 */
1602 void
1603 pmap_bootstrap(vm_paddr_t *firstaddr)
1604 {
1605 vm_offset_t va;
1606 pt_entry_t *pte, *pcpu_pte;
1607 uint64_t cr4, pcpu_phys;
1608 u_long res;
1609 int i;
1610
1611 KERNend = *firstaddr;
1612 res = atop(KERNend - (vm_paddr_t)kernphys);
1613
1614 if (!pti)
1615 pg_g = X86_PG_G;
1616
1617 /*
1618 * Create an initial set of page tables to run the kernel in.
1619 */
1620 create_pagetables(firstaddr);
1621
1622 pcpu_phys = allocpages(firstaddr, MAXCPU);
1623
1624 /*
1625 * Add a physical memory segment (vm_phys_seg) corresponding to the
1626 * preallocated kernel page table pages so that vm_page structures
1627 * representing these pages will be created. The vm_page structures
1628 * are required for promotion of the corresponding kernel virtual
1629 * addresses to superpage mappings.
1630 */
1631 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1632
1633 /*
1634 * Account for the virtual addresses mapped by create_pagetables().
1635 */
1636 virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
1637 virtual_end = VM_MAX_KERNEL_ADDRESS;
1638
1639 /*
1640 * Enable PG_G global pages, then switch to the kernel page
1641 * table from the bootstrap page table. After the switch, it
1642 * is possible to enable SMEP and SMAP since PG_U bits are
1643 * correct now.
1644 */
1645 cr4 = rcr4();
1646 cr4 |= CR4_PGE;
1647 load_cr4(cr4);
1648 load_cr3(KPML4phys);
1649 if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1650 cr4 |= CR4_SMEP;
1651 if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
1652 cr4 |= CR4_SMAP;
1653 load_cr4(cr4);
1654
1655 /*
1656 * Initialize the kernel pmap (which is statically allocated).
1657 * Count bootstrap data as being resident in case any of this data is
1658 * later unmapped (using pmap_remove()) and freed.
1659 */
1660 PMAP_LOCK_INIT(kernel_pmap);
1661 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
1662 kernel_pmap->pm_cr3 = KPML4phys;
1663 kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
1664 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
1665 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1666 kernel_pmap->pm_stats.resident_count = res;
1667 kernel_pmap->pm_flags = pmap_flags;
1668
1669 /*
1670 * Initialize the TLB invalidations generation number lock.
1671 */
1672 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
1673
1674 /*
1675 * Reserve some special page table entries/VA space for temporary
1676 * mapping of pages.
1677 */
1678 #define SYSMAP(c, p, v, n) \
1679 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
1680
1681 va = virtual_avail;
1682 pte = vtopte(va);
1683
1684 /*
1685 * Crashdump maps. The first page is reused as CMAP1 for the
1686 * memory test.
1687 */
1688 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
1689 CADDR1 = crashdumpmap;
1690
1691 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
1692 virtual_avail = va;
1693
1694 for (i = 0; i < MAXCPU; i++) {
1695 pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW |
1696 pg_g | pg_nx | X86_PG_M | X86_PG_A;
1697 }
1698 STAILQ_INIT(&cpuhead);
1699 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
1700 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
1701 amd64_bsp_pcpu_init1(&__pcpu[0]);
1702 amd64_bsp_ist_init(&__pcpu[0]);
1703 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
1704 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
1705
1706 /*
1707 * Initialize the PAT MSR.
1708 * pmap_init_pat() clears and sets CR4_PGE, which, as a
1709 * side-effect, invalidates stale PG_G TLB entries that might
1710 * have been created in our pre-boot environment.
1711 */
1712 pmap_init_pat();
1713
1714 /* Initialize TLB Context Id. */
1715 if (pmap_pcid_enabled) {
1716 for (i = 0; i < MAXCPU; i++) {
1717 kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
1718 kernel_pmap->pm_pcids[i].pm_gen = 1;
1719 }
1720
1721 /*
1722 * PMAP_PCID_KERN + 1 is used for initialization of
1723 * proc0 pmap. The pmap' pcid state might be used by
1724 * EFIRT entry before first context switch, so it
1725 * needs to be valid.
1726 */
1727 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
1728 PCPU_SET(pcid_gen, 1);
1729
1730 /*
1731 * pcpu area for APs is zeroed during AP startup.
1732 * pc_pcid_next and pc_pcid_gen are initialized by AP
1733 * during pcpu setup.
1734 */
1735 load_cr4(rcr4() | CR4_PCIDE);
1736 }
1737 }
1738
1739 /*
1740 * Setup the PAT MSR.
1741 */
1742 void
1743 pmap_init_pat(void)
1744 {
1745 uint64_t pat_msr;
1746 u_long cr0, cr4;
1747 int i;
1748
1749 /* Bail if this CPU doesn't implement PAT. */
1750 if ((cpu_feature & CPUID_PAT) == 0)
1751 panic("no PAT??");
1752
1753 /* Set default PAT index table. */
1754 for (i = 0; i < PAT_INDEX_SIZE; i++)
1755 pat_index[i] = -1;
1756 pat_index[PAT_WRITE_BACK] = 0;
1757 pat_index[PAT_WRITE_THROUGH] = 1;
1758 pat_index[PAT_UNCACHEABLE] = 3;
1759 pat_index[PAT_WRITE_COMBINING] = 6;
1760 pat_index[PAT_WRITE_PROTECTED] = 5;
1761 pat_index[PAT_UNCACHED] = 2;
1762
1763 /*
1764 * Initialize default PAT entries.
1765 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
1766 * Program 5 and 6 as WP and WC.
1767 *
1768 * Leave 4 and 7 as WB and UC. Note that a recursive page table
1769 * mapping for a 2M page uses a PAT value with the bit 3 set due
1770 * to its overload with PG_PS.
1771 */
1772 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
1773 PAT_VALUE(1, PAT_WRITE_THROUGH) |
1774 PAT_VALUE(2, PAT_UNCACHED) |
1775 PAT_VALUE(3, PAT_UNCACHEABLE) |
1776 PAT_VALUE(4, PAT_WRITE_BACK) |
1777 PAT_VALUE(5, PAT_WRITE_PROTECTED) |
1778 PAT_VALUE(6, PAT_WRITE_COMBINING) |
1779 PAT_VALUE(7, PAT_UNCACHEABLE);
1780
1781 /* Disable PGE. */
1782 cr4 = rcr4();
1783 load_cr4(cr4 & ~CR4_PGE);
1784
1785 /* Disable caches (CD = 1, NW = 0). */
1786 cr0 = rcr0();
1787 load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1788
1789 /* Flushes caches and TLBs. */
1790 wbinvd();
1791 invltlb();
1792
1793 /* Update PAT and index table. */
1794 wrmsr(MSR_PAT, pat_msr);
1795
1796 /* Flush caches and TLBs again. */
1797 wbinvd();
1798 invltlb();
1799
1800 /* Restore caches and PGE. */
1801 load_cr0(cr0);
1802 load_cr4(cr4);
1803 }
1804
1805 /*
1806 * Initialize a vm_page's machine-dependent fields.
1807 */
1808 void
1809 pmap_page_init(vm_page_t m)
1810 {
1811
1812 TAILQ_INIT(&m->md.pv_list);
1813 m->md.pat_mode = PAT_WRITE_BACK;
1814 }
1815
1816 static int pmap_allow_2m_x_ept;
1817 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
1818 &pmap_allow_2m_x_ept, 0,
1819 "Allow executable superpage mappings in EPT");
1820
1821 void
1822 pmap_allow_2m_x_ept_recalculate(void)
1823 {
1824 /*
1825 * SKL002, SKL012S. Since the EPT format is only used by
1826 * Intel CPUs, the vendor check is merely a formality.
1827 */
1828 if (!(cpu_vendor_id != CPU_VENDOR_INTEL ||
1829 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 ||
1830 (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1831 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */
1832 CPUID_TO_MODEL(cpu_id) == 0x27 ||
1833 CPUID_TO_MODEL(cpu_id) == 0x35 ||
1834 CPUID_TO_MODEL(cpu_id) == 0x36 ||
1835 CPUID_TO_MODEL(cpu_id) == 0x37 ||
1836 CPUID_TO_MODEL(cpu_id) == 0x86 ||
1837 CPUID_TO_MODEL(cpu_id) == 0x1c ||
1838 CPUID_TO_MODEL(cpu_id) == 0x4a ||
1839 CPUID_TO_MODEL(cpu_id) == 0x4c ||
1840 CPUID_TO_MODEL(cpu_id) == 0x4d ||
1841 CPUID_TO_MODEL(cpu_id) == 0x5a ||
1842 CPUID_TO_MODEL(cpu_id) == 0x5c ||
1843 CPUID_TO_MODEL(cpu_id) == 0x5d ||
1844 CPUID_TO_MODEL(cpu_id) == 0x5f ||
1845 CPUID_TO_MODEL(cpu_id) == 0x6e ||
1846 CPUID_TO_MODEL(cpu_id) == 0x7a ||
1847 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */
1848 CPUID_TO_MODEL(cpu_id) == 0x85))))
1849 pmap_allow_2m_x_ept = 1;
1850 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept);
1851 }
1852
1853 static bool
1854 pmap_allow_2m_x_page(pmap_t pmap, bool executable)
1855 {
1856
1857 return (pmap->pm_type != PT_EPT || !executable ||
1858 !pmap_allow_2m_x_ept);
1859 }
1860
1861 /*
1862 * Initialize the pmap module.
1863 * Called by vm_init, to initialize any structures that the pmap
1864 * system needs to map virtual memory.
1865 */
1866 void
1867 pmap_init(void)
1868 {
1869 struct pmap_preinit_mapping *ppim;
1870 vm_page_t m, mpte;
1871 vm_size_t s;
1872 int error, i, pv_npg, ret, skz63;
1873
1874 /* L1TF, reserve page @0 unconditionally */
1875 vm_page_blacklist_add(0, bootverbose);
1876
1877 /* Detect bare-metal Skylake Server and Skylake-X. */
1878 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
1879 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
1880 /*
1881 * Skylake-X errata SKZ63. Processor May Hang When
1882 * Executing Code In an HLE Transaction Region between
1883 * 40000000H and 403FFFFFH.
1884 *
1885 * Mark the pages in the range as preallocated. It
1886 * seems to be impossible to distinguish between
1887 * Skylake Server and Skylake X.
1888 */
1889 skz63 = 1;
1890 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
1891 if (skz63 != 0) {
1892 if (bootverbose)
1893 printf("SKZ63: skipping 4M RAM starting "
1894 "at physical 1G\n");
1895 for (i = 0; i < atop(0x400000); i++) {
1896 ret = vm_page_blacklist_add(0x40000000 +
1897 ptoa(i), FALSE);
1898 if (!ret && bootverbose)
1899 printf("page at %#lx already used\n",
1900 0x40000000 + ptoa(i));
1901 }
1902 }
1903 }
1904
1905 /* IFU */
1906 pmap_allow_2m_x_ept_recalculate();
1907
1908 /*
1909 * Initialize the vm page array entries for the kernel pmap's
1910 * page table pages.
1911 */
1912 PMAP_LOCK(kernel_pmap);
1913 for (i = 0; i < nkpt; i++) {
1914 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1915 KASSERT(mpte >= vm_page_array &&
1916 mpte < &vm_page_array[vm_page_array_size],
1917 ("pmap_init: page table page is out of range"));
1918 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1919 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1920 mpte->wire_count = 1;
1921
1922 /*
1923 * Collect the page table pages that were replaced by a 2MB
1924 * page in create_pagetables(). They are zero filled.
1925 */
1926 if (i << PDRSHIFT < KERNend &&
1927 pmap_insert_pt_page(kernel_pmap, mpte, false))
1928 panic("pmap_init: pmap_insert_pt_page failed");
1929 }
1930 PMAP_UNLOCK(kernel_pmap);
1931 vm_wire_add(nkpt);
1932
1933 /*
1934 * If the kernel is running on a virtual machine, then it must assume
1935 * that MCA is enabled by the hypervisor. Moreover, the kernel must
1936 * be prepared for the hypervisor changing the vendor and family that
1937 * are reported by CPUID. Consequently, the workaround for AMD Family
1938 * 10h Erratum 383 is enabled if the processor's feature set does not
1939 * include at least one feature that is only supported by older Intel
1940 * or newer AMD processors.
1941 */
1942 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
1943 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1944 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1945 AMDID2_FMA4)) == 0)
1946 workaround_erratum383 = 1;
1947
1948 /*
1949 * Are large page mappings enabled?
1950 */
1951 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1952 if (pg_ps_enabled) {
1953 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1954 ("pmap_init: can't assign to pagesizes[1]"));
1955 pagesizes[1] = NBPDR;
1956 }
1957
1958 /*
1959 * Initialize the pv chunk list mutex.
1960 */
1961 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1962
1963 /*
1964 * Initialize the pool of pv list locks.
1965 */
1966 for (i = 0; i < NPV_LIST_LOCKS; i++)
1967 rw_init(&pv_list_locks[i], "pmap pv list");
1968
1969 /*
1970 * Calculate the size of the pv head table for superpages.
1971 */
1972 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
1973
1974 /*
1975 * Allocate memory for the pv head table for superpages.
1976 */
1977 s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1978 s = round_page(s);
1979 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
1980 for (i = 0; i < pv_npg; i++)
1981 TAILQ_INIT(&pv_table[i].pv_list);
1982 TAILQ_INIT(&pv_dummy.pv_list);
1983
1984 pmap_initialized = 1;
1985 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
1986 ppim = pmap_preinit_mapping + i;
1987 if (ppim->va == 0)
1988 continue;
1989 /* Make the direct map consistent */
1990 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
1991 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
1992 ppim->sz, ppim->mode);
1993 }
1994 if (!bootverbose)
1995 continue;
1996 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
1997 ppim->pa, ppim->va, ppim->sz, ppim->mode);
1998 }
1999
2000 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
2001 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
2002 (vmem_addr_t *)&qframe);
2003 if (error != 0)
2004 panic("qframe allocation failed");
2005
2006 lm_ents = 8;
2007 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
2008 if (lm_ents > LMEPML4I - LMSPML4I + 1)
2009 lm_ents = LMEPML4I - LMSPML4I + 1;
2010 if (bootverbose)
2011 printf("pmap: large map %u PML4 slots (%lu Gb)\n",
2012 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
2013 if (lm_ents != 0) {
2014 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
2015 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
2016 if (large_vmem == NULL) {
2017 printf("pmap: cannot create large map\n");
2018 lm_ents = 0;
2019 }
2020 for (i = 0; i < lm_ents; i++) {
2021 m = pmap_large_map_getptp_unlocked();
2022 kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
2023 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
2024 VM_PAGE_TO_PHYS(m);
2025 }
2026 }
2027 }
2028
2029 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
2030 "2MB page mapping counters");
2031
2032 static u_long pmap_pde_demotions;
2033 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
2034 &pmap_pde_demotions, 0, "2MB page demotions");
2035
2036 static u_long pmap_pde_mappings;
2037 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
2038 &pmap_pde_mappings, 0, "2MB page mappings");
2039
2040 static u_long pmap_pde_p_failures;
2041 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
2042 &pmap_pde_p_failures, 0, "2MB page promotion failures");
2043
2044 static u_long pmap_pde_promotions;
2045 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
2046 &pmap_pde_promotions, 0, "2MB page promotions");
2047
2048 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
2049 "1GB page mapping counters");
2050
2051 static u_long pmap_pdpe_demotions;
2052 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
2053 &pmap_pdpe_demotions, 0, "1GB page demotions");
2054
2055 /***************************************************
2056 * Low level helper routines.....
2057 ***************************************************/
2058
2059 static pt_entry_t
2060 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
2061 {
2062 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
2063
2064 switch (pmap->pm_type) {
2065 case PT_X86:
2066 case PT_RVI:
2067 /* Verify that both PAT bits are not set at the same time */
2068 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
2069 ("Invalid PAT bits in entry %#lx", entry));
2070
2071 /* Swap the PAT bits if one of them is set */
2072 if ((entry & x86_pat_bits) != 0)
2073 entry ^= x86_pat_bits;
2074 break;
2075 case PT_EPT:
2076 /*
2077 * Nothing to do - the memory attributes are represented
2078 * the same way for regular pages and superpages.
2079 */
2080 break;
2081 default:
2082 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
2083 }
2084
2085 return (entry);
2086 }
2087
2088 boolean_t
2089 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
2090 {
2091
2092 return (mode >= 0 && mode < PAT_INDEX_SIZE &&
2093 pat_index[(int)mode] >= 0);
2094 }
2095
2096 /*
2097 * Determine the appropriate bits to set in a PTE or PDE for a specified
2098 * caching mode.
2099 */
2100 int
2101 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
2102 {
2103 int cache_bits, pat_flag, pat_idx;
2104
2105 if (!pmap_is_valid_memattr(pmap, mode))
2106 panic("Unknown caching mode %d\n", mode);
2107
2108 switch (pmap->pm_type) {
2109 case PT_X86:
2110 case PT_RVI:
2111 /* The PAT bit is different for PTE's and PDE's. */
2112 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
2113
2114 /* Map the caching mode to a PAT index. */
2115 pat_idx = pat_index[mode];
2116
2117 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
2118 cache_bits = 0;
2119 if (pat_idx & 0x4)
2120 cache_bits |= pat_flag;
2121 if (pat_idx & 0x2)
2122 cache_bits |= PG_NC_PCD;
2123 if (pat_idx & 0x1)
2124 cache_bits |= PG_NC_PWT;
2125 break;
2126
2127 case PT_EPT:
2128 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
2129 break;
2130
2131 default:
2132 panic("unsupported pmap type %d", pmap->pm_type);
2133 }
2134
2135 return (cache_bits);
2136 }
2137
2138 static int
2139 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
2140 {
2141 int mask;
2142
2143 switch (pmap->pm_type) {
2144 case PT_X86:
2145 case PT_RVI:
2146 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
2147 break;
2148 case PT_EPT:
2149 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
2150 break;
2151 default:
2152 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
2153 }
2154
2155 return (mask);
2156 }
2157
2158 static int
2159 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
2160 {
2161 int pat_flag, pat_idx;
2162
2163 pat_idx = 0;
2164 switch (pmap->pm_type) {
2165 case PT_X86:
2166 case PT_RVI:
2167 /* The PAT bit is different for PTE's and PDE's. */
2168 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
2169
2170 if ((pte & pat_flag) != 0)
2171 pat_idx |= 0x4;
2172 if ((pte & PG_NC_PCD) != 0)
2173 pat_idx |= 0x2;
2174 if ((pte & PG_NC_PWT) != 0)
2175 pat_idx |= 0x1;
2176 break;
2177 case PT_EPT:
2178 if ((pte & EPT_PG_IGNORE_PAT) != 0)
2179 panic("EPT PTE %#lx has no PAT memory type", pte);
2180 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
2181 break;
2182 }
2183
2184 /* See pmap_init_pat(). */
2185 if (pat_idx == 4)
2186 pat_idx = 0;
2187 if (pat_idx == 7)
2188 pat_idx = 3;
2189
2190 return (pat_idx);
2191 }
2192
2193 bool
2194 pmap_ps_enabled(pmap_t pmap)
2195 {
2196
2197 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
2198 }
2199
2200 static void
2201 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
2202 {
2203
2204 switch (pmap->pm_type) {
2205 case PT_X86:
2206 break;
2207 case PT_RVI:
2208 case PT_EPT:
2209 /*
2210 * XXX
2211 * This is a little bogus since the generation number is
2212 * supposed to be bumped up when a region of the address
2213 * space is invalidated in the page tables.
2214 *
2215 * In this case the old PDE entry is valid but yet we want
2216 * to make sure that any mappings using the old entry are
2217 * invalidated in the TLB.
2218 *
2219 * The reason this works as expected is because we rendezvous
2220 * "all" host cpus and force any vcpu context to exit as a
2221 * side-effect.
2222 */
2223 atomic_add_acq_long(&pmap->pm_eptgen, 1);
2224 break;
2225 default:
2226 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
2227 }
2228 pde_store(pde, newpde);
2229 }
2230
2231 /*
2232 * After changing the page size for the specified virtual address in the page
2233 * table, flush the corresponding entries from the processor's TLB. Only the
2234 * calling processor's TLB is affected.
2235 *
2236 * The calling thread must be pinned to a processor.
2237 */
2238 static void
2239 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
2240 {
2241 pt_entry_t PG_G;
2242
2243 if (pmap_type_guest(pmap))
2244 return;
2245
2246 KASSERT(pmap->pm_type == PT_X86,
2247 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
2248
2249 PG_G = pmap_global_bit(pmap);
2250
2251 if ((newpde & PG_PS) == 0)
2252 /* Demotion: flush a specific 2MB page mapping. */
2253 invlpg(va);
2254 else if ((newpde & PG_G) == 0)
2255 /*
2256 * Promotion: flush every 4KB page mapping from the TLB
2257 * because there are too many to flush individually.
2258 */
2259 invltlb();
2260 else {
2261 /*
2262 * Promotion: flush every 4KB page mapping from the TLB,
2263 * including any global (PG_G) mappings.
2264 */
2265 invltlb_glob();
2266 }
2267 }
2268 #ifdef SMP
2269
2270 /*
2271 * For SMP, these functions have to use the IPI mechanism for coherence.
2272 *
2273 * N.B.: Before calling any of the following TLB invalidation functions,
2274 * the calling processor must ensure that all stores updating a non-
2275 * kernel page table are globally performed. Otherwise, another
2276 * processor could cache an old, pre-update entry without being
2277 * invalidated. This can happen one of two ways: (1) The pmap becomes
2278 * active on another processor after its pm_active field is checked by
2279 * one of the following functions but before a store updating the page
2280 * table is globally performed. (2) The pmap becomes active on another
2281 * processor before its pm_active field is checked but due to
2282 * speculative loads one of the following functions stills reads the
2283 * pmap as inactive on the other processor.
2284 *
2285 * The kernel page table is exempt because its pm_active field is
2286 * immutable. The kernel page table is always active on every
2287 * processor.
2288 */
2289
2290 /*
2291 * Interrupt the cpus that are executing in the guest context.
2292 * This will force the vcpu to exit and the cached EPT mappings
2293 * will be invalidated by the host before the next vmresume.
2294 */
2295 static __inline void
2296 pmap_invalidate_ept(pmap_t pmap)
2297 {
2298 int ipinum;
2299
2300 sched_pin();
2301 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2302 ("pmap_invalidate_ept: absurd pm_active"));
2303
2304 /*
2305 * The TLB mappings associated with a vcpu context are not
2306 * flushed each time a different vcpu is chosen to execute.
2307 *
2308 * This is in contrast with a process's vtop mappings that
2309 * are flushed from the TLB on each context switch.
2310 *
2311 * Therefore we need to do more than just a TLB shootdown on
2312 * the active cpus in 'pmap->pm_active'. To do this we keep
2313 * track of the number of invalidations performed on this pmap.
2314 *
2315 * Each vcpu keeps a cache of this counter and compares it
2316 * just before a vmresume. If the counter is out-of-date an
2317 * invept will be done to flush stale mappings from the TLB.
2318 */
2319 atomic_add_acq_long(&pmap->pm_eptgen, 1);
2320
2321 /*
2322 * Force the vcpu to exit and trap back into the hypervisor.
2323 */
2324 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
2325 ipi_selected(pmap->pm_active, ipinum);
2326 sched_unpin();
2327 }
2328
2329 static cpuset_t
2330 pmap_invalidate_cpu_mask(pmap_t pmap)
2331 {
2332 return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
2333 }
2334
2335 static inline void
2336 pmap_invalidate_preipi_pcid(pmap_t pmap)
2337 {
2338 u_int cpuid, i;
2339
2340 sched_pin();
2341
2342 cpuid = PCPU_GET(cpuid);
2343 if (pmap != PCPU_GET(curpmap))
2344 cpuid = 0xffffffff; /* An impossible value */
2345
2346 CPU_FOREACH(i) {
2347 if (cpuid != i)
2348 pmap->pm_pcids[i].pm_gen = 0;
2349 }
2350
2351 /*
2352 * The fence is between stores to pm_gen and the read of the
2353 * pm_active mask. We need to ensure that it is impossible
2354 * for us to miss the bit update in pm_active and
2355 * simultaneously observe a non-zero pm_gen in
2356 * pmap_activate_sw(), otherwise TLB update is missed.
2357 * Without the fence, IA32 allows such an outcome. Note that
2358 * pm_active is updated by a locked operation, which provides
2359 * the reciprocal fence.
2360 */
2361 atomic_thread_fence_seq_cst();
2362 }
2363
2364 static void
2365 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)
2366 {
2367 sched_pin();
2368 }
2369
2370 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t), static)
2371 {
2372 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid :
2373 pmap_invalidate_preipi_nopcid);
2374 }
2375
2376 static inline void
2377 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va,
2378 const bool invpcid_works1)
2379 {
2380 struct invpcid_descr d;
2381 uint64_t kcr3, ucr3;
2382 uint32_t pcid;
2383 u_int cpuid;
2384
2385 /*
2386 * Because pm_pcid is recalculated on a context switch, we
2387 * must ensure there is no preemption, not just pinning.
2388 * Otherwise, we might use a stale value below.
2389 */
2390 CRITICAL_ASSERT(curthread);
2391
2392 /*
2393 * No need to do anything with user page tables invalidation
2394 * if there is no user page table.
2395 */
2396 if (pmap->pm_ucr3 == PMAP_NO_CR3)
2397 return;
2398
2399 cpuid = PCPU_GET(cpuid);
2400
2401 pcid = pmap->pm_pcids[cpuid].pm_pcid;
2402 if (invpcid_works1) {
2403 d.pcid = pcid | PMAP_PCID_USER_PT;
2404 d.pad = 0;
2405 d.addr = va;
2406 invpcid(&d, INVPCID_ADDR);
2407 } else {
2408 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
2409 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2410 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
2411 }
2412 }
2413
2414 static void
2415 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va)
2416 {
2417 pmap_invalidate_page_pcid_cb(pmap, va, true);
2418 }
2419
2420 static void
2421 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va)
2422 {
2423 pmap_invalidate_page_pcid_cb(pmap, va, false);
2424 }
2425
2426 static void
2427 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused)
2428 {
2429 }
2430
2431 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t),
2432 static)
2433 {
2434 if (pmap_pcid_enabled)
2435 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb :
2436 pmap_invalidate_page_pcid_noinvpcid_cb);
2437 return (pmap_invalidate_page_nopcid_cb);
2438 }
2439
2440 static void
2441 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
2442 vm_offset_t addr2 __unused)
2443 {
2444 if (pmap == kernel_pmap) {
2445 invlpg(va);
2446 } else if (pmap == PCPU_GET(curpmap)) {
2447 invlpg(va);
2448 pmap_invalidate_page_cb(pmap, va);
2449 }
2450 }
2451
2452 void
2453 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
2454 {
2455 if (pmap_type_guest(pmap)) {
2456 pmap_invalidate_ept(pmap);
2457 return;
2458 }
2459
2460 KASSERT(pmap->pm_type == PT_X86,
2461 ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
2462
2463 pmap_invalidate_preipi(pmap);
2464 smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap,
2465 pmap_invalidate_page_curcpu_cb);
2466 }
2467
2468 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
2469 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE)
2470
2471 static void
2472 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2473 const bool invpcid_works1)
2474 {
2475 struct invpcid_descr d;
2476 uint64_t kcr3, ucr3;
2477 uint32_t pcid;
2478 u_int cpuid;
2479
2480 CRITICAL_ASSERT(curthread);
2481
2482 if (pmap != PCPU_GET(curpmap) ||
2483 pmap->pm_ucr3 == PMAP_NO_CR3)
2484 return;
2485
2486 cpuid = PCPU_GET(cpuid);
2487
2488 pcid = pmap->pm_pcids[cpuid].pm_pcid;
2489 if (invpcid_works1) {
2490 d.pcid = pcid | PMAP_PCID_USER_PT;
2491 d.pad = 0;
2492 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE)
2493 invpcid(&d, INVPCID_ADDR);
2494 } else {
2495 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
2496 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2497 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
2498 }
2499 }
2500
2501 static void
2502 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva,
2503 vm_offset_t eva)
2504 {
2505 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true);
2506 }
2507
2508 static void
2509 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva,
2510 vm_offset_t eva)
2511 {
2512 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false);
2513 }
2514
2515 static void
2516 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused,
2517 vm_offset_t eva __unused)
2518 {
2519 }
2520
2521 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t,
2522 vm_offset_t), static)
2523 {
2524 if (pmap_pcid_enabled)
2525 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb :
2526 pmap_invalidate_range_pcid_noinvpcid_cb);
2527 return (pmap_invalidate_range_nopcid_cb);
2528 }
2529
2530 static void
2531 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2532 {
2533 vm_offset_t addr;
2534
2535 if (pmap == kernel_pmap) {
2536 for (addr = sva; addr < eva; addr += PAGE_SIZE)
2537 invlpg(addr);
2538 } else if (pmap == PCPU_GET(curpmap)) {
2539 for (addr = sva; addr < eva; addr += PAGE_SIZE)
2540 invlpg(addr);
2541 pmap_invalidate_range_cb(pmap, sva, eva);
2542 }
2543 }
2544
2545 void
2546 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2547 {
2548 if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
2549 pmap_invalidate_all(pmap);
2550 return;
2551 }
2552
2553 if (pmap_type_guest(pmap)) {
2554 pmap_invalidate_ept(pmap);
2555 return;
2556 }
2557
2558 KASSERT(pmap->pm_type == PT_X86,
2559 ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
2560
2561 pmap_invalidate_preipi(pmap);
2562 smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap,
2563 pmap_invalidate_range_curcpu_cb);
2564 }
2565
2566 static inline void
2567 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1)
2568 {
2569 struct invpcid_descr d;
2570 uint64_t kcr3, ucr3;
2571 uint32_t pcid;
2572 u_int cpuid;
2573
2574 if (pmap == kernel_pmap) {
2575 if (invpcid_works1) {
2576 bzero(&d, sizeof(d));
2577 invpcid(&d, INVPCID_CTXGLOB);
2578 } else {
2579 invltlb_glob();
2580 }
2581 } else if (pmap == PCPU_GET(curpmap)) {
2582 CRITICAL_ASSERT(curthread);
2583 cpuid = PCPU_GET(cpuid);
2584
2585 pcid = pmap->pm_pcids[cpuid].pm_pcid;
2586 if (invpcid_works1) {
2587 d.pcid = pcid;
2588 d.pad = 0;
2589 d.addr = 0;
2590 invpcid(&d, INVPCID_CTX);
2591 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2592 d.pcid |= PMAP_PCID_USER_PT;
2593 invpcid(&d, INVPCID_CTX);
2594 }
2595 } else {
2596 kcr3 = pmap->pm_cr3 | pcid;
2597 ucr3 = pmap->pm_ucr3;
2598 if (ucr3 != PMAP_NO_CR3) {
2599 ucr3 |= pcid | PMAP_PCID_USER_PT;
2600 pmap_pti_pcid_invalidate(ucr3, kcr3);
2601 } else {
2602 load_cr3(kcr3);
2603 }
2604 }
2605 }
2606 }
2607
2608 static void
2609 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)
2610 {
2611 pmap_invalidate_all_pcid_cb(pmap, true);
2612 }
2613
2614 static void
2615 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)
2616 {
2617 pmap_invalidate_all_pcid_cb(pmap, false);
2618 }
2619
2620 static void
2621 pmap_invalidate_all_nopcid_cb(pmap_t pmap)
2622 {
2623 if (pmap == kernel_pmap)
2624 invltlb_glob();
2625 else if (pmap == PCPU_GET(curpmap))
2626 invltlb();
2627 }
2628
2629 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t), static)
2630 {
2631 if (pmap_pcid_enabled)
2632 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb :
2633 pmap_invalidate_all_pcid_noinvpcid_cb);
2634 return (pmap_invalidate_all_nopcid_cb);
2635 }
2636
2637 static void
2638 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
2639 vm_offset_t addr2 __unused)
2640 {
2641 pmap_invalidate_all_cb(pmap);
2642 }
2643
2644 void
2645 pmap_invalidate_all(pmap_t pmap)
2646 {
2647 if (pmap_type_guest(pmap)) {
2648 pmap_invalidate_ept(pmap);
2649 return;
2650 }
2651
2652 KASSERT(pmap->pm_type == PT_X86,
2653 ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
2654
2655 pmap_invalidate_preipi(pmap);
2656 smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap,
2657 pmap_invalidate_all_curcpu_cb);
2658 }
2659
2660 static void
2661 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
2662 vm_offset_t addr2 __unused)
2663 {
2664 wbinvd();
2665 }
2666
2667 void
2668 pmap_invalidate_cache(void)
2669 {
2670 sched_pin();
2671 smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
2672 }
2673
2674 struct pde_action {
2675 cpuset_t invalidate; /* processors that invalidate their TLB */
2676 pmap_t pmap;
2677 vm_offset_t va;
2678 pd_entry_t *pde;
2679 pd_entry_t newpde;
2680 u_int store; /* processor that updates the PDE */
2681 };
2682
2683 static void
2684 pmap_update_pde_action(void *arg)
2685 {
2686 struct pde_action *act = arg;
2687
2688 if (act->store == PCPU_GET(cpuid))
2689 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
2690 }
2691
2692 static void
2693 pmap_update_pde_teardown(void *arg)
2694 {
2695 struct pde_action *act = arg;
2696
2697 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
2698 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
2699 }
2700
2701 /*
2702 * Change the page size for the specified virtual address in a way that
2703 * prevents any possibility of the TLB ever having two entries that map the
2704 * same virtual address using different page sizes. This is the recommended
2705 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
2706 * machine check exception for a TLB state that is improperly diagnosed as a
2707 * hardware error.
2708 */
2709 static void
2710 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
2711 {
2712 struct pde_action act;
2713 cpuset_t active, other_cpus;
2714 u_int cpuid;
2715
2716 sched_pin();
2717 cpuid = PCPU_GET(cpuid);
2718 other_cpus = all_cpus;
2719 CPU_CLR(cpuid, &other_cpus);
2720 if (pmap == kernel_pmap || pmap_type_guest(pmap))
2721 active = all_cpus;
2722 else {
2723 active = pmap->pm_active;
2724 }
2725 if (CPU_OVERLAP(&active, &other_cpus)) {
2726 act.store = cpuid;
2727 act.invalidate = active;
2728 act.va = va;
2729 act.pmap = pmap;
2730 act.pde = pde;
2731 act.newpde = newpde;
2732 CPU_SET(cpuid, &active);
2733 smp_rendezvous_cpus(active,
2734 smp_no_rendezvous_barrier, pmap_update_pde_action,
2735 pmap_update_pde_teardown, &act);
2736 } else {
2737 pmap_update_pde_store(pmap, pde, newpde);
2738 if (CPU_ISSET(cpuid, &active))
2739 pmap_update_pde_invalidate(pmap, va, newpde);
2740 }
2741 sched_unpin();
2742 }
2743 #else /* !SMP */
2744 /*
2745 * Normal, non-SMP, invalidation functions.
2746 */
2747 void
2748 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
2749 {
2750 struct invpcid_descr d;
2751 uint64_t kcr3, ucr3;
2752 uint32_t pcid;
2753
2754 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2755 pmap->pm_eptgen++;
2756 return;
2757 }
2758 KASSERT(pmap->pm_type == PT_X86,
2759 ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
2760
2761 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
2762 invlpg(va);
2763 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
2764 pmap->pm_ucr3 != PMAP_NO_CR3) {
2765 critical_enter();
2766 pcid = pmap->pm_pcids[0].pm_pcid;
2767 if (invpcid_works) {
2768 d.pcid = pcid | PMAP_PCID_USER_PT;
2769 d.pad = 0;
2770 d.addr = va;
2771 invpcid(&d, INVPCID_ADDR);
2772 } else {
2773 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
2774 ucr3 = pmap->pm_ucr3 | pcid |
2775 PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2776 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
2777 }
2778 critical_exit();
2779 }
2780 } else if (pmap_pcid_enabled)
2781 pmap->pm_pcids[0].pm_gen = 0;
2782 }
2783
2784 void
2785 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2786 {
2787 struct invpcid_descr d;
2788 vm_offset_t addr;
2789 uint64_t kcr3, ucr3;
2790
2791 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2792 pmap->pm_eptgen++;
2793 return;
2794 }
2795 KASSERT(pmap->pm_type == PT_X86,
2796 ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
2797
2798 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
2799 for (addr = sva; addr < eva; addr += PAGE_SIZE)
2800 invlpg(addr);
2801 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
2802 pmap->pm_ucr3 != PMAP_NO_CR3) {
2803 critical_enter();
2804 if (invpcid_works) {
2805 d.pcid = pmap->pm_pcids[0].pm_pcid |
2806 PMAP_PCID_USER_PT;
2807 d.pad = 0;
2808 d.addr = sva;
2809 for (; d.addr < eva; d.addr += PAGE_SIZE)
2810 invpcid(&d, INVPCID_ADDR);
2811 } else {
2812 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
2813 pm_pcid | CR3_PCID_SAVE;
2814 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
2815 pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2816 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
2817 }
2818 critical_exit();
2819 }
2820 } else if (pmap_pcid_enabled) {
2821 pmap->pm_pcids[0].pm_gen = 0;
2822 }
2823 }
2824
2825 void
2826 pmap_invalidate_all(pmap_t pmap)
2827 {
2828 struct invpcid_descr d;
2829 uint64_t kcr3, ucr3;
2830
2831 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2832 pmap->pm_eptgen++;
2833 return;
2834 }
2835 KASSERT(pmap->pm_type == PT_X86,
2836 ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
2837
2838 if (pmap == kernel_pmap) {
2839 if (pmap_pcid_enabled && invpcid_works) {
2840 bzero(&d, sizeof(d));
2841 invpcid(&d, INVPCID_CTXGLOB);
2842 } else {
2843 invltlb_glob();
2844 }
2845 } else if (pmap == PCPU_GET(curpmap)) {
2846 if (pmap_pcid_enabled) {
2847 critical_enter();
2848 if (invpcid_works) {
2849 d.pcid = pmap->pm_pcids[0].pm_pcid;
2850 d.pad = 0;
2851 d.addr = 0;
2852 invpcid(&d, INVPCID_CTX);
2853 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2854 d.pcid |= PMAP_PCID_USER_PT;
2855 invpcid(&d, INVPCID_CTX);
2856 }
2857 } else {
2858 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
2859 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2860 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
2861 0].pm_pcid | PMAP_PCID_USER_PT;
2862 pmap_pti_pcid_invalidate(ucr3, kcr3);
2863 } else
2864 load_cr3(kcr3);
2865 }
2866 critical_exit();
2867 } else {
2868 invltlb();
2869 }
2870 } else if (pmap_pcid_enabled) {
2871 pmap->pm_pcids[0].pm_gen = 0;
2872 }
2873 }
2874
2875 PMAP_INLINE void
2876 pmap_invalidate_cache(void)
2877 {
2878
2879 wbinvd();
2880 }
2881
2882 static void
2883 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
2884 {
2885
2886 pmap_update_pde_store(pmap, pde, newpde);
2887 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
2888 pmap_update_pde_invalidate(pmap, va, newpde);
2889 else
2890 pmap->pm_pcids[0].pm_gen = 0;
2891 }
2892 #endif /* !SMP */
2893
2894 static void
2895 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
2896 {
2897
2898 /*
2899 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
2900 * by a promotion that did not invalidate the 512 4KB page mappings
2901 * that might exist in the TLB. Consequently, at this point, the TLB
2902 * may hold both 4KB and 2MB page mappings for the address range [va,
2903 * va + NBPDR). Therefore, the entire range must be invalidated here.
2904 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
2905 * 4KB page mappings for the address range [va, va + NBPDR), and so a
2906 * single INVLPG suffices to invalidate the 2MB page mapping from the
2907 * TLB.
2908 */
2909 if ((pde & PG_PROMOTED) != 0)
2910 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
2911 else
2912 pmap_invalidate_page(pmap, va);
2913 }
2914
2915 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
2916 (vm_offset_t sva, vm_offset_t eva), static)
2917 {
2918
2919 if ((cpu_feature & CPUID_SS) != 0)
2920 return (pmap_invalidate_cache_range_selfsnoop);
2921 if ((cpu_feature & CPUID_CLFSH) != 0)
2922 return (pmap_force_invalidate_cache_range);
2923 return (pmap_invalidate_cache_range_all);
2924 }
2925
2926 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
2927
2928 static void
2929 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
2930 {
2931
2932 KASSERT((sva & PAGE_MASK) == 0,
2933 ("pmap_invalidate_cache_range: sva not page-aligned"));
2934 KASSERT((eva & PAGE_MASK) == 0,
2935 ("pmap_invalidate_cache_range: eva not page-aligned"));
2936 }
2937
2938 static void
2939 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
2940 {
2941
2942 pmap_invalidate_cache_range_check_align(sva, eva);
2943 }
2944
2945 void
2946 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
2947 {
2948
2949 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
2950
2951 /*
2952 * XXX: Some CPUs fault, hang, or trash the local APIC
2953 * registers if we use CLFLUSH on the local APIC range. The
2954 * local APIC is always uncached, so we don't need to flush
2955 * for that range anyway.
2956 */
2957 if (pmap_kextract(sva) == lapic_paddr)
2958 return;
2959
2960 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
2961 /*
2962 * Do per-cache line flush. Use a locked
2963 * instruction to insure that previous stores are
2964 * included in the write-back. The processor
2965 * propagates flush to other processors in the cache
2966 * coherence domain.
2967 */
2968 atomic_thread_fence_seq_cst();
2969 for (; sva < eva; sva += cpu_clflush_line_size)
2970 clflushopt(sva);
2971 atomic_thread_fence_seq_cst();
2972 } else {
2973 /*
2974 * Writes are ordered by CLFLUSH on Intel CPUs.
2975 */
2976 if (cpu_vendor_id != CPU_VENDOR_INTEL)
2977 mfence();
2978 for (; sva < eva; sva += cpu_clflush_line_size)
2979 clflush(sva);
2980 if (cpu_vendor_id != CPU_VENDOR_INTEL)
2981 mfence();
2982 }
2983 }
2984
2985 static void
2986 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
2987 {
2988
2989 pmap_invalidate_cache_range_check_align(sva, eva);
2990 pmap_invalidate_cache();
2991 }
2992
2993 /*
2994 * Remove the specified set of pages from the data and instruction caches.
2995 *
2996 * In contrast to pmap_invalidate_cache_range(), this function does not
2997 * rely on the CPU's self-snoop feature, because it is intended for use
2998 * when moving pages into a different cache domain.
2999 */
3000 void
3001 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
3002 {
3003 vm_offset_t daddr, eva;
3004 int i;
3005 bool useclflushopt;
3006
3007 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
3008 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
3009 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
3010 pmap_invalidate_cache();
3011 else {
3012 if (useclflushopt)
3013 atomic_thread_fence_seq_cst();
3014 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
3015 mfence();
3016 for (i = 0; i < count; i++) {
3017 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
3018 eva = daddr + PAGE_SIZE;
3019 for (; daddr < eva; daddr += cpu_clflush_line_size) {
3020 if (useclflushopt)
3021 clflushopt(daddr);
3022 else
3023 clflush(daddr);
3024 }
3025 }
3026 if (useclflushopt)
3027 atomic_thread_fence_seq_cst();
3028 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
3029 mfence();
3030 }
3031 }
3032
3033 void
3034 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
3035 {
3036
3037 pmap_invalidate_cache_range_check_align(sva, eva);
3038
3039 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
3040 pmap_force_invalidate_cache_range(sva, eva);
3041 return;
3042 }
3043
3044 /* See comment in pmap_force_invalidate_cache_range(). */
3045 if (pmap_kextract(sva) == lapic_paddr)
3046 return;
3047
3048 atomic_thread_fence_seq_cst();
3049 for (; sva < eva; sva += cpu_clflush_line_size)
3050 clwb(sva);
3051 atomic_thread_fence_seq_cst();
3052 }
3053
3054 void
3055 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
3056 {
3057 pt_entry_t *pte;
3058 vm_offset_t vaddr;
3059 int error, pte_bits;
3060
3061 KASSERT((spa & PAGE_MASK) == 0,
3062 ("pmap_flush_cache_phys_range: spa not page-aligned"));
3063 KASSERT((epa & PAGE_MASK) == 0,
3064 ("pmap_flush_cache_phys_range: epa not page-aligned"));
3065
3066 if (spa < dmaplimit) {
3067 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
3068 dmaplimit, epa)));
3069 if (dmaplimit >= epa)
3070 return;
3071 spa = dmaplimit;
3072 }
3073
3074 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
3075 X86_PG_V;
3076 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
3077 &vaddr);
3078 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
3079 pte = vtopte(vaddr);
3080 for (; spa < epa; spa += PAGE_SIZE) {
3081 sched_pin();
3082 pte_store(pte, spa | pte_bits);
3083 invlpg(vaddr);
3084 /* XXXKIB atomic inside flush_cache_range are excessive */
3085 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
3086 sched_unpin();
3087 }
3088 vmem_free(kernel_arena, vaddr, PAGE_SIZE);
3089 }
3090
3091 /*
3092 * Routine: pmap_extract
3093 * Function:
3094 * Extract the physical page address associated
3095 * with the given map/virtual_address pair.
3096 */
3097 vm_paddr_t
3098 pmap_extract(pmap_t pmap, vm_offset_t va)
3099 {
3100 pdp_entry_t *pdpe;
3101 pd_entry_t *pde;
3102 pt_entry_t *pte, PG_V;
3103 vm_paddr_t pa;
3104
3105 pa = 0;
3106 PG_V = pmap_valid_bit(pmap);
3107 PMAP_LOCK(pmap);
3108 pdpe = pmap_pdpe(pmap, va);
3109 if (pdpe != NULL && (*pdpe & PG_V) != 0) {
3110 if ((*pdpe & PG_PS) != 0)
3111 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
3112 else {
3113 pde = pmap_pdpe_to_pde(pdpe, va);
3114 if ((*pde & PG_V) != 0) {
3115 if ((*pde & PG_PS) != 0) {
3116 pa = (*pde & PG_PS_FRAME) |
3117 (va & PDRMASK);
3118 } else {
3119 pte = pmap_pde_to_pte(pde, va);
3120 pa = (*pte & PG_FRAME) |
3121 (va & PAGE_MASK);
3122 }
3123 }
3124 }
3125 }
3126 PMAP_UNLOCK(pmap);
3127 return (pa);
3128 }
3129
3130 /*
3131 * Routine: pmap_extract_and_hold
3132 * Function:
3133 * Atomically extract and hold the physical page
3134 * with the given pmap and virtual address pair
3135 * if that mapping permits the given protection.
3136 */
3137 vm_page_t
3138 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
3139 {
3140 pd_entry_t pde, *pdep;
3141 pt_entry_t pte, PG_RW, PG_V;
3142 vm_paddr_t pa;
3143 vm_page_t m;
3144
3145 pa = 0;
3146 m = NULL;
3147 PG_RW = pmap_rw_bit(pmap);
3148 PG_V = pmap_valid_bit(pmap);
3149 PMAP_LOCK(pmap);
3150 retry:
3151 pdep = pmap_pde(pmap, va);
3152 if (pdep != NULL && (pde = *pdep)) {
3153 if (pde & PG_PS) {
3154 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
3155 if (vm_page_pa_tryrelock(pmap, (pde &
3156 PG_PS_FRAME) | (va & PDRMASK), &pa))
3157 goto retry;
3158 m = PHYS_TO_VM_PAGE(pa);
3159 }
3160 } else {
3161 pte = *pmap_pde_to_pte(pdep, va);
3162 if ((pte & PG_V) &&
3163 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
3164 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
3165 &pa))
3166 goto retry;
3167 m = PHYS_TO_VM_PAGE(pa);
3168 }
3169 }
3170 if (m != NULL)
3171 vm_page_hold(m);
3172 }
3173 PA_UNLOCK_COND(pa);
3174 PMAP_UNLOCK(pmap);
3175 return (m);
3176 }
3177
3178 vm_paddr_t
3179 pmap_kextract(vm_offset_t va)
3180 {
3181 pd_entry_t pde;
3182 vm_paddr_t pa;
3183
3184 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
3185 pa = DMAP_TO_PHYS(va);
3186 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
3187 pa = pmap_large_map_kextract(va);
3188 } else {
3189 pde = *vtopde(va);
3190 if (pde & PG_PS) {
3191 pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
3192 } else {
3193 /*
3194 * Beware of a concurrent promotion that changes the
3195 * PDE at this point! For example, vtopte() must not
3196 * be used to access the PTE because it would use the
3197 * new PDE. It is, however, safe to use the old PDE
3198 * because the page table page is preserved by the
3199 * promotion.
3200 */
3201 pa = *pmap_pde_to_pte(&pde, va);
3202 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
3203 }
3204 }
3205 return (pa);
3206 }
3207
3208 /***************************************************
3209 * Low level mapping routines.....
3210 ***************************************************/
3211
3212 /*
3213 * Add a wired page to the kva.
3214 * Note: not SMP coherent.
3215 */
3216 PMAP_INLINE void
3217 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
3218 {
3219 pt_entry_t *pte;
3220
3221 pte = vtopte(va);
3222 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx);
3223 }
3224
3225 static __inline void
3226 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
3227 {
3228 pt_entry_t *pte;
3229 int cache_bits;
3230
3231 pte = vtopte(va);
3232 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
3233 pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx | cache_bits);
3234 }
3235
3236 /*
3237 * Remove a page from the kernel pagetables.
3238 * Note: not SMP coherent.
3239 */
3240 PMAP_INLINE void
3241 pmap_kremove(vm_offset_t va)
3242 {
3243 pt_entry_t *pte;
3244
3245 pte = vtopte(va);
3246 pte_clear(pte);
3247 }
3248
3249 /*
3250 * Used to map a range of physical addresses into kernel
3251 * virtual address space.
3252 *
3253 * The value passed in '*virt' is a suggested virtual address for
3254 * the mapping. Architectures which can support a direct-mapped
3255 * physical to virtual region can return the appropriate address
3256 * within that region, leaving '*virt' unchanged. Other
3257 * architectures should map the pages starting at '*virt' and
3258 * update '*virt' with the first usable address after the mapped
3259 * region.
3260 */
3261 vm_offset_t
3262 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
3263 {
3264 return PHYS_TO_DMAP(start);
3265 }
3266
3267
3268 /*
3269 * Add a list of wired pages to the kva
3270 * this routine is only used for temporary
3271 * kernel mappings that do not need to have
3272 * page modification or references recorded.
3273 * Note that old mappings are simply written
3274 * over. The page *must* be wired.
3275 * Note: SMP coherent. Uses a ranged shootdown IPI.
3276 */
3277 void
3278 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
3279 {
3280 pt_entry_t *endpte, oldpte, pa, *pte;
3281 vm_page_t m;
3282 int cache_bits;
3283
3284 oldpte = 0;
3285 pte = vtopte(sva);
3286 endpte = pte + count;
3287 while (pte < endpte) {
3288 m = *ma++;
3289 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
3290 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
3291 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
3292 oldpte |= *pte;
3293 pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
3294 }
3295 pte++;
3296 }
3297 if (__predict_false((oldpte & X86_PG_V) != 0))
3298 pmap_invalidate_range(kernel_pmap, sva, sva + count *
3299 PAGE_SIZE);
3300 }
3301
3302 /*
3303 * This routine tears out page mappings from the
3304 * kernel -- it is meant only for temporary mappings.
3305 * Note: SMP coherent. Uses a ranged shootdown IPI.
3306 */
3307 void
3308 pmap_qremove(vm_offset_t sva, int count)
3309 {
3310 vm_offset_t va;
3311
3312 va = sva;
3313 while (count-- > 0) {
3314 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
3315 pmap_kremove(va);
3316 va += PAGE_SIZE;
3317 }
3318 pmap_invalidate_range(kernel_pmap, sva, va);
3319 }
3320
3321 /***************************************************
3322 * Page table page management routines.....
3323 ***************************************************/
3324 /*
3325 * Schedule the specified unused page table page to be freed. Specifically,
3326 * add the page to the specified list of pages that will be released to the
3327 * physical memory manager after the TLB has been updated.
3328 */
3329 static __inline void
3330 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
3331 boolean_t set_PG_ZERO)
3332 {
3333
3334 if (set_PG_ZERO)
3335 m->flags |= PG_ZERO;
3336 else
3337 m->flags &= ~PG_ZERO;
3338 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
3339 }
3340
3341 /*
3342 * Inserts the specified page table page into the specified pmap's collection
3343 * of idle page table pages. Each of a pmap's page table pages is responsible
3344 * for mapping a distinct range of virtual addresses. The pmap's collection is
3345 * ordered by this virtual address range.
3346 *
3347 * If "promoted" is false, then the page table page "mpte" must be zero filled.
3348 */
3349 static __inline int
3350 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
3351 {
3352
3353 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3354 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
3355 return (vm_radix_insert(&pmap->pm_root, mpte));
3356 }
3357
3358 /*
3359 * Removes the page table page mapping the specified virtual address from the
3360 * specified pmap's collection of idle page table pages, and returns it.
3361 * Otherwise, returns NULL if there is no page table page corresponding to the
3362 * specified virtual address.
3363 */
3364 static __inline vm_page_t
3365 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
3366 {
3367
3368 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3369 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
3370 }
3371
3372 /*
3373 * Decrements a page table page's wire count, which is used to record the
3374 * number of valid page table entries within the page. If the wire count
3375 * drops to zero, then the page table page is unmapped. Returns TRUE if the
3376 * page table page was unmapped and FALSE otherwise.
3377 */
3378 static inline boolean_t
3379 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
3380 {
3381
3382 --m->wire_count;
3383 if (m->wire_count == 0) {
3384 _pmap_unwire_ptp(pmap, va, m, free);
3385 return (TRUE);
3386 } else
3387 return (FALSE);
3388 }
3389
3390 static void
3391 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
3392 {
3393
3394 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3395 /*
3396 * unmap the page table page
3397 */
3398 if (m->pindex >= (NUPDE + NUPDPE)) {
3399 /* PDP page */
3400 pml4_entry_t *pml4;
3401 pml4 = pmap_pml4e(pmap, va);
3402 *pml4 = 0;
3403 if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
3404 pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
3405 *pml4 = 0;
3406 }
3407 } else if (m->pindex >= NUPDE) {
3408 /* PD page */
3409 pdp_entry_t *pdp;
3410 pdp = pmap_pdpe(pmap, va);
3411 *pdp = 0;
3412 } else {
3413 /* PTE page */
3414 pd_entry_t *pd;
3415 pd = pmap_pde(pmap, va);
3416 *pd = 0;
3417 }
3418 pmap_resident_count_dec(pmap, 1);
3419 if (m->pindex < NUPDE) {
3420 /* We just released a PT, unhold the matching PD */
3421 vm_page_t pdpg;
3422
3423 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
3424 pmap_unwire_ptp(pmap, va, pdpg, free);
3425 }
3426 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
3427 /* We just released a PD, unhold the matching PDP */
3428 vm_page_t pdppg;
3429
3430 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
3431 pmap_unwire_ptp(pmap, va, pdppg, free);
3432 }
3433
3434 /*
3435 * Put page on a list so that it is released after
3436 * *ALL* TLB shootdown is done
3437 */
3438 pmap_add_delayed_free_list(m, free, TRUE);
3439 }
3440
3441 /*
3442 * After removing a page table entry, this routine is used to
3443 * conditionally free the page, and manage the hold/wire counts.
3444 */
3445 static int
3446 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
3447 struct spglist *free)
3448 {
3449 vm_page_t mpte;
3450
3451 if (va >= VM_MAXUSER_ADDRESS)
3452 return (0);
3453 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
3454 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
3455 return (pmap_unwire_ptp(pmap, va, mpte, free));
3456 }
3457
3458 void
3459 pmap_pinit0(pmap_t pmap)
3460 {
3461 struct proc *p;
3462 struct thread *td;
3463 int i;
3464
3465 PMAP_LOCK_INIT(pmap);
3466 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
3467 pmap->pm_pml4u = NULL;
3468 pmap->pm_cr3 = KPML4phys;
3469 /* hack to keep pmap_pti_pcid_invalidate() alive */
3470 pmap->pm_ucr3 = PMAP_NO_CR3;
3471 pmap->pm_root.rt_root = 0;
3472 CPU_ZERO(&pmap->pm_active);
3473 TAILQ_INIT(&pmap->pm_pvchunk);
3474 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
3475 pmap->pm_flags = pmap_flags;
3476 CPU_FOREACH(i) {
3477 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
3478 pmap->pm_pcids[i].pm_gen = 1;
3479 }
3480 pmap_activate_boot(pmap);
3481 td = curthread;
3482 if (pti) {
3483 p = td->td_proc;
3484 PROC_LOCK(p);
3485 p->p_amd64_md_flags |= P_MD_KPTI;
3486 PROC_UNLOCK(p);
3487 }
3488 pmap_thread_init_invl_gen(td);
3489
3490 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
3491 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
3492 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
3493 UMA_ALIGN_PTR, 0);
3494 }
3495 }
3496
3497 void
3498 pmap_pinit_pml4(vm_page_t pml4pg)
3499 {
3500 pml4_entry_t *pm_pml4;
3501 int i;
3502
3503 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
3504
3505 /* Wire in kernel global address entries. */
3506 for (i = 0; i < NKPML4E; i++) {
3507 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
3508 X86_PG_V;
3509 }
3510 for (i = 0; i < ndmpdpphys; i++) {
3511 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
3512 X86_PG_V;
3513 }
3514
3515 /* install self-referential address mapping entry(s) */
3516 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
3517 X86_PG_A | X86_PG_M;
3518
3519 /* install large map entries if configured */
3520 for (i = 0; i < lm_ents; i++)
3521 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
3522 }
3523
3524 static void
3525 pmap_pinit_pml4_pti(vm_page_t pml4pg)
3526 {
3527 pml4_entry_t *pm_pml4;
3528 int i;
3529
3530 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
3531 for (i = 0; i < NPML4EPG; i++)
3532 pm_pml4[i] = pti_pml4[i];
3533 }
3534
3535 /*
3536 * Initialize a preallocated and zeroed pmap structure,
3537 * such as one in a vmspace structure.
3538 */
3539 int
3540 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
3541 {
3542 vm_page_t pml4pg, pml4pgu;
3543 vm_paddr_t pml4phys;
3544 int i;
3545
3546 /*
3547 * allocate the page directory page
3548 */
3549 pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3550 VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
3551
3552 pml4phys = VM_PAGE_TO_PHYS(pml4pg);
3553 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
3554 CPU_FOREACH(i) {
3555 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
3556 pmap->pm_pcids[i].pm_gen = 0;
3557 }
3558 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */
3559 pmap->pm_ucr3 = PMAP_NO_CR3;
3560 pmap->pm_pml4u = NULL;
3561
3562 pmap->pm_type = pm_type;
3563 if ((pml4pg->flags & PG_ZERO) == 0)
3564 pagezero(pmap->pm_pml4);
3565
3566 /*
3567 * Do not install the host kernel mappings in the nested page
3568 * tables. These mappings are meaningless in the guest physical
3569 * address space.
3570 * Install minimal kernel mappings in PTI case.
3571 */
3572 if (pm_type == PT_X86) {
3573 pmap->pm_cr3 = pml4phys;
3574 pmap_pinit_pml4(pml4pg);
3575 if ((curproc->p_amd64_md_flags & P_MD_KPTI) != 0) {
3576 pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
3577 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
3578 pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
3579 VM_PAGE_TO_PHYS(pml4pgu));
3580 pmap_pinit_pml4_pti(pml4pgu);
3581 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
3582 }
3583 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
3584 rangeset_init(&pmap->pm_pkru, pkru_dup_range,
3585 pkru_free_range, pmap, M_NOWAIT);
3586 }
3587 }
3588
3589 pmap->pm_root.rt_root = 0;
3590 CPU_ZERO(&pmap->pm_active);
3591 TAILQ_INIT(&pmap->pm_pvchunk);
3592 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
3593 pmap->pm_flags = flags;
3594 pmap->pm_eptgen = 0;
3595
3596 return (1);
3597 }
3598
3599 int
3600 pmap_pinit(pmap_t pmap)
3601 {
3602
3603 return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
3604 }
3605
3606 /*
3607 * This routine is called if the desired page table page does not exist.
3608 *
3609 * If page table page allocation fails, this routine may sleep before
3610 * returning NULL. It sleeps only if a lock pointer was given.
3611 *
3612 * Note: If a page allocation fails at page table level two or three,
3613 * one or two pages may be held during the wait, only to be released
3614 * afterwards. This conservative approach is easily argued to avoid
3615 * race conditions.
3616 *
3617 * The ptepindexes, i.e. page indices, of the page table pages encountered
3618 * while translating virtual address va are defined as follows:
3619 * - for the page table page (last level),
3620 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT,
3621 * in other words, it is just the index of the PDE that maps the page
3622 * table page.
3623 * - for the page directory page,
3624 * ptepindex = NUPDE (number of userland PD entries) +
3625 * (pmap_pde_index(va) >> NPDEPGSHIFT)
3626 * i.e. index of PDPE is put after the last index of PDE,
3627 * - for the page directory pointer page,
3628 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT +
3629 * NPML4EPGSHIFT),
3630 * i.e. index of pml4e is put after the last index of PDPE.
3631 *
3632 * Define an order on the paging entries, where all entries of the
3633 * same height are put together, then heights are put from deepest to
3634 * root. Then ptexpindex is the sequential number of the
3635 * corresponding paging entry in this order.
3636 *
3637 * The root page at PML4 does not participate in this indexing scheme, since
3638 * it is statically allocated by pmap_pinit() and not by _pmap_allocpte().
3639 */
3640 static vm_page_t
3641 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
3642 {
3643 vm_page_t m, pdppg, pdpg;
3644 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
3645
3646 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3647
3648 PG_A = pmap_accessed_bit(pmap);
3649 PG_M = pmap_modified_bit(pmap);
3650 PG_V = pmap_valid_bit(pmap);
3651 PG_RW = pmap_rw_bit(pmap);
3652
3653 /*
3654 * Allocate a page table page.
3655 */
3656 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
3657 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
3658 if (lockp != NULL) {
3659 RELEASE_PV_LIST_LOCK(lockp);
3660 PMAP_UNLOCK(pmap);
3661 PMAP_ASSERT_NOT_IN_DI();
3662 vm_wait(NULL);
3663 PMAP_LOCK(pmap);
3664 }
3665
3666 /*
3667 * Indicate the need to retry. While waiting, the page table
3668 * page may have been allocated.
3669 */
3670 return (NULL);
3671 }
3672 if ((m->flags & PG_ZERO) == 0)
3673 pmap_zero_page(m);
3674
3675 /*
3676 * Map the pagetable page into the process address space, if
3677 * it isn't already there.
3678 */
3679
3680 if (ptepindex >= (NUPDE + NUPDPE)) {
3681 pml4_entry_t *pml4, *pml4u;
3682 vm_pindex_t pml4index;
3683
3684 /* Wire up a new PDPE page */
3685 pml4index = ptepindex - (NUPDE + NUPDPE);
3686 pml4 = &pmap->pm_pml4[pml4index];
3687 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
3688 if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
3689 /*
3690 * PTI: Make all user-space mappings in the
3691 * kernel-mode page table no-execute so that
3692 * we detect any programming errors that leave
3693 * the kernel-mode page table active on return
3694 * to user space.
3695 */
3696 if (pmap->pm_ucr3 != PMAP_NO_CR3)
3697 *pml4 |= pg_nx;
3698
3699 pml4u = &pmap->pm_pml4u[pml4index];
3700 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
3701 PG_A | PG_M;
3702 }
3703
3704 } else if (ptepindex >= NUPDE) {
3705 vm_pindex_t pml4index;
3706 vm_pindex_t pdpindex;
3707 pml4_entry_t *pml4;
3708 pdp_entry_t *pdp;
3709
3710 /* Wire up a new PDE page */
3711 pdpindex = ptepindex - NUPDE;
3712 pml4index = pdpindex >> NPML4EPGSHIFT;
3713
3714 pml4 = &pmap->pm_pml4[pml4index];
3715 if ((*pml4 & PG_V) == 0) {
3716 /* Have to allocate a new pdp, recurse */
3717 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
3718 lockp) == NULL) {
3719 vm_page_unwire_noq(m);
3720 vm_page_free_zero(m);
3721 return (NULL);
3722 }
3723 } else {
3724 /* Add reference to pdp page */
3725 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
3726 pdppg->wire_count++;
3727 }
3728 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
3729
3730 /* Now find the pdp page */
3731 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
3732 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
3733
3734 } else {
3735 vm_pindex_t pml4index;
3736 vm_pindex_t pdpindex;
3737 pml4_entry_t *pml4;
3738 pdp_entry_t *pdp;
3739 pd_entry_t *pd;
3740
3741 /* Wire up a new PTE page */
3742 pdpindex = ptepindex >> NPDPEPGSHIFT;
3743 pml4index = pdpindex >> NPML4EPGSHIFT;
3744
3745 /* First, find the pdp and check that its valid. */
3746 pml4 = &pmap->pm_pml4[pml4index];
3747 if ((*pml4 & PG_V) == 0) {
3748 /* Have to allocate a new pd, recurse */
3749 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
3750 lockp) == NULL) {
3751 vm_page_unwire_noq(m);
3752 vm_page_free_zero(m);
3753 return (NULL);
3754 }
3755 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
3756 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
3757 } else {
3758 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
3759 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
3760 if ((*pdp & PG_V) == 0) {
3761 /* Have to allocate a new pd, recurse */
3762 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
3763 lockp) == NULL) {
3764 vm_page_unwire_noq(m);
3765 vm_page_free_zero(m);
3766 return (NULL);
3767 }
3768 } else {
3769 /* Add reference to the pd page */
3770 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
3771 pdpg->wire_count++;
3772 }
3773 }
3774 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
3775
3776 /* Now we know where the page directory page is */
3777 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
3778 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
3779 }
3780
3781 pmap_resident_count_inc(pmap, 1);
3782
3783 return (m);
3784 }
3785
3786 static vm_page_t
3787 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3788 {
3789 vm_pindex_t pdpindex, ptepindex;
3790 pdp_entry_t *pdpe, PG_V;
3791 vm_page_t pdpg;
3792
3793 PG_V = pmap_valid_bit(pmap);
3794
3795 retry:
3796 pdpe = pmap_pdpe(pmap, va);
3797 if (pdpe != NULL && (*pdpe & PG_V) != 0) {
3798 /* Add a reference to the pd page. */
3799 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
3800 pdpg->wire_count++;
3801 } else {
3802 /* Allocate a pd page. */
3803 ptepindex = pmap_pde_pindex(va);
3804 pdpindex = ptepindex >> NPDPEPGSHIFT;
3805 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
3806 if (pdpg == NULL && lockp != NULL)
3807 goto retry;
3808 }
3809 return (pdpg);
3810 }
3811
3812 static vm_page_t
3813 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3814 {
3815 vm_pindex_t ptepindex;
3816 pd_entry_t *pd, PG_V;
3817 vm_page_t m;
3818
3819 PG_V = pmap_valid_bit(pmap);
3820
3821 /*
3822 * Calculate pagetable page index
3823 */
3824 ptepindex = pmap_pde_pindex(va);
3825 retry:
3826 /*
3827 * Get the page directory entry
3828 */
3829 pd = pmap_pde(pmap, va);
3830
3831 /*
3832 * This supports switching from a 2MB page to a
3833 * normal 4K page.
3834 */
3835 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
3836 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
3837 /*
3838 * Invalidation of the 2MB page mapping may have caused
3839 * the deallocation of the underlying PD page.
3840 */
3841 pd = NULL;
3842 }
3843 }
3844
3845 /*
3846 * If the page table page is mapped, we just increment the
3847 * hold count, and activate it.
3848 */
3849 if (pd != NULL && (*pd & PG_V) != 0) {
3850 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
3851 m->wire_count++;
3852 } else {
3853 /*
3854 * Here if the pte page isn't mapped, or if it has been
3855 * deallocated.
3856 */
3857 m = _pmap_allocpte(pmap, ptepindex, lockp);
3858 if (m == NULL && lockp != NULL)
3859 goto retry;
3860 }
3861 return (m);
3862 }
3863
3864
3865 /***************************************************
3866 * Pmap allocation/deallocation routines.
3867 ***************************************************/
3868
3869 /*
3870 * Release any resources held by the given physical map.
3871 * Called when a pmap initialized by pmap_pinit is being released.
3872 * Should only be called if the map contains no valid mappings.
3873 */
3874 void
3875 pmap_release(pmap_t pmap)
3876 {
3877 vm_page_t m;
3878 int i;
3879
3880 KASSERT(pmap->pm_stats.resident_count == 0,
3881 ("pmap_release: pmap resident count %ld != 0",
3882 pmap->pm_stats.resident_count));
3883 KASSERT(vm_radix_is_empty(&pmap->pm_root),
3884 ("pmap_release: pmap has reserved page table page(s)"));
3885 KASSERT(CPU_EMPTY(&pmap->pm_active),
3886 ("releasing active pmap %p", pmap));
3887
3888 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
3889
3890 for (i = 0; i < NKPML4E; i++) /* KVA */
3891 pmap->pm_pml4[KPML4BASE + i] = 0;
3892 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
3893 pmap->pm_pml4[DMPML4I + i] = 0;
3894 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
3895 for (i = 0; i < lm_ents; i++) /* Large Map */
3896 pmap->pm_pml4[LMSPML4I + i] = 0;
3897
3898 vm_page_unwire_noq(m);
3899 vm_page_free_zero(m);
3900
3901 if (pmap->pm_pml4u != NULL) {
3902 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
3903 vm_page_unwire_noq(m);
3904 vm_page_free(m);
3905 }
3906 if (pmap->pm_type == PT_X86 &&
3907 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
3908 rangeset_fini(&pmap->pm_pkru);
3909 }
3910
3911 static int
3912 kvm_size(SYSCTL_HANDLER_ARGS)
3913 {
3914 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3915
3916 return sysctl_handle_long(oidp, &ksize, 0, req);
3917 }
3918 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
3919 0, 0, kvm_size, "LU", "Size of KVM");
3920
3921 static int
3922 kvm_free(SYSCTL_HANDLER_ARGS)
3923 {
3924 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3925
3926 return sysctl_handle_long(oidp, &kfree, 0, req);
3927 }
3928 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
3929 0, 0, kvm_free, "LU", "Amount of KVM free");
3930
3931 /*
3932 * grow the number of kernel page table entries, if needed
3933 */
3934 void
3935 pmap_growkernel(vm_offset_t addr)
3936 {
3937 vm_paddr_t paddr;
3938 vm_page_t nkpg;
3939 pd_entry_t *pde, newpdir;
3940 pdp_entry_t *pdpe;
3941
3942 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3943
3944 /*
3945 * Return if "addr" is within the range of kernel page table pages
3946 * that were preallocated during pmap bootstrap. Moreover, leave
3947 * "kernel_vm_end" and the kernel page table as they were.
3948 *
3949 * The correctness of this action is based on the following
3950 * argument: vm_map_insert() allocates contiguous ranges of the
3951 * kernel virtual address space. It calls this function if a range
3952 * ends after "kernel_vm_end". If the kernel is mapped between
3953 * "kernel_vm_end" and "addr", then the range cannot begin at
3954 * "kernel_vm_end". In fact, its beginning address cannot be less
3955 * than the kernel. Thus, there is no immediate need to allocate
3956 * any new kernel page table pages between "kernel_vm_end" and
3957 * "KERNBASE".
3958 */
3959 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
3960 return;
3961
3962 addr = roundup2(addr, NBPDR);
3963 if (addr - 1 >= vm_map_max(kernel_map))
3964 addr = vm_map_max(kernel_map);
3965 while (kernel_vm_end < addr) {
3966 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
3967 if ((*pdpe & X86_PG_V) == 0) {
3968 /* We need a new PDP entry */
3969 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
3970 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
3971 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3972 if (nkpg == NULL)
3973 panic("pmap_growkernel: no memory to grow kernel");
3974 if ((nkpg->flags & PG_ZERO) == 0)
3975 pmap_zero_page(nkpg);
3976 paddr = VM_PAGE_TO_PHYS(nkpg);
3977 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
3978 X86_PG_A | X86_PG_M);
3979 continue; /* try again */
3980 }
3981 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
3982 if ((*pde & X86_PG_V) != 0) {
3983 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
3984 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3985 kernel_vm_end = vm_map_max(kernel_map);
3986 break;
3987 }
3988 continue;
3989 }
3990
3991 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
3992 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3993 VM_ALLOC_ZERO);
3994 if (nkpg == NULL)
3995 panic("pmap_growkernel: no memory to grow kernel");
3996 if ((nkpg->flags & PG_ZERO) == 0)
3997 pmap_zero_page(nkpg);
3998 paddr = VM_PAGE_TO_PHYS(nkpg);
3999 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
4000 pde_store(pde, newpdir);
4001
4002 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
4003 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
4004 kernel_vm_end = vm_map_max(kernel_map);
4005 break;
4006 }
4007 }
4008 }
4009
4010
4011 /***************************************************
4012 * page management routines.
4013 ***************************************************/
4014
4015 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
4016 CTASSERT(_NPCM == 3);
4017 CTASSERT(_NPCPV == 168);
4018
4019 static __inline struct pv_chunk *
4020 pv_to_chunk(pv_entry_t pv)
4021 {
4022
4023 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
4024 }
4025
4026 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
4027
4028 #define PC_FREE0 0xfffffffffffffffful
4029 #define PC_FREE1 0xfffffffffffffffful
4030 #define PC_FREE2 0x000000fffffffffful
4031
4032 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
4033
4034 #ifdef PV_STATS
4035 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
4036
4037 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
4038 "Current number of pv entry chunks");
4039 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
4040 "Current number of pv entry chunks allocated");
4041 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
4042 "Current number of pv entry chunks frees");
4043 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
4044 "Number of times tried to get a chunk page but failed.");
4045
4046 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
4047 static int pv_entry_spare;
4048
4049 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
4050 "Current number of pv entry frees");
4051 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
4052 "Current number of pv entry allocs");
4053 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
4054 "Current number of pv entries");
4055 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
4056 "Current number of spare pv entries");
4057 #endif
4058
4059 static void
4060 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
4061 {
4062
4063 if (pmap == NULL)
4064 return;
4065 pmap_invalidate_all(pmap);
4066 if (pmap != locked_pmap)
4067 PMAP_UNLOCK(pmap);
4068 if (start_di)
4069 pmap_delayed_invl_finish();
4070 }
4071
4072 /*
4073 * We are in a serious low memory condition. Resort to
4074 * drastic measures to free some pages so we can allocate
4075 * another pv entry chunk.
4076 *
4077 * Returns NULL if PV entries were reclaimed from the specified pmap.
4078 *
4079 * We do not, however, unmap 2mpages because subsequent accesses will
4080 * allocate per-page pv entries until repromotion occurs, thereby
4081 * exacerbating the shortage of free pv entries.
4082 */
4083 static vm_page_t
4084 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
4085 {
4086 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
4087 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
4088 struct md_page *pvh;
4089 pd_entry_t *pde;
4090 pmap_t next_pmap, pmap;
4091 pt_entry_t *pte, tpte;
4092 pt_entry_t PG_G, PG_A, PG_M, PG_RW;
4093 pv_entry_t pv;
4094 vm_offset_t va;
4095 vm_page_t m, m_pc;
4096 struct spglist free;
4097 uint64_t inuse;
4098 int bit, field, freed;
4099 bool start_di;
4100 static int active_reclaims = 0;
4101
4102 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
4103 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
4104 pmap = NULL;
4105 m_pc = NULL;
4106 PG_G = PG_A = PG_M = PG_RW = 0;
4107 SLIST_INIT(&free);
4108 bzero(&pc_marker_b, sizeof(pc_marker_b));
4109 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
4110 pc_marker = (struct pv_chunk *)&pc_marker_b;
4111 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
4112
4113 /*
4114 * A delayed invalidation block should already be active if
4115 * pmap_advise() or pmap_remove() called this function by way
4116 * of pmap_demote_pde_locked().
4117 */
4118 start_di = pmap_not_in_di();
4119
4120 mtx_lock(&pv_chunks_mutex);
4121 active_reclaims++;
4122 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
4123 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
4124 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
4125 SLIST_EMPTY(&free)) {
4126 next_pmap = pc->pc_pmap;
4127 if (next_pmap == NULL) {
4128 /*
4129 * The next chunk is a marker. However, it is
4130 * not our marker, so active_reclaims must be
4131 * > 1. Consequently, the next_chunk code
4132 * will not rotate the pv_chunks list.
4133 */
4134 goto next_chunk;
4135 }
4136 mtx_unlock(&pv_chunks_mutex);
4137
4138 /*
4139 * A pv_chunk can only be removed from the pc_lru list
4140 * when both pc_chunks_mutex is owned and the
4141 * corresponding pmap is locked.
4142 */
4143 if (pmap != next_pmap) {
4144 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
4145 start_di);
4146 pmap = next_pmap;
4147 /* Avoid deadlock and lock recursion. */
4148 if (pmap > locked_pmap) {
4149 RELEASE_PV_LIST_LOCK(lockp);
4150 PMAP_LOCK(pmap);
4151 if (start_di)
4152 pmap_delayed_invl_start();
4153 mtx_lock(&pv_chunks_mutex);
4154 continue;
4155 } else if (pmap != locked_pmap) {
4156 if (PMAP_TRYLOCK(pmap)) {
4157 if (start_di)
4158 pmap_delayed_invl_start();
4159 mtx_lock(&pv_chunks_mutex);
4160 continue;
4161 } else {
4162 pmap = NULL; /* pmap is not locked */
4163 mtx_lock(&pv_chunks_mutex);
4164 pc = TAILQ_NEXT(pc_marker, pc_lru);
4165 if (pc == NULL ||
4166 pc->pc_pmap != next_pmap)
4167 continue;
4168 goto next_chunk;
4169 }
4170 } else if (start_di)
4171 pmap_delayed_invl_start();
4172 PG_G = pmap_global_bit(pmap);
4173 PG_A = pmap_accessed_bit(pmap);
4174 PG_M = pmap_modified_bit(pmap);
4175 PG_RW = pmap_rw_bit(pmap);
4176 }
4177
4178 /*
4179 * Destroy every non-wired, 4 KB page mapping in the chunk.
4180 */
4181 freed = 0;
4182 for (field = 0; field < _NPCM; field++) {
4183 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
4184 inuse != 0; inuse &= ~(1UL << bit)) {
4185 bit = bsfq(inuse);
4186 pv = &pc->pc_pventry[field * 64 + bit];
4187 va = pv->pv_va;
4188 pde = pmap_pde(pmap, va);
4189 if ((*pde & PG_PS) != 0)
4190 continue;
4191 pte = pmap_pde_to_pte(pde, va);
4192 if ((*pte & PG_W) != 0)
4193 continue;
4194 tpte = pte_load_clear(pte);
4195 if ((tpte & PG_G) != 0)
4196 pmap_invalidate_page(pmap, va);
4197 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4198 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4199 vm_page_dirty(m);
4200 if ((tpte & PG_A) != 0)
4201 vm_page_aflag_set(m, PGA_REFERENCED);
4202 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4203 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4204 m->md.pv_gen++;
4205 if (TAILQ_EMPTY(&m->md.pv_list) &&
4206 (m->flags & PG_FICTITIOUS) == 0) {
4207 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4208 if (TAILQ_EMPTY(&pvh->pv_list)) {
4209 vm_page_aflag_clear(m,
4210 PGA_WRITEABLE);
4211 }
4212 }
4213 pmap_delayed_invl_page(m);
4214 pc->pc_map[field] |= 1UL << bit;
4215 pmap_unuse_pt(pmap, va, *pde, &free);
4216 freed++;
4217 }
4218 }
4219 if (freed == 0) {
4220 mtx_lock(&pv_chunks_mutex);
4221 goto next_chunk;
4222 }
4223 /* Every freed mapping is for a 4 KB page. */
4224 pmap_resident_count_dec(pmap, freed);
4225 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
4226 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
4227 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
4228 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4229 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
4230 pc->pc_map[2] == PC_FREE2) {
4231 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
4232 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
4233 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
4234 /* Entire chunk is free; return it. */
4235 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
4236 dump_drop_page(m_pc->phys_addr);
4237 mtx_lock(&pv_chunks_mutex);
4238 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
4239 break;
4240 }
4241 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
4242 mtx_lock(&pv_chunks_mutex);
4243 /* One freed pv entry in locked_pmap is sufficient. */
4244 if (pmap == locked_pmap)
4245 break;
4246 next_chunk:
4247 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
4248 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
4249 if (active_reclaims == 1 && pmap != NULL) {
4250 /*
4251 * Rotate the pv chunks list so that we do not
4252 * scan the same pv chunks that could not be
4253 * freed (because they contained a wired
4254 * and/or superpage mapping) on every
4255 * invocation of reclaim_pv_chunk().
4256 */
4257 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
4258 MPASS(pc->pc_pmap != NULL);
4259 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
4260 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
4261 }
4262 }
4263 }
4264 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
4265 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
4266 active_reclaims--;
4267 mtx_unlock(&pv_chunks_mutex);
4268 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
4269 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
4270 m_pc = SLIST_FIRST(&free);
4271 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
4272 /* Recycle a freed page table page. */
4273 m_pc->wire_count = 1;
4274 }
4275 vm_page_free_pages_toq(&free, true);
4276 return (m_pc);
4277 }
4278
4279 /*
4280 * free the pv_entry back to the free list
4281 */
4282 static void
4283 free_pv_entry(pmap_t pmap, pv_entry_t pv)
4284 {
4285 struct pv_chunk *pc;
4286 int idx, field, bit;
4287
4288 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4289 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
4290 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
4291 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
4292 pc = pv_to_chunk(pv);
4293 idx = pv - &pc->pc_pventry[0];
4294 field = idx / 64;
4295 bit = idx % 64;
4296 pc->pc_map[field] |= 1ul << bit;
4297 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
4298 pc->pc_map[2] != PC_FREE2) {
4299 /* 98% of the time, pc is already at the head of the list. */
4300 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
4301 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4302 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
4303 }
4304 return;
4305 }
4306 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4307 free_pv_chunk(pc);
4308 }
4309
4310 static void
4311 free_pv_chunk_dequeued(struct pv_chunk *pc)
4312 {
4313 vm_page_t m;
4314
4315 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
4316 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
4317 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
4318 /* entire chunk is free, return it */
4319 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
4320 dump_drop_page(m->phys_addr);
4321 vm_page_unwire_noq(m);
4322 vm_page_free(m);
4323 }
4324
4325 static void
4326 free_pv_chunk(struct pv_chunk *pc)
4327 {
4328
4329 mtx_lock(&pv_chunks_mutex);
4330 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
4331 mtx_unlock(&pv_chunks_mutex);
4332 free_pv_chunk_dequeued(pc);
4333 }
4334
4335 static void
4336 free_pv_chunk_batch(struct pv_chunklist *batch)
4337 {
4338 struct pv_chunk *pc, *npc;
4339
4340 if (TAILQ_EMPTY(batch))
4341 return;
4342
4343 mtx_lock(&pv_chunks_mutex);
4344 TAILQ_FOREACH(pc, batch, pc_list) {
4345 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
4346 }
4347 mtx_unlock(&pv_chunks_mutex);
4348
4349 TAILQ_FOREACH_SAFE(pc, batch, pc_list, npc) {
4350 free_pv_chunk_dequeued(pc);
4351 }
4352 }
4353
4354 /*
4355 * Returns a new PV entry, allocating a new PV chunk from the system when
4356 * needed. If this PV chunk allocation fails and a PV list lock pointer was
4357 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
4358 * returned.
4359 *
4360 * The given PV list lock may be released.
4361 */
4362 static pv_entry_t
4363 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
4364 {
4365 int bit, field;
4366 pv_entry_t pv;
4367 struct pv_chunk *pc;
4368 vm_page_t m;
4369
4370 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4371 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
4372 retry:
4373 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
4374 if (pc != NULL) {
4375 for (field = 0; field < _NPCM; field++) {
4376 if (pc->pc_map[field]) {
4377 bit = bsfq(pc->pc_map[field]);
4378 break;
4379 }
4380 }
4381 if (field < _NPCM) {
4382 pv = &pc->pc_pventry[field * 64 + bit];
4383 pc->pc_map[field] &= ~(1ul << bit);
4384 /* If this was the last item, move it to tail */
4385 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
4386 pc->pc_map[2] == 0) {
4387 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4388 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
4389 pc_list);
4390 }
4391 PV_STAT(atomic_add_long(&pv_entry_count, 1));
4392 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
4393 return (pv);
4394 }
4395 }
4396 /* No free items, allocate another chunk */
4397 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
4398 VM_ALLOC_WIRED);
4399 if (m == NULL) {
4400 if (lockp == NULL) {
4401 PV_STAT(pc_chunk_tryfail++);
4402 return (NULL);
4403 }
4404 m = reclaim_pv_chunk(pmap, lockp);
4405 if (m == NULL)
4406 goto retry;
4407 }
4408 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
4409 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
4410 dump_add_page(m->phys_addr);
4411 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
4412 pc->pc_pmap = pmap;
4413 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
4414 pc->pc_map[1] = PC_FREE1;
4415 pc->pc_map[2] = PC_FREE2;
4416 mtx_lock(&pv_chunks_mutex);
4417 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
4418 mtx_unlock(&pv_chunks_mutex);
4419 pv = &pc->pc_pventry[0];
4420 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
4421 PV_STAT(atomic_add_long(&pv_entry_count, 1));
4422 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
4423 return (pv);
4424 }
4425
4426 /*
4427 * Returns the number of one bits within the given PV chunk map.
4428 *
4429 * The erratas for Intel processors state that "POPCNT Instruction May
4430 * Take Longer to Execute Than Expected". It is believed that the
4431 * issue is the spurious dependency on the destination register.
4432 * Provide a hint to the register rename logic that the destination
4433 * value is overwritten, by clearing it, as suggested in the
4434 * optimization manual. It should be cheap for unaffected processors
4435 * as well.
4436 *
4437 * Reference numbers for erratas are
4438 * 4th Gen Core: HSD146
4439 * 5th Gen Core: BDM85
4440 * 6th Gen Core: SKL029
4441 */
4442 static int
4443 popcnt_pc_map_pq(uint64_t *map)
4444 {
4445 u_long result, tmp;
4446
4447 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
4448 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
4449 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
4450 : "=&r" (result), "=&r" (tmp)
4451 : "m" (map[0]), "m" (map[1]), "m" (map[2]));
4452 return (result);
4453 }
4454
4455 /*
4456 * Ensure that the number of spare PV entries in the specified pmap meets or
4457 * exceeds the given count, "needed".
4458 *
4459 * The given PV list lock may be released.
4460 */
4461 static void
4462 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
4463 {
4464 struct pch new_tail;
4465 struct pv_chunk *pc;
4466 vm_page_t m;
4467 int avail, free;
4468 bool reclaimed;
4469
4470 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4471 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
4472
4473 /*
4474 * Newly allocated PV chunks must be stored in a private list until
4475 * the required number of PV chunks have been allocated. Otherwise,
4476 * reclaim_pv_chunk() could recycle one of these chunks. In
4477 * contrast, these chunks must be added to the pmap upon allocation.
4478 */
4479 TAILQ_INIT(&new_tail);
4480 retry:
4481 avail = 0;
4482 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
4483 #ifndef __POPCNT__
4484 if ((cpu_feature2 & CPUID2_POPCNT) == 0)
4485 bit_count((bitstr_t *)pc->pc_map, 0,
4486 sizeof(pc->pc_map) * NBBY, &free);
4487 else
4488 #endif
4489 free = popcnt_pc_map_pq(pc->pc_map);
4490 if (free == 0)
4491 break;
4492 avail += free;
4493 if (avail >= needed)
4494 break;
4495 }
4496 for (reclaimed = false; avail < needed; avail += _NPCPV) {
4497 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
4498 VM_ALLOC_WIRED);
4499 if (m == NULL) {
4500 m = reclaim_pv_chunk(pmap, lockp);
4501 if (m == NULL)
4502 goto retry;
4503 reclaimed = true;
4504 }
4505 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
4506 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
4507 dump_add_page(m->phys_addr);
4508 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
4509 pc->pc_pmap = pmap;
4510 pc->pc_map[0] = PC_FREE0;
4511 pc->pc_map[1] = PC_FREE1;
4512 pc->pc_map[2] = PC_FREE2;
4513 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
4514 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
4515 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
4516
4517 /*
4518 * The reclaim might have freed a chunk from the current pmap.
4519 * If that chunk contained available entries, we need to
4520 * re-count the number of available entries.
4521 */
4522 if (reclaimed)
4523 goto retry;
4524 }
4525 if (!TAILQ_EMPTY(&new_tail)) {
4526 mtx_lock(&pv_chunks_mutex);
4527 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
4528 mtx_unlock(&pv_chunks_mutex);
4529 }
4530 }
4531
4532 /*
4533 * First find and then remove the pv entry for the specified pmap and virtual
4534 * address from the specified pv list. Returns the pv entry if found and NULL
4535 * otherwise. This operation can be performed on pv lists for either 4KB or
4536 * 2MB page mappings.
4537 */
4538 static __inline pv_entry_t
4539 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
4540 {
4541 pv_entry_t pv;
4542
4543 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4544 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
4545 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4546 pvh->pv_gen++;
4547 break;
4548 }
4549 }
4550 return (pv);
4551 }
4552
4553 /*
4554 * After demotion from a 2MB page mapping to 512 4KB page mappings,
4555 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
4556 * entries for each of the 4KB page mappings.
4557 */
4558 static void
4559 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4560 struct rwlock **lockp)
4561 {
4562 struct md_page *pvh;
4563 struct pv_chunk *pc;
4564 pv_entry_t pv;
4565 vm_offset_t va_last;
4566 vm_page_t m;
4567 int bit, field;
4568
4569 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4570 KASSERT((pa & PDRMASK) == 0,
4571 ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
4572 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4573
4574 /*
4575 * Transfer the 2mpage's pv entry for this mapping to the first
4576 * page's pv list. Once this transfer begins, the pv list lock
4577 * must not be released until the last pv entry is reinstantiated.
4578 */
4579 pvh = pa_to_pvh(pa);
4580 va = trunc_2mpage(va);
4581 pv = pmap_pvh_remove(pvh, pmap, va);
4582 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
4583 m = PHYS_TO_VM_PAGE(pa);
4584 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4585 m->md.pv_gen++;
4586 /* Instantiate the remaining NPTEPG - 1 pv entries. */
4587 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
4588 va_last = va + NBPDR - PAGE_SIZE;
4589 for (;;) {
4590 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
4591 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
4592 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
4593 for (field = 0; field < _NPCM; field++) {
4594 while (pc->pc_map[field]) {
4595 bit = bsfq(pc->pc_map[field]);
4596 pc->pc_map[field] &= ~(1ul << bit);
4597 pv = &pc->pc_pventry[field * 64 + bit];
4598 va += PAGE_SIZE;
4599 pv->pv_va = va;
4600 m++;
4601 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4602 ("pmap_pv_demote_pde: page %p is not managed", m));
4603 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4604 m->md.pv_gen++;
4605 if (va == va_last)
4606 goto out;
4607 }
4608 }
4609 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4610 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
4611 }
4612 out:
4613 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
4614 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4615 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
4616 }
4617 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
4618 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
4619 }
4620
4621 #if VM_NRESERVLEVEL > 0
4622 /*
4623 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4624 * replace the many pv entries for the 4KB page mappings by a single pv entry
4625 * for the 2MB page mapping.
4626 */
4627 static void
4628 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4629 struct rwlock **lockp)
4630 {
4631 struct md_page *pvh;
4632 pv_entry_t pv;
4633 vm_offset_t va_last;
4634 vm_page_t m;
4635
4636 KASSERT((pa & PDRMASK) == 0,
4637 ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
4638 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4639
4640 /*
4641 * Transfer the first page's pv entry for this mapping to the 2mpage's
4642 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4643 * a transfer avoids the possibility that get_pv_entry() calls
4644 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4645 * mappings that is being promoted.
4646 */
4647 m = PHYS_TO_VM_PAGE(pa);
4648 va = trunc_2mpage(va);
4649 pv = pmap_pvh_remove(&m->md, pmap, va);
4650 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
4651 pvh = pa_to_pvh(pa);
4652 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4653 pvh->pv_gen++;
4654 /* Free the remaining NPTEPG - 1 pv entries. */
4655 va_last = va + NBPDR - PAGE_SIZE;
4656 do {
4657 m++;
4658 va += PAGE_SIZE;
4659 pmap_pvh_free(&m->md, pmap, va);
4660 } while (va < va_last);
4661 }
4662 #endif /* VM_NRESERVLEVEL > 0 */
4663
4664 /*
4665 * First find and then destroy the pv entry for the specified pmap and virtual
4666 * address. This operation can be performed on pv lists for either 4KB or 2MB
4667 * page mappings.
4668 */
4669 static void
4670 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
4671 {
4672 pv_entry_t pv;
4673
4674 pv = pmap_pvh_remove(pvh, pmap, va);
4675 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
4676 free_pv_entry(pmap, pv);
4677 }
4678
4679 /*
4680 * Conditionally create the PV entry for a 4KB page mapping if the required
4681 * memory can be allocated without resorting to reclamation.
4682 */
4683 static boolean_t
4684 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
4685 struct rwlock **lockp)
4686 {
4687 pv_entry_t pv;
4688
4689 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4690 /* Pass NULL instead of the lock pointer to disable reclamation. */
4691 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
4692 pv->pv_va = va;
4693 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4694 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4695 m->md.pv_gen++;
4696 return (TRUE);
4697 } else
4698 return (FALSE);
4699 }
4700
4701 /*
4702 * Create the PV entry for a 2MB page mapping. Always returns true unless the
4703 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
4704 * false if the PV entry cannot be allocated without resorting to reclamation.
4705 */
4706 static bool
4707 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
4708 struct rwlock **lockp)
4709 {
4710 struct md_page *pvh;
4711 pv_entry_t pv;
4712 vm_paddr_t pa;
4713
4714 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4715 /* Pass NULL instead of the lock pointer to disable reclamation. */
4716 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
4717 NULL : lockp)) == NULL)
4718 return (false);
4719 pv->pv_va = va;
4720 pa = pde & PG_PS_FRAME;
4721 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4722 pvh = pa_to_pvh(pa);
4723 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4724 pvh->pv_gen++;
4725 return (true);
4726 }
4727
4728 /*
4729 * Fills a page table page with mappings to consecutive physical pages.
4730 */
4731 static void
4732 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
4733 {
4734 pt_entry_t *pte;
4735
4736 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
4737 *pte = newpte;
4738 newpte += PAGE_SIZE;
4739 }
4740 }
4741
4742 /*
4743 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page
4744 * mapping is invalidated.
4745 */
4746 static boolean_t
4747 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
4748 {
4749 struct rwlock *lock;
4750 boolean_t rv;
4751
4752 lock = NULL;
4753 rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
4754 if (lock != NULL)
4755 rw_wunlock(lock);
4756 return (rv);
4757 }
4758
4759 static void
4760 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused)
4761 {
4762 #ifdef INVARIANTS
4763 #ifdef DIAGNOSTIC
4764 pt_entry_t *xpte, *ypte;
4765
4766 for (xpte = firstpte; xpte < firstpte + NPTEPG;
4767 xpte++, newpte += PAGE_SIZE) {
4768 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) {
4769 printf("pmap_demote_pde: xpte %zd and newpte map "
4770 "different pages: found %#lx, expected %#lx\n",
4771 xpte - firstpte, *xpte, newpte);
4772 printf("page table dump\n");
4773 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++)
4774 printf("%zd %#lx\n", ypte - firstpte, *ypte);
4775 panic("firstpte");
4776 }
4777 }
4778 #else
4779 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
4780 ("pmap_demote_pde: firstpte and newpte map different physical"
4781 " addresses"));
4782 #endif
4783 #endif
4784 }
4785
4786 static void
4787 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
4788 pd_entry_t oldpde, struct rwlock **lockp)
4789 {
4790 struct spglist free;
4791 vm_offset_t sva;
4792
4793 SLIST_INIT(&free);
4794 sva = trunc_2mpage(va);
4795 pmap_remove_pde(pmap, pde, sva, &free, lockp);
4796 if ((oldpde & pmap_global_bit(pmap)) == 0)
4797 pmap_invalidate_pde_page(pmap, sva, oldpde);
4798 vm_page_free_pages_toq(&free, true);
4799 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p",
4800 va, pmap);
4801 }
4802
4803 static boolean_t
4804 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4805 struct rwlock **lockp)
4806 {
4807 pd_entry_t newpde, oldpde;
4808 pt_entry_t *firstpte, newpte;
4809 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
4810 vm_paddr_t mptepa;
4811 vm_page_t mpte;
4812 int PG_PTE_CACHE;
4813 bool in_kernel;
4814
4815 PG_A = pmap_accessed_bit(pmap);
4816 PG_G = pmap_global_bit(pmap);
4817 PG_M = pmap_modified_bit(pmap);
4818 PG_RW = pmap_rw_bit(pmap);
4819 PG_V = pmap_valid_bit(pmap);
4820 PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4821 PG_PKU_MASK = pmap_pku_mask_bit(pmap);
4822
4823 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4824 in_kernel = va >= VM_MAXUSER_ADDRESS;
4825 oldpde = *pde;
4826 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
4827 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
4828
4829 /*
4830 * Invalidate the 2MB page mapping and return "failure" if the
4831 * mapping was never accessed.
4832 */
4833 if ((oldpde & PG_A) == 0) {
4834 KASSERT((oldpde & PG_W) == 0,
4835 ("pmap_demote_pde: a wired mapping is missing PG_A"));
4836 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
4837 return (FALSE);
4838 }
4839
4840 mpte = pmap_remove_pt_page(pmap, va);
4841 if (mpte == NULL) {
4842 KASSERT((oldpde & PG_W) == 0,
4843 ("pmap_demote_pde: page table page for a wired mapping"
4844 " is missing"));
4845
4846 /*
4847 * If the page table page is missing and the mapping
4848 * is for a kernel address, the mapping must belong to
4849 * the direct map. Page table pages are preallocated
4850 * for every other part of the kernel address space,
4851 * so the direct map region is the only part of the
4852 * kernel address space that must be handled here.
4853 */
4854 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
4855 va < DMAP_MAX_ADDRESS),
4856 ("pmap_demote_pde: No saved mpte for va %#lx", va));
4857
4858 /*
4859 * If the 2MB page mapping belongs to the direct map
4860 * region of the kernel's address space, then the page
4861 * allocation request specifies the highest possible
4862 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
4863 * priority is normal.
4864 */
4865 mpte = vm_page_alloc(NULL, pmap_pde_pindex(va),
4866 (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
4867 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
4868
4869 /*
4870 * If the allocation of the new page table page fails,
4871 * invalidate the 2MB page mapping and return "failure".
4872 */
4873 if (mpte == NULL) {
4874 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
4875 return (FALSE);
4876 }
4877
4878 if (!in_kernel) {
4879 mpte->wire_count = NPTEPG;
4880 pmap_resident_count_inc(pmap, 1);
4881 }
4882 }
4883 mptepa = VM_PAGE_TO_PHYS(mpte);
4884 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
4885 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
4886 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
4887 ("pmap_demote_pde: oldpde is missing PG_M"));
4888 newpte = oldpde & ~PG_PS;
4889 newpte = pmap_swap_pat(pmap, newpte);
4890
4891 /*
4892 * If the page table page is not leftover from an earlier promotion,
4893 * initialize it.
4894 */
4895 if (mpte->valid == 0)
4896 pmap_fill_ptp(firstpte, newpte);
4897
4898 pmap_demote_pde_check(firstpte, newpte);
4899
4900 /*
4901 * If the mapping has changed attributes, update the page table
4902 * entries.
4903 */
4904 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
4905 pmap_fill_ptp(firstpte, newpte);
4906
4907 /*
4908 * The spare PV entries must be reserved prior to demoting the
4909 * mapping, that is, prior to changing the PDE. Otherwise, the state
4910 * of the PDE and the PV lists will be inconsistent, which can result
4911 * in reclaim_pv_chunk() attempting to remove a PV entry from the
4912 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
4913 * PV entry for the 2MB page mapping that is being demoted.
4914 */
4915 if ((oldpde & PG_MANAGED) != 0)
4916 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
4917
4918 /*
4919 * Demote the mapping. This pmap is locked. The old PDE has
4920 * PG_A set. If the old PDE has PG_RW set, it also has PG_M
4921 * set. Thus, there is no danger of a race with another
4922 * processor changing the setting of PG_A and/or PG_M between
4923 * the read above and the store below.
4924 */
4925 if (workaround_erratum383)
4926 pmap_update_pde(pmap, va, pde, newpde);
4927 else
4928 pde_store(pde, newpde);
4929
4930 /*
4931 * Invalidate a stale recursive mapping of the page table page.
4932 */
4933 if (in_kernel)
4934 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
4935
4936 /*
4937 * Demote the PV entry.
4938 */
4939 if ((oldpde & PG_MANAGED) != 0)
4940 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
4941
4942 atomic_add_long(&pmap_pde_demotions, 1);
4943 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
4944 va, pmap);
4945 return (TRUE);
4946 }
4947
4948 /*
4949 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
4950 */
4951 static void
4952 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
4953 {
4954 pd_entry_t newpde;
4955 vm_paddr_t mptepa;
4956 vm_page_t mpte;
4957
4958 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
4959 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4960 mpte = pmap_remove_pt_page(pmap, va);
4961 if (mpte == NULL)
4962 panic("pmap_remove_kernel_pde: Missing pt page.");
4963
4964 mptepa = VM_PAGE_TO_PHYS(mpte);
4965 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
4966
4967 /*
4968 * If this page table page was unmapped by a promotion, then it
4969 * contains valid mappings. Zero it to invalidate those mappings.
4970 */
4971 if (mpte->valid != 0)
4972 pagezero((void *)PHYS_TO_DMAP(mptepa));
4973
4974 /*
4975 * Demote the mapping.
4976 */
4977 if (workaround_erratum383)
4978 pmap_update_pde(pmap, va, pde, newpde);
4979 else
4980 pde_store(pde, newpde);
4981
4982 /*
4983 * Invalidate a stale recursive mapping of the page table page.
4984 */
4985 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
4986 }
4987
4988 /*
4989 * pmap_remove_pde: do the things to unmap a superpage in a process
4990 */
4991 static int
4992 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
4993 struct spglist *free, struct rwlock **lockp)
4994 {
4995 struct md_page *pvh;
4996 pd_entry_t oldpde;
4997 vm_offset_t eva, va;
4998 vm_page_t m, mpte;
4999 pt_entry_t PG_G, PG_A, PG_M, PG_RW;
5000
5001 PG_G = pmap_global_bit(pmap);
5002 PG_A = pmap_accessed_bit(pmap);
5003 PG_M = pmap_modified_bit(pmap);
5004 PG_RW = pmap_rw_bit(pmap);
5005
5006 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5007 KASSERT((sva & PDRMASK) == 0,
5008 ("pmap_remove_pde: sva is not 2mpage aligned"));
5009 oldpde = pte_load_clear(pdq);
5010 if (oldpde & PG_W)
5011 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
5012 if ((oldpde & PG_G) != 0)
5013 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
5014 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
5015 if (oldpde & PG_MANAGED) {
5016 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
5017 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
5018 pmap_pvh_free(pvh, pmap, sva);
5019 eva = sva + NBPDR;
5020 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
5021 va < eva; va += PAGE_SIZE, m++) {
5022 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
5023 vm_page_dirty(m);
5024 if (oldpde & PG_A)
5025 vm_page_aflag_set(m, PGA_REFERENCED);
5026 if (TAILQ_EMPTY(&m->md.pv_list) &&
5027 TAILQ_EMPTY(&pvh->pv_list))
5028 vm_page_aflag_clear(m, PGA_WRITEABLE);
5029 pmap_delayed_invl_page(m);
5030 }
5031 }
5032 if (pmap == kernel_pmap) {
5033 pmap_remove_kernel_pde(pmap, pdq, sva);
5034 } else {
5035 mpte = pmap_remove_pt_page(pmap, sva);
5036 if (mpte != NULL) {
5037 KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
5038 ("pmap_remove_pde: pte page not promoted"));
5039 pmap_resident_count_dec(pmap, 1);
5040 KASSERT(mpte->wire_count == NPTEPG,
5041 ("pmap_remove_pde: pte page wire count error"));
5042 mpte->wire_count = 0;
5043 pmap_add_delayed_free_list(mpte, free, FALSE);
5044 }
5045 }
5046 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
5047 }
5048
5049 /*
5050 * pmap_remove_pte: do the things to unmap a page in a process
5051 */
5052 static int
5053 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
5054 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
5055 {
5056 struct md_page *pvh;
5057 pt_entry_t oldpte, PG_A, PG_M, PG_RW;
5058 vm_page_t m;
5059
5060 PG_A = pmap_accessed_bit(pmap);
5061 PG_M = pmap_modified_bit(pmap);
5062 PG_RW = pmap_rw_bit(pmap);
5063
5064 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5065 oldpte = pte_load_clear(ptq);
5066 if (oldpte & PG_W)
5067 pmap->pm_stats.wired_count -= 1;
5068 pmap_resident_count_dec(pmap, 1);
5069 if (oldpte & PG_MANAGED) {
5070 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
5071 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5072 vm_page_dirty(m);
5073 if (oldpte & PG_A)
5074 vm_page_aflag_set(m, PGA_REFERENCED);
5075 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5076 pmap_pvh_free(&m->md, pmap, va);
5077 if (TAILQ_EMPTY(&m->md.pv_list) &&
5078 (m->flags & PG_FICTITIOUS) == 0) {
5079 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5080 if (TAILQ_EMPTY(&pvh->pv_list))
5081 vm_page_aflag_clear(m, PGA_WRITEABLE);
5082 }
5083 pmap_delayed_invl_page(m);
5084 }
5085 return (pmap_unuse_pt(pmap, va, ptepde, free));
5086 }
5087
5088 /*
5089 * Remove a single page from a process address space
5090 */
5091 static void
5092 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
5093 struct spglist *free)
5094 {
5095 struct rwlock *lock;
5096 pt_entry_t *pte, PG_V;
5097
5098 PG_V = pmap_valid_bit(pmap);
5099 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5100 if ((*pde & PG_V) == 0)
5101 return;
5102 pte = pmap_pde_to_pte(pde, va);
5103 if ((*pte & PG_V) == 0)
5104 return;
5105 lock = NULL;
5106 pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
5107 if (lock != NULL)
5108 rw_wunlock(lock);
5109 pmap_invalidate_page(pmap, va);
5110 }
5111
5112 /*
5113 * Removes the specified range of addresses from the page table page.
5114 */
5115 static bool
5116 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
5117 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
5118 {
5119 pt_entry_t PG_G, *pte;
5120 vm_offset_t va;
5121 bool anyvalid;
5122
5123 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5124 PG_G = pmap_global_bit(pmap);
5125 anyvalid = false;
5126 va = eva;
5127 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
5128 sva += PAGE_SIZE) {
5129 if (*pte == 0) {
5130 if (va != eva) {
5131 pmap_invalidate_range(pmap, va, sva);
5132 va = eva;
5133 }
5134 continue;
5135 }
5136 if ((*pte & PG_G) == 0)
5137 anyvalid = true;
5138 else if (va == eva)
5139 va = sva;
5140 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
5141 sva += PAGE_SIZE;
5142 break;
5143 }
5144 }
5145 if (va != eva)
5146 pmap_invalidate_range(pmap, va, sva);
5147 return (anyvalid);
5148 }
5149
5150 /*
5151 * Remove the given range of addresses from the specified map.
5152 *
5153 * It is assumed that the start and end are properly
5154 * rounded to the page size.
5155 */
5156 void
5157 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5158 {
5159 struct rwlock *lock;
5160 vm_offset_t va_next;
5161 pml4_entry_t *pml4e;
5162 pdp_entry_t *pdpe;
5163 pd_entry_t ptpaddr, *pde;
5164 pt_entry_t PG_G, PG_V;
5165 struct spglist free;
5166 int anyvalid;
5167
5168 PG_G = pmap_global_bit(pmap);
5169 PG_V = pmap_valid_bit(pmap);
5170
5171 /*
5172 * Perform an unsynchronized read. This is, however, safe.
5173 */
5174 if (pmap->pm_stats.resident_count == 0)
5175 return;
5176
5177 anyvalid = 0;
5178 SLIST_INIT(&free);
5179
5180 pmap_delayed_invl_start();
5181 PMAP_LOCK(pmap);
5182 pmap_pkru_on_remove(pmap, sva, eva);
5183
5184 /*
5185 * special handling of removing one page. a very
5186 * common operation and easy to short circuit some
5187 * code.
5188 */
5189 if (sva + PAGE_SIZE == eva) {
5190 pde = pmap_pde(pmap, sva);
5191 if (pde && (*pde & PG_PS) == 0) {
5192 pmap_remove_page(pmap, sva, pde, &free);
5193 goto out;
5194 }
5195 }
5196
5197 lock = NULL;
5198 for (; sva < eva; sva = va_next) {
5199
5200 if (pmap->pm_stats.resident_count == 0)
5201 break;
5202
5203 pml4e = pmap_pml4e(pmap, sva);
5204 if ((*pml4e & PG_V) == 0) {
5205 va_next = (sva + NBPML4) & ~PML4MASK;
5206 if (va_next < sva)
5207 va_next = eva;
5208 continue;
5209 }
5210
5211 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5212 if ((*pdpe & PG_V) == 0) {
5213 va_next = (sva + NBPDP) & ~PDPMASK;
5214 if (va_next < sva)
5215 va_next = eva;
5216 continue;
5217 }
5218
5219 /*
5220 * Calculate index for next page table.
5221 */
5222 va_next = (sva + NBPDR) & ~PDRMASK;
5223 if (va_next < sva)
5224 va_next = eva;
5225
5226 pde = pmap_pdpe_to_pde(pdpe, sva);
5227 ptpaddr = *pde;
5228
5229 /*
5230 * Weed out invalid mappings.
5231 */
5232 if (ptpaddr == 0)
5233 continue;
5234
5235 /*
5236 * Check for large page.
5237 */
5238 if ((ptpaddr & PG_PS) != 0) {
5239 /*
5240 * Are we removing the entire large page? If not,
5241 * demote the mapping and fall through.
5242 */
5243 if (sva + NBPDR == va_next && eva >= va_next) {
5244 /*
5245 * The TLB entry for a PG_G mapping is
5246 * invalidated by pmap_remove_pde().
5247 */
5248 if ((ptpaddr & PG_G) == 0)
5249 anyvalid = 1;
5250 pmap_remove_pde(pmap, pde, sva, &free, &lock);
5251 continue;
5252 } else if (!pmap_demote_pde_locked(pmap, pde, sva,
5253 &lock)) {
5254 /* The large page mapping was destroyed. */
5255 continue;
5256 } else
5257 ptpaddr = *pde;
5258 }
5259
5260 /*
5261 * Limit our scan to either the end of the va represented
5262 * by the current page table page, or to the end of the
5263 * range being removed.
5264 */
5265 if (va_next > eva)
5266 va_next = eva;
5267
5268 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
5269 anyvalid = 1;
5270 }
5271 if (lock != NULL)
5272 rw_wunlock(lock);
5273 out:
5274 if (anyvalid)
5275 pmap_invalidate_all(pmap);
5276 PMAP_UNLOCK(pmap);
5277 pmap_delayed_invl_finish();
5278 vm_page_free_pages_toq(&free, true);
5279 }
5280
5281 /*
5282 * Routine: pmap_remove_all
5283 * Function:
5284 * Removes this physical page from
5285 * all physical maps in which it resides.
5286 * Reflects back modify bits to the pager.
5287 *
5288 * Notes:
5289 * Original versions of this routine were very
5290 * inefficient because they iteratively called
5291 * pmap_remove (slow...)
5292 */
5293
5294 void
5295 pmap_remove_all(vm_page_t m)
5296 {
5297 struct md_page *pvh;
5298 pv_entry_t pv;
5299 pmap_t pmap;
5300 struct rwlock *lock;
5301 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
5302 pd_entry_t *pde;
5303 vm_offset_t va;
5304 struct spglist free;
5305 int pvh_gen, md_gen;
5306
5307 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5308 ("pmap_remove_all: page %p is not managed", m));
5309 SLIST_INIT(&free);
5310 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5311 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5312 pa_to_pvh(VM_PAGE_TO_PHYS(m));
5313 retry:
5314 rw_wlock(lock);
5315 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
5316 pmap = PV_PMAP(pv);
5317 if (!PMAP_TRYLOCK(pmap)) {
5318 pvh_gen = pvh->pv_gen;
5319 rw_wunlock(lock);
5320 PMAP_LOCK(pmap);
5321 rw_wlock(lock);
5322 if (pvh_gen != pvh->pv_gen) {
5323 rw_wunlock(lock);
5324 PMAP_UNLOCK(pmap);
5325 goto retry;
5326 }
5327 }
5328 va = pv->pv_va;
5329 pde = pmap_pde(pmap, va);
5330 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
5331 PMAP_UNLOCK(pmap);
5332 }
5333 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
5334 pmap = PV_PMAP(pv);
5335 if (!PMAP_TRYLOCK(pmap)) {
5336 pvh_gen = pvh->pv_gen;
5337 md_gen = m->md.pv_gen;
5338 rw_wunlock(lock);
5339 PMAP_LOCK(pmap);
5340 rw_wlock(lock);
5341 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5342 rw_wunlock(lock);
5343 PMAP_UNLOCK(pmap);
5344 goto retry;
5345 }
5346 }
5347 PG_A = pmap_accessed_bit(pmap);
5348 PG_M = pmap_modified_bit(pmap);
5349 PG_RW = pmap_rw_bit(pmap);
5350 pmap_resident_count_dec(pmap, 1);
5351 pde = pmap_pde(pmap, pv->pv_va);
5352 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
5353 " a 2mpage in page %p's pv list", m));
5354 pte = pmap_pde_to_pte(pde, pv->pv_va);
5355 tpte = pte_load_clear(pte);
5356 if (tpte & PG_W)
5357 pmap->pm_stats.wired_count--;
5358 if (tpte & PG_A)
5359 vm_page_aflag_set(m, PGA_REFERENCED);
5360
5361 /*
5362 * Update the vm_page_t clean and reference bits.
5363 */
5364 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5365 vm_page_dirty(m);
5366 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
5367 pmap_invalidate_page(pmap, pv->pv_va);
5368 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5369 m->md.pv_gen++;
5370 free_pv_entry(pmap, pv);
5371 PMAP_UNLOCK(pmap);
5372 }
5373 vm_page_aflag_clear(m, PGA_WRITEABLE);
5374 rw_wunlock(lock);
5375 pmap_delayed_invl_wait(m);
5376 vm_page_free_pages_toq(&free, true);
5377 }
5378
5379 /*
5380 * pmap_protect_pde: do the things to protect a 2mpage in a process
5381 */
5382 static boolean_t
5383 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
5384 {
5385 pd_entry_t newpde, oldpde;
5386 vm_page_t m, mt;
5387 boolean_t anychanged;
5388 pt_entry_t PG_G, PG_M, PG_RW;
5389
5390 PG_G = pmap_global_bit(pmap);
5391 PG_M = pmap_modified_bit(pmap);
5392 PG_RW = pmap_rw_bit(pmap);
5393
5394 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5395 KASSERT((sva & PDRMASK) == 0,
5396 ("pmap_protect_pde: sva is not 2mpage aligned"));
5397 anychanged = FALSE;
5398 retry:
5399 oldpde = newpde = *pde;
5400 if ((prot & VM_PROT_WRITE) == 0) {
5401 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
5402 (PG_MANAGED | PG_M | PG_RW)) {
5403 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
5404 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5405 vm_page_dirty(mt);
5406 }
5407 newpde &= ~(PG_RW | PG_M);
5408 }
5409 if ((prot & VM_PROT_EXECUTE) == 0)
5410 newpde |= pg_nx;
5411 if (newpde != oldpde) {
5412 /*
5413 * As an optimization to future operations on this PDE, clear
5414 * PG_PROMOTED. The impending invalidation will remove any
5415 * lingering 4KB page mappings from the TLB.
5416 */
5417 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
5418 goto retry;
5419 if ((oldpde & PG_G) != 0)
5420 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
5421 else
5422 anychanged = TRUE;
5423 }
5424 return (anychanged);
5425 }
5426
5427 /*
5428 * Set the physical protection on the
5429 * specified range of this map as requested.
5430 */
5431 void
5432 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
5433 {
5434 vm_offset_t va_next;
5435 pml4_entry_t *pml4e;
5436 pdp_entry_t *pdpe;
5437 pd_entry_t ptpaddr, *pde;
5438 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
5439 boolean_t anychanged;
5440
5441 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
5442 if (prot == VM_PROT_NONE) {
5443 pmap_remove(pmap, sva, eva);
5444 return;
5445 }
5446
5447 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
5448 (VM_PROT_WRITE|VM_PROT_EXECUTE))
5449 return;
5450
5451 PG_G = pmap_global_bit(pmap);
5452 PG_M = pmap_modified_bit(pmap);
5453 PG_V = pmap_valid_bit(pmap);
5454 PG_RW = pmap_rw_bit(pmap);
5455 anychanged = FALSE;
5456
5457 /*
5458 * Although this function delays and batches the invalidation
5459 * of stale TLB entries, it does not need to call
5460 * pmap_delayed_invl_start() and
5461 * pmap_delayed_invl_finish(), because it does not
5462 * ordinarily destroy mappings. Stale TLB entries from
5463 * protection-only changes need only be invalidated before the
5464 * pmap lock is released, because protection-only changes do
5465 * not destroy PV entries. Even operations that iterate over
5466 * a physical page's PV list of mappings, like
5467 * pmap_remove_write(), acquire the pmap lock for each
5468 * mapping. Consequently, for protection-only changes, the
5469 * pmap lock suffices to synchronize both page table and TLB
5470 * updates.
5471 *
5472 * This function only destroys a mapping if pmap_demote_pde()
5473 * fails. In that case, stale TLB entries are immediately
5474 * invalidated.
5475 */
5476
5477 PMAP_LOCK(pmap);
5478 for (; sva < eva; sva = va_next) {
5479
5480 pml4e = pmap_pml4e(pmap, sva);
5481 if ((*pml4e & PG_V) == 0) {
5482 va_next = (sva + NBPML4) & ~PML4MASK;
5483 if (va_next < sva)
5484 va_next = eva;
5485 continue;
5486 }
5487
5488 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5489 if ((*pdpe & PG_V) == 0) {
5490 va_next = (sva + NBPDP) & ~PDPMASK;
5491 if (va_next < sva)
5492 va_next = eva;
5493 continue;
5494 }
5495
5496 va_next = (sva + NBPDR) & ~PDRMASK;
5497 if (va_next < sva)
5498 va_next = eva;
5499
5500 pde = pmap_pdpe_to_pde(pdpe, sva);
5501 ptpaddr = *pde;
5502
5503 /*
5504 * Weed out invalid mappings.
5505 */
5506 if (ptpaddr == 0)
5507 continue;
5508
5509 /*
5510 * Check for large page.
5511 */
5512 if ((ptpaddr & PG_PS) != 0) {
5513 /*
5514 * Are we protecting the entire large page? If not,
5515 * demote the mapping and fall through.
5516 */
5517 if (sva + NBPDR == va_next && eva >= va_next) {
5518 /*
5519 * The TLB entry for a PG_G mapping is
5520 * invalidated by pmap_protect_pde().
5521 */
5522 if (pmap_protect_pde(pmap, pde, sva, prot))
5523 anychanged = TRUE;
5524 continue;
5525 } else if (!pmap_demote_pde(pmap, pde, sva)) {
5526 /*
5527 * The large page mapping was destroyed.
5528 */
5529 continue;
5530 }
5531 }
5532
5533 if (va_next > eva)
5534 va_next = eva;
5535
5536 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5537 sva += PAGE_SIZE) {
5538 pt_entry_t obits, pbits;
5539 vm_page_t m;
5540
5541 retry:
5542 obits = pbits = *pte;
5543 if ((pbits & PG_V) == 0)
5544 continue;
5545
5546 if ((prot & VM_PROT_WRITE) == 0) {
5547 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
5548 (PG_MANAGED | PG_M | PG_RW)) {
5549 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
5550 vm_page_dirty(m);
5551 }
5552 pbits &= ~(PG_RW | PG_M);
5553 }
5554 if ((prot & VM_PROT_EXECUTE) == 0)
5555 pbits |= pg_nx;
5556
5557 if (pbits != obits) {
5558 if (!atomic_cmpset_long(pte, obits, pbits))
5559 goto retry;
5560 if (obits & PG_G)
5561 pmap_invalidate_page(pmap, sva);
5562 else
5563 anychanged = TRUE;
5564 }
5565 }
5566 }
5567 if (anychanged)
5568 pmap_invalidate_all(pmap);
5569 PMAP_UNLOCK(pmap);
5570 }
5571
5572 #if VM_NRESERVLEVEL > 0
5573 static bool
5574 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde)
5575 {
5576
5577 if (pmap->pm_type != PT_EPT)
5578 return (false);
5579 return ((pde & EPT_PG_EXECUTE) != 0);
5580 }
5581
5582 /*
5583 * Tries to promote the 512, contiguous 4KB page mappings that are within a
5584 * single page table page (PTP) to a single 2MB page mapping. For promotion
5585 * to occur, two conditions must be met: (1) the 4KB page mappings must map
5586 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
5587 * identical characteristics.
5588 */
5589 static void
5590 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
5591 struct rwlock **lockp)
5592 {
5593 pd_entry_t newpde;
5594 pt_entry_t *firstpte, oldpte, pa, *pte;
5595 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
5596 vm_page_t mpte;
5597 int PG_PTE_CACHE;
5598
5599 PG_A = pmap_accessed_bit(pmap);
5600 PG_G = pmap_global_bit(pmap);
5601 PG_M = pmap_modified_bit(pmap);
5602 PG_V = pmap_valid_bit(pmap);
5603 PG_RW = pmap_rw_bit(pmap);
5604 PG_PKU_MASK = pmap_pku_mask_bit(pmap);
5605 PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
5606
5607 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5608
5609 /*
5610 * Examine the first PTE in the specified PTP. Abort if this PTE is
5611 * either invalid, unused, or does not map the first 4KB physical page
5612 * within a 2MB page.
5613 */
5614 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
5615 setpde:
5616 newpde = *firstpte;
5617 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) ||
5618 !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
5619 newpde))) {
5620 atomic_add_long(&pmap_pde_p_failures, 1);
5621 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
5622 " in pmap %p", va, pmap);
5623 return;
5624 }
5625 if ((newpde & (PG_M | PG_RW)) == PG_RW) {
5626 /*
5627 * When PG_M is already clear, PG_RW can be cleared without
5628 * a TLB invalidation.
5629 */
5630 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
5631 goto setpde;
5632 newpde &= ~PG_RW;
5633 }
5634
5635 /*
5636 * Examine each of the other PTEs in the specified PTP. Abort if this
5637 * PTE maps an unexpected 4KB physical page or does not have identical
5638 * characteristics to the first PTE.
5639 */
5640 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
5641 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
5642 setpte:
5643 oldpte = *pte;
5644 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
5645 atomic_add_long(&pmap_pde_p_failures, 1);
5646 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
5647 " in pmap %p", va, pmap);
5648 return;
5649 }
5650 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
5651 /*
5652 * When PG_M is already clear, PG_RW can be cleared
5653 * without a TLB invalidation.
5654 */
5655 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
5656 goto setpte;
5657 oldpte &= ~PG_RW;
5658 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
5659 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
5660 (va & ~PDRMASK), pmap);
5661 }
5662 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
5663 atomic_add_long(&pmap_pde_p_failures, 1);
5664 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
5665 " in pmap %p", va, pmap);
5666 return;
5667 }
5668 pa -= PAGE_SIZE;
5669 }
5670
5671 /*
5672 * Save the page table page in its current state until the PDE
5673 * mapping the superpage is demoted by pmap_demote_pde() or
5674 * destroyed by pmap_remove_pde().
5675 */
5676 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
5677 KASSERT(mpte >= vm_page_array &&
5678 mpte < &vm_page_array[vm_page_array_size],
5679 ("pmap_promote_pde: page table page is out of range"));
5680 KASSERT(mpte->pindex == pmap_pde_pindex(va),
5681 ("pmap_promote_pde: page table page's pindex is wrong"));
5682 if (pmap_insert_pt_page(pmap, mpte, true)) {
5683 atomic_add_long(&pmap_pde_p_failures, 1);
5684 CTR2(KTR_PMAP,
5685 "pmap_promote_pde: failure for va %#lx in pmap %p", va,
5686 pmap);
5687 return;
5688 }
5689
5690 /*
5691 * Promote the pv entries.
5692 */
5693 if ((newpde & PG_MANAGED) != 0)
5694 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
5695
5696 /*
5697 * Propagate the PAT index to its proper position.
5698 */
5699 newpde = pmap_swap_pat(pmap, newpde);
5700
5701 /*
5702 * Map the superpage.
5703 */
5704 if (workaround_erratum383)
5705 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
5706 else
5707 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
5708
5709 atomic_add_long(&pmap_pde_promotions, 1);
5710 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
5711 " in pmap %p", va, pmap);
5712 }
5713 #endif /* VM_NRESERVLEVEL > 0 */
5714
5715 /*
5716 * Insert the given physical page (p) at
5717 * the specified virtual address (v) in the
5718 * target physical map with the protection requested.
5719 *
5720 * If specified, the page will be wired down, meaning
5721 * that the related pte can not be reclaimed.
5722 *
5723 * NB: This is the only routine which MAY NOT lazy-evaluate
5724 * or lose information. That is, this routine must actually
5725 * insert this page into the given map NOW.
5726 *
5727 * When destroying both a page table and PV entry, this function
5728 * performs the TLB invalidation before releasing the PV list
5729 * lock, so we do not need pmap_delayed_invl_page() calls here.
5730 */
5731 int
5732 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5733 u_int flags, int8_t psind)
5734 {
5735 struct rwlock *lock;
5736 pd_entry_t *pde;
5737 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
5738 pt_entry_t newpte, origpte;
5739 pv_entry_t pv;
5740 vm_paddr_t opa, pa;
5741 vm_page_t mpte, om;
5742 int rv;
5743 boolean_t nosleep;
5744
5745 PG_A = pmap_accessed_bit(pmap);
5746 PG_G = pmap_global_bit(pmap);
5747 PG_M = pmap_modified_bit(pmap);
5748 PG_V = pmap_valid_bit(pmap);
5749 PG_RW = pmap_rw_bit(pmap);
5750
5751 va = trunc_page(va);
5752 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
5753 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
5754 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
5755 va));
5756 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
5757 va >= kmi.clean_eva,
5758 ("pmap_enter: managed mapping within the clean submap"));
5759 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
5760 VM_OBJECT_ASSERT_LOCKED(m->object);
5761 KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
5762 ("pmap_enter: flags %u has reserved bits set", flags));
5763 pa = VM_PAGE_TO_PHYS(m);
5764 newpte = (pt_entry_t)(pa | PG_A | PG_V);
5765 if ((flags & VM_PROT_WRITE) != 0)
5766 newpte |= PG_M;
5767 if ((prot & VM_PROT_WRITE) != 0)
5768 newpte |= PG_RW;
5769 KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
5770 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
5771 if ((prot & VM_PROT_EXECUTE) == 0)
5772 newpte |= pg_nx;
5773 if ((flags & PMAP_ENTER_WIRED) != 0)
5774 newpte |= PG_W;
5775 if (va < VM_MAXUSER_ADDRESS)
5776 newpte |= PG_U;
5777 if (pmap == kernel_pmap)
5778 newpte |= PG_G;
5779 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
5780
5781 /*
5782 * Set modified bit gratuitously for writeable mappings if
5783 * the page is unmanaged. We do not want to take a fault
5784 * to do the dirty bit accounting for these mappings.
5785 */
5786 if ((m->oflags & VPO_UNMANAGED) != 0) {
5787 if ((newpte & PG_RW) != 0)
5788 newpte |= PG_M;
5789 } else
5790 newpte |= PG_MANAGED;
5791
5792 lock = NULL;
5793 PMAP_LOCK(pmap);
5794 if (psind == 1) {
5795 /* Assert the required virtual and physical alignment. */
5796 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
5797 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5798 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
5799 goto out;
5800 }
5801 mpte = NULL;
5802
5803 /*
5804 * In the case that a page table page is not
5805 * resident, we are creating it here.
5806 */
5807 retry:
5808 pde = pmap_pde(pmap, va);
5809 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
5810 pmap_demote_pde_locked(pmap, pde, va, &lock))) {
5811 pte = pmap_pde_to_pte(pde, va);
5812 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
5813 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
5814 mpte->wire_count++;
5815 }
5816 } else if (va < VM_MAXUSER_ADDRESS) {
5817 /*
5818 * Here if the pte page isn't mapped, or if it has been
5819 * deallocated.
5820 */
5821 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5822 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
5823 nosleep ? NULL : &lock);
5824 if (mpte == NULL && nosleep) {
5825 rv = KERN_RESOURCE_SHORTAGE;
5826 goto out;
5827 }
5828 goto retry;
5829 } else
5830 panic("pmap_enter: invalid page directory va=%#lx", va);
5831
5832 origpte = *pte;
5833 pv = NULL;
5834 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
5835 newpte |= pmap_pkru_get(pmap, va);
5836
5837 /*
5838 * Is the specified virtual address already mapped?
5839 */
5840 if ((origpte & PG_V) != 0) {
5841 /*
5842 * Wiring change, just update stats. We don't worry about
5843 * wiring PT pages as they remain resident as long as there
5844 * are valid mappings in them. Hence, if a user page is wired,
5845 * the PT page will be also.
5846 */
5847 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
5848 pmap->pm_stats.wired_count++;
5849 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
5850 pmap->pm_stats.wired_count--;
5851
5852 /*
5853 * Remove the extra PT page reference.
5854 */
5855 if (mpte != NULL) {
5856 mpte->wire_count--;
5857 KASSERT(mpte->wire_count > 0,
5858 ("pmap_enter: missing reference to page table page,"
5859 " va: 0x%lx", va));
5860 }
5861
5862 /*
5863 * Has the physical page changed?
5864 */
5865 opa = origpte & PG_FRAME;
5866 if (opa == pa) {
5867 /*
5868 * No, might be a protection or wiring change.
5869 */
5870 if ((origpte & PG_MANAGED) != 0 &&
5871 (newpte & PG_RW) != 0)
5872 vm_page_aflag_set(m, PGA_WRITEABLE);
5873 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
5874 goto unchanged;
5875 goto validate;
5876 }
5877
5878 /*
5879 * The physical page has changed. Temporarily invalidate
5880 * the mapping. This ensures that all threads sharing the
5881 * pmap keep a consistent view of the mapping, which is
5882 * necessary for the correct handling of COW faults. It
5883 * also permits reuse of the old mapping's PV entry,
5884 * avoiding an allocation.
5885 *
5886 * For consistency, handle unmanaged mappings the same way.
5887 */
5888 origpte = pte_load_clear(pte);
5889 KASSERT((origpte & PG_FRAME) == opa,
5890 ("pmap_enter: unexpected pa update for %#lx", va));
5891 if ((origpte & PG_MANAGED) != 0) {
5892 om = PHYS_TO_VM_PAGE(opa);
5893
5894 /*
5895 * The pmap lock is sufficient to synchronize with
5896 * concurrent calls to pmap_page_test_mappings() and
5897 * pmap_ts_referenced().
5898 */
5899 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5900 vm_page_dirty(om);
5901 if ((origpte & PG_A) != 0)
5902 vm_page_aflag_set(om, PGA_REFERENCED);
5903 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
5904 pv = pmap_pvh_remove(&om->md, pmap, va);
5905 KASSERT(pv != NULL,
5906 ("pmap_enter: no PV entry for %#lx", va));
5907 if ((newpte & PG_MANAGED) == 0)
5908 free_pv_entry(pmap, pv);
5909 if ((om->aflags & PGA_WRITEABLE) != 0 &&
5910 TAILQ_EMPTY(&om->md.pv_list) &&
5911 ((om->flags & PG_FICTITIOUS) != 0 ||
5912 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
5913 vm_page_aflag_clear(om, PGA_WRITEABLE);
5914 }
5915 if ((origpte & PG_A) != 0)
5916 pmap_invalidate_page(pmap, va);
5917 origpte = 0;
5918 } else {
5919 /*
5920 * Increment the counters.
5921 */
5922 if ((newpte & PG_W) != 0)
5923 pmap->pm_stats.wired_count++;
5924 pmap_resident_count_inc(pmap, 1);
5925 }
5926
5927 /*
5928 * Enter on the PV list if part of our managed memory.
5929 */
5930 if ((newpte & PG_MANAGED) != 0) {
5931 if (pv == NULL) {
5932 pv = get_pv_entry(pmap, &lock);
5933 pv->pv_va = va;
5934 }
5935 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
5936 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5937 m->md.pv_gen++;
5938 if ((newpte & PG_RW) != 0)
5939 vm_page_aflag_set(m, PGA_WRITEABLE);
5940 }
5941
5942 /*
5943 * Update the PTE.
5944 */
5945 if ((origpte & PG_V) != 0) {
5946 validate:
5947 origpte = pte_load_store(pte, newpte);
5948 KASSERT((origpte & PG_FRAME) == pa,
5949 ("pmap_enter: unexpected pa update for %#lx", va));
5950 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
5951 (PG_M | PG_RW)) {
5952 if ((origpte & PG_MANAGED) != 0)
5953 vm_page_dirty(m);
5954
5955 /*
5956 * Although the PTE may still have PG_RW set, TLB
5957 * invalidation may nonetheless be required because
5958 * the PTE no longer has PG_M set.
5959 */
5960 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
5961 /*
5962 * This PTE change does not require TLB invalidation.
5963 */
5964 goto unchanged;
5965 }
5966 if ((origpte & PG_A) != 0)
5967 pmap_invalidate_page(pmap, va);
5968 } else
5969 pte_store(pte, newpte);
5970
5971 unchanged:
5972
5973 #if VM_NRESERVLEVEL > 0
5974 /*
5975 * If both the page table page and the reservation are fully
5976 * populated, then attempt promotion.
5977 */
5978 if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
5979 pmap_ps_enabled(pmap) &&
5980 (m->flags & PG_FICTITIOUS) == 0 &&
5981 vm_reserv_level_iffullpop(m) == 0)
5982 pmap_promote_pde(pmap, pde, va, &lock);
5983 #endif
5984
5985 rv = KERN_SUCCESS;
5986 out:
5987 if (lock != NULL)
5988 rw_wunlock(lock);
5989 PMAP_UNLOCK(pmap);
5990 return (rv);
5991 }
5992
5993 /*
5994 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true
5995 * if successful. Returns false if (1) a page table page cannot be allocated
5996 * without sleeping, (2) a mapping already exists at the specified virtual
5997 * address, or (3) a PV entry cannot be allocated without reclaiming another
5998 * PV entry.
5999 */
6000 static bool
6001 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
6002 struct rwlock **lockp)
6003 {
6004 pd_entry_t newpde;
6005 pt_entry_t PG_V;
6006
6007 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6008 PG_V = pmap_valid_bit(pmap);
6009 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
6010 PG_PS | PG_V;
6011 if ((m->oflags & VPO_UNMANAGED) == 0)
6012 newpde |= PG_MANAGED;
6013 if ((prot & VM_PROT_EXECUTE) == 0)
6014 newpde |= pg_nx;
6015 if (va < VM_MAXUSER_ADDRESS)
6016 newpde |= PG_U;
6017 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
6018 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
6019 KERN_SUCCESS);
6020 }
6021
6022 /*
6023 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if
6024 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
6025 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
6026 * a mapping already exists at the specified virtual address. Returns
6027 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
6028 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if
6029 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
6030 *
6031 * The parameter "m" is only used when creating a managed, writeable mapping.
6032 */
6033 static int
6034 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
6035 vm_page_t m, struct rwlock **lockp)
6036 {
6037 struct spglist free;
6038 pd_entry_t oldpde, *pde;
6039 pt_entry_t PG_G, PG_RW, PG_V;
6040 vm_page_t mt, pdpg;
6041
6042 PG_G = pmap_global_bit(pmap);
6043 PG_RW = pmap_rw_bit(pmap);
6044 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
6045 ("pmap_enter_pde: newpde is missing PG_M"));
6046 PG_V = pmap_valid_bit(pmap);
6047 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6048
6049 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
6050 newpde))) {
6051 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx"
6052 " in pmap %p", va, pmap);
6053 return (KERN_FAILURE);
6054 }
6055 if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
6056 NULL : lockp)) == NULL) {
6057 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
6058 " in pmap %p", va, pmap);
6059 return (KERN_RESOURCE_SHORTAGE);
6060 }
6061
6062 /*
6063 * If pkru is not same for the whole pde range, return failure
6064 * and let vm_fault() cope. Check after pde allocation, since
6065 * it could sleep.
6066 */
6067 if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
6068 SLIST_INIT(&free);
6069 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
6070 pmap_invalidate_page(pmap, va);
6071 vm_page_free_pages_toq(&free, true);
6072 }
6073 return (KERN_FAILURE);
6074 }
6075 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
6076 newpde &= ~X86_PG_PKU_MASK;
6077 newpde |= pmap_pkru_get(pmap, va);
6078 }
6079
6080 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
6081 pde = &pde[pmap_pde_index(va)];
6082 oldpde = *pde;
6083 if ((oldpde & PG_V) != 0) {
6084 KASSERT(pdpg->wire_count > 1,
6085 ("pmap_enter_pde: pdpg's wire count is too low"));
6086 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
6087 pdpg->wire_count--;
6088 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
6089 " in pmap %p", va, pmap);
6090 return (KERN_FAILURE);
6091 }
6092 /* Break the existing mapping(s). */
6093 SLIST_INIT(&free);
6094 if ((oldpde & PG_PS) != 0) {
6095 /*
6096 * The reference to the PD page that was acquired by
6097 * pmap_allocpde() ensures that it won't be freed.
6098 * However, if the PDE resulted from a promotion, then
6099 * a reserved PT page could be freed.
6100 */
6101 (void)pmap_remove_pde(pmap, pde, va, &free, lockp);
6102 if ((oldpde & PG_G) == 0)
6103 pmap_invalidate_pde_page(pmap, va, oldpde);
6104 } else {
6105 pmap_delayed_invl_start();
6106 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
6107 lockp))
6108 pmap_invalidate_all(pmap);
6109 pmap_delayed_invl_finish();
6110 }
6111 vm_page_free_pages_toq(&free, true);
6112 if (va >= VM_MAXUSER_ADDRESS) {
6113 /*
6114 * Both pmap_remove_pde() and pmap_remove_ptes() will
6115 * leave the kernel page table page zero filled.
6116 */
6117 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
6118 if (pmap_insert_pt_page(pmap, mt, false))
6119 panic("pmap_enter_pde: trie insert failed");
6120 } else
6121 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
6122 pde));
6123 }
6124 if ((newpde & PG_MANAGED) != 0) {
6125 /*
6126 * Abort this mapping if its PV entry could not be created.
6127 */
6128 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
6129 SLIST_INIT(&free);
6130 if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
6131 /*
6132 * Although "va" is not mapped, paging-
6133 * structure caches could nonetheless have
6134 * entries that refer to the freed page table
6135 * pages. Invalidate those entries.
6136 */
6137 pmap_invalidate_page(pmap, va);
6138 vm_page_free_pages_toq(&free, true);
6139 }
6140 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
6141 " in pmap %p", va, pmap);
6142 return (KERN_RESOURCE_SHORTAGE);
6143 }
6144 if ((newpde & PG_RW) != 0) {
6145 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6146 vm_page_aflag_set(mt, PGA_WRITEABLE);
6147 }
6148 }
6149
6150 /*
6151 * Increment counters.
6152 */
6153 if ((newpde & PG_W) != 0)
6154 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
6155 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
6156
6157 /*
6158 * Map the superpage. (This is not a promoted mapping; there will not
6159 * be any lingering 4KB page mappings in the TLB.)
6160 */
6161 pde_store(pde, newpde);
6162
6163 atomic_add_long(&pmap_pde_mappings, 1);
6164 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
6165 " in pmap %p", va, pmap);
6166 return (KERN_SUCCESS);
6167 }
6168
6169 /*
6170 * Maps a sequence of resident pages belonging to the same object.
6171 * The sequence begins with the given page m_start. This page is
6172 * mapped at the given virtual address start. Each subsequent page is
6173 * mapped at a virtual address that is offset from start by the same
6174 * amount as the page is offset from m_start within the object. The
6175 * last page in the sequence is the page with the largest offset from
6176 * m_start that can be mapped at a virtual address less than the given
6177 * virtual address end. Not every virtual page between start and end
6178 * is mapped; only those for which a resident page exists with the
6179 * corresponding offset from m_start are mapped.
6180 */
6181 void
6182 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
6183 vm_page_t m_start, vm_prot_t prot)
6184 {
6185 struct rwlock *lock;
6186 vm_offset_t va;
6187 vm_page_t m, mpte;
6188 vm_pindex_t diff, psize;
6189
6190 VM_OBJECT_ASSERT_LOCKED(m_start->object);
6191
6192 psize = atop(end - start);
6193 mpte = NULL;
6194 m = m_start;
6195 lock = NULL;
6196 PMAP_LOCK(pmap);
6197 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
6198 va = start + ptoa(diff);
6199 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
6200 m->psind == 1 && pmap_ps_enabled(pmap) &&
6201 pmap_allow_2m_x_page(pmap, (prot & VM_PROT_EXECUTE) != 0) &&
6202 pmap_enter_2mpage(pmap, va, m, prot, &lock))
6203 m = &m[NBPDR / PAGE_SIZE - 1];
6204 else
6205 mpte = pmap_enter_quick_locked(pmap, va, m, prot,
6206 mpte, &lock);
6207 m = TAILQ_NEXT(m, listq);
6208 }
6209 if (lock != NULL)
6210 rw_wunlock(lock);
6211 PMAP_UNLOCK(pmap);
6212 }
6213
6214 /*
6215 * this code makes some *MAJOR* assumptions:
6216 * 1. Current pmap & pmap exists.
6217 * 2. Not wired.
6218 * 3. Read access.
6219 * 4. No page table pages.
6220 * but is *MUCH* faster than pmap_enter...
6221 */
6222
6223 void
6224 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
6225 {
6226 struct rwlock *lock;
6227
6228 lock = NULL;
6229 PMAP_LOCK(pmap);
6230 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
6231 if (lock != NULL)
6232 rw_wunlock(lock);
6233 PMAP_UNLOCK(pmap);
6234 }
6235
6236 static vm_page_t
6237 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
6238 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
6239 {
6240 struct spglist free;
6241 pt_entry_t newpte, *pte, PG_V;
6242
6243 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
6244 (m->oflags & VPO_UNMANAGED) != 0,
6245 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
6246 PG_V = pmap_valid_bit(pmap);
6247 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6248
6249 /*
6250 * In the case that a page table page is not
6251 * resident, we are creating it here.
6252 */
6253 if (va < VM_MAXUSER_ADDRESS) {
6254 vm_pindex_t ptepindex;
6255 pd_entry_t *ptepa;
6256
6257 /*
6258 * Calculate pagetable page index
6259 */
6260 ptepindex = pmap_pde_pindex(va);
6261 if (mpte && (mpte->pindex == ptepindex)) {
6262 mpte->wire_count++;
6263 } else {
6264 /*
6265 * Get the page directory entry
6266 */
6267 ptepa = pmap_pde(pmap, va);
6268
6269 /*
6270 * If the page table page is mapped, we just increment
6271 * the hold count, and activate it. Otherwise, we
6272 * attempt to allocate a page table page. If this
6273 * attempt fails, we don't retry. Instead, we give up.
6274 */
6275 if (ptepa && (*ptepa & PG_V) != 0) {
6276 if (*ptepa & PG_PS)
6277 return (NULL);
6278 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
6279 mpte->wire_count++;
6280 } else {
6281 /*
6282 * Pass NULL instead of the PV list lock
6283 * pointer, because we don't intend to sleep.
6284 */
6285 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
6286 if (mpte == NULL)
6287 return (mpte);
6288 }
6289 }
6290 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
6291 pte = &pte[pmap_pte_index(va)];
6292 } else {
6293 mpte = NULL;
6294 pte = vtopte(va);
6295 }
6296 if (*pte) {
6297 if (mpte != NULL) {
6298 mpte->wire_count--;
6299 mpte = NULL;
6300 }
6301 return (mpte);
6302 }
6303
6304 /*
6305 * Enter on the PV list if part of our managed memory.
6306 */
6307 if ((m->oflags & VPO_UNMANAGED) == 0 &&
6308 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6309 if (mpte != NULL) {
6310 SLIST_INIT(&free);
6311 if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
6312 /*
6313 * Although "va" is not mapped, paging-
6314 * structure caches could nonetheless have
6315 * entries that refer to the freed page table
6316 * pages. Invalidate those entries.
6317 */
6318 pmap_invalidate_page(pmap, va);
6319 vm_page_free_pages_toq(&free, true);
6320 }
6321 mpte = NULL;
6322 }
6323 return (mpte);
6324 }
6325
6326 /*
6327 * Increment counters
6328 */
6329 pmap_resident_count_inc(pmap, 1);
6330
6331 newpte = VM_PAGE_TO_PHYS(m) | PG_V |
6332 pmap_cache_bits(pmap, m->md.pat_mode, 0);
6333 if ((m->oflags & VPO_UNMANAGED) == 0)
6334 newpte |= PG_MANAGED;
6335 if ((prot & VM_PROT_EXECUTE) == 0)
6336 newpte |= pg_nx;
6337 if (va < VM_MAXUSER_ADDRESS)
6338 newpte |= PG_U | pmap_pkru_get(pmap, va);
6339 pte_store(pte, newpte);
6340 return (mpte);
6341 }
6342
6343 /*
6344 * Make a temporary mapping for a physical address. This is only intended
6345 * to be used for panic dumps.
6346 */
6347 void *
6348 pmap_kenter_temporary(vm_paddr_t pa, int i)
6349 {
6350 vm_offset_t va;
6351
6352 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
6353 pmap_kenter(va, pa);
6354 invlpg(va);
6355 return ((void *)crashdumpmap);
6356 }
6357
6358 /*
6359 * This code maps large physical mmap regions into the
6360 * processor address space. Note that some shortcuts
6361 * are taken, but the code works.
6362 */
6363 void
6364 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6365 vm_pindex_t pindex, vm_size_t size)
6366 {
6367 pd_entry_t *pde;
6368 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6369 vm_paddr_t pa, ptepa;
6370 vm_page_t p, pdpg;
6371 int pat_mode;
6372
6373 PG_A = pmap_accessed_bit(pmap);
6374 PG_M = pmap_modified_bit(pmap);
6375 PG_V = pmap_valid_bit(pmap);
6376 PG_RW = pmap_rw_bit(pmap);
6377
6378 VM_OBJECT_ASSERT_WLOCKED(object);
6379 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6380 ("pmap_object_init_pt: non-device object"));
6381 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
6382 if (!pmap_ps_enabled(pmap))
6383 return;
6384 if (!vm_object_populate(object, pindex, pindex + atop(size)))
6385 return;
6386 p = vm_page_lookup(object, pindex);
6387 KASSERT(p->valid == VM_PAGE_BITS_ALL,
6388 ("pmap_object_init_pt: invalid page %p", p));
6389 pat_mode = p->md.pat_mode;
6390
6391 /*
6392 * Abort the mapping if the first page is not physically
6393 * aligned to a 2MB page boundary.
6394 */
6395 ptepa = VM_PAGE_TO_PHYS(p);
6396 if (ptepa & (NBPDR - 1))
6397 return;
6398
6399 /*
6400 * Skip the first page. Abort the mapping if the rest of
6401 * the pages are not physically contiguous or have differing
6402 * memory attributes.
6403 */
6404 p = TAILQ_NEXT(p, listq);
6405 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
6406 pa += PAGE_SIZE) {
6407 KASSERT(p->valid == VM_PAGE_BITS_ALL,
6408 ("pmap_object_init_pt: invalid page %p", p));
6409 if (pa != VM_PAGE_TO_PHYS(p) ||
6410 pat_mode != p->md.pat_mode)
6411 return;
6412 p = TAILQ_NEXT(p, listq);
6413 }
6414
6415 /*
6416 * Map using 2MB pages. Since "ptepa" is 2M aligned and
6417 * "size" is a multiple of 2M, adding the PAT setting to "pa"
6418 * will not affect the termination of this loop.
6419 */
6420 PMAP_LOCK(pmap);
6421 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
6422 pa < ptepa + size; pa += NBPDR) {
6423 pdpg = pmap_allocpde(pmap, addr, NULL);
6424 if (pdpg == NULL) {
6425 /*
6426 * The creation of mappings below is only an
6427 * optimization. If a page directory page
6428 * cannot be allocated without blocking,
6429 * continue on to the next mapping rather than
6430 * blocking.
6431 */
6432 addr += NBPDR;
6433 continue;
6434 }
6435 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
6436 pde = &pde[pmap_pde_index(addr)];
6437 if ((*pde & PG_V) == 0) {
6438 pde_store(pde, pa | PG_PS | PG_M | PG_A |
6439 PG_U | PG_RW | PG_V);
6440 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
6441 atomic_add_long(&pmap_pde_mappings, 1);
6442 } else {
6443 /* Continue on if the PDE is already valid. */
6444 pdpg->wire_count--;
6445 KASSERT(pdpg->wire_count > 0,
6446 ("pmap_object_init_pt: missing reference "
6447 "to page directory page, va: 0x%lx", addr));
6448 }
6449 addr += NBPDR;
6450 }
6451 PMAP_UNLOCK(pmap);
6452 }
6453 }
6454
6455 /*
6456 * Clear the wired attribute from the mappings for the specified range of
6457 * addresses in the given pmap. Every valid mapping within that range
6458 * must have the wired attribute set. In contrast, invalid mappings
6459 * cannot have the wired attribute set, so they are ignored.
6460 *
6461 * The wired attribute of the page table entry is not a hardware
6462 * feature, so there is no need to invalidate any TLB entries.
6463 * Since pmap_demote_pde() for the wired entry must never fail,
6464 * pmap_delayed_invl_start()/finish() calls around the
6465 * function are not needed.
6466 */
6467 void
6468 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6469 {
6470 vm_offset_t va_next;
6471 pml4_entry_t *pml4e;
6472 pdp_entry_t *pdpe;
6473 pd_entry_t *pde;
6474 pt_entry_t *pte, PG_V;
6475
6476 PG_V = pmap_valid_bit(pmap);
6477 PMAP_LOCK(pmap);
6478 for (; sva < eva; sva = va_next) {
6479 pml4e = pmap_pml4e(pmap, sva);
6480 if ((*pml4e & PG_V) == 0) {
6481 va_next = (sva + NBPML4) & ~PML4MASK;
6482 if (va_next < sva)
6483 va_next = eva;
6484 continue;
6485 }
6486 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6487 if ((*pdpe & PG_V) == 0) {
6488 va_next = (sva + NBPDP) & ~PDPMASK;
6489 if (va_next < sva)
6490 va_next = eva;
6491 continue;
6492 }
6493 va_next = (sva + NBPDR) & ~PDRMASK;
6494 if (va_next < sva)
6495 va_next = eva;
6496 pde = pmap_pdpe_to_pde(pdpe, sva);
6497 if ((*pde & PG_V) == 0)
6498 continue;
6499 if ((*pde & PG_PS) != 0) {
6500 if ((*pde & PG_W) == 0)
6501 panic("pmap_unwire: pde %#jx is missing PG_W",
6502 (uintmax_t)*pde);
6503
6504 /*
6505 * Are we unwiring the entire large page? If not,
6506 * demote the mapping and fall through.
6507 */
6508 if (sva + NBPDR == va_next && eva >= va_next) {
6509 atomic_clear_long(pde, PG_W);
6510 pmap->pm_stats.wired_count -= NBPDR /
6511 PAGE_SIZE;
6512 continue;
6513 } else if (!pmap_demote_pde(pmap, pde, sva))
6514 panic("pmap_unwire: demotion failed");
6515 }
6516 if (va_next > eva)
6517 va_next = eva;
6518 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6519 sva += PAGE_SIZE) {
6520 if ((*pte & PG_V) == 0)
6521 continue;
6522 if ((*pte & PG_W) == 0)
6523 panic("pmap_unwire: pte %#jx is missing PG_W",
6524 (uintmax_t)*pte);
6525
6526 /*
6527 * PG_W must be cleared atomically. Although the pmap
6528 * lock synchronizes access to PG_W, another processor
6529 * could be setting PG_M and/or PG_A concurrently.
6530 */
6531 atomic_clear_long(pte, PG_W);
6532 pmap->pm_stats.wired_count--;
6533 }
6534 }
6535 PMAP_UNLOCK(pmap);
6536 }
6537
6538 /*
6539 * Copy the range specified by src_addr/len
6540 * from the source map to the range dst_addr/len
6541 * in the destination map.
6542 *
6543 * This routine is only advisory and need not do anything.
6544 */
6545 void
6546 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6547 vm_offset_t src_addr)
6548 {
6549 struct rwlock *lock;
6550 struct spglist free;
6551 pml4_entry_t *pml4e;
6552 pdp_entry_t *pdpe;
6553 pd_entry_t *pde, srcptepaddr;
6554 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte;
6555 vm_offset_t addr, end_addr, va_next;
6556 vm_page_t dst_pdpg, dstmpte, srcmpte;
6557
6558 if (dst_addr != src_addr)
6559 return;
6560
6561 if (dst_pmap->pm_type != src_pmap->pm_type)
6562 return;
6563
6564 /*
6565 * EPT page table entries that require emulation of A/D bits are
6566 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
6567 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
6568 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
6569 * implementations flag an EPT misconfiguration for exec-only
6570 * mappings we skip this function entirely for emulated pmaps.
6571 */
6572 if (pmap_emulate_ad_bits(dst_pmap))
6573 return;
6574
6575 end_addr = src_addr + len;
6576 lock = NULL;
6577 if (dst_pmap < src_pmap) {
6578 PMAP_LOCK(dst_pmap);
6579 PMAP_LOCK(src_pmap);
6580 } else {
6581 PMAP_LOCK(src_pmap);
6582 PMAP_LOCK(dst_pmap);
6583 }
6584
6585 PG_A = pmap_accessed_bit(dst_pmap);
6586 PG_M = pmap_modified_bit(dst_pmap);
6587 PG_V = pmap_valid_bit(dst_pmap);
6588
6589 for (addr = src_addr; addr < end_addr; addr = va_next) {
6590 KASSERT(addr < UPT_MIN_ADDRESS,
6591 ("pmap_copy: invalid to pmap_copy page tables"));
6592
6593 pml4e = pmap_pml4e(src_pmap, addr);
6594 if ((*pml4e & PG_V) == 0) {
6595 va_next = (addr + NBPML4) & ~PML4MASK;
6596 if (va_next < addr)
6597 va_next = end_addr;
6598 continue;
6599 }
6600
6601 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
6602 if ((*pdpe & PG_V) == 0) {
6603 va_next = (addr + NBPDP) & ~PDPMASK;
6604 if (va_next < addr)
6605 va_next = end_addr;
6606 continue;
6607 }
6608
6609 va_next = (addr + NBPDR) & ~PDRMASK;
6610 if (va_next < addr)
6611 va_next = end_addr;
6612
6613 pde = pmap_pdpe_to_pde(pdpe, addr);
6614 srcptepaddr = *pde;
6615 if (srcptepaddr == 0)
6616 continue;
6617
6618 if (srcptepaddr & PG_PS) {
6619 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
6620 continue;
6621 dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
6622 if (dst_pdpg == NULL)
6623 break;
6624 pde = (pd_entry_t *)
6625 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
6626 pde = &pde[pmap_pde_index(addr)];
6627 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
6628 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
6629 PMAP_ENTER_NORECLAIM, &lock))) {
6630 *pde = srcptepaddr & ~PG_W;
6631 pmap_resident_count_inc(dst_pmap, NBPDR /
6632 PAGE_SIZE);
6633 atomic_add_long(&pmap_pde_mappings, 1);
6634 } else
6635 dst_pdpg->wire_count--;
6636 continue;
6637 }
6638
6639 srcptepaddr &= PG_FRAME;
6640 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
6641 KASSERT(srcmpte->wire_count > 0,
6642 ("pmap_copy: source page table page is unused"));
6643
6644 if (va_next > end_addr)
6645 va_next = end_addr;
6646
6647 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
6648 src_pte = &src_pte[pmap_pte_index(addr)];
6649 dstmpte = NULL;
6650 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6651 ptetemp = *src_pte;
6652
6653 /*
6654 * We only virtual copy managed pages.
6655 */
6656 if ((ptetemp & PG_MANAGED) == 0)
6657 continue;
6658
6659 if (dstmpte != NULL) {
6660 KASSERT(dstmpte->pindex ==
6661 pmap_pde_pindex(addr),
6662 ("dstmpte pindex/addr mismatch"));
6663 dstmpte->wire_count++;
6664 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr,
6665 NULL)) == NULL)
6666 goto out;
6667 dst_pte = (pt_entry_t *)
6668 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6669 dst_pte = &dst_pte[pmap_pte_index(addr)];
6670 if (*dst_pte == 0 &&
6671 pmap_try_insert_pv_entry(dst_pmap, addr,
6672 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) {
6673 /*
6674 * Clear the wired, modified, and accessed
6675 * (referenced) bits during the copy.
6676 */
6677 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A);
6678 pmap_resident_count_inc(dst_pmap, 1);
6679 } else {
6680 SLIST_INIT(&free);
6681 if (pmap_unwire_ptp(dst_pmap, addr, dstmpte,
6682 &free)) {
6683 /*
6684 * Although "addr" is not mapped,
6685 * paging-structure caches could
6686 * nonetheless have entries that refer
6687 * to the freed page table pages.
6688 * Invalidate those entries.
6689 */
6690 pmap_invalidate_page(dst_pmap, addr);
6691 vm_page_free_pages_toq(&free, true);
6692 }
6693 goto out;
6694 }
6695 /* Have we copied all of the valid mappings? */
6696 if (dstmpte->wire_count >= srcmpte->wire_count)
6697 break;
6698 }
6699 }
6700 out:
6701 if (lock != NULL)
6702 rw_wunlock(lock);
6703 PMAP_UNLOCK(src_pmap);
6704 PMAP_UNLOCK(dst_pmap);
6705 }
6706
6707 int
6708 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6709 {
6710 int error;
6711
6712 if (dst_pmap->pm_type != src_pmap->pm_type ||
6713 dst_pmap->pm_type != PT_X86 ||
6714 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
6715 return (0);
6716 for (;;) {
6717 if (dst_pmap < src_pmap) {
6718 PMAP_LOCK(dst_pmap);
6719 PMAP_LOCK(src_pmap);
6720 } else {
6721 PMAP_LOCK(src_pmap);
6722 PMAP_LOCK(dst_pmap);
6723 }
6724 error = pmap_pkru_copy(dst_pmap, src_pmap);
6725 /* Clean up partial copy on failure due to no memory. */
6726 if (error == ENOMEM)
6727 pmap_pkru_deassign_all(dst_pmap);
6728 PMAP_UNLOCK(src_pmap);
6729 PMAP_UNLOCK(dst_pmap);
6730 if (error != ENOMEM)
6731 break;
6732 vm_wait(NULL);
6733 }
6734 return (error);
6735 }
6736
6737 /*
6738 * Zero the specified hardware page.
6739 */
6740 void
6741 pmap_zero_page(vm_page_t m)
6742 {
6743 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6744
6745 pagezero((void *)va);
6746 }
6747
6748 /*
6749 * Zero an area within a single hardware page. off and size must not
6750 * cover an area beyond a single hardware page.
6751 */
6752 void
6753 pmap_zero_page_area(vm_page_t m, int off, int size)
6754 {
6755 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6756
6757 if (off == 0 && size == PAGE_SIZE)
6758 pagezero((void *)va);
6759 else
6760 bzero((char *)va + off, size);
6761 }
6762
6763 /*
6764 * Copy 1 specified hardware page to another.
6765 */
6766 void
6767 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6768 {
6769 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6770 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6771
6772 pagecopy((void *)src, (void *)dst);
6773 }
6774
6775 int unmapped_buf_allowed = 1;
6776
6777 void
6778 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6779 vm_offset_t b_offset, int xfersize)
6780 {
6781 void *a_cp, *b_cp;
6782 vm_page_t pages[2];
6783 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
6784 int cnt;
6785 boolean_t mapped;
6786
6787 while (xfersize > 0) {
6788 a_pg_offset = a_offset & PAGE_MASK;
6789 pages[0] = ma[a_offset >> PAGE_SHIFT];
6790 b_pg_offset = b_offset & PAGE_MASK;
6791 pages[1] = mb[b_offset >> PAGE_SHIFT];
6792 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6793 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6794 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
6795 a_cp = (char *)vaddr[0] + a_pg_offset;
6796 b_cp = (char *)vaddr[1] + b_pg_offset;
6797 bcopy(a_cp, b_cp, cnt);
6798 if (__predict_false(mapped))
6799 pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
6800 a_offset += cnt;
6801 b_offset += cnt;
6802 xfersize -= cnt;
6803 }
6804 }
6805
6806 /*
6807 * Returns true if the pmap's pv is one of the first
6808 * 16 pvs linked to from this page. This count may
6809 * be changed upwards or downwards in the future; it
6810 * is only necessary that true be returned for a small
6811 * subset of pmaps for proper page aging.
6812 */
6813 boolean_t
6814 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6815 {
6816 struct md_page *pvh;
6817 struct rwlock *lock;
6818 pv_entry_t pv;
6819 int loops = 0;
6820 boolean_t rv;
6821
6822 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6823 ("pmap_page_exists_quick: page %p is not managed", m));
6824 rv = FALSE;
6825 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6826 rw_rlock(lock);
6827 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6828 if (PV_PMAP(pv) == pmap) {
6829 rv = TRUE;
6830 break;
6831 }
6832 loops++;
6833 if (loops >= 16)
6834 break;
6835 }
6836 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6837 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6838 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6839 if (PV_PMAP(pv) == pmap) {
6840 rv = TRUE;
6841 break;
6842 }
6843 loops++;
6844 if (loops >= 16)
6845 break;
6846 }
6847 }
6848 rw_runlock(lock);
6849 return (rv);
6850 }
6851
6852 /*
6853 * pmap_page_wired_mappings:
6854 *
6855 * Return the number of managed mappings to the given physical page
6856 * that are wired.
6857 */
6858 int
6859 pmap_page_wired_mappings(vm_page_t m)
6860 {
6861 struct rwlock *lock;
6862 struct md_page *pvh;
6863 pmap_t pmap;
6864 pt_entry_t *pte;
6865 pv_entry_t pv;
6866 int count, md_gen, pvh_gen;
6867
6868 if ((m->oflags & VPO_UNMANAGED) != 0)
6869 return (0);
6870 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6871 rw_rlock(lock);
6872 restart:
6873 count = 0;
6874 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6875 pmap = PV_PMAP(pv);
6876 if (!PMAP_TRYLOCK(pmap)) {
6877 md_gen = m->md.pv_gen;
6878 rw_runlock(lock);
6879 PMAP_LOCK(pmap);
6880 rw_rlock(lock);
6881 if (md_gen != m->md.pv_gen) {
6882 PMAP_UNLOCK(pmap);
6883 goto restart;
6884 }
6885 }
6886 pte = pmap_pte(pmap, pv->pv_va);
6887 if ((*pte & PG_W) != 0)
6888 count++;
6889 PMAP_UNLOCK(pmap);
6890 }
6891 if ((m->flags & PG_FICTITIOUS) == 0) {
6892 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6893 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6894 pmap = PV_PMAP(pv);
6895 if (!PMAP_TRYLOCK(pmap)) {
6896 md_gen = m->md.pv_gen;
6897 pvh_gen = pvh->pv_gen;
6898 rw_runlock(lock);
6899 PMAP_LOCK(pmap);
6900 rw_rlock(lock);
6901 if (md_gen != m->md.pv_gen ||
6902 pvh_gen != pvh->pv_gen) {
6903 PMAP_UNLOCK(pmap);
6904 goto restart;
6905 }
6906 }
6907 pte = pmap_pde(pmap, pv->pv_va);
6908 if ((*pte & PG_W) != 0)
6909 count++;
6910 PMAP_UNLOCK(pmap);
6911 }
6912 }
6913 rw_runlock(lock);
6914 return (count);
6915 }
6916
6917 /*
6918 * Returns TRUE if the given page is mapped individually or as part of
6919 * a 2mpage. Otherwise, returns FALSE.
6920 */
6921 boolean_t
6922 pmap_page_is_mapped(vm_page_t m)
6923 {
6924 struct rwlock *lock;
6925 boolean_t rv;
6926
6927 if ((m->oflags & VPO_UNMANAGED) != 0)
6928 return (FALSE);
6929 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6930 rw_rlock(lock);
6931 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6932 ((m->flags & PG_FICTITIOUS) == 0 &&
6933 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
6934 rw_runlock(lock);
6935 return (rv);
6936 }
6937
6938 /*
6939 * Destroy all managed, non-wired mappings in the given user-space
6940 * pmap. This pmap cannot be active on any processor besides the
6941 * caller.
6942 *
6943 * This function cannot be applied to the kernel pmap. Moreover, it
6944 * is not intended for general use. It is only to be used during
6945 * process termination. Consequently, it can be implemented in ways
6946 * that make it faster than pmap_remove(). First, it can more quickly
6947 * destroy mappings by iterating over the pmap's collection of PV
6948 * entries, rather than searching the page table. Second, it doesn't
6949 * have to test and clear the page table entries atomically, because
6950 * no processor is currently accessing the user address space. In
6951 * particular, a page table entry's dirty bit won't change state once
6952 * this function starts.
6953 *
6954 * Although this function destroys all of the pmap's managed,
6955 * non-wired mappings, it can delay and batch the invalidation of TLB
6956 * entries without calling pmap_delayed_invl_start() and
6957 * pmap_delayed_invl_finish(). Because the pmap is not active on
6958 * any other processor, none of these TLB entries will ever be used
6959 * before their eventual invalidation. Consequently, there is no need
6960 * for either pmap_remove_all() or pmap_remove_write() to wait for
6961 * that eventual TLB invalidation.
6962 */
6963 void
6964 pmap_remove_pages(pmap_t pmap)
6965 {
6966 pd_entry_t ptepde;
6967 pt_entry_t *pte, tpte;
6968 pt_entry_t PG_M, PG_RW, PG_V;
6969 struct spglist free;
6970 struct pv_chunklist free_chunks;
6971 vm_page_t m, mpte, mt;
6972 pv_entry_t pv;
6973 struct md_page *pvh;
6974 struct pv_chunk *pc, *npc;
6975 struct rwlock *lock;
6976 int64_t bit;
6977 uint64_t inuse, bitmask;
6978 int allfree, field, idx;
6979 #ifdef PV_STATS
6980 int freed;
6981 #endif
6982 boolean_t superpage;
6983 vm_paddr_t pa;
6984
6985 /*
6986 * Assert that the given pmap is only active on the current
6987 * CPU. Unfortunately, we cannot block another CPU from
6988 * activating the pmap while this function is executing.
6989 */
6990 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
6991 #ifdef INVARIANTS
6992 {
6993 cpuset_t other_cpus;
6994
6995 other_cpus = all_cpus;
6996 critical_enter();
6997 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
6998 CPU_AND(&other_cpus, &pmap->pm_active);
6999 critical_exit();
7000 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
7001 }
7002 #endif
7003
7004 lock = NULL;
7005 PG_M = pmap_modified_bit(pmap);
7006 PG_V = pmap_valid_bit(pmap);
7007 PG_RW = pmap_rw_bit(pmap);
7008
7009 TAILQ_INIT(&free_chunks);
7010 SLIST_INIT(&free);
7011 PMAP_LOCK(pmap);
7012 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
7013 allfree = 1;
7014 #ifdef PV_STATS
7015 freed = 0;
7016 #endif
7017 for (field = 0; field < _NPCM; field++) {
7018 inuse = ~pc->pc_map[field] & pc_freemask[field];
7019 while (inuse != 0) {
7020 bit = bsfq(inuse);
7021 bitmask = 1UL << bit;
7022 idx = field * 64 + bit;
7023 pv = &pc->pc_pventry[idx];
7024 inuse &= ~bitmask;
7025
7026 pte = pmap_pdpe(pmap, pv->pv_va);
7027 ptepde = *pte;
7028 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
7029 tpte = *pte;
7030 if ((tpte & (PG_PS | PG_V)) == PG_V) {
7031 superpage = FALSE;
7032 ptepde = tpte;
7033 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
7034 PG_FRAME);
7035 pte = &pte[pmap_pte_index(pv->pv_va)];
7036 tpte = *pte;
7037 } else {
7038 /*
7039 * Keep track whether 'tpte' is a
7040 * superpage explicitly instead of
7041 * relying on PG_PS being set.
7042 *
7043 * This is because PG_PS is numerically
7044 * identical to PG_PTE_PAT and thus a
7045 * regular page could be mistaken for
7046 * a superpage.
7047 */
7048 superpage = TRUE;
7049 }
7050
7051 if ((tpte & PG_V) == 0) {
7052 panic("bad pte va %lx pte %lx",
7053 pv->pv_va, tpte);
7054 }
7055
7056 /*
7057 * We cannot remove wired pages from a process' mapping at this time
7058 */
7059 if (tpte & PG_W) {
7060 allfree = 0;
7061 continue;
7062 }
7063
7064 if (superpage)
7065 pa = tpte & PG_PS_FRAME;
7066 else
7067 pa = tpte & PG_FRAME;
7068
7069 m = PHYS_TO_VM_PAGE(pa);
7070 KASSERT(m->phys_addr == pa,
7071 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
7072 m, (uintmax_t)m->phys_addr,
7073 (uintmax_t)tpte));
7074
7075 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
7076 m < &vm_page_array[vm_page_array_size],
7077 ("pmap_remove_pages: bad tpte %#jx",
7078 (uintmax_t)tpte));
7079
7080 pte_clear(pte);
7081
7082 /*
7083 * Update the vm_page_t clean/reference bits.
7084 */
7085 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
7086 if (superpage) {
7087 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
7088 vm_page_dirty(mt);
7089 } else
7090 vm_page_dirty(m);
7091 }
7092
7093 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
7094
7095 /* Mark free */
7096 pc->pc_map[field] |= bitmask;
7097 if (superpage) {
7098 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
7099 pvh = pa_to_pvh(tpte & PG_PS_FRAME);
7100 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7101 pvh->pv_gen++;
7102 if (TAILQ_EMPTY(&pvh->pv_list)) {
7103 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
7104 if ((mt->aflags & PGA_WRITEABLE) != 0 &&
7105 TAILQ_EMPTY(&mt->md.pv_list))
7106 vm_page_aflag_clear(mt, PGA_WRITEABLE);
7107 }
7108 mpte = pmap_remove_pt_page(pmap, pv->pv_va);
7109 if (mpte != NULL) {
7110 KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
7111 ("pmap_remove_pages: pte page not promoted"));
7112 pmap_resident_count_dec(pmap, 1);
7113 KASSERT(mpte->wire_count == NPTEPG,
7114 ("pmap_remove_pages: pte page wire count error"));
7115 mpte->wire_count = 0;
7116 pmap_add_delayed_free_list(mpte, &free, FALSE);
7117 }
7118 } else {
7119 pmap_resident_count_dec(pmap, 1);
7120 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7121 m->md.pv_gen++;
7122 if ((m->aflags & PGA_WRITEABLE) != 0 &&
7123 TAILQ_EMPTY(&m->md.pv_list) &&
7124 (m->flags & PG_FICTITIOUS) == 0) {
7125 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
7126 if (TAILQ_EMPTY(&pvh->pv_list))
7127 vm_page_aflag_clear(m, PGA_WRITEABLE);
7128 }
7129 }
7130 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
7131 #ifdef PV_STATS
7132 freed++;
7133 #endif
7134 }
7135 }
7136 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
7137 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
7138 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
7139 if (allfree) {
7140 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
7141 TAILQ_INSERT_TAIL(&free_chunks, pc, pc_list);
7142 }
7143 }
7144 if (lock != NULL)
7145 rw_wunlock(lock);
7146 pmap_invalidate_all(pmap);
7147 pmap_pkru_deassign_all(pmap);
7148 free_pv_chunk_batch(&free_chunks);
7149 PMAP_UNLOCK(pmap);
7150 vm_page_free_pages_toq(&free, true);
7151 }
7152
7153 static boolean_t
7154 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
7155 {
7156 struct rwlock *lock;
7157 pv_entry_t pv;
7158 struct md_page *pvh;
7159 pt_entry_t *pte, mask;
7160 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
7161 pmap_t pmap;
7162 int md_gen, pvh_gen;
7163 boolean_t rv;
7164
7165 rv = FALSE;
7166 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7167 rw_rlock(lock);
7168 restart:
7169 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7170 pmap = PV_PMAP(pv);
7171 if (!PMAP_TRYLOCK(pmap)) {
7172 md_gen = m->md.pv_gen;
7173 rw_runlock(lock);
7174 PMAP_LOCK(pmap);
7175 rw_rlock(lock);
7176 if (md_gen != m->md.pv_gen) {
7177 PMAP_UNLOCK(pmap);
7178 goto restart;
7179 }
7180 }
7181 pte = pmap_pte(pmap, pv->pv_va);
7182 mask = 0;
7183 if (modified) {
7184 PG_M = pmap_modified_bit(pmap);
7185 PG_RW = pmap_rw_bit(pmap);
7186 mask |= PG_RW | PG_M;
7187 }
7188 if (accessed) {
7189 PG_A = pmap_accessed_bit(pmap);
7190 PG_V = pmap_valid_bit(pmap);
7191 mask |= PG_V | PG_A;
7192 }
7193 rv = (*pte & mask) == mask;
7194 PMAP_UNLOCK(pmap);
7195 if (rv)
7196 goto out;
7197 }
7198 if ((m->flags & PG_FICTITIOUS) == 0) {
7199 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
7200 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
7201 pmap = PV_PMAP(pv);
7202 if (!PMAP_TRYLOCK(pmap)) {
7203 md_gen = m->md.pv_gen;
7204 pvh_gen = pvh->pv_gen;
7205 rw_runlock(lock);
7206 PMAP_LOCK(pmap);
7207 rw_rlock(lock);
7208 if (md_gen != m->md.pv_gen ||
7209 pvh_gen != pvh->pv_gen) {
7210 PMAP_UNLOCK(pmap);
7211 goto restart;
7212 }
7213 }
7214 pte = pmap_pde(pmap, pv->pv_va);
7215 mask = 0;
7216 if (modified) {
7217 PG_M = pmap_modified_bit(pmap);
7218 PG_RW = pmap_rw_bit(pmap);
7219 mask |= PG_RW | PG_M;
7220 }
7221 if (accessed) {
7222 PG_A = pmap_accessed_bit(pmap);
7223 PG_V = pmap_valid_bit(pmap);
7224 mask |= PG_V | PG_A;
7225 }
7226 rv = (*pte & mask) == mask;
7227 PMAP_UNLOCK(pmap);
7228 if (rv)
7229 goto out;
7230 }
7231 }
7232 out:
7233 rw_runlock(lock);
7234 return (rv);
7235 }
7236
7237 /*
7238 * pmap_is_modified:
7239 *
7240 * Return whether or not the specified physical page was modified
7241 * in any physical maps.
7242 */
7243 boolean_t
7244 pmap_is_modified(vm_page_t m)
7245 {
7246
7247 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7248 ("pmap_is_modified: page %p is not managed", m));
7249
7250 /*
7251 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
7252 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE
7253 * is clear, no PTEs can have PG_M set.
7254 */
7255 VM_OBJECT_ASSERT_WLOCKED(m->object);
7256 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
7257 return (FALSE);
7258 return (pmap_page_test_mappings(m, FALSE, TRUE));
7259 }
7260
7261 /*
7262 * pmap_is_prefaultable:
7263 *
7264 * Return whether or not the specified virtual address is eligible
7265 * for prefault.
7266 */
7267 boolean_t
7268 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7269 {
7270 pd_entry_t *pde;
7271 pt_entry_t *pte, PG_V;
7272 boolean_t rv;
7273
7274 PG_V = pmap_valid_bit(pmap);
7275 rv = FALSE;
7276 PMAP_LOCK(pmap);
7277 pde = pmap_pde(pmap, addr);
7278 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
7279 pte = pmap_pde_to_pte(pde, addr);
7280 rv = (*pte & PG_V) == 0;
7281 }
7282 PMAP_UNLOCK(pmap);
7283 return (rv);
7284 }
7285
7286 /*
7287 * pmap_is_referenced:
7288 *
7289 * Return whether or not the specified physical page was referenced
7290 * in any physical maps.
7291 */
7292 boolean_t
7293 pmap_is_referenced(vm_page_t m)
7294 {
7295
7296 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7297 ("pmap_is_referenced: page %p is not managed", m));
7298 return (pmap_page_test_mappings(m, TRUE, FALSE));
7299 }
7300
7301 /*
7302 * Clear the write and modified bits in each of the given page's mappings.
7303 */
7304 void
7305 pmap_remove_write(vm_page_t m)
7306 {
7307 struct md_page *pvh;
7308 pmap_t pmap;
7309 struct rwlock *lock;
7310 pv_entry_t next_pv, pv;
7311 pd_entry_t *pde;
7312 pt_entry_t oldpte, *pte, PG_M, PG_RW;
7313 vm_offset_t va;
7314 int pvh_gen, md_gen;
7315
7316 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7317 ("pmap_remove_write: page %p is not managed", m));
7318
7319 /*
7320 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
7321 * set by another thread while the object is locked. Thus,
7322 * if PGA_WRITEABLE is clear, no page table entries need updating.
7323 */
7324 VM_OBJECT_ASSERT_WLOCKED(m->object);
7325 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
7326 return;
7327 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7328 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
7329 pa_to_pvh(VM_PAGE_TO_PHYS(m));
7330 retry_pv_loop:
7331 rw_wlock(lock);
7332 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7333 pmap = PV_PMAP(pv);
7334 if (!PMAP_TRYLOCK(pmap)) {
7335 pvh_gen = pvh->pv_gen;
7336 rw_wunlock(lock);
7337 PMAP_LOCK(pmap);
7338 rw_wlock(lock);
7339 if (pvh_gen != pvh->pv_gen) {
7340 PMAP_UNLOCK(pmap);
7341 rw_wunlock(lock);
7342 goto retry_pv_loop;
7343 }
7344 }
7345 PG_RW = pmap_rw_bit(pmap);
7346 va = pv->pv_va;
7347 pde = pmap_pde(pmap, va);
7348 if ((*pde & PG_RW) != 0)
7349 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
7350 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7351 ("inconsistent pv lock %p %p for page %p",
7352 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7353 PMAP_UNLOCK(pmap);
7354 }
7355 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7356 pmap = PV_PMAP(pv);
7357 if (!PMAP_TRYLOCK(pmap)) {
7358 pvh_gen = pvh->pv_gen;
7359 md_gen = m->md.pv_gen;
7360 rw_wunlock(lock);
7361 PMAP_LOCK(pmap);
7362 rw_wlock(lock);
7363 if (pvh_gen != pvh->pv_gen ||
7364 md_gen != m->md.pv_gen) {
7365 PMAP_UNLOCK(pmap);
7366 rw_wunlock(lock);
7367 goto retry_pv_loop;
7368 }
7369 }
7370 PG_M = pmap_modified_bit(pmap);
7371 PG_RW = pmap_rw_bit(pmap);
7372 pde = pmap_pde(pmap, pv->pv_va);
7373 KASSERT((*pde & PG_PS) == 0,
7374 ("pmap_remove_write: found a 2mpage in page %p's pv list",
7375 m));
7376 pte = pmap_pde_to_pte(pde, pv->pv_va);
7377 retry:
7378 oldpte = *pte;
7379 if (oldpte & PG_RW) {
7380 if (!atomic_cmpset_long(pte, oldpte, oldpte &
7381 ~(PG_RW | PG_M)))
7382 goto retry;
7383 if ((oldpte & PG_M) != 0)
7384 vm_page_dirty(m);
7385 pmap_invalidate_page(pmap, pv->pv_va);
7386 }
7387 PMAP_UNLOCK(pmap);
7388 }
7389 rw_wunlock(lock);
7390 vm_page_aflag_clear(m, PGA_WRITEABLE);
7391 pmap_delayed_invl_wait(m);
7392 }
7393
7394 static __inline boolean_t
7395 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
7396 {
7397
7398 if (!pmap_emulate_ad_bits(pmap))
7399 return (TRUE);
7400
7401 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
7402
7403 /*
7404 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
7405 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
7406 * if the EPT_PG_WRITE bit is set.
7407 */
7408 if ((pte & EPT_PG_WRITE) != 0)
7409 return (FALSE);
7410
7411 /*
7412 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
7413 */
7414 if ((pte & EPT_PG_EXECUTE) == 0 ||
7415 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
7416 return (TRUE);
7417 else
7418 return (FALSE);
7419 }
7420
7421 /*
7422 * pmap_ts_referenced:
7423 *
7424 * Return a count of reference bits for a page, clearing those bits.
7425 * It is not necessary for every reference bit to be cleared, but it
7426 * is necessary that 0 only be returned when there are truly no
7427 * reference bits set.
7428 *
7429 * As an optimization, update the page's dirty field if a modified bit is
7430 * found while counting reference bits. This opportunistic update can be
7431 * performed at low cost and can eliminate the need for some future calls
7432 * to pmap_is_modified(). However, since this function stops after
7433 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7434 * dirty pages. Those dirty pages will only be detected by a future call
7435 * to pmap_is_modified().
7436 *
7437 * A DI block is not needed within this function, because
7438 * invalidations are performed before the PV list lock is
7439 * released.
7440 */
7441 int
7442 pmap_ts_referenced(vm_page_t m)
7443 {
7444 struct md_page *pvh;
7445 pv_entry_t pv, pvf;
7446 pmap_t pmap;
7447 struct rwlock *lock;
7448 pd_entry_t oldpde, *pde;
7449 pt_entry_t *pte, PG_A, PG_M, PG_RW;
7450 vm_offset_t va;
7451 vm_paddr_t pa;
7452 int cleared, md_gen, not_cleared, pvh_gen;
7453 struct spglist free;
7454 boolean_t demoted;
7455
7456 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7457 ("pmap_ts_referenced: page %p is not managed", m));
7458 SLIST_INIT(&free);
7459 cleared = 0;
7460 pa = VM_PAGE_TO_PHYS(m);
7461 lock = PHYS_TO_PV_LIST_LOCK(pa);
7462 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
7463 rw_wlock(lock);
7464 retry:
7465 not_cleared = 0;
7466 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7467 goto small_mappings;
7468 pv = pvf;
7469 do {
7470 if (pvf == NULL)
7471 pvf = pv;
7472 pmap = PV_PMAP(pv);
7473 if (!PMAP_TRYLOCK(pmap)) {
7474 pvh_gen = pvh->pv_gen;
7475 rw_wunlock(lock);
7476 PMAP_LOCK(pmap);
7477 rw_wlock(lock);
7478 if (pvh_gen != pvh->pv_gen) {
7479 PMAP_UNLOCK(pmap);
7480 goto retry;
7481 }
7482 }
7483 PG_A = pmap_accessed_bit(pmap);
7484 PG_M = pmap_modified_bit(pmap);
7485 PG_RW = pmap_rw_bit(pmap);
7486 va = pv->pv_va;
7487 pde = pmap_pde(pmap, pv->pv_va);
7488 oldpde = *pde;
7489 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
7490 /*
7491 * Although "oldpde" is mapping a 2MB page, because
7492 * this function is called at a 4KB page granularity,
7493 * we only update the 4KB page under test.
7494 */
7495 vm_page_dirty(m);
7496 }
7497 if ((oldpde & PG_A) != 0) {
7498 /*
7499 * Since this reference bit is shared by 512 4KB
7500 * pages, it should not be cleared every time it is
7501 * tested. Apply a simple "hash" function on the
7502 * physical page number, the virtual superpage number,
7503 * and the pmap address to select one 4KB page out of
7504 * the 512 on which testing the reference bit will
7505 * result in clearing that reference bit. This
7506 * function is designed to avoid the selection of the
7507 * same 4KB page for every 2MB page mapping.
7508 *
7509 * On demotion, a mapping that hasn't been referenced
7510 * is simply destroyed. To avoid the possibility of a
7511 * subsequent page fault on a demoted wired mapping,
7512 * always leave its reference bit set. Moreover,
7513 * since the superpage is wired, the current state of
7514 * its reference bit won't affect page replacement.
7515 */
7516 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
7517 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
7518 (oldpde & PG_W) == 0) {
7519 if (safe_to_clear_referenced(pmap, oldpde)) {
7520 atomic_clear_long(pde, PG_A);
7521 pmap_invalidate_page(pmap, pv->pv_va);
7522 demoted = FALSE;
7523 } else if (pmap_demote_pde_locked(pmap, pde,
7524 pv->pv_va, &lock)) {
7525 /*
7526 * Remove the mapping to a single page
7527 * so that a subsequent access may
7528 * repromote. Since the underlying
7529 * page table page is fully populated,
7530 * this removal never frees a page
7531 * table page.
7532 */
7533 demoted = TRUE;
7534 va += VM_PAGE_TO_PHYS(m) - (oldpde &
7535 PG_PS_FRAME);
7536 pte = pmap_pde_to_pte(pde, va);
7537 pmap_remove_pte(pmap, pte, va, *pde,
7538 NULL, &lock);
7539 pmap_invalidate_page(pmap, va);
7540 } else
7541 demoted = TRUE;
7542
7543 if (demoted) {
7544 /*
7545 * The superpage mapping was removed
7546 * entirely and therefore 'pv' is no
7547 * longer valid.
7548 */
7549 if (pvf == pv)
7550 pvf = NULL;
7551 pv = NULL;
7552 }
7553 cleared++;
7554 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7555 ("inconsistent pv lock %p %p for page %p",
7556 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7557 } else
7558 not_cleared++;
7559 }
7560 PMAP_UNLOCK(pmap);
7561 /* Rotate the PV list if it has more than one entry. */
7562 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
7563 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7564 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7565 pvh->pv_gen++;
7566 }
7567 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7568 goto out;
7569 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7570 small_mappings:
7571 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7572 goto out;
7573 pv = pvf;
7574 do {
7575 if (pvf == NULL)
7576 pvf = pv;
7577 pmap = PV_PMAP(pv);
7578 if (!PMAP_TRYLOCK(pmap)) {
7579 pvh_gen = pvh->pv_gen;
7580 md_gen = m->md.pv_gen;
7581 rw_wunlock(lock);
7582 PMAP_LOCK(pmap);
7583 rw_wlock(lock);
7584 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7585 PMAP_UNLOCK(pmap);
7586 goto retry;
7587 }
7588 }
7589 PG_A = pmap_accessed_bit(pmap);
7590 PG_M = pmap_modified_bit(pmap);
7591 PG_RW = pmap_rw_bit(pmap);
7592 pde = pmap_pde(pmap, pv->pv_va);
7593 KASSERT((*pde & PG_PS) == 0,
7594 ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
7595 m));
7596 pte = pmap_pde_to_pte(pde, pv->pv_va);
7597 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
7598 vm_page_dirty(m);
7599 if ((*pte & PG_A) != 0) {
7600 if (safe_to_clear_referenced(pmap, *pte)) {
7601 atomic_clear_long(pte, PG_A);
7602 pmap_invalidate_page(pmap, pv->pv_va);
7603 cleared++;
7604 } else if ((*pte & PG_W) == 0) {
7605 /*
7606 * Wired pages cannot be paged out so
7607 * doing accessed bit emulation for
7608 * them is wasted effort. We do the
7609 * hard work for unwired pages only.
7610 */
7611 pmap_remove_pte(pmap, pte, pv->pv_va,
7612 *pde, &free, &lock);
7613 pmap_invalidate_page(pmap, pv->pv_va);
7614 cleared++;
7615 if (pvf == pv)
7616 pvf = NULL;
7617 pv = NULL;
7618 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7619 ("inconsistent pv lock %p %p for page %p",
7620 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7621 } else
7622 not_cleared++;
7623 }
7624 PMAP_UNLOCK(pmap);
7625 /* Rotate the PV list if it has more than one entry. */
7626 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
7627 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7628 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7629 m->md.pv_gen++;
7630 }
7631 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7632 not_cleared < PMAP_TS_REFERENCED_MAX);
7633 out:
7634 rw_wunlock(lock);
7635 vm_page_free_pages_toq(&free, true);
7636 return (cleared + not_cleared);
7637 }
7638
7639 /*
7640 * Apply the given advice to the specified range of addresses within the
7641 * given pmap. Depending on the advice, clear the referenced and/or
7642 * modified flags in each mapping and set the mapped page's dirty field.
7643 */
7644 void
7645 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7646 {
7647 struct rwlock *lock;
7648 pml4_entry_t *pml4e;
7649 pdp_entry_t *pdpe;
7650 pd_entry_t oldpde, *pde;
7651 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
7652 vm_offset_t va, va_next;
7653 vm_page_t m;
7654 bool anychanged;
7655
7656 if (advice != MADV_DONTNEED && advice != MADV_FREE)
7657 return;
7658
7659 /*
7660 * A/D bit emulation requires an alternate code path when clearing
7661 * the modified and accessed bits below. Since this function is
7662 * advisory in nature we skip it entirely for pmaps that require
7663 * A/D bit emulation.
7664 */
7665 if (pmap_emulate_ad_bits(pmap))
7666 return;
7667
7668 PG_A = pmap_accessed_bit(pmap);
7669 PG_G = pmap_global_bit(pmap);
7670 PG_M = pmap_modified_bit(pmap);
7671 PG_V = pmap_valid_bit(pmap);
7672 PG_RW = pmap_rw_bit(pmap);
7673 anychanged = false;
7674 pmap_delayed_invl_start();
7675 PMAP_LOCK(pmap);
7676 for (; sva < eva; sva = va_next) {
7677 pml4e = pmap_pml4e(pmap, sva);
7678 if ((*pml4e & PG_V) == 0) {
7679 va_next = (sva + NBPML4) & ~PML4MASK;
7680 if (va_next < sva)
7681 va_next = eva;
7682 continue;
7683 }
7684 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
7685 if ((*pdpe & PG_V) == 0) {
7686 va_next = (sva + NBPDP) & ~PDPMASK;
7687 if (va_next < sva)
7688 va_next = eva;
7689 continue;
7690 }
7691 va_next = (sva + NBPDR) & ~PDRMASK;
7692 if (va_next < sva)
7693 va_next = eva;
7694 pde = pmap_pdpe_to_pde(pdpe, sva);
7695 oldpde = *pde;
7696 if ((oldpde & PG_V) == 0)
7697 continue;
7698 else if ((oldpde & PG_PS) != 0) {
7699 if ((oldpde & PG_MANAGED) == 0)
7700 continue;
7701 lock = NULL;
7702 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
7703 if (lock != NULL)
7704 rw_wunlock(lock);
7705
7706 /*
7707 * The large page mapping was destroyed.
7708 */
7709 continue;
7710 }
7711
7712 /*
7713 * Unless the page mappings are wired, remove the
7714 * mapping to a single page so that a subsequent
7715 * access may repromote. Choosing the last page
7716 * within the address range [sva, min(va_next, eva))
7717 * generally results in more repromotions. Since the
7718 * underlying page table page is fully populated, this
7719 * removal never frees a page table page.
7720 */
7721 if ((oldpde & PG_W) == 0) {
7722 va = eva;
7723 if (va > va_next)
7724 va = va_next;
7725 va -= PAGE_SIZE;
7726 KASSERT(va >= sva,
7727 ("pmap_advise: no address gap"));
7728 pte = pmap_pde_to_pte(pde, va);
7729 KASSERT((*pte & PG_V) != 0,
7730 ("pmap_advise: invalid PTE"));
7731 pmap_remove_pte(pmap, pte, va, *pde, NULL,
7732 &lock);
7733 anychanged = true;
7734 }
7735 if (lock != NULL)
7736 rw_wunlock(lock);
7737 }
7738 if (va_next > eva)
7739 va_next = eva;
7740 va = va_next;
7741 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
7742 sva += PAGE_SIZE) {
7743 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
7744 goto maybe_invlrng;
7745 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
7746 if (advice == MADV_DONTNEED) {
7747 /*
7748 * Future calls to pmap_is_modified()
7749 * can be avoided by making the page
7750 * dirty now.
7751 */
7752 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
7753 vm_page_dirty(m);
7754 }
7755 atomic_clear_long(pte, PG_M | PG_A);
7756 } else if ((*pte & PG_A) != 0)
7757 atomic_clear_long(pte, PG_A);
7758 else
7759 goto maybe_invlrng;
7760
7761 if ((*pte & PG_G) != 0) {
7762 if (va == va_next)
7763 va = sva;
7764 } else
7765 anychanged = true;
7766 continue;
7767 maybe_invlrng:
7768 if (va != va_next) {
7769 pmap_invalidate_range(pmap, va, sva);
7770 va = va_next;
7771 }
7772 }
7773 if (va != va_next)
7774 pmap_invalidate_range(pmap, va, sva);
7775 }
7776 if (anychanged)
7777 pmap_invalidate_all(pmap);
7778 PMAP_UNLOCK(pmap);
7779 pmap_delayed_invl_finish();
7780 }
7781
7782 /*
7783 * Clear the modify bits on the specified physical page.
7784 */
7785 void
7786 pmap_clear_modify(vm_page_t m)
7787 {
7788 struct md_page *pvh;
7789 pmap_t pmap;
7790 pv_entry_t next_pv, pv;
7791 pd_entry_t oldpde, *pde;
7792 pt_entry_t *pte, PG_M, PG_RW;
7793 struct rwlock *lock;
7794 vm_offset_t va;
7795 int md_gen, pvh_gen;
7796
7797 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7798 ("pmap_clear_modify: page %p is not managed", m));
7799 VM_OBJECT_ASSERT_WLOCKED(m->object);
7800 KASSERT(!vm_page_xbusied(m),
7801 ("pmap_clear_modify: page %p is exclusive busied", m));
7802
7803 /*
7804 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
7805 * If the object containing the page is locked and the page is not
7806 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
7807 */
7808 if ((m->aflags & PGA_WRITEABLE) == 0)
7809 return;
7810 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
7811 pa_to_pvh(VM_PAGE_TO_PHYS(m));
7812 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7813 rw_wlock(lock);
7814 restart:
7815 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7816 pmap = PV_PMAP(pv);
7817 if (!PMAP_TRYLOCK(pmap)) {
7818 pvh_gen = pvh->pv_gen;
7819 rw_wunlock(lock);
7820 PMAP_LOCK(pmap);
7821 rw_wlock(lock);
7822 if (pvh_gen != pvh->pv_gen) {
7823 PMAP_UNLOCK(pmap);
7824 goto restart;
7825 }
7826 }
7827 PG_M = pmap_modified_bit(pmap);
7828 PG_RW = pmap_rw_bit(pmap);
7829 va = pv->pv_va;
7830 pde = pmap_pde(pmap, va);
7831 oldpde = *pde;
7832 /* If oldpde has PG_RW set, then it also has PG_M set. */
7833 if ((oldpde & PG_RW) != 0 &&
7834 pmap_demote_pde_locked(pmap, pde, va, &lock) &&
7835 (oldpde & PG_W) == 0) {
7836 /*
7837 * Write protect the mapping to a single page so that
7838 * a subsequent write access may repromote.
7839 */
7840 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
7841 pte = pmap_pde_to_pte(pde, va);
7842 atomic_clear_long(pte, PG_M | PG_RW);
7843 vm_page_dirty(m);
7844 pmap_invalidate_page(pmap, va);
7845 }
7846 PMAP_UNLOCK(pmap);
7847 }
7848 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7849 pmap = PV_PMAP(pv);
7850 if (!PMAP_TRYLOCK(pmap)) {
7851 md_gen = m->md.pv_gen;
7852 pvh_gen = pvh->pv_gen;
7853 rw_wunlock(lock);
7854 PMAP_LOCK(pmap);
7855 rw_wlock(lock);
7856 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7857 PMAP_UNLOCK(pmap);
7858 goto restart;
7859 }
7860 }
7861 PG_M = pmap_modified_bit(pmap);
7862 PG_RW = pmap_rw_bit(pmap);
7863 pde = pmap_pde(pmap, pv->pv_va);
7864 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
7865 " a 2mpage in page %p's pv list", m));
7866 pte = pmap_pde_to_pte(pde, pv->pv_va);
7867 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
7868 atomic_clear_long(pte, PG_M);
7869 pmap_invalidate_page(pmap, pv->pv_va);
7870 }
7871 PMAP_UNLOCK(pmap);
7872 }
7873 rw_wunlock(lock);
7874 }
7875
7876 /*
7877 * Miscellaneous support routines follow
7878 */
7879
7880 /* Adjust the properties for a leaf page table entry. */
7881 static __inline void
7882 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask)
7883 {
7884 u_long opte, npte;
7885
7886 opte = *(u_long *)pte;
7887 do {
7888 npte = opte & ~mask;
7889 npte |= bits;
7890 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte,
7891 npte));
7892 }
7893
7894 /*
7895 * Map a set of physical memory pages into the kernel virtual
7896 * address space. Return a pointer to where it is mapped. This
7897 * routine is intended to be used for mapping device memory,
7898 * NOT real memory.
7899 */
7900 static void *
7901 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
7902 {
7903 struct pmap_preinit_mapping *ppim;
7904 vm_offset_t va, offset;
7905 vm_size_t tmpsize;
7906 int i;
7907
7908 offset = pa & PAGE_MASK;
7909 size = round_page(offset + size);
7910 pa = trunc_page(pa);
7911
7912 if (!pmap_initialized) {
7913 va = 0;
7914 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7915 ppim = pmap_preinit_mapping + i;
7916 if (ppim->va == 0) {
7917 ppim->pa = pa;
7918 ppim->sz = size;
7919 ppim->mode = mode;
7920 ppim->va = virtual_avail;
7921 virtual_avail += size;
7922 va = ppim->va;
7923 break;
7924 }
7925 }
7926 if (va == 0)
7927 panic("%s: too many preinit mappings", __func__);
7928 } else {
7929 /*
7930 * If we have a preinit mapping, re-use it.
7931 */
7932 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7933 ppim = pmap_preinit_mapping + i;
7934 if (ppim->pa == pa && ppim->sz == size &&
7935 (ppim->mode == mode ||
7936 (flags & MAPDEV_SETATTR) == 0))
7937 return ((void *)(ppim->va + offset));
7938 }
7939 /*
7940 * If the specified range of physical addresses fits within
7941 * the direct map window, use the direct map.
7942 */
7943 if (pa < dmaplimit && pa + size <= dmaplimit) {
7944 va = PHYS_TO_DMAP(pa);
7945 if ((flags & MAPDEV_SETATTR) != 0) {
7946 PMAP_LOCK(kernel_pmap);
7947 i = pmap_change_props_locked(va, size,
7948 PROT_NONE, mode, flags);
7949 PMAP_UNLOCK(kernel_pmap);
7950 } else
7951 i = 0;
7952 if (!i)
7953 return ((void *)(va + offset));
7954 }
7955 va = kva_alloc(size);
7956 if (va == 0)
7957 panic("%s: Couldn't allocate KVA", __func__);
7958 }
7959 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
7960 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
7961 pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
7962 if ((flags & MAPDEV_FLUSHCACHE) != 0)
7963 pmap_invalidate_cache_range(va, va + tmpsize);
7964 return ((void *)(va + offset));
7965 }
7966
7967 void *
7968 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
7969 {
7970
7971 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE |
7972 MAPDEV_SETATTR));
7973 }
7974
7975 void *
7976 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
7977 {
7978
7979 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
7980 }
7981
7982 void *
7983 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
7984 {
7985
7986 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE,
7987 MAPDEV_SETATTR));
7988 }
7989
7990 void *
7991 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7992 {
7993
7994 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK,
7995 MAPDEV_FLUSHCACHE));
7996 }
7997
7998 void
7999 pmap_unmapdev(vm_offset_t va, vm_size_t size)
8000 {
8001 struct pmap_preinit_mapping *ppim;
8002 vm_offset_t offset;
8003 int i;
8004
8005 /* If we gave a direct map region in pmap_mapdev, do nothing */
8006 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
8007 return;
8008 offset = va & PAGE_MASK;
8009 size = round_page(offset + size);
8010 va = trunc_page(va);
8011 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
8012 ppim = pmap_preinit_mapping + i;
8013 if (ppim->va == va && ppim->sz == size) {
8014 if (pmap_initialized)
8015 return;
8016 ppim->pa = 0;
8017 ppim->va = 0;
8018 ppim->sz = 0;
8019 ppim->mode = 0;
8020 if (va + size == virtual_avail)
8021 virtual_avail = va;
8022 return;
8023 }
8024 }
8025 if (pmap_initialized) {
8026 pmap_qremove(va, atop(size));
8027 kva_free(va, size);
8028 }
8029 }
8030
8031 /*
8032 * Tries to demote a 1GB page mapping.
8033 */
8034 static boolean_t
8035 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
8036 {
8037 pdp_entry_t newpdpe, oldpdpe;
8038 pd_entry_t *firstpde, newpde, *pde;
8039 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
8040 vm_paddr_t pdpgpa;
8041 vm_page_t pdpg;
8042
8043 PG_A = pmap_accessed_bit(pmap);
8044 PG_M = pmap_modified_bit(pmap);
8045 PG_V = pmap_valid_bit(pmap);
8046 PG_RW = pmap_rw_bit(pmap);
8047
8048 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8049 oldpdpe = *pdpe;
8050 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
8051 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
8052 if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
8053 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
8054 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
8055 " in pmap %p", va, pmap);
8056 return (FALSE);
8057 }
8058 pdpgpa = VM_PAGE_TO_PHYS(pdpg);
8059 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
8060 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
8061 KASSERT((oldpdpe & PG_A) != 0,
8062 ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
8063 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
8064 ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
8065 newpde = oldpdpe;
8066
8067 /*
8068 * Initialize the page directory page.
8069 */
8070 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
8071 *pde = newpde;
8072 newpde += NBPDR;
8073 }
8074
8075 /*
8076 * Demote the mapping.
8077 */
8078 *pdpe = newpdpe;
8079
8080 /*
8081 * Invalidate a stale recursive mapping of the page directory page.
8082 */
8083 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
8084
8085 pmap_pdpe_demotions++;
8086 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
8087 " in pmap %p", va, pmap);
8088 return (TRUE);
8089 }
8090
8091 /*
8092 * Sets the memory attribute for the specified page.
8093 */
8094 void
8095 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
8096 {
8097
8098 m->md.pat_mode = ma;
8099
8100 /*
8101 * If "m" is a normal page, update its direct mapping. This update
8102 * can be relied upon to perform any cache operations that are
8103 * required for data coherence.
8104 */
8105 if ((m->flags & PG_FICTITIOUS) == 0 &&
8106 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
8107 m->md.pat_mode))
8108 panic("memory attribute change on the direct map failed");
8109 }
8110
8111 /*
8112 * Changes the specified virtual address range's memory type to that given by
8113 * the parameter "mode". The specified virtual address range must be
8114 * completely contained within either the direct map or the kernel map. If
8115 * the virtual address range is contained within the kernel map, then the
8116 * memory type for each of the corresponding ranges of the direct map is also
8117 * changed. (The corresponding ranges of the direct map are those ranges that
8118 * map the same physical pages as the specified virtual address range.) These
8119 * changes to the direct map are necessary because Intel describes the
8120 * behavior of their processors as "undefined" if two or more mappings to the
8121 * same physical page have different memory types.
8122 *
8123 * Returns zero if the change completed successfully, and either EINVAL or
8124 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
8125 * of the virtual address range was not mapped, and ENOMEM is returned if
8126 * there was insufficient memory available to complete the change. In the
8127 * latter case, the memory type may have been changed on some part of the
8128 * virtual address range or the direct map.
8129 */
8130 int
8131 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
8132 {
8133 int error;
8134
8135 PMAP_LOCK(kernel_pmap);
8136 error = pmap_change_props_locked(va, size, PROT_NONE, mode,
8137 MAPDEV_FLUSHCACHE);
8138 PMAP_UNLOCK(kernel_pmap);
8139 return (error);
8140 }
8141
8142 /*
8143 * Changes the specified virtual address range's protections to those
8144 * specified by "prot". Like pmap_change_attr(), protections for aliases
8145 * in the direct map are updated as well. Protections on aliasing mappings may
8146 * be a subset of the requested protections; for example, mappings in the direct
8147 * map are never executable.
8148 */
8149 int
8150 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
8151 {
8152 int error;
8153
8154 /* Only supported within the kernel map. */
8155 if (va < VM_MIN_KERNEL_ADDRESS)
8156 return (EINVAL);
8157
8158 PMAP_LOCK(kernel_pmap);
8159 error = pmap_change_props_locked(va, size, prot, -1,
8160 MAPDEV_ASSERTVALID);
8161 PMAP_UNLOCK(kernel_pmap);
8162 return (error);
8163 }
8164
8165 static int
8166 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
8167 int mode, int flags)
8168 {
8169 vm_offset_t base, offset, tmpva;
8170 vm_paddr_t pa_start, pa_end, pa_end1;
8171 pdp_entry_t *pdpe;
8172 pd_entry_t *pde, pde_bits, pde_mask;
8173 pt_entry_t *pte, pte_bits, pte_mask;
8174 int error;
8175 bool changed;
8176
8177 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
8178 base = trunc_page(va);
8179 offset = va & PAGE_MASK;
8180 size = round_page(offset + size);
8181
8182 /*
8183 * Only supported on kernel virtual addresses, including the direct
8184 * map but excluding the recursive map.
8185 */
8186 if (base < DMAP_MIN_ADDRESS)
8187 return (EINVAL);
8188
8189 /*
8190 * Construct our flag sets and masks. "bits" is the subset of
8191 * "mask" that will be set in each modified PTE.
8192 *
8193 * Mappings in the direct map are never allowed to be executable.
8194 */
8195 pde_bits = pte_bits = 0;
8196 pde_mask = pte_mask = 0;
8197 if (mode != -1) {
8198 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true);
8199 pde_mask |= X86_PG_PDE_CACHE;
8200 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false);
8201 pte_mask |= X86_PG_PTE_CACHE;
8202 }
8203 if (prot != VM_PROT_NONE) {
8204 if ((prot & VM_PROT_WRITE) != 0) {
8205 pde_bits |= X86_PG_RW;
8206 pte_bits |= X86_PG_RW;
8207 }
8208 if ((prot & VM_PROT_EXECUTE) == 0 ||
8209 va < VM_MIN_KERNEL_ADDRESS) {
8210 pde_bits |= pg_nx;
8211 pte_bits |= pg_nx;
8212 }
8213 pde_mask |= X86_PG_RW | pg_nx;
8214 pte_mask |= X86_PG_RW | pg_nx;
8215 }
8216
8217 /*
8218 * Pages that aren't mapped aren't supported. Also break down 2MB pages
8219 * into 4KB pages if required.
8220 */
8221 for (tmpva = base; tmpva < base + size; ) {
8222 pdpe = pmap_pdpe(kernel_pmap, tmpva);
8223 if (pdpe == NULL || *pdpe == 0) {
8224 KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
8225 ("%s: addr %#lx is not mapped", __func__, tmpva));
8226 return (EINVAL);
8227 }
8228 if (*pdpe & PG_PS) {
8229 /*
8230 * If the current 1GB page already has the required
8231 * properties, then we need not demote this page. Just
8232 * increment tmpva to the next 1GB page frame.
8233 */
8234 if ((*pdpe & pde_mask) == pde_bits) {
8235 tmpva = trunc_1gpage(tmpva) + NBPDP;
8236 continue;
8237 }
8238
8239 /*
8240 * If the current offset aligns with a 1GB page frame
8241 * and there is at least 1GB left within the range, then
8242 * we need not break down this page into 2MB pages.
8243 */
8244 if ((tmpva & PDPMASK) == 0 &&
8245 tmpva + PDPMASK < base + size) {
8246 tmpva += NBPDP;
8247 continue;
8248 }
8249 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
8250 return (ENOMEM);
8251 }
8252 pde = pmap_pdpe_to_pde(pdpe, tmpva);
8253 if (*pde == 0) {
8254 KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
8255 ("%s: addr %#lx is not mapped", __func__, tmpva));
8256 return (EINVAL);
8257 }
8258 if (*pde & PG_PS) {
8259 /*
8260 * If the current 2MB page already has the required
8261 * properties, then we need not demote this page. Just
8262 * increment tmpva to the next 2MB page frame.
8263 */
8264 if ((*pde & pde_mask) == pde_bits) {
8265 tmpva = trunc_2mpage(tmpva) + NBPDR;
8266 continue;
8267 }
8268
8269 /*
8270 * If the current offset aligns with a 2MB page frame
8271 * and there is at least 2MB left within the range, then
8272 * we need not break down this page into 4KB pages.
8273 */
8274 if ((tmpva & PDRMASK) == 0 &&
8275 tmpva + PDRMASK < base + size) {
8276 tmpva += NBPDR;
8277 continue;
8278 }
8279 if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
8280 return (ENOMEM);
8281 }
8282 pte = pmap_pde_to_pte(pde, tmpva);
8283 if (*pte == 0) {
8284 KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
8285 ("%s: addr %#lx is not mapped", __func__, tmpva));
8286 return (EINVAL);
8287 }
8288 tmpva += PAGE_SIZE;
8289 }
8290 error = 0;
8291
8292 /*
8293 * Ok, all the pages exist, so run through them updating their
8294 * properties if required.
8295 */
8296 changed = false;
8297 pa_start = pa_end = 0;
8298 for (tmpva = base; tmpva < base + size; ) {
8299 pdpe = pmap_pdpe(kernel_pmap, tmpva);
8300 if (*pdpe & PG_PS) {
8301 if ((*pdpe & pde_mask) != pde_bits) {
8302 pmap_pte_props(pdpe, pde_bits, pde_mask);
8303 changed = true;
8304 }
8305 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
8306 (*pdpe & PG_PS_FRAME) < dmaplimit) {
8307 if (pa_start == pa_end) {
8308 /* Start physical address run. */
8309 pa_start = *pdpe & PG_PS_FRAME;
8310 pa_end = pa_start + NBPDP;
8311 } else if (pa_end == (*pdpe & PG_PS_FRAME))
8312 pa_end += NBPDP;
8313 else {
8314 /* Run ended, update direct map. */
8315 error = pmap_change_props_locked(
8316 PHYS_TO_DMAP(pa_start),
8317 pa_end - pa_start, prot, mode,
8318 flags);
8319 if (error != 0)
8320 break;
8321 /* Start physical address run. */
8322 pa_start = *pdpe & PG_PS_FRAME;
8323 pa_end = pa_start + NBPDP;
8324 }
8325 }
8326 tmpva = trunc_1gpage(tmpva) + NBPDP;
8327 continue;
8328 }
8329 pde = pmap_pdpe_to_pde(pdpe, tmpva);
8330 if (*pde & PG_PS) {
8331 if ((*pde & pde_mask) != pde_bits) {
8332 pmap_pte_props(pde, pde_bits, pde_mask);
8333 changed = true;
8334 }
8335 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
8336 (*pde & PG_PS_FRAME) < dmaplimit) {
8337 if (pa_start == pa_end) {
8338 /* Start physical address run. */
8339 pa_start = *pde & PG_PS_FRAME;
8340 pa_end = pa_start + NBPDR;
8341 } else if (pa_end == (*pde & PG_PS_FRAME))
8342 pa_end += NBPDR;
8343 else {
8344 /* Run ended, update direct map. */
8345 error = pmap_change_props_locked(
8346 PHYS_TO_DMAP(pa_start),
8347 pa_end - pa_start, prot, mode,
8348 flags);
8349 if (error != 0)
8350 break;
8351 /* Start physical address run. */
8352 pa_start = *pde & PG_PS_FRAME;
8353 pa_end = pa_start + NBPDR;
8354 }
8355 }
8356 tmpva = trunc_2mpage(tmpva) + NBPDR;
8357 } else {
8358 pte = pmap_pde_to_pte(pde, tmpva);
8359 if ((*pte & pte_mask) != pte_bits) {
8360 pmap_pte_props(pte, pte_bits, pte_mask);
8361 changed = true;
8362 }
8363 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
8364 (*pte & PG_FRAME) < dmaplimit) {
8365 if (pa_start == pa_end) {
8366 /* Start physical address run. */
8367 pa_start = *pte & PG_FRAME;
8368 pa_end = pa_start + PAGE_SIZE;
8369 } else if (pa_end == (*pte & PG_FRAME))
8370 pa_end += PAGE_SIZE;
8371 else {
8372 /* Run ended, update direct map. */
8373 error = pmap_change_props_locked(
8374 PHYS_TO_DMAP(pa_start),
8375 pa_end - pa_start, prot, mode,
8376 flags);
8377 if (error != 0)
8378 break;
8379 /* Start physical address run. */
8380 pa_start = *pte & PG_FRAME;
8381 pa_end = pa_start + PAGE_SIZE;
8382 }
8383 }
8384 tmpva += PAGE_SIZE;
8385 }
8386 }
8387 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
8388 pa_end1 = MIN(pa_end, dmaplimit);
8389 if (pa_start != pa_end1)
8390 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start),
8391 pa_end1 - pa_start, prot, mode, flags);
8392 }
8393
8394 /*
8395 * Flush CPU caches if required to make sure any data isn't cached that
8396 * shouldn't be, etc.
8397 */
8398 if (changed) {
8399 pmap_invalidate_range(kernel_pmap, base, tmpva);
8400 if ((flags & MAPDEV_FLUSHCACHE) != 0)
8401 pmap_invalidate_cache_range(base, tmpva);
8402 }
8403 return (error);
8404 }
8405
8406 /*
8407 * Demotes any mapping within the direct map region that covers more than the
8408 * specified range of physical addresses. This range's size must be a power
8409 * of two and its starting address must be a multiple of its size. Since the
8410 * demotion does not change any attributes of the mapping, a TLB invalidation
8411 * is not mandatory. The caller may, however, request a TLB invalidation.
8412 */
8413 void
8414 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
8415 {
8416 pdp_entry_t *pdpe;
8417 pd_entry_t *pde;
8418 vm_offset_t va;
8419 boolean_t changed;
8420
8421 if (len == 0)
8422 return;
8423 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
8424 KASSERT((base & (len - 1)) == 0,
8425 ("pmap_demote_DMAP: base is not a multiple of len"));
8426 if (len < NBPDP && base < dmaplimit) {
8427 va = PHYS_TO_DMAP(base);
8428 changed = FALSE;
8429 PMAP_LOCK(kernel_pmap);
8430 pdpe = pmap_pdpe(kernel_pmap, va);
8431 if ((*pdpe & X86_PG_V) == 0)
8432 panic("pmap_demote_DMAP: invalid PDPE");
8433 if ((*pdpe & PG_PS) != 0) {
8434 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
8435 panic("pmap_demote_DMAP: PDPE failed");
8436 changed = TRUE;
8437 }
8438 if (len < NBPDR) {
8439 pde = pmap_pdpe_to_pde(pdpe, va);
8440 if ((*pde & X86_PG_V) == 0)
8441 panic("pmap_demote_DMAP: invalid PDE");
8442 if ((*pde & PG_PS) != 0) {
8443 if (!pmap_demote_pde(kernel_pmap, pde, va))
8444 panic("pmap_demote_DMAP: PDE failed");
8445 changed = TRUE;
8446 }
8447 }
8448 if (changed && invalidate)
8449 pmap_invalidate_page(kernel_pmap, va);
8450 PMAP_UNLOCK(kernel_pmap);
8451 }
8452 }
8453
8454 /*
8455 * perform the pmap work for mincore
8456 */
8457 int
8458 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
8459 {
8460 pd_entry_t *pdep;
8461 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
8462 vm_paddr_t pa;
8463 int val;
8464
8465 PG_A = pmap_accessed_bit(pmap);
8466 PG_M = pmap_modified_bit(pmap);
8467 PG_V = pmap_valid_bit(pmap);
8468 PG_RW = pmap_rw_bit(pmap);
8469
8470 PMAP_LOCK(pmap);
8471 retry:
8472 pdep = pmap_pde(pmap, addr);
8473 if (pdep != NULL && (*pdep & PG_V)) {
8474 if (*pdep & PG_PS) {
8475 pte = *pdep;
8476 /* Compute the physical address of the 4KB page. */
8477 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
8478 PG_FRAME;
8479 val = MINCORE_SUPER;
8480 } else {
8481 pte = *pmap_pde_to_pte(pdep, addr);
8482 pa = pte & PG_FRAME;
8483 val = 0;
8484 }
8485 } else {
8486 pte = 0;
8487 pa = 0;
8488 val = 0;
8489 }
8490 if ((pte & PG_V) != 0) {
8491 val |= MINCORE_INCORE;
8492 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
8493 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8494 if ((pte & PG_A) != 0)
8495 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8496 }
8497 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8498 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
8499 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
8500 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
8501 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
8502 goto retry;
8503 } else
8504 PA_UNLOCK_COND(*locked_pa);
8505 PMAP_UNLOCK(pmap);
8506 return (val);
8507 }
8508
8509 static uint64_t
8510 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
8511 {
8512 uint32_t gen, new_gen, pcid_next;
8513
8514 CRITICAL_ASSERT(curthread);
8515 gen = PCPU_GET(pcid_gen);
8516 if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN)
8517 return (pti ? 0 : CR3_PCID_SAVE);
8518 if (pmap->pm_pcids[cpuid].pm_gen == gen)
8519 return (CR3_PCID_SAVE);
8520 pcid_next = PCPU_GET(pcid_next);
8521 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
8522 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
8523 ("cpu %d pcid_next %#x", cpuid, pcid_next));
8524 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
8525 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
8526 new_gen = gen + 1;
8527 if (new_gen == 0)
8528 new_gen = 1;
8529 PCPU_SET(pcid_gen, new_gen);
8530 pcid_next = PMAP_PCID_KERN + 1;
8531 } else {
8532 new_gen = gen;
8533 }
8534 pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
8535 pmap->pm_pcids[cpuid].pm_gen = new_gen;
8536 PCPU_SET(pcid_next, pcid_next + 1);
8537 return (0);
8538 }
8539
8540 static uint64_t
8541 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid)
8542 {
8543 uint64_t cached;
8544
8545 cached = pmap_pcid_alloc(pmap, cpuid);
8546 KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
8547 ("pmap %p cpu %d pcid %#x", pmap, cpuid,
8548 pmap->pm_pcids[cpuid].pm_pcid));
8549 KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
8550 pmap == kernel_pmap,
8551 ("non-kernel pmap pmap %p cpu %d pcid %#x",
8552 pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
8553 return (cached);
8554 }
8555
8556 static void
8557 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap)
8558 {
8559
8560 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ?
8561 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base;
8562 }
8563
8564 static void inline
8565 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1)
8566 {
8567 struct invpcid_descr d;
8568 uint64_t cached, cr3, kcr3, ucr3;
8569
8570 cached = pmap_pcid_alloc_checked(pmap, cpuid);
8571 cr3 = rcr3();
8572 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
8573 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid);
8574 PCPU_SET(curpmap, pmap);
8575 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
8576 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
8577 PMAP_PCID_USER_PT;
8578
8579 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) {
8580 /*
8581 * Explicitly invalidate translations cached from the
8582 * user page table. They are not automatically
8583 * flushed by reload of cr3 with the kernel page table
8584 * pointer above.
8585 *
8586 * Note that the if() condition is resolved statically
8587 * by using the function argument instead of
8588 * runtime-evaluated invpcid_works value.
8589 */
8590 if (invpcid_works1) {
8591 d.pcid = PMAP_PCID_USER_PT |
8592 pmap->pm_pcids[cpuid].pm_pcid;
8593 d.pad = 0;
8594 d.addr = 0;
8595 invpcid(&d, INVPCID_CTX);
8596 } else {
8597 pmap_pti_pcid_invalidate(ucr3, kcr3);
8598 }
8599 }
8600
8601 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
8602 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
8603 if (cached)
8604 PCPU_INC(pm_save_cnt);
8605 }
8606
8607 static void
8608 pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
8609 {
8610
8611 pmap_activate_sw_pcid_pti(pmap, cpuid, true);
8612 pmap_activate_sw_pti_post(td, pmap);
8613 }
8614
8615 static void
8616 pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap,
8617 u_int cpuid)
8618 {
8619 register_t rflags;
8620
8621 /*
8622 * If the INVPCID instruction is not available,
8623 * invltlb_pcid_handler() is used to handle an invalidate_all
8624 * IPI, which checks for curpmap == smp_tlb_pmap. The below
8625 * sequence of operations has a window where %CR3 is loaded
8626 * with the new pmap's PML4 address, but the curpmap value has
8627 * not yet been updated. This causes the invltlb IPI handler,
8628 * which is called between the updates, to execute as a NOP,
8629 * which leaves stale TLB entries.
8630 *
8631 * Note that the most typical use of pmap_activate_sw(), from
8632 * the context switch, is immune to this race, because
8633 * interrupts are disabled (while the thread lock is owned),
8634 * and the IPI happens after curpmap is updated. Protect
8635 * other callers in a similar way, by disabling interrupts
8636 * around the %cr3 register reload and curpmap assignment.
8637 */
8638 rflags = intr_disable();
8639 pmap_activate_sw_pcid_pti(pmap, cpuid, false);
8640 intr_restore(rflags);
8641 pmap_activate_sw_pti_post(td, pmap);
8642 }
8643
8644 static void
8645 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap,
8646 u_int cpuid)
8647 {
8648 uint64_t cached, cr3;
8649
8650 cached = pmap_pcid_alloc_checked(pmap, cpuid);
8651 cr3 = rcr3();
8652 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
8653 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
8654 cached);
8655 PCPU_SET(curpmap, pmap);
8656 if (cached)
8657 PCPU_INC(pm_save_cnt);
8658 }
8659
8660 static void
8661 pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap,
8662 u_int cpuid)
8663 {
8664 register_t rflags;
8665
8666 rflags = intr_disable();
8667 pmap_activate_sw_pcid_nopti(td, pmap, cpuid);
8668 intr_restore(rflags);
8669 }
8670
8671 static void
8672 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap,
8673 u_int cpuid __unused)
8674 {
8675
8676 load_cr3(pmap->pm_cr3);
8677 PCPU_SET(curpmap, pmap);
8678 }
8679
8680 static void
8681 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap,
8682 u_int cpuid __unused)
8683 {
8684
8685 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid);
8686 PCPU_SET(kcr3, pmap->pm_cr3);
8687 PCPU_SET(ucr3, pmap->pm_ucr3);
8688 pmap_activate_sw_pti_post(td, pmap);
8689 }
8690
8691 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t,
8692 u_int), static)
8693 {
8694
8695 if (pmap_pcid_enabled && pti && invpcid_works)
8696 return (pmap_activate_sw_pcid_invpcid_pti);
8697 else if (pmap_pcid_enabled && pti && !invpcid_works)
8698 return (pmap_activate_sw_pcid_noinvpcid_pti);
8699 else if (pmap_pcid_enabled && !pti && invpcid_works)
8700 return (pmap_activate_sw_pcid_nopti);
8701 else if (pmap_pcid_enabled && !pti && !invpcid_works)
8702 return (pmap_activate_sw_pcid_noinvpcid_nopti);
8703 else if (!pmap_pcid_enabled && pti)
8704 return (pmap_activate_sw_nopcid_pti);
8705 else /* if (!pmap_pcid_enabled && !pti) */
8706 return (pmap_activate_sw_nopcid_nopti);
8707 }
8708
8709 void
8710 pmap_activate_sw(struct thread *td)
8711 {
8712 pmap_t oldpmap, pmap;
8713 u_int cpuid;
8714
8715 oldpmap = PCPU_GET(curpmap);
8716 pmap = vmspace_pmap(td->td_proc->p_vmspace);
8717 if (oldpmap == pmap) {
8718 if (cpu_vendor_id != CPU_VENDOR_INTEL)
8719 mfence();
8720 return;
8721 }
8722 cpuid = PCPU_GET(cpuid);
8723 #ifdef SMP
8724 CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
8725 #else
8726 CPU_SET(cpuid, &pmap->pm_active);
8727 #endif
8728 pmap_activate_sw_mode(td, pmap, cpuid);
8729 #ifdef SMP
8730 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
8731 #else
8732 CPU_CLR(cpuid, &oldpmap->pm_active);
8733 #endif
8734 }
8735
8736 void
8737 pmap_activate(struct thread *td)
8738 {
8739
8740 critical_enter();
8741 pmap_activate_sw(td);
8742 critical_exit();
8743 }
8744
8745 void
8746 pmap_activate_boot(pmap_t pmap)
8747 {
8748 uint64_t kcr3;
8749 u_int cpuid;
8750
8751 /*
8752 * kernel_pmap must be never deactivated, and we ensure that
8753 * by never activating it at all.
8754 */
8755 MPASS(pmap != kernel_pmap);
8756
8757 cpuid = PCPU_GET(cpuid);
8758 #ifdef SMP
8759 CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
8760 #else
8761 CPU_SET(cpuid, &pmap->pm_active);
8762 #endif
8763 PCPU_SET(curpmap, pmap);
8764 if (pti) {
8765 kcr3 = pmap->pm_cr3;
8766 if (pmap_pcid_enabled)
8767 kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE;
8768 } else {
8769 kcr3 = PMAP_NO_CR3;
8770 }
8771 PCPU_SET(kcr3, kcr3);
8772 PCPU_SET(ucr3, PMAP_NO_CR3);
8773 }
8774
8775 void
8776 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
8777 {
8778 }
8779
8780 /*
8781 * Increase the starting virtual address of the given mapping if a
8782 * different alignment might result in more superpage mappings.
8783 */
8784 void
8785 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
8786 vm_offset_t *addr, vm_size_t size)
8787 {
8788 vm_offset_t superpage_offset;
8789
8790 if (size < NBPDR)
8791 return;
8792 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
8793 offset += ptoa(object->pg_color);
8794 superpage_offset = offset & PDRMASK;
8795 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
8796 (*addr & PDRMASK) == superpage_offset)
8797 return;
8798 if ((*addr & PDRMASK) < superpage_offset)
8799 *addr = (*addr & ~PDRMASK) + superpage_offset;
8800 else
8801 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
8802 }
8803
8804 #ifdef INVARIANTS
8805 static unsigned long num_dirty_emulations;
8806 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
8807 &num_dirty_emulations, 0, NULL);
8808
8809 static unsigned long num_accessed_emulations;
8810 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
8811 &num_accessed_emulations, 0, NULL);
8812
8813 static unsigned long num_superpage_accessed_emulations;
8814 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
8815 &num_superpage_accessed_emulations, 0, NULL);
8816
8817 static unsigned long ad_emulation_superpage_promotions;
8818 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
8819 &ad_emulation_superpage_promotions, 0, NULL);
8820 #endif /* INVARIANTS */
8821
8822 int
8823 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
8824 {
8825 int rv;
8826 struct rwlock *lock;
8827 #if VM_NRESERVLEVEL > 0
8828 vm_page_t m, mpte;
8829 #endif
8830 pd_entry_t *pde;
8831 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
8832
8833 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
8834 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
8835
8836 if (!pmap_emulate_ad_bits(pmap))
8837 return (-1);
8838
8839 PG_A = pmap_accessed_bit(pmap);
8840 PG_M = pmap_modified_bit(pmap);
8841 PG_V = pmap_valid_bit(pmap);
8842 PG_RW = pmap_rw_bit(pmap);
8843
8844 rv = -1;
8845 lock = NULL;
8846 PMAP_LOCK(pmap);
8847
8848 pde = pmap_pde(pmap, va);
8849 if (pde == NULL || (*pde & PG_V) == 0)
8850 goto done;
8851
8852 if ((*pde & PG_PS) != 0) {
8853 if (ftype == VM_PROT_READ) {
8854 #ifdef INVARIANTS
8855 atomic_add_long(&num_superpage_accessed_emulations, 1);
8856 #endif
8857 *pde |= PG_A;
8858 rv = 0;
8859 }
8860 goto done;
8861 }
8862
8863 pte = pmap_pde_to_pte(pde, va);
8864 if ((*pte & PG_V) == 0)
8865 goto done;
8866
8867 if (ftype == VM_PROT_WRITE) {
8868 if ((*pte & PG_RW) == 0)
8869 goto done;
8870 /*
8871 * Set the modified and accessed bits simultaneously.
8872 *
8873 * Intel EPT PTEs that do software emulation of A/D bits map
8874 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
8875 * An EPT misconfiguration is triggered if the PTE is writable
8876 * but not readable (WR=10). This is avoided by setting PG_A
8877 * and PG_M simultaneously.
8878 */
8879 *pte |= PG_M | PG_A;
8880 } else {
8881 *pte |= PG_A;
8882 }
8883
8884 #if VM_NRESERVLEVEL > 0
8885 /* try to promote the mapping */
8886 if (va < VM_MAXUSER_ADDRESS)
8887 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
8888 else
8889 mpte = NULL;
8890
8891 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
8892
8893 if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
8894 pmap_ps_enabled(pmap) &&
8895 (m->flags & PG_FICTITIOUS) == 0 &&
8896 vm_reserv_level_iffullpop(m) == 0) {
8897 pmap_promote_pde(pmap, pde, va, &lock);
8898 #ifdef INVARIANTS
8899 atomic_add_long(&ad_emulation_superpage_promotions, 1);
8900 #endif
8901 }
8902 #endif
8903
8904 #ifdef INVARIANTS
8905 if (ftype == VM_PROT_WRITE)
8906 atomic_add_long(&num_dirty_emulations, 1);
8907 else
8908 atomic_add_long(&num_accessed_emulations, 1);
8909 #endif
8910 rv = 0; /* success */
8911 done:
8912 if (lock != NULL)
8913 rw_wunlock(lock);
8914 PMAP_UNLOCK(pmap);
8915 return (rv);
8916 }
8917
8918 void
8919 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
8920 {
8921 pml4_entry_t *pml4;
8922 pdp_entry_t *pdp;
8923 pd_entry_t *pde;
8924 pt_entry_t *pte, PG_V;
8925 int idx;
8926
8927 idx = 0;
8928 PG_V = pmap_valid_bit(pmap);
8929 PMAP_LOCK(pmap);
8930
8931 pml4 = pmap_pml4e(pmap, va);
8932 ptr[idx++] = *pml4;
8933 if ((*pml4 & PG_V) == 0)
8934 goto done;
8935
8936 pdp = pmap_pml4e_to_pdpe(pml4, va);
8937 ptr[idx++] = *pdp;
8938 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
8939 goto done;
8940
8941 pde = pmap_pdpe_to_pde(pdp, va);
8942 ptr[idx++] = *pde;
8943 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
8944 goto done;
8945
8946 pte = pmap_pde_to_pte(pde, va);
8947 ptr[idx++] = *pte;
8948
8949 done:
8950 PMAP_UNLOCK(pmap);
8951 *num = idx;
8952 }
8953
8954 /**
8955 * Get the kernel virtual address of a set of physical pages. If there are
8956 * physical addresses not covered by the DMAP perform a transient mapping
8957 * that will be removed when calling pmap_unmap_io_transient.
8958 *
8959 * \param page The pages the caller wishes to obtain the virtual
8960 * address on the kernel memory map.
8961 * \param vaddr On return contains the kernel virtual memory address
8962 * of the pages passed in the page parameter.
8963 * \param count Number of pages passed in.
8964 * \param can_fault TRUE if the thread using the mapped pages can take
8965 * page faults, FALSE otherwise.
8966 *
8967 * \returns TRUE if the caller must call pmap_unmap_io_transient when
8968 * finished or FALSE otherwise.
8969 *
8970 */
8971 boolean_t
8972 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8973 boolean_t can_fault)
8974 {
8975 vm_paddr_t paddr;
8976 boolean_t needs_mapping;
8977 pt_entry_t *pte;
8978 int cache_bits, error __unused, i;
8979
8980 /*
8981 * Allocate any KVA space that we need, this is done in a separate
8982 * loop to prevent calling vmem_alloc while pinned.
8983 */
8984 needs_mapping = FALSE;
8985 for (i = 0; i < count; i++) {
8986 paddr = VM_PAGE_TO_PHYS(page[i]);
8987 if (__predict_false(paddr >= dmaplimit)) {
8988 error = vmem_alloc(kernel_arena, PAGE_SIZE,
8989 M_BESTFIT | M_WAITOK, &vaddr[i]);
8990 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
8991 needs_mapping = TRUE;
8992 } else {
8993 vaddr[i] = PHYS_TO_DMAP(paddr);
8994 }
8995 }
8996
8997 /* Exit early if everything is covered by the DMAP */
8998 if (!needs_mapping)
8999 return (FALSE);
9000
9001 /*
9002 * NB: The sequence of updating a page table followed by accesses
9003 * to the corresponding pages used in the !DMAP case is subject to
9004 * the situation described in the "AMD64 Architecture Programmer's
9005 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
9006 * Coherency Considerations". Therefore, issuing the INVLPG right
9007 * after modifying the PTE bits is crucial.
9008 */
9009 if (!can_fault)
9010 sched_pin();
9011 for (i = 0; i < count; i++) {
9012 paddr = VM_PAGE_TO_PHYS(page[i]);
9013 if (paddr >= dmaplimit) {
9014 if (can_fault) {
9015 /*
9016 * Slow path, since we can get page faults
9017 * while mappings are active don't pin the
9018 * thread to the CPU and instead add a global
9019 * mapping visible to all CPUs.
9020 */
9021 pmap_qenter(vaddr[i], &page[i], 1);
9022 } else {
9023 pte = vtopte(vaddr[i]);
9024 cache_bits = pmap_cache_bits(kernel_pmap,
9025 page[i]->md.pat_mode, 0);
9026 pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
9027 cache_bits);
9028 invlpg(vaddr[i]);
9029 }
9030 }
9031 }
9032
9033 return (needs_mapping);
9034 }
9035
9036 void
9037 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9038 boolean_t can_fault)
9039 {
9040 vm_paddr_t paddr;
9041 int i;
9042
9043 if (!can_fault)
9044 sched_unpin();
9045 for (i = 0; i < count; i++) {
9046 paddr = VM_PAGE_TO_PHYS(page[i]);
9047 if (paddr >= dmaplimit) {
9048 if (can_fault)
9049 pmap_qremove(vaddr[i], 1);
9050 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
9051 }
9052 }
9053 }
9054
9055 vm_offset_t
9056 pmap_quick_enter_page(vm_page_t m)
9057 {
9058 vm_paddr_t paddr;
9059
9060 paddr = VM_PAGE_TO_PHYS(m);
9061 if (paddr < dmaplimit)
9062 return (PHYS_TO_DMAP(paddr));
9063 mtx_lock_spin(&qframe_mtx);
9064 KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
9065 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
9066 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
9067 return (qframe);
9068 }
9069
9070 void
9071 pmap_quick_remove_page(vm_offset_t addr)
9072 {
9073
9074 if (addr != qframe)
9075 return;
9076 pte_store(vtopte(qframe), 0);
9077 invlpg(qframe);
9078 mtx_unlock_spin(&qframe_mtx);
9079 }
9080
9081 /*
9082 * Pdp pages from the large map are managed differently from either
9083 * kernel or user page table pages. They are permanently allocated at
9084 * initialization time, and their wire count is permanently set to
9085 * zero. The pml4 entries pointing to those pages are copied into
9086 * each allocated pmap.
9087 *
9088 * In contrast, pd and pt pages are managed like user page table
9089 * pages. They are dynamically allocated, and their wire count
9090 * represents the number of valid entries within the page.
9091 */
9092 static vm_page_t
9093 pmap_large_map_getptp_unlocked(void)
9094 {
9095 vm_page_t m;
9096
9097 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
9098 VM_ALLOC_ZERO);
9099 if (m != NULL && (m->flags & PG_ZERO) == 0)
9100 pmap_zero_page(m);
9101 return (m);
9102 }
9103
9104 static vm_page_t
9105 pmap_large_map_getptp(void)
9106 {
9107 vm_page_t m;
9108
9109 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
9110 m = pmap_large_map_getptp_unlocked();
9111 if (m == NULL) {
9112 PMAP_UNLOCK(kernel_pmap);
9113 vm_wait(NULL);
9114 PMAP_LOCK(kernel_pmap);
9115 /* Callers retry. */
9116 }
9117 return (m);
9118 }
9119
9120 static pdp_entry_t *
9121 pmap_large_map_pdpe(vm_offset_t va)
9122 {
9123 vm_pindex_t pml4_idx;
9124 vm_paddr_t mphys;
9125
9126 pml4_idx = pmap_pml4e_index(va);
9127 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
9128 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
9129 "%#jx lm_ents %d",
9130 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
9131 KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
9132 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
9133 "LMSPML4I %#jx lm_ents %d",
9134 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
9135 mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
9136 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
9137 }
9138
9139 static pd_entry_t *
9140 pmap_large_map_pde(vm_offset_t va)
9141 {
9142 pdp_entry_t *pdpe;
9143 vm_page_t m;
9144 vm_paddr_t mphys;
9145
9146 retry:
9147 pdpe = pmap_large_map_pdpe(va);
9148 if (*pdpe == 0) {
9149 m = pmap_large_map_getptp();
9150 if (m == NULL)
9151 goto retry;
9152 mphys = VM_PAGE_TO_PHYS(m);
9153 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
9154 } else {
9155 MPASS((*pdpe & X86_PG_PS) == 0);
9156 mphys = *pdpe & PG_FRAME;
9157 }
9158 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
9159 }
9160
9161 static pt_entry_t *
9162 pmap_large_map_pte(vm_offset_t va)
9163 {
9164 pd_entry_t *pde;
9165 vm_page_t m;
9166 vm_paddr_t mphys;
9167
9168 retry:
9169 pde = pmap_large_map_pde(va);
9170 if (*pde == 0) {
9171 m = pmap_large_map_getptp();
9172 if (m == NULL)
9173 goto retry;
9174 mphys = VM_PAGE_TO_PHYS(m);
9175 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
9176 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++;
9177 } else {
9178 MPASS((*pde & X86_PG_PS) == 0);
9179 mphys = *pde & PG_FRAME;
9180 }
9181 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
9182 }
9183
9184 static vm_paddr_t
9185 pmap_large_map_kextract(vm_offset_t va)
9186 {
9187 pdp_entry_t *pdpe, pdp;
9188 pd_entry_t *pde, pd;
9189 pt_entry_t *pte, pt;
9190
9191 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va),
9192 ("not largemap range %#lx", (u_long)va));
9193 pdpe = pmap_large_map_pdpe(va);
9194 pdp = *pdpe;
9195 KASSERT((pdp & X86_PG_V) != 0,
9196 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
9197 (u_long)pdpe, pdp));
9198 if ((pdp & X86_PG_PS) != 0) {
9199 KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
9200 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
9201 (u_long)pdpe, pdp));
9202 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK));
9203 }
9204 pde = pmap_pdpe_to_pde(pdpe, va);
9205 pd = *pde;
9206 KASSERT((pd & X86_PG_V) != 0,
9207 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd));
9208 if ((pd & X86_PG_PS) != 0)
9209 return ((pd & PG_PS_FRAME) | (va & PDRMASK));
9210 pte = pmap_pde_to_pte(pde, va);
9211 pt = *pte;
9212 KASSERT((pt & X86_PG_V) != 0,
9213 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt));
9214 return ((pt & PG_FRAME) | (va & PAGE_MASK));
9215 }
9216
9217 static int
9218 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
9219 vmem_addr_t *vmem_res)
9220 {
9221
9222 /*
9223 * Large mappings are all but static. Consequently, there
9224 * is no point in waiting for an earlier allocation to be
9225 * freed.
9226 */
9227 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
9228 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
9229 }
9230
9231 int
9232 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
9233 vm_memattr_t mattr)
9234 {
9235 pdp_entry_t *pdpe;
9236 pd_entry_t *pde;
9237 pt_entry_t *pte;
9238 vm_offset_t va, inc;
9239 vmem_addr_t vmem_res;
9240 vm_paddr_t pa;
9241 int error;
9242
9243 if (len == 0 || spa + len < spa)
9244 return (EINVAL);
9245
9246 /* See if DMAP can serve. */
9247 if (spa + len <= dmaplimit) {
9248 va = PHYS_TO_DMAP(spa);
9249 *addr = (void *)va;
9250 return (pmap_change_attr(va, len, mattr));
9251 }
9252
9253 /*
9254 * No, allocate KVA. Fit the address with best possible
9255 * alignment for superpages. Fall back to worse align if
9256 * failed.
9257 */
9258 error = ENOMEM;
9259 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
9260 NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
9261 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
9262 &vmem_res);
9263 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
9264 NBPDR) + NBPDR)
9265 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
9266 &vmem_res);
9267 if (error != 0)
9268 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
9269 if (error != 0)
9270 return (error);
9271
9272 /*
9273 * Fill pagetable. PG_M is not pre-set, we scan modified bits
9274 * in the pagetable to minimize flushing. No need to
9275 * invalidate TLB, since we only update invalid entries.
9276 */
9277 PMAP_LOCK(kernel_pmap);
9278 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
9279 len -= inc) {
9280 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
9281 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
9282 pdpe = pmap_large_map_pdpe(va);
9283 MPASS(*pdpe == 0);
9284 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
9285 X86_PG_V | X86_PG_A | pg_nx |
9286 pmap_cache_bits(kernel_pmap, mattr, TRUE);
9287 inc = NBPDP;
9288 } else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
9289 (va & PDRMASK) == 0) {
9290 pde = pmap_large_map_pde(va);
9291 MPASS(*pde == 0);
9292 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
9293 X86_PG_V | X86_PG_A | pg_nx |
9294 pmap_cache_bits(kernel_pmap, mattr, TRUE);
9295 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
9296 wire_count++;
9297 inc = NBPDR;
9298 } else {
9299 pte = pmap_large_map_pte(va);
9300 MPASS(*pte == 0);
9301 *pte = pa | pg_g | X86_PG_RW | X86_PG_V |
9302 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
9303 mattr, FALSE);
9304 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
9305 wire_count++;
9306 inc = PAGE_SIZE;
9307 }
9308 }
9309 PMAP_UNLOCK(kernel_pmap);
9310 MPASS(len == 0);
9311
9312 *addr = (void *)vmem_res;
9313 return (0);
9314 }
9315
9316 void
9317 pmap_large_unmap(void *svaa, vm_size_t len)
9318 {
9319 vm_offset_t sva, va;
9320 vm_size_t inc;
9321 pdp_entry_t *pdpe, pdp;
9322 pd_entry_t *pde, pd;
9323 pt_entry_t *pte;
9324 vm_page_t m;
9325 struct spglist spgf;
9326
9327 sva = (vm_offset_t)svaa;
9328 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
9329 sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
9330 return;
9331
9332 SLIST_INIT(&spgf);
9333 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) &&
9334 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1),
9335 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
9336 PMAP_LOCK(kernel_pmap);
9337 for (va = sva; va < sva + len; va += inc) {
9338 pdpe = pmap_large_map_pdpe(va);
9339 pdp = *pdpe;
9340 KASSERT((pdp & X86_PG_V) != 0,
9341 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
9342 (u_long)pdpe, pdp));
9343 if ((pdp & X86_PG_PS) != 0) {
9344 KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
9345 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
9346 (u_long)pdpe, pdp));
9347 KASSERT((va & PDPMASK) == 0,
9348 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
9349 (u_long)pdpe, pdp));
9350 KASSERT(va + NBPDP <= sva + len,
9351 ("unmap covers partial 1GB page, sva %#lx va %#lx "
9352 "pdpe %#lx pdp %#lx len %#lx", sva, va,
9353 (u_long)pdpe, pdp, len));
9354 *pdpe = 0;
9355 inc = NBPDP;
9356 continue;
9357 }
9358 pde = pmap_pdpe_to_pde(pdpe, va);
9359 pd = *pde;
9360 KASSERT((pd & X86_PG_V) != 0,
9361 ("invalid pd va %#lx pde %#lx pd %#lx", va,
9362 (u_long)pde, pd));
9363 if ((pd & X86_PG_PS) != 0) {
9364 KASSERT((va & PDRMASK) == 0,
9365 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
9366 (u_long)pde, pd));
9367 KASSERT(va + NBPDR <= sva + len,
9368 ("unmap covers partial 2MB page, sva %#lx va %#lx "
9369 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde,
9370 pd, len));
9371 pde_store(pde, 0);
9372 inc = NBPDR;
9373 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
9374 m->wire_count--;
9375 if (m->wire_count == 0) {
9376 *pdpe = 0;
9377 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
9378 }
9379 continue;
9380 }
9381 pte = pmap_pde_to_pte(pde, va);
9382 KASSERT((*pte & X86_PG_V) != 0,
9383 ("invalid pte va %#lx pte %#lx pt %#lx", va,
9384 (u_long)pte, *pte));
9385 pte_clear(pte);
9386 inc = PAGE_SIZE;
9387 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
9388 m->wire_count--;
9389 if (m->wire_count == 0) {
9390 *pde = 0;
9391 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
9392 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
9393 m->wire_count--;
9394 if (m->wire_count == 0) {
9395 *pdpe = 0;
9396 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
9397 }
9398 }
9399 }
9400 pmap_invalidate_range(kernel_pmap, sva, sva + len);
9401 PMAP_UNLOCK(kernel_pmap);
9402 vm_page_free_pages_toq(&spgf, false);
9403 vmem_free(large_vmem, sva, len);
9404 }
9405
9406 static void
9407 pmap_large_map_wb_fence_mfence(void)
9408 {
9409
9410 mfence();
9411 }
9412
9413 static void
9414 pmap_large_map_wb_fence_atomic(void)
9415 {
9416
9417 atomic_thread_fence_seq_cst();
9418 }
9419
9420 static void
9421 pmap_large_map_wb_fence_nop(void)
9422 {
9423 }
9424
9425 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void), static)
9426 {
9427
9428 if (cpu_vendor_id != CPU_VENDOR_INTEL)
9429 return (pmap_large_map_wb_fence_mfence);
9430 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
9431 CPUID_STDEXT_CLFLUSHOPT)) == 0)
9432 return (pmap_large_map_wb_fence_atomic);
9433 else
9434 /* clflush is strongly enough ordered */
9435 return (pmap_large_map_wb_fence_nop);
9436 }
9437
9438 static void
9439 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
9440 {
9441
9442 for (; len > 0; len -= cpu_clflush_line_size,
9443 va += cpu_clflush_line_size)
9444 clwb(va);
9445 }
9446
9447 static void
9448 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
9449 {
9450
9451 for (; len > 0; len -= cpu_clflush_line_size,
9452 va += cpu_clflush_line_size)
9453 clflushopt(va);
9454 }
9455
9456 static void
9457 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
9458 {
9459
9460 for (; len > 0; len -= cpu_clflush_line_size,
9461 va += cpu_clflush_line_size)
9462 clflush(va);
9463 }
9464
9465 static void
9466 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused)
9467 {
9468 }
9469
9470 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t),
9471 static)
9472 {
9473
9474 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
9475 return (pmap_large_map_flush_range_clwb);
9476 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
9477 return (pmap_large_map_flush_range_clflushopt);
9478 else if ((cpu_feature & CPUID_CLFSH) != 0)
9479 return (pmap_large_map_flush_range_clflush);
9480 else
9481 return (pmap_large_map_flush_range_nop);
9482 }
9483
9484 static void
9485 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
9486 {
9487 volatile u_long *pe;
9488 u_long p;
9489 vm_offset_t va;
9490 vm_size_t inc;
9491 bool seen_other;
9492
9493 for (va = sva; va < eva; va += inc) {
9494 inc = 0;
9495 if ((amd_feature & AMDID_PAGE1GB) != 0) {
9496 pe = (volatile u_long *)pmap_large_map_pdpe(va);
9497 p = *pe;
9498 if ((p & X86_PG_PS) != 0)
9499 inc = NBPDP;
9500 }
9501 if (inc == 0) {
9502 pe = (volatile u_long *)pmap_large_map_pde(va);
9503 p = *pe;
9504 if ((p & X86_PG_PS) != 0)
9505 inc = NBPDR;
9506 }
9507 if (inc == 0) {
9508 pe = (volatile u_long *)pmap_large_map_pte(va);
9509 p = *pe;
9510 inc = PAGE_SIZE;
9511 }
9512 seen_other = false;
9513 for (;;) {
9514 if ((p & X86_PG_AVAIL1) != 0) {
9515 /*
9516 * Spin-wait for the end of a parallel
9517 * write-back.
9518 */
9519 cpu_spinwait();
9520 p = *pe;
9521
9522 /*
9523 * If we saw other write-back
9524 * occuring, we cannot rely on PG_M to
9525 * indicate state of the cache. The
9526 * PG_M bit is cleared before the
9527 * flush to avoid ignoring new writes,
9528 * and writes which are relevant for
9529 * us might happen after.
9530 */
9531 seen_other = true;
9532 continue;
9533 }
9534
9535 if ((p & X86_PG_M) != 0 || seen_other) {
9536 if (!atomic_fcmpset_long(pe, &p,
9537 (p & ~X86_PG_M) | X86_PG_AVAIL1))
9538 /*
9539 * If we saw PG_M without
9540 * PG_AVAIL1, and then on the
9541 * next attempt we do not
9542 * observe either PG_M or
9543 * PG_AVAIL1, the other
9544 * write-back started after us
9545 * and finished before us. We
9546 * can rely on it doing our
9547 * work.
9548 */
9549 continue;
9550 pmap_large_map_flush_range(va, inc);
9551 atomic_clear_long(pe, X86_PG_AVAIL1);
9552 }
9553 break;
9554 }
9555 maybe_yield();
9556 }
9557 }
9558
9559 /*
9560 * Write-back cache lines for the given address range.
9561 *
9562 * Must be called only on the range or sub-range returned from
9563 * pmap_large_map(). Must not be called on the coalesced ranges.
9564 *
9565 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
9566 * instructions support.
9567 */
9568 void
9569 pmap_large_map_wb(void *svap, vm_size_t len)
9570 {
9571 vm_offset_t eva, sva;
9572
9573 sva = (vm_offset_t)svap;
9574 eva = sva + len;
9575 pmap_large_map_wb_fence();
9576 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
9577 pmap_large_map_flush_range(sva, len);
9578 } else {
9579 KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
9580 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
9581 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
9582 pmap_large_map_wb_large(sva, eva);
9583 }
9584 pmap_large_map_wb_fence();
9585 }
9586
9587 static vm_page_t
9588 pmap_pti_alloc_page(void)
9589 {
9590 vm_page_t m;
9591
9592 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
9593 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
9594 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
9595 return (m);
9596 }
9597
9598 static bool
9599 pmap_pti_free_page(vm_page_t m)
9600 {
9601
9602 KASSERT(m->wire_count > 0, ("page %p not wired", m));
9603 if (!vm_page_unwire_noq(m))
9604 return (false);
9605 vm_page_free_zero(m);
9606 return (true);
9607 }
9608
9609 static void
9610 pmap_pti_init(void)
9611 {
9612 vm_page_t pml4_pg;
9613 pdp_entry_t *pdpe;
9614 vm_offset_t va;
9615 int i;
9616
9617 if (!pti)
9618 return;
9619 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
9620 VM_OBJECT_WLOCK(pti_obj);
9621 pml4_pg = pmap_pti_alloc_page();
9622 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
9623 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
9624 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
9625 pdpe = pmap_pti_pdpe(va);
9626 pmap_pti_wire_pte(pdpe);
9627 }
9628 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
9629 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
9630 pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
9631 sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
9632 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
9633 sizeof(struct gate_descriptor) * NIDT, false);
9634 pmap_pti_add_kva_locked((vm_offset_t)common_tss,
9635 (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
9636 CPU_FOREACH(i) {
9637 /* Doublefault stack IST 1 */
9638 va = common_tss[i].tss_ist1;
9639 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false);
9640 /* NMI stack IST 2 */
9641 va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
9642 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false);
9643 /* MC# stack IST 3 */
9644 va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
9645 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false);
9646 /* DB# stack IST 4 */
9647 va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
9648 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
9649 }
9650 pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
9651 (vm_offset_t)etext, true);
9652 pti_finalized = true;
9653 VM_OBJECT_WUNLOCK(pti_obj);
9654 }
9655 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
9656
9657 static pdp_entry_t *
9658 pmap_pti_pdpe(vm_offset_t va)
9659 {
9660 pml4_entry_t *pml4e;
9661 pdp_entry_t *pdpe;
9662 vm_page_t m;
9663 vm_pindex_t pml4_idx;
9664 vm_paddr_t mphys;
9665
9666 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
9667
9668 pml4_idx = pmap_pml4e_index(va);
9669 pml4e = &pti_pml4[pml4_idx];
9670 m = NULL;
9671 if (*pml4e == 0) {
9672 if (pti_finalized)
9673 panic("pml4 alloc after finalization\n");
9674 m = pmap_pti_alloc_page();
9675 if (*pml4e != 0) {
9676 pmap_pti_free_page(m);
9677 mphys = *pml4e & ~PAGE_MASK;
9678 } else {
9679 mphys = VM_PAGE_TO_PHYS(m);
9680 *pml4e = mphys | X86_PG_RW | X86_PG_V;
9681 }
9682 } else {
9683 mphys = *pml4e & ~PAGE_MASK;
9684 }
9685 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
9686 return (pdpe);
9687 }
9688
9689 static void
9690 pmap_pti_wire_pte(void *pte)
9691 {
9692 vm_page_t m;
9693
9694 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
9695 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
9696 m->wire_count++;
9697 }
9698
9699 static void
9700 pmap_pti_unwire_pde(void *pde, bool only_ref)
9701 {
9702 vm_page_t m;
9703
9704 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
9705 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
9706 MPASS(m->wire_count > 0);
9707 MPASS(only_ref || m->wire_count > 1);
9708 pmap_pti_free_page(m);
9709 }
9710
9711 static void
9712 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
9713 {
9714 vm_page_t m;
9715 pd_entry_t *pde;
9716
9717 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
9718 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
9719 MPASS(m->wire_count > 0);
9720 if (pmap_pti_free_page(m)) {
9721 pde = pmap_pti_pde(va);
9722 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
9723 *pde = 0;
9724 pmap_pti_unwire_pde(pde, false);
9725 }
9726 }
9727
9728 static pd_entry_t *
9729 pmap_pti_pde(vm_offset_t va)
9730 {
9731 pdp_entry_t *pdpe;
9732 pd_entry_t *pde;
9733 vm_page_t m;
9734 vm_pindex_t pd_idx;
9735 vm_paddr_t mphys;
9736
9737 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
9738
9739 pdpe = pmap_pti_pdpe(va);
9740 if (*pdpe == 0) {
9741 m = pmap_pti_alloc_page();
9742 if (*pdpe != 0) {
9743 pmap_pti_free_page(m);
9744 MPASS((*pdpe & X86_PG_PS) == 0);
9745 mphys = *pdpe & ~PAGE_MASK;
9746 } else {
9747 mphys = VM_PAGE_TO_PHYS(m);
9748 *pdpe = mphys | X86_PG_RW | X86_PG_V;
9749 }
9750 } else {
9751 MPASS((*pdpe & X86_PG_PS) == 0);
9752 mphys = *pdpe & ~PAGE_MASK;
9753 }
9754
9755 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
9756 pd_idx = pmap_pde_index(va);
9757 pde += pd_idx;
9758 return (pde);
9759 }
9760
9761 static pt_entry_t *
9762 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
9763 {
9764 pd_entry_t *pde;
9765 pt_entry_t *pte;
9766 vm_page_t m;
9767 vm_paddr_t mphys;
9768
9769 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
9770
9771 pde = pmap_pti_pde(va);
9772 if (unwire_pde != NULL) {
9773 *unwire_pde = true;
9774 pmap_pti_wire_pte(pde);
9775 }
9776 if (*pde == 0) {
9777 m = pmap_pti_alloc_page();
9778 if (*pde != 0) {
9779 pmap_pti_free_page(m);
9780 MPASS((*pde & X86_PG_PS) == 0);
9781 mphys = *pde & ~(PAGE_MASK | pg_nx);
9782 } else {
9783 mphys = VM_PAGE_TO_PHYS(m);
9784 *pde = mphys | X86_PG_RW | X86_PG_V;
9785 if (unwire_pde != NULL)
9786 *unwire_pde = false;
9787 }
9788 } else {
9789 MPASS((*pde & X86_PG_PS) == 0);
9790 mphys = *pde & ~(PAGE_MASK | pg_nx);
9791 }
9792
9793 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
9794 pte += pmap_pte_index(va);
9795
9796 return (pte);
9797 }
9798
9799 static void
9800 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
9801 {
9802 vm_paddr_t pa;
9803 pd_entry_t *pde;
9804 pt_entry_t *pte, ptev;
9805 bool unwire_pde;
9806
9807 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
9808
9809 sva = trunc_page(sva);
9810 MPASS(sva > VM_MAXUSER_ADDRESS);
9811 eva = round_page(eva);
9812 MPASS(sva < eva);
9813 for (; sva < eva; sva += PAGE_SIZE) {
9814 pte = pmap_pti_pte(sva, &unwire_pde);
9815 pa = pmap_kextract(sva);
9816 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
9817 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
9818 VM_MEMATTR_DEFAULT, FALSE);
9819 if (*pte == 0) {
9820 pte_store(pte, ptev);
9821 pmap_pti_wire_pte(pte);
9822 } else {
9823 KASSERT(!pti_finalized,
9824 ("pti overlap after fin %#lx %#lx %#lx",
9825 sva, *pte, ptev));
9826 KASSERT(*pte == ptev,
9827 ("pti non-identical pte after fin %#lx %#lx %#lx",
9828 sva, *pte, ptev));
9829 }
9830 if (unwire_pde) {
9831 pde = pmap_pti_pde(sva);
9832 pmap_pti_unwire_pde(pde, true);
9833 }
9834 }
9835 }
9836
9837 void
9838 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
9839 {
9840
9841 if (!pti)
9842 return;
9843 VM_OBJECT_WLOCK(pti_obj);
9844 pmap_pti_add_kva_locked(sva, eva, exec);
9845 VM_OBJECT_WUNLOCK(pti_obj);
9846 }
9847
9848 void
9849 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
9850 {
9851 pt_entry_t *pte;
9852 vm_offset_t va;
9853
9854 if (!pti)
9855 return;
9856 sva = rounddown2(sva, PAGE_SIZE);
9857 MPASS(sva > VM_MAXUSER_ADDRESS);
9858 eva = roundup2(eva, PAGE_SIZE);
9859 MPASS(sva < eva);
9860 VM_OBJECT_WLOCK(pti_obj);
9861 for (va = sva; va < eva; va += PAGE_SIZE) {
9862 pte = pmap_pti_pte(va, NULL);
9863 KASSERT((*pte & X86_PG_V) != 0,
9864 ("invalid pte va %#lx pte %#lx pt %#lx", va,
9865 (u_long)pte, *pte));
9866 pte_clear(pte);
9867 pmap_pti_unwire_pte(pte, va);
9868 }
9869 pmap_invalidate_range(kernel_pmap, sva, eva);
9870 VM_OBJECT_WUNLOCK(pti_obj);
9871 }
9872
9873 static void *
9874 pkru_dup_range(void *ctx __unused, void *data)
9875 {
9876 struct pmap_pkru_range *node, *new_node;
9877
9878 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
9879 if (new_node == NULL)
9880 return (NULL);
9881 node = data;
9882 memcpy(new_node, node, sizeof(*node));
9883 return (new_node);
9884 }
9885
9886 static void
9887 pkru_free_range(void *ctx __unused, void *node)
9888 {
9889
9890 uma_zfree(pmap_pkru_ranges_zone, node);
9891 }
9892
9893 static int
9894 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
9895 int flags)
9896 {
9897 struct pmap_pkru_range *ppr;
9898 int error;
9899
9900 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9901 MPASS(pmap->pm_type == PT_X86);
9902 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
9903 if ((flags & AMD64_PKRU_EXCL) != 0 &&
9904 !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
9905 return (EBUSY);
9906 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
9907 if (ppr == NULL)
9908 return (ENOMEM);
9909 ppr->pkru_keyidx = keyidx;
9910 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
9911 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
9912 if (error != 0)
9913 uma_zfree(pmap_pkru_ranges_zone, ppr);
9914 return (error);
9915 }
9916
9917 static int
9918 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9919 {
9920
9921 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9922 MPASS(pmap->pm_type == PT_X86);
9923 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
9924 return (rangeset_remove(&pmap->pm_pkru, sva, eva));
9925 }
9926
9927 static void
9928 pmap_pkru_deassign_all(pmap_t pmap)
9929 {
9930
9931 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9932 if (pmap->pm_type == PT_X86 &&
9933 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
9934 rangeset_remove_all(&pmap->pm_pkru);
9935 }
9936
9937 static bool
9938 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9939 {
9940 struct pmap_pkru_range *ppr, *prev_ppr;
9941 vm_offset_t va;
9942
9943 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9944 if (pmap->pm_type != PT_X86 ||
9945 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
9946 sva >= VM_MAXUSER_ADDRESS)
9947 return (true);
9948 MPASS(eva <= VM_MAXUSER_ADDRESS);
9949 for (va = sva, prev_ppr = NULL; va < eva;) {
9950 ppr = rangeset_lookup(&pmap->pm_pkru, va);
9951 if ((ppr == NULL) ^ (prev_ppr == NULL))
9952 return (false);
9953 if (ppr == NULL) {
9954 va += PAGE_SIZE;
9955 continue;
9956 }
9957 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
9958 return (false);
9959 va = ppr->pkru_rs_el.re_end;
9960 }
9961 return (true);
9962 }
9963
9964 static pt_entry_t
9965 pmap_pkru_get(pmap_t pmap, vm_offset_t va)
9966 {
9967 struct pmap_pkru_range *ppr;
9968
9969 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9970 if (pmap->pm_type != PT_X86 ||
9971 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
9972 va >= VM_MAXUSER_ADDRESS)
9973 return (0);
9974 ppr = rangeset_lookup(&pmap->pm_pkru, va);
9975 if (ppr != NULL)
9976 return (X86_PG_PKU(ppr->pkru_keyidx));
9977 return (0);
9978 }
9979
9980 static bool
9981 pred_pkru_on_remove(void *ctx __unused, void *r)
9982 {
9983 struct pmap_pkru_range *ppr;
9984
9985 ppr = r;
9986 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
9987 }
9988
9989 static void
9990 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9991 {
9992
9993 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9994 if (pmap->pm_type == PT_X86 &&
9995 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
9996 rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
9997 pred_pkru_on_remove);
9998 }
9999 }
10000
10001 static int
10002 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
10003 {
10004
10005 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
10006 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
10007 MPASS(dst_pmap->pm_type == PT_X86);
10008 MPASS(src_pmap->pm_type == PT_X86);
10009 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
10010 if (src_pmap->pm_pkru.rs_data_ctx == NULL)
10011 return (0);
10012 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
10013 }
10014
10015 static void
10016 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
10017 u_int keyidx)
10018 {
10019 pml4_entry_t *pml4e;
10020 pdp_entry_t *pdpe;
10021 pd_entry_t newpde, ptpaddr, *pde;
10022 pt_entry_t newpte, *ptep, pte;
10023 vm_offset_t va, va_next;
10024 bool changed;
10025
10026 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
10027 MPASS(pmap->pm_type == PT_X86);
10028 MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
10029
10030 for (changed = false, va = sva; va < eva; va = va_next) {
10031 pml4e = pmap_pml4e(pmap, va);
10032 if ((*pml4e & X86_PG_V) == 0) {
10033 va_next = (va + NBPML4) & ~PML4MASK;
10034 if (va_next < va)
10035 va_next = eva;
10036 continue;
10037 }
10038
10039 pdpe = pmap_pml4e_to_pdpe(pml4e, va);
10040 if ((*pdpe & X86_PG_V) == 0) {
10041 va_next = (va + NBPDP) & ~PDPMASK;
10042 if (va_next < va)
10043 va_next = eva;
10044 continue;
10045 }
10046
10047 va_next = (va + NBPDR) & ~PDRMASK;
10048 if (va_next < va)
10049 va_next = eva;
10050
10051 pde = pmap_pdpe_to_pde(pdpe, va);
10052 ptpaddr = *pde;
10053 if (ptpaddr == 0)
10054 continue;
10055
10056 MPASS((ptpaddr & X86_PG_V) != 0);
10057 if ((ptpaddr & PG_PS) != 0) {
10058 if (va + NBPDR == va_next && eva >= va_next) {
10059 newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
10060 X86_PG_PKU(keyidx);
10061 if (newpde != ptpaddr) {
10062 *pde = newpde;
10063 changed = true;
10064 }
10065 continue;
10066 } else if (!pmap_demote_pde(pmap, pde, va)) {
10067 continue;
10068 }
10069 }
10070
10071 if (va_next > eva)
10072 va_next = eva;
10073
10074 for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
10075 ptep++, va += PAGE_SIZE) {
10076 pte = *ptep;
10077 if ((pte & X86_PG_V) == 0)
10078 continue;
10079 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
10080 if (newpte != pte) {
10081 *ptep = newpte;
10082 changed = true;
10083 }
10084 }
10085 }
10086 if (changed)
10087 pmap_invalidate_range(pmap, sva, eva);
10088 }
10089
10090 static int
10091 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
10092 u_int keyidx, int flags)
10093 {
10094
10095 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
10096 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
10097 return (EINVAL);
10098 if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
10099 return (EFAULT);
10100 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
10101 return (ENOTSUP);
10102 return (0);
10103 }
10104
10105 int
10106 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
10107 int flags)
10108 {
10109 int error;
10110
10111 sva = trunc_page(sva);
10112 eva = round_page(eva);
10113 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
10114 if (error != 0)
10115 return (error);
10116 for (;;) {
10117 PMAP_LOCK(pmap);
10118 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
10119 if (error == 0)
10120 pmap_pkru_update_range(pmap, sva, eva, keyidx);
10121 PMAP_UNLOCK(pmap);
10122 if (error != ENOMEM)
10123 break;
10124 vm_wait(NULL);
10125 }
10126 return (error);
10127 }
10128
10129 int
10130 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
10131 {
10132 int error;
10133
10134 sva = trunc_page(sva);
10135 eva = round_page(eva);
10136 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
10137 if (error != 0)
10138 return (error);
10139 for (;;) {
10140 PMAP_LOCK(pmap);
10141 error = pmap_pkru_deassign(pmap, sva, eva);
10142 if (error == 0)
10143 pmap_pkru_update_range(pmap, sva, eva, 0);
10144 PMAP_UNLOCK(pmap);
10145 if (error != ENOMEM)
10146 break;
10147 vm_wait(NULL);
10148 }
10149 return (error);
10150 }
10151
10152 /*
10153 * Track a range of the kernel's virtual address space that is contiguous
10154 * in various mapping attributes.
10155 */
10156 struct pmap_kernel_map_range {
10157 vm_offset_t sva;
10158 pt_entry_t attrs;
10159 int ptes;
10160 int pdes;
10161 int pdpes;
10162 };
10163
10164 static void
10165 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
10166 vm_offset_t eva)
10167 {
10168 const char *mode;
10169 int i, pat_idx;
10170
10171 if (eva <= range->sva)
10172 return;
10173
10174 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true);
10175 for (i = 0; i < PAT_INDEX_SIZE; i++)
10176 if (pat_index[i] == pat_idx)
10177 break;
10178
10179 switch (i) {
10180 case PAT_WRITE_BACK:
10181 mode = "WB";
10182 break;
10183 case PAT_WRITE_THROUGH:
10184 mode = "WT";
10185 break;
10186 case PAT_UNCACHEABLE:
10187 mode = "UC";
10188 break;
10189 case PAT_UNCACHED:
10190 mode = "U-";
10191 break;
10192 case PAT_WRITE_PROTECTED:
10193 mode = "WP";
10194 break;
10195 case PAT_WRITE_COMBINING:
10196 mode = "WC";
10197 break;
10198 default:
10199 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n",
10200 __func__, pat_idx, range->sva, eva);
10201 mode = "??";
10202 break;
10203 }
10204
10205 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
10206 range->sva, eva,
10207 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-',
10208 (range->attrs & pg_nx) != 0 ? '-' : 'x',
10209 (range->attrs & X86_PG_U) != 0 ? 'u' : 's',
10210 (range->attrs & X86_PG_G) != 0 ? 'g' : '-',
10211 mode, range->pdpes, range->pdes, range->ptes);
10212
10213 /* Reset to sentinel value. */
10214 range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
10215 }
10216
10217 /*
10218 * Determine whether the attributes specified by a page table entry match those
10219 * being tracked by the current range. This is not quite as simple as a direct
10220 * flag comparison since some PAT modes have multiple representations.
10221 */
10222 static bool
10223 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
10224 {
10225 pt_entry_t diff, mask;
10226
10227 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx;
10228 diff = (range->attrs ^ attrs) & mask;
10229 if (diff == 0)
10230 return (true);
10231 if ((diff & ~X86_PG_PDE_PAT) == 0 &&
10232 pmap_pat_index(kernel_pmap, range->attrs, true) ==
10233 pmap_pat_index(kernel_pmap, attrs, true))
10234 return (true);
10235 return (false);
10236 }
10237
10238 static void
10239 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
10240 pt_entry_t attrs)
10241 {
10242
10243 memset(range, 0, sizeof(*range));
10244 range->sva = va;
10245 range->attrs = attrs;
10246 }
10247
10248 /*
10249 * Given a leaf PTE, derive the mapping's attributes. If they do not match
10250 * those of the current run, dump the address range and its attributes, and
10251 * begin a new run.
10252 */
10253 static void
10254 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
10255 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
10256 pt_entry_t pte)
10257 {
10258 pt_entry_t attrs;
10259
10260 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
10261
10262 attrs |= pdpe & pg_nx;
10263 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
10264 if ((pdpe & PG_PS) != 0) {
10265 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE);
10266 } else if (pde != 0) {
10267 attrs |= pde & pg_nx;
10268 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U));
10269 }
10270 if ((pde & PG_PS) != 0) {
10271 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE);
10272 } else if (pte != 0) {
10273 attrs |= pte & pg_nx;
10274 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U));
10275 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE);
10276
10277 /* Canonicalize by always using the PDE PAT bit. */
10278 if ((attrs & X86_PG_PTE_PAT) != 0)
10279 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT;
10280 }
10281
10282 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
10283 sysctl_kmaps_dump(sb, range, va);
10284 sysctl_kmaps_reinit(range, va, attrs);
10285 }
10286 }
10287
10288 static int
10289 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
10290 {
10291 struct pmap_kernel_map_range range;
10292 struct sbuf sbuf, *sb;
10293 pml4_entry_t pml4e;
10294 pdp_entry_t *pdp, pdpe;
10295 pd_entry_t *pd, pde;
10296 pt_entry_t *pt, pte;
10297 vm_offset_t sva;
10298 vm_paddr_t pa;
10299 int error, i, j, k, l;
10300
10301 error = sysctl_wire_old_buffer(req, 0);
10302 if (error != 0)
10303 return (error);
10304 sb = &sbuf;
10305 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
10306
10307 /* Sentinel value. */
10308 range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
10309
10310 /*
10311 * Iterate over the kernel page tables without holding the kernel pmap
10312 * lock. Outside of the large map, kernel page table pages are never
10313 * freed, so at worst we will observe inconsistencies in the output.
10314 * Within the large map, ensure that PDP and PD page addresses are
10315 * valid before descending.
10316 */
10317 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
10318 switch (i) {
10319 case PML4PML4I:
10320 sbuf_printf(sb, "\nRecursive map:\n");
10321 break;
10322 case DMPML4I:
10323 sbuf_printf(sb, "\nDirect map:\n");
10324 break;
10325 case KPML4BASE:
10326 sbuf_printf(sb, "\nKernel map:\n");
10327 break;
10328 case LMSPML4I:
10329 sbuf_printf(sb, "\nLarge map:\n");
10330 break;
10331 }
10332
10333 /* Convert to canonical form. */
10334 if (sva == 1ul << 47)
10335 sva |= -1ul << 48;
10336
10337 restart:
10338 pml4e = kernel_pmap->pm_pml4[i];
10339 if ((pml4e & X86_PG_V) == 0) {
10340 sva = rounddown2(sva, NBPML4);
10341 sysctl_kmaps_dump(sb, &range, sva);
10342 sva += NBPML4;
10343 continue;
10344 }
10345 pa = pml4e & PG_FRAME;
10346 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa);
10347
10348 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) {
10349 pdpe = pdp[j];
10350 if ((pdpe & X86_PG_V) == 0) {
10351 sva = rounddown2(sva, NBPDP);
10352 sysctl_kmaps_dump(sb, &range, sva);
10353 sva += NBPDP;
10354 continue;
10355 }
10356 pa = pdpe & PG_FRAME;
10357 if ((pdpe & PG_PS) != 0) {
10358 sva = rounddown2(sva, NBPDP);
10359 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
10360 0, 0);
10361 range.pdpes++;
10362 sva += NBPDP;
10363 continue;
10364 }
10365 if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
10366 vm_phys_paddr_to_vm_page(pa) == NULL) {
10367 /*
10368 * Page table pages for the large map may be
10369 * freed. Validate the next-level address
10370 * before descending.
10371 */
10372 goto restart;
10373 }
10374 pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
10375
10376 for (k = pmap_pde_index(sva); k < NPDEPG; k++) {
10377 pde = pd[k];
10378 if ((pde & X86_PG_V) == 0) {
10379 sva = rounddown2(sva, NBPDR);
10380 sysctl_kmaps_dump(sb, &range, sva);
10381 sva += NBPDR;
10382 continue;
10383 }
10384 pa = pde & PG_FRAME;
10385 if ((pde & PG_PS) != 0) {
10386 sva = rounddown2(sva, NBPDR);
10387 sysctl_kmaps_check(sb, &range, sva,
10388 pml4e, pdpe, pde, 0);
10389 range.pdes++;
10390 sva += NBPDR;
10391 continue;
10392 }
10393 if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
10394 vm_phys_paddr_to_vm_page(pa) == NULL) {
10395 /*
10396 * Page table pages for the large map
10397 * may be freed. Validate the
10398 * next-level address before descending.
10399 */
10400 goto restart;
10401 }
10402 pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
10403
10404 for (l = pmap_pte_index(sva); l < NPTEPG; l++,
10405 sva += PAGE_SIZE) {
10406 pte = pt[l];
10407 if ((pte & X86_PG_V) == 0) {
10408 sysctl_kmaps_dump(sb, &range,
10409 sva);
10410 continue;
10411 }
10412 sysctl_kmaps_check(sb, &range, sva,
10413 pml4e, pdpe, pde, pte);
10414 range.ptes++;
10415 }
10416 }
10417 }
10418 }
10419
10420 error = sbuf_finish(sb);
10421 sbuf_delete(sb);
10422 return (error);
10423 }
10424 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
10425 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
10426 NULL, 0, sysctl_kmaps, "A",
10427 "Dump kernel address layout");
10428
10429 #ifdef DDB
10430 DB_SHOW_COMMAND(pte, pmap_print_pte)
10431 {
10432 pmap_t pmap;
10433 pml4_entry_t *pml4;
10434 pdp_entry_t *pdp;
10435 pd_entry_t *pde;
10436 pt_entry_t *pte, PG_V;
10437 vm_offset_t va;
10438
10439 if (!have_addr) {
10440 db_printf("show pte addr\n");
10441 return;
10442 }
10443 va = (vm_offset_t)addr;
10444
10445 if (kdb_thread != NULL)
10446 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
10447 else
10448 pmap = PCPU_GET(curpmap);
10449
10450 PG_V = pmap_valid_bit(pmap);
10451 pml4 = pmap_pml4e(pmap, va);
10452 db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4);
10453 if ((*pml4 & PG_V) == 0) {
10454 db_printf("\n");
10455 return;
10456 }
10457 pdp = pmap_pml4e_to_pdpe(pml4, va);
10458 db_printf(" pdpe 0x%016lx", *pdp);
10459 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
10460 db_printf("\n");
10461 return;
10462 }
10463 pde = pmap_pdpe_to_pde(pdp, va);
10464 db_printf(" pde 0x%016lx", *pde);
10465 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
10466 db_printf("\n");
10467 return;
10468 }
10469 pte = pmap_pde_to_pte(pde, va);
10470 db_printf(" pte 0x%016lx\n", *pte);
10471 }
10472
10473 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
10474 {
10475 vm_paddr_t a;
10476
10477 if (have_addr) {
10478 a = (vm_paddr_t)addr;
10479 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
10480 } else {
10481 db_printf("show phys2dmap addr\n");
10482 }
10483 }
10484 #endif
Cache object: 9f6b25a88312e59b5b8ea2b14a370149
|