FreeBSD/Linux Kernel Cross Reference
sys/i386/i386/pmap.c
1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 * must display the following acknowledgement:
25 * This product includes software developed by the University of
26 * California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
44 */
45 /*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD: releng/11.1/sys/i386/i386/pmap.c 330908 2018-03-14 04:00:00Z gordon $");
79
80 /*
81 * Manages physical address maps.
82 *
83 * Since the information managed by this module is
84 * also stored by the logical address mapping module,
85 * this module may throw away valid virtual-to-physical
86 * mappings at almost any time. However, invalidations
87 * of virtual-to-physical mappings must be done as
88 * requested.
89 *
90 * In order to cope with hardware architectures which
91 * make virtual-to-physical map invalidates expensive,
92 * this module may delay invalidate or reduced protection
93 * operations until such time as they are actually
94 * necessary. This module is given full information as
95 * to which processors are currently using which maps,
96 * and to when physical maps must be made correct.
97 */
98
99 #include "opt_apic.h"
100 #include "opt_cpu.h"
101 #include "opt_pmap.h"
102 #include "opt_smp.h"
103 #include "opt_xbox.h"
104
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/kernel.h>
108 #include <sys/ktr.h>
109 #include <sys/lock.h>
110 #include <sys/malloc.h>
111 #include <sys/mman.h>
112 #include <sys/msgbuf.h>
113 #include <sys/mutex.h>
114 #include <sys/proc.h>
115 #include <sys/rwlock.h>
116 #include <sys/sf_buf.h>
117 #include <sys/sx.h>
118 #include <sys/vmmeter.h>
119 #include <sys/sched.h>
120 #include <sys/sysctl.h>
121 #include <sys/smp.h>
122
123 #include <vm/vm.h>
124 #include <vm/vm_param.h>
125 #include <vm/vm_kern.h>
126 #include <vm/vm_page.h>
127 #include <vm/vm_map.h>
128 #include <vm/vm_object.h>
129 #include <vm/vm_extern.h>
130 #include <vm/vm_pageout.h>
131 #include <vm/vm_pager.h>
132 #include <vm/vm_phys.h>
133 #include <vm/vm_radix.h>
134 #include <vm/vm_reserv.h>
135 #include <vm/uma.h>
136
137 #ifdef DEV_APIC
138 #include <sys/bus.h>
139 #include <machine/intr_machdep.h>
140 #include <x86/apicvar.h>
141 #endif
142 #include <machine/cpu.h>
143 #include <machine/cputypes.h>
144 #include <machine/md_var.h>
145 #include <machine/pcb.h>
146 #include <machine/specialreg.h>
147 #ifdef SMP
148 #include <machine/smp.h>
149 #endif
150
151 #ifdef XBOX
152 #include <machine/xbox.h>
153 #endif
154
155 #ifndef PMAP_SHPGPERPROC
156 #define PMAP_SHPGPERPROC 200
157 #endif
158
159 #if !defined(DIAGNOSTIC)
160 #ifdef __GNUC_GNU_INLINE__
161 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline
162 #else
163 #define PMAP_INLINE extern inline
164 #endif
165 #else
166 #define PMAP_INLINE
167 #endif
168
169 #ifdef PV_STATS
170 #define PV_STAT(x) do { x ; } while (0)
171 #else
172 #define PV_STAT(x) do { } while (0)
173 #endif
174
175 #define pa_index(pa) ((pa) >> PDRSHIFT)
176 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
177
178 /*
179 * Get PDEs and PTEs for user/kernel address space
180 */
181 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
182 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
183
184 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0)
185 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0)
186 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0)
187 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0)
188 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0)
189
190 #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
191 atomic_clear_int((u_int *)(pte), PG_W))
192 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
193
194 struct pmap kernel_pmap_store;
195 LIST_HEAD(pmaplist, pmap);
196 static struct pmaplist allpmaps;
197 static struct mtx allpmaps_lock;
198
199 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
200 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
201 int pgeflag = 0; /* PG_G or-in */
202 int pseflag = 0; /* PG_PS or-in */
203
204 static int nkpt = NKPT;
205 vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
206 extern u_int32_t KERNend;
207 extern u_int32_t KPTphys;
208
209 #if defined(PAE) || defined(PAE_TABLES)
210 pt_entry_t pg_nx;
211 static uma_zone_t pdptzone;
212 #endif
213
214 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
215
216 static int pat_works = 1;
217 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
218 "Is page attribute table fully functional?");
219
220 static int pg_ps_enabled = 1;
221 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
222 &pg_ps_enabled, 0, "Are large page mappings enabled?");
223
224 #define PAT_INDEX_SIZE 8
225 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
226
227 /*
228 * pmap_mapdev support pre initialization (i.e. console)
229 */
230 #define PMAP_PREINIT_MAPPING_COUNT 8
231 static struct pmap_preinit_mapping {
232 vm_paddr_t pa;
233 vm_offset_t va;
234 vm_size_t sz;
235 int mode;
236 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
237 static int pmap_initialized;
238
239 static struct rwlock_padalign pvh_global_lock;
240
241 /*
242 * Data for the pv entry allocation mechanism
243 */
244 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
245 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
246 static struct md_page *pv_table;
247 static int shpgperproc = PMAP_SHPGPERPROC;
248
249 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */
250 int pv_maxchunks; /* How many chunks we have KVA for */
251 vm_offset_t pv_vafree; /* freelist stored in the PTE */
252
253 /*
254 * All those kernel PT submaps that BSD is so fond of
255 */
256 pt_entry_t *CMAP3;
257 static pd_entry_t *KPTD;
258 caddr_t ptvmmap = 0;
259 caddr_t CADDR3;
260 struct msgbuf *msgbufp = NULL;
261
262 /*
263 * Crashdump maps.
264 */
265 static caddr_t crashdumpmap;
266
267 static pt_entry_t *PMAP1 = NULL, *PMAP2;
268 static pt_entry_t *PADDR1 = NULL, *PADDR2;
269 #ifdef SMP
270 static int PMAP1cpu;
271 static int PMAP1changedcpu;
272 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
273 &PMAP1changedcpu, 0,
274 "Number of times pmap_pte_quick changed CPU with same PMAP1");
275 #endif
276 static int PMAP1changed;
277 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
278 &PMAP1changed, 0,
279 "Number of times pmap_pte_quick changed PMAP1");
280 static int PMAP1unchanged;
281 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
282 &PMAP1unchanged, 0,
283 "Number of times pmap_pte_quick didn't change PMAP1");
284 static struct mtx PMAP2mutex;
285
286 int pti;
287
288 static void free_pv_chunk(struct pv_chunk *pc);
289 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
290 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
291 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
292 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
293 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
294 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
295 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
296 vm_offset_t va);
297 static int pmap_pvh_wired_mappings(struct md_page *pvh, int count);
298
299 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
300 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
301 vm_prot_t prot);
302 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
303 vm_page_t m, vm_prot_t prot, vm_page_t mpte);
304 static void pmap_flush_page(vm_page_t m);
305 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
306 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
307 pd_entry_t pde);
308 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
309 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
310 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
311 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
312 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
313 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
314 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
315 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
316 vm_prot_t prot);
317 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
318 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
319 struct spglist *free);
320 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
321 struct spglist *free);
322 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
323 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
324 struct spglist *free);
325 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
326 vm_offset_t va);
327 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
328 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
329 vm_page_t m);
330 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
331 pd_entry_t newpde);
332 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
333
334 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
335
336 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
337 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
338 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
339 static void pmap_pte_release(pt_entry_t *pte);
340 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
341 #if defined(PAE) || defined(PAE_TABLES)
342 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
343 int wait);
344 #endif
345 static void pmap_set_pg(void);
346
347 static __inline void pagezero(void *page);
348
349 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
350 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
351
352 /*
353 * If you get an error here, then you set KVA_PAGES wrong! See the
354 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
355 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
356 */
357 CTASSERT(KERNBASE % (1 << 24) == 0);
358
359 /*
360 * Bootstrap the system enough to run with virtual memory.
361 *
362 * On the i386 this is called after mapping has already been enabled
363 * and just syncs the pmap module with what has already been done.
364 * [We can't call it easily with mapping off since the kernel is not
365 * mapped with PA == VA, hence we would have to relocate every address
366 * from the linked base (virtual) address "KERNBASE" to the actual
367 * (physical) address starting relative to 0]
368 */
369 void
370 pmap_bootstrap(vm_paddr_t firstaddr)
371 {
372 vm_offset_t va;
373 pt_entry_t *pte, *unused;
374 struct pcpu *pc;
375 int i;
376
377 /*
378 * Add a physical memory segment (vm_phys_seg) corresponding to the
379 * preallocated kernel page table pages so that vm_page structures
380 * representing these pages will be created. The vm_page structures
381 * are required for promotion of the corresponding kernel virtual
382 * addresses to superpage mappings.
383 */
384 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
385
386 /*
387 * Initialize the first available kernel virtual address. However,
388 * using "firstaddr" may waste a few pages of the kernel virtual
389 * address space, because locore may not have mapped every physical
390 * page that it allocated. Preferably, locore would provide a first
391 * unused virtual address in addition to "firstaddr".
392 */
393 virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
394
395 virtual_end = VM_MAX_KERNEL_ADDRESS;
396
397 /*
398 * Initialize the kernel pmap (which is statically allocated).
399 */
400 PMAP_LOCK_INIT(kernel_pmap);
401 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
402 #if defined(PAE) || defined(PAE_TABLES)
403 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
404 #endif
405 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
406 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
407
408 /*
409 * Initialize the global pv list lock.
410 */
411 rw_init(&pvh_global_lock, "pmap pv global");
412
413 LIST_INIT(&allpmaps);
414
415 /*
416 * Request a spin mutex so that changes to allpmaps cannot be
417 * preempted by smp_rendezvous_cpus(). Otherwise,
418 * pmap_update_pde_kernel() could access allpmaps while it is
419 * being changed.
420 */
421 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
422 mtx_lock_spin(&allpmaps_lock);
423 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
424 mtx_unlock_spin(&allpmaps_lock);
425
426 /*
427 * Reserve some special page table entries/VA space for temporary
428 * mapping of pages.
429 */
430 #define SYSMAP(c, p, v, n) \
431 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
432
433 va = virtual_avail;
434 pte = vtopte(va);
435
436
437 /*
438 * Initialize temporary map objects on the current CPU for use
439 * during early boot.
440 * CMAP1/CMAP2 are used for zeroing and copying pages.
441 * CMAP3 is used for the idle process page zeroing.
442 */
443 pc = get_pcpu();
444 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
445 SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
446 SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
447 SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
448
449 SYSMAP(caddr_t, CMAP3, CADDR3, 1)
450
451 /*
452 * Crashdump maps.
453 */
454 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
455
456 /*
457 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
458 */
459 SYSMAP(caddr_t, unused, ptvmmap, 1)
460
461 /*
462 * msgbufp is used to map the system message buffer.
463 */
464 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
465
466 /*
467 * KPTmap is used by pmap_kextract().
468 *
469 * KPTmap is first initialized by locore. However, that initial
470 * KPTmap can only support NKPT page table pages. Here, a larger
471 * KPTmap is created that can support KVA_PAGES page table pages.
472 */
473 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
474
475 for (i = 0; i < NKPT; i++)
476 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
477
478 /*
479 * Adjust the start of the KPTD and KPTmap so that the implementation
480 * of pmap_kextract() and pmap_growkernel() can be made simpler.
481 */
482 KPTD -= KPTDI;
483 KPTmap -= i386_btop(KPTDI << PDRSHIFT);
484
485 /*
486 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
487 * respectively.
488 */
489 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
490 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
491
492 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
493
494 virtual_avail = va;
495
496 /*
497 * Leave in place an identity mapping (virt == phys) for the low 1 MB
498 * physical memory region that is used by the ACPI wakeup code. This
499 * mapping must not have PG_G set.
500 */
501 #ifdef XBOX
502 /* FIXME: This is gross, but needed for the XBOX. Since we are in such
503 * an early stadium, we cannot yet neatly map video memory ... :-(
504 * Better fixes are very welcome! */
505 if (!arch_i386_is_xbox)
506 #endif
507 for (i = 1; i < NKPT; i++)
508 PTD[i] = 0;
509
510 /*
511 * Initialize the PAT MSR if present.
512 * pmap_init_pat() clears and sets CR4_PGE, which, as a
513 * side-effect, invalidates stale PG_G TLB entries that might
514 * have been created in our pre-boot environment. We assume
515 * that PAT support implies PGE and in reverse, PGE presence
516 * comes with PAT. Both features were added for Pentium Pro.
517 */
518 pmap_init_pat();
519
520 /* Turn on PG_G on kernel page(s) */
521 pmap_set_pg();
522 }
523
524 static void
525 pmap_init_reserved_pages(void)
526 {
527 struct pcpu *pc;
528 vm_offset_t pages;
529 int i;
530
531 CPU_FOREACH(i) {
532 pc = pcpu_find(i);
533 /*
534 * Skip if the mapping has already been initialized,
535 * i.e. this is the BSP.
536 */
537 if (pc->pc_cmap_addr1 != 0)
538 continue;
539 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
540 pages = kva_alloc(PAGE_SIZE * 3);
541 if (pages == 0)
542 panic("%s: unable to allocate KVA", __func__);
543 pc->pc_cmap_pte1 = vtopte(pages);
544 pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
545 pc->pc_cmap_addr1 = (caddr_t)pages;
546 pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
547 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2);
548 }
549 }
550
551 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
552
553 /*
554 * Setup the PAT MSR.
555 */
556 void
557 pmap_init_pat(void)
558 {
559 int pat_table[PAT_INDEX_SIZE];
560 uint64_t pat_msr;
561 u_long cr0, cr4;
562 int i;
563
564 /* Set default PAT index table. */
565 for (i = 0; i < PAT_INDEX_SIZE; i++)
566 pat_table[i] = -1;
567 pat_table[PAT_WRITE_BACK] = 0;
568 pat_table[PAT_WRITE_THROUGH] = 1;
569 pat_table[PAT_UNCACHEABLE] = 3;
570 pat_table[PAT_WRITE_COMBINING] = 3;
571 pat_table[PAT_WRITE_PROTECTED] = 3;
572 pat_table[PAT_UNCACHED] = 3;
573
574 /*
575 * Bail if this CPU doesn't implement PAT.
576 * We assume that PAT support implies PGE.
577 */
578 if ((cpu_feature & CPUID_PAT) == 0) {
579 for (i = 0; i < PAT_INDEX_SIZE; i++)
580 pat_index[i] = pat_table[i];
581 pat_works = 0;
582 return;
583 }
584
585 /*
586 * Due to some Intel errata, we can only safely use the lower 4
587 * PAT entries.
588 *
589 * Intel Pentium III Processor Specification Update
590 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
591 * or Mode C Paging)
592 *
593 * Intel Pentium IV Processor Specification Update
594 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
595 */
596 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
597 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
598 pat_works = 0;
599
600 /* Initialize default PAT entries. */
601 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
602 PAT_VALUE(1, PAT_WRITE_THROUGH) |
603 PAT_VALUE(2, PAT_UNCACHED) |
604 PAT_VALUE(3, PAT_UNCACHEABLE) |
605 PAT_VALUE(4, PAT_WRITE_BACK) |
606 PAT_VALUE(5, PAT_WRITE_THROUGH) |
607 PAT_VALUE(6, PAT_UNCACHED) |
608 PAT_VALUE(7, PAT_UNCACHEABLE);
609
610 if (pat_works) {
611 /*
612 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
613 * Program 5 and 6 as WP and WC.
614 * Leave 4 and 7 as WB and UC.
615 */
616 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
617 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
618 PAT_VALUE(6, PAT_WRITE_COMBINING);
619 pat_table[PAT_UNCACHED] = 2;
620 pat_table[PAT_WRITE_PROTECTED] = 5;
621 pat_table[PAT_WRITE_COMBINING] = 6;
622 } else {
623 /*
624 * Just replace PAT Index 2 with WC instead of UC-.
625 */
626 pat_msr &= ~PAT_MASK(2);
627 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
628 pat_table[PAT_WRITE_COMBINING] = 2;
629 }
630
631 /* Disable PGE. */
632 cr4 = rcr4();
633 load_cr4(cr4 & ~CR4_PGE);
634
635 /* Disable caches (CD = 1, NW = 0). */
636 cr0 = rcr0();
637 load_cr0((cr0 & ~CR0_NW) | CR0_CD);
638
639 /* Flushes caches and TLBs. */
640 wbinvd();
641 invltlb();
642
643 /* Update PAT and index table. */
644 wrmsr(MSR_PAT, pat_msr);
645 for (i = 0; i < PAT_INDEX_SIZE; i++)
646 pat_index[i] = pat_table[i];
647
648 /* Flush caches and TLBs again. */
649 wbinvd();
650 invltlb();
651
652 /* Restore caches and PGE. */
653 load_cr0(cr0);
654 load_cr4(cr4);
655 }
656
657 /*
658 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on.
659 */
660 static void
661 pmap_set_pg(void)
662 {
663 pt_entry_t *pte;
664 vm_offset_t va, endva;
665
666 if (pgeflag == 0)
667 return;
668
669 endva = KERNBASE + KERNend;
670
671 if (pseflag) {
672 va = KERNBASE + KERNLOAD;
673 while (va < endva) {
674 pdir_pde(PTD, va) |= pgeflag;
675 invltlb(); /* Flush non-PG_G entries. */
676 va += NBPDR;
677 }
678 } else {
679 va = (vm_offset_t)btext;
680 while (va < endva) {
681 pte = vtopte(va);
682 if (*pte)
683 *pte |= pgeflag;
684 invltlb(); /* Flush non-PG_G entries. */
685 va += PAGE_SIZE;
686 }
687 }
688 }
689
690 /*
691 * Initialize a vm_page's machine-dependent fields.
692 */
693 void
694 pmap_page_init(vm_page_t m)
695 {
696
697 TAILQ_INIT(&m->md.pv_list);
698 m->md.pat_mode = PAT_WRITE_BACK;
699 }
700
701 #if defined(PAE) || defined(PAE_TABLES)
702 static void *
703 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
704 {
705
706 /* Inform UMA that this allocator uses kernel_map/object. */
707 *flags = UMA_SLAB_KERNEL;
708 return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
709 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
710 }
711 #endif
712
713 /*
714 * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
715 * Requirements:
716 * - Must deal with pages in order to ensure that none of the PG_* bits
717 * are ever set, PG_V in particular.
718 * - Assumes we can write to ptes without pte_store() atomic ops, even
719 * on PAE systems. This should be ok.
720 * - Assumes nothing will ever test these addresses for 0 to indicate
721 * no mapping instead of correctly checking PG_V.
722 * - Assumes a vm_offset_t will fit in a pte (true for i386).
723 * Because PG_V is never set, there can be no mappings to invalidate.
724 */
725 static vm_offset_t
726 pmap_ptelist_alloc(vm_offset_t *head)
727 {
728 pt_entry_t *pte;
729 vm_offset_t va;
730
731 va = *head;
732 if (va == 0)
733 panic("pmap_ptelist_alloc: exhausted ptelist KVA");
734 pte = vtopte(va);
735 *head = *pte;
736 if (*head & PG_V)
737 panic("pmap_ptelist_alloc: va with PG_V set!");
738 *pte = 0;
739 return (va);
740 }
741
742 static void
743 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
744 {
745 pt_entry_t *pte;
746
747 if (va & PG_V)
748 panic("pmap_ptelist_free: freeing va with PG_V set!");
749 pte = vtopte(va);
750 *pte = *head; /* virtual! PG_V is 0 though */
751 *head = va;
752 }
753
754 static void
755 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
756 {
757 int i;
758 vm_offset_t va;
759
760 *head = 0;
761 for (i = npages - 1; i >= 0; i--) {
762 va = (vm_offset_t)base + i * PAGE_SIZE;
763 pmap_ptelist_free(head, va);
764 }
765 }
766
767
768 /*
769 * Initialize the pmap module.
770 * Called by vm_init, to initialize any structures that the pmap
771 * system needs to map virtual memory.
772 */
773 void
774 pmap_init(void)
775 {
776 struct pmap_preinit_mapping *ppim;
777 vm_page_t mpte;
778 vm_size_t s;
779 int i, pv_npg;
780
781 /*
782 * Initialize the vm page array entries for the kernel pmap's
783 * page table pages.
784 */
785 for (i = 0; i < NKPT; i++) {
786 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
787 KASSERT(mpte >= vm_page_array &&
788 mpte < &vm_page_array[vm_page_array_size],
789 ("pmap_init: page table page is out of range"));
790 mpte->pindex = i + KPTDI;
791 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
792 }
793
794 /*
795 * Initialize the address space (zone) for the pv entries. Set a
796 * high water mark so that the system can recover from excessive
797 * numbers of pv entries.
798 */
799 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
800 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
801 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
802 pv_entry_max = roundup(pv_entry_max, _NPCPV);
803 pv_entry_high_water = 9 * (pv_entry_max / 10);
804
805 /*
806 * If the kernel is running on a virtual machine, then it must assume
807 * that MCA is enabled by the hypervisor. Moreover, the kernel must
808 * be prepared for the hypervisor changing the vendor and family that
809 * are reported by CPUID. Consequently, the workaround for AMD Family
810 * 10h Erratum 383 is enabled if the processor's feature set does not
811 * include at least one feature that is only supported by older Intel
812 * or newer AMD processors.
813 */
814 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
815 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
816 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
817 AMDID2_FMA4)) == 0)
818 workaround_erratum383 = 1;
819
820 /*
821 * Are large page mappings supported and enabled?
822 */
823 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
824 if (pseflag == 0)
825 pg_ps_enabled = 0;
826 else if (pg_ps_enabled) {
827 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
828 ("pmap_init: can't assign to pagesizes[1]"));
829 pagesizes[1] = NBPDR;
830 }
831
832 /*
833 * Calculate the size of the pv head table for superpages.
834 * Handle the possibility that "vm_phys_segs[...].end" is zero.
835 */
836 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
837 PAGE_SIZE) / NBPDR + 1;
838
839 /*
840 * Allocate memory for the pv head table for superpages.
841 */
842 s = (vm_size_t)(pv_npg * sizeof(struct md_page));
843 s = round_page(s);
844 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
845 M_WAITOK | M_ZERO);
846 for (i = 0; i < pv_npg; i++)
847 TAILQ_INIT(&pv_table[i].pv_list);
848
849 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
850 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
851 if (pv_chunkbase == NULL)
852 panic("pmap_init: not enough kvm for pv chunks");
853 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
854 #if defined(PAE) || defined(PAE_TABLES)
855 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
856 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
857 UMA_ZONE_VM | UMA_ZONE_NOFREE);
858 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
859 #endif
860
861 pmap_initialized = 1;
862 if (!bootverbose)
863 return;
864 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
865 ppim = pmap_preinit_mapping + i;
866 if (ppim->va == 0)
867 continue;
868 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
869 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
870 }
871 }
872
873
874 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
875 "Max number of PV entries");
876 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
877 "Page share factor per proc");
878
879 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
880 "2/4MB page mapping counters");
881
882 static u_long pmap_pde_demotions;
883 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
884 &pmap_pde_demotions, 0, "2/4MB page demotions");
885
886 static u_long pmap_pde_mappings;
887 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
888 &pmap_pde_mappings, 0, "2/4MB page mappings");
889
890 static u_long pmap_pde_p_failures;
891 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
892 &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
893
894 static u_long pmap_pde_promotions;
895 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
896 &pmap_pde_promotions, 0, "2/4MB page promotions");
897
898 /***************************************************
899 * Low level helper routines.....
900 ***************************************************/
901
902 /*
903 * Determine the appropriate bits to set in a PTE or PDE for a specified
904 * caching mode.
905 */
906 int
907 pmap_cache_bits(int mode, boolean_t is_pde)
908 {
909 int cache_bits, pat_flag, pat_idx;
910
911 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
912 panic("Unknown caching mode %d\n", mode);
913
914 /* The PAT bit is different for PTE's and PDE's. */
915 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
916
917 /* Map the caching mode to a PAT index. */
918 pat_idx = pat_index[mode];
919
920 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
921 cache_bits = 0;
922 if (pat_idx & 0x4)
923 cache_bits |= pat_flag;
924 if (pat_idx & 0x2)
925 cache_bits |= PG_NC_PCD;
926 if (pat_idx & 0x1)
927 cache_bits |= PG_NC_PWT;
928 return (cache_bits);
929 }
930
931 /*
932 * The caller is responsible for maintaining TLB consistency.
933 */
934 static void
935 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
936 {
937 pd_entry_t *pde;
938 pmap_t pmap;
939 boolean_t PTD_updated;
940
941 PTD_updated = FALSE;
942 mtx_lock_spin(&allpmaps_lock);
943 LIST_FOREACH(pmap, &allpmaps, pm_list) {
944 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
945 PG_FRAME))
946 PTD_updated = TRUE;
947 pde = pmap_pde(pmap, va);
948 pde_store(pde, newpde);
949 }
950 mtx_unlock_spin(&allpmaps_lock);
951 KASSERT(PTD_updated,
952 ("pmap_kenter_pde: current page table is not in allpmaps"));
953 }
954
955 /*
956 * After changing the page size for the specified virtual address in the page
957 * table, flush the corresponding entries from the processor's TLB. Only the
958 * calling processor's TLB is affected.
959 *
960 * The calling thread must be pinned to a processor.
961 */
962 static void
963 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
964 {
965 u_long cr4;
966
967 if ((newpde & PG_PS) == 0)
968 /* Demotion: flush a specific 2MB page mapping. */
969 invlpg(va);
970 else if ((newpde & PG_G) == 0)
971 /*
972 * Promotion: flush every 4KB page mapping from the TLB
973 * because there are too many to flush individually.
974 */
975 invltlb();
976 else {
977 /*
978 * Promotion: flush every 4KB page mapping from the TLB,
979 * including any global (PG_G) mappings.
980 */
981 cr4 = rcr4();
982 load_cr4(cr4 & ~CR4_PGE);
983 /*
984 * Although preemption at this point could be detrimental to
985 * performance, it would not lead to an error. PG_G is simply
986 * ignored if CR4.PGE is clear. Moreover, in case this block
987 * is re-entered, the load_cr4() either above or below will
988 * modify CR4.PGE flushing the TLB.
989 */
990 load_cr4(cr4 | CR4_PGE);
991 }
992 }
993
994 void
995 invltlb_glob(void)
996 {
997 uint64_t cr4;
998
999 if (pgeflag == 0) {
1000 invltlb();
1001 } else {
1002 cr4 = rcr4();
1003 load_cr4(cr4 & ~CR4_PGE);
1004 load_cr4(cr4 | CR4_PGE);
1005 }
1006 }
1007
1008
1009 #ifdef SMP
1010 /*
1011 * For SMP, these functions have to use the IPI mechanism for coherence.
1012 *
1013 * N.B.: Before calling any of the following TLB invalidation functions,
1014 * the calling processor must ensure that all stores updating a non-
1015 * kernel page table are globally performed. Otherwise, another
1016 * processor could cache an old, pre-update entry without being
1017 * invalidated. This can happen one of two ways: (1) The pmap becomes
1018 * active on another processor after its pm_active field is checked by
1019 * one of the following functions but before a store updating the page
1020 * table is globally performed. (2) The pmap becomes active on another
1021 * processor before its pm_active field is checked but due to
1022 * speculative loads one of the following functions stills reads the
1023 * pmap as inactive on the other processor.
1024 *
1025 * The kernel page table is exempt because its pm_active field is
1026 * immutable. The kernel page table is always active on every
1027 * processor.
1028 */
1029 void
1030 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1031 {
1032 cpuset_t *mask, other_cpus;
1033 u_int cpuid;
1034
1035 sched_pin();
1036 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1037 invlpg(va);
1038 mask = &all_cpus;
1039 } else {
1040 cpuid = PCPU_GET(cpuid);
1041 other_cpus = all_cpus;
1042 CPU_CLR(cpuid, &other_cpus);
1043 if (CPU_ISSET(cpuid, &pmap->pm_active))
1044 invlpg(va);
1045 CPU_AND(&other_cpus, &pmap->pm_active);
1046 mask = &other_cpus;
1047 }
1048 smp_masked_invlpg(*mask, va, pmap);
1049 sched_unpin();
1050 }
1051
1052 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1053 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE)
1054
1055 void
1056 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1057 {
1058 cpuset_t *mask, other_cpus;
1059 vm_offset_t addr;
1060 u_int cpuid;
1061
1062 if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1063 pmap_invalidate_all(pmap);
1064 return;
1065 }
1066
1067 sched_pin();
1068 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1069 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1070 invlpg(addr);
1071 mask = &all_cpus;
1072 } else {
1073 cpuid = PCPU_GET(cpuid);
1074 other_cpus = all_cpus;
1075 CPU_CLR(cpuid, &other_cpus);
1076 if (CPU_ISSET(cpuid, &pmap->pm_active))
1077 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1078 invlpg(addr);
1079 CPU_AND(&other_cpus, &pmap->pm_active);
1080 mask = &other_cpus;
1081 }
1082 smp_masked_invlpg_range(*mask, sva, eva, pmap);
1083 sched_unpin();
1084 }
1085
1086 void
1087 pmap_invalidate_all(pmap_t pmap)
1088 {
1089 cpuset_t *mask, other_cpus;
1090 u_int cpuid;
1091
1092 sched_pin();
1093 if (pmap == kernel_pmap) {
1094 invltlb_glob();
1095 mask = &all_cpus;
1096 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
1097 invltlb();
1098 mask = &all_cpus;
1099 } else {
1100 cpuid = PCPU_GET(cpuid);
1101 other_cpus = all_cpus;
1102 CPU_CLR(cpuid, &other_cpus);
1103 if (CPU_ISSET(cpuid, &pmap->pm_active))
1104 invltlb();
1105 CPU_AND(&other_cpus, &pmap->pm_active);
1106 mask = &other_cpus;
1107 }
1108 smp_masked_invltlb(*mask, pmap);
1109 sched_unpin();
1110 }
1111
1112 void
1113 pmap_invalidate_cache(void)
1114 {
1115
1116 sched_pin();
1117 wbinvd();
1118 smp_cache_flush();
1119 sched_unpin();
1120 }
1121
1122 struct pde_action {
1123 cpuset_t invalidate; /* processors that invalidate their TLB */
1124 vm_offset_t va;
1125 pd_entry_t *pde;
1126 pd_entry_t newpde;
1127 u_int store; /* processor that updates the PDE */
1128 };
1129
1130 static void
1131 pmap_update_pde_kernel(void *arg)
1132 {
1133 struct pde_action *act = arg;
1134 pd_entry_t *pde;
1135 pmap_t pmap;
1136
1137 if (act->store == PCPU_GET(cpuid)) {
1138
1139 /*
1140 * Elsewhere, this operation requires allpmaps_lock for
1141 * synchronization. Here, it does not because it is being
1142 * performed in the context of an all_cpus rendezvous.
1143 */
1144 LIST_FOREACH(pmap, &allpmaps, pm_list) {
1145 pde = pmap_pde(pmap, act->va);
1146 pde_store(pde, act->newpde);
1147 }
1148 }
1149 }
1150
1151 static void
1152 pmap_update_pde_user(void *arg)
1153 {
1154 struct pde_action *act = arg;
1155
1156 if (act->store == PCPU_GET(cpuid))
1157 pde_store(act->pde, act->newpde);
1158 }
1159
1160 static void
1161 pmap_update_pde_teardown(void *arg)
1162 {
1163 struct pde_action *act = arg;
1164
1165 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1166 pmap_update_pde_invalidate(act->va, act->newpde);
1167 }
1168
1169 /*
1170 * Change the page size for the specified virtual address in a way that
1171 * prevents any possibility of the TLB ever having two entries that map the
1172 * same virtual address using different page sizes. This is the recommended
1173 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
1174 * machine check exception for a TLB state that is improperly diagnosed as a
1175 * hardware error.
1176 */
1177 static void
1178 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1179 {
1180 struct pde_action act;
1181 cpuset_t active, other_cpus;
1182 u_int cpuid;
1183
1184 sched_pin();
1185 cpuid = PCPU_GET(cpuid);
1186 other_cpus = all_cpus;
1187 CPU_CLR(cpuid, &other_cpus);
1188 if (pmap == kernel_pmap)
1189 active = all_cpus;
1190 else
1191 active = pmap->pm_active;
1192 if (CPU_OVERLAP(&active, &other_cpus)) {
1193 act.store = cpuid;
1194 act.invalidate = active;
1195 act.va = va;
1196 act.pde = pde;
1197 act.newpde = newpde;
1198 CPU_SET(cpuid, &active);
1199 smp_rendezvous_cpus(active,
1200 smp_no_rendevous_barrier, pmap == kernel_pmap ?
1201 pmap_update_pde_kernel : pmap_update_pde_user,
1202 pmap_update_pde_teardown, &act);
1203 } else {
1204 if (pmap == kernel_pmap)
1205 pmap_kenter_pde(va, newpde);
1206 else
1207 pde_store(pde, newpde);
1208 if (CPU_ISSET(cpuid, &active))
1209 pmap_update_pde_invalidate(va, newpde);
1210 }
1211 sched_unpin();
1212 }
1213 #else /* !SMP */
1214 /*
1215 * Normal, non-SMP, 486+ invalidation functions.
1216 * We inline these within pmap.c for speed.
1217 */
1218 PMAP_INLINE void
1219 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1220 {
1221
1222 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1223 invlpg(va);
1224 }
1225
1226 PMAP_INLINE void
1227 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1228 {
1229 vm_offset_t addr;
1230
1231 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1232 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1233 invlpg(addr);
1234 }
1235
1236 PMAP_INLINE void
1237 pmap_invalidate_all(pmap_t pmap)
1238 {
1239
1240 if (pmap == kernel_pmap)
1241 invltlb_glob();
1242 else if (!CPU_EMPTY(&pmap->pm_active))
1243 invltlb();
1244 }
1245
1246 PMAP_INLINE void
1247 pmap_invalidate_cache(void)
1248 {
1249
1250 wbinvd();
1251 }
1252
1253 static void
1254 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1255 {
1256
1257 if (pmap == kernel_pmap)
1258 pmap_kenter_pde(va, newpde);
1259 else
1260 pde_store(pde, newpde);
1261 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1262 pmap_update_pde_invalidate(va, newpde);
1263 }
1264 #endif /* !SMP */
1265
1266 static void
1267 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
1268 {
1269
1270 /*
1271 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was
1272 * created by a promotion that did not invalidate the 512 or 1024 4KB
1273 * page mappings that might exist in the TLB. Consequently, at this
1274 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for
1275 * the address range [va, va + NBPDR). Therefore, the entire range
1276 * must be invalidated here. In contrast, when PG_PROMOTED is clear,
1277 * the TLB will not hold any 4KB page mappings for the address range
1278 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the
1279 * 2- or 4MB page mapping from the TLB.
1280 */
1281 if ((pde & PG_PROMOTED) != 0)
1282 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
1283 else
1284 pmap_invalidate_page(pmap, va);
1285 }
1286
1287 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
1288
1289 void
1290 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1291 {
1292
1293 if (force) {
1294 sva &= ~(vm_offset_t)cpu_clflush_line_size;
1295 } else {
1296 KASSERT((sva & PAGE_MASK) == 0,
1297 ("pmap_invalidate_cache_range: sva not page-aligned"));
1298 KASSERT((eva & PAGE_MASK) == 0,
1299 ("pmap_invalidate_cache_range: eva not page-aligned"));
1300 }
1301
1302 if ((cpu_feature & CPUID_SS) != 0 && !force)
1303 ; /* If "Self Snoop" is supported and allowed, do nothing. */
1304 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
1305 eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1306 #ifdef DEV_APIC
1307 /*
1308 * XXX: Some CPUs fault, hang, or trash the local APIC
1309 * registers if we use CLFLUSH on the local APIC
1310 * range. The local APIC is always uncached, so we
1311 * don't need to flush for that range anyway.
1312 */
1313 if (pmap_kextract(sva) == lapic_paddr)
1314 return;
1315 #endif
1316 /*
1317 * Otherwise, do per-cache line flush. Use the sfence
1318 * instruction to insure that previous stores are
1319 * included in the write-back. The processor
1320 * propagates flush to other processors in the cache
1321 * coherence domain.
1322 */
1323 sfence();
1324 for (; sva < eva; sva += cpu_clflush_line_size)
1325 clflushopt(sva);
1326 sfence();
1327 } else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1328 eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1329 #ifdef DEV_APIC
1330 if (pmap_kextract(sva) == lapic_paddr)
1331 return;
1332 #endif
1333 /*
1334 * Writes are ordered by CLFLUSH on Intel CPUs.
1335 */
1336 if (cpu_vendor_id != CPU_VENDOR_INTEL)
1337 mfence();
1338 for (; sva < eva; sva += cpu_clflush_line_size)
1339 clflush(sva);
1340 if (cpu_vendor_id != CPU_VENDOR_INTEL)
1341 mfence();
1342 } else {
1343
1344 /*
1345 * No targeted cache flush methods are supported by CPU,
1346 * or the supplied range is bigger than 2MB.
1347 * Globally invalidate cache.
1348 */
1349 pmap_invalidate_cache();
1350 }
1351 }
1352
1353 void
1354 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1355 {
1356 int i;
1357
1358 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1359 (cpu_feature & CPUID_CLFSH) == 0) {
1360 pmap_invalidate_cache();
1361 } else {
1362 for (i = 0; i < count; i++)
1363 pmap_flush_page(pages[i]);
1364 }
1365 }
1366
1367 /*
1368 * Are we current address space or kernel?
1369 */
1370 static __inline int
1371 pmap_is_current(pmap_t pmap)
1372 {
1373
1374 return (pmap == kernel_pmap || pmap ==
1375 vmspace_pmap(curthread->td_proc->p_vmspace));
1376 }
1377
1378 /*
1379 * If the given pmap is not the current or kernel pmap, the returned pte must
1380 * be released by passing it to pmap_pte_release().
1381 */
1382 pt_entry_t *
1383 pmap_pte(pmap_t pmap, vm_offset_t va)
1384 {
1385 pd_entry_t newpf;
1386 pd_entry_t *pde;
1387
1388 pde = pmap_pde(pmap, va);
1389 if (*pde & PG_PS)
1390 return (pde);
1391 if (*pde != 0) {
1392 /* are we current address space or kernel? */
1393 if (pmap_is_current(pmap))
1394 return (vtopte(va));
1395 mtx_lock(&PMAP2mutex);
1396 newpf = *pde & PG_FRAME;
1397 if ((*PMAP2 & PG_FRAME) != newpf) {
1398 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1399 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1400 }
1401 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1402 }
1403 return (NULL);
1404 }
1405
1406 /*
1407 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte
1408 * being NULL.
1409 */
1410 static __inline void
1411 pmap_pte_release(pt_entry_t *pte)
1412 {
1413
1414 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1415 mtx_unlock(&PMAP2mutex);
1416 }
1417
1418 /*
1419 * NB: The sequence of updating a page table followed by accesses to the
1420 * corresponding pages is subject to the situation described in the "AMD64
1421 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
1422 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG
1423 * right after modifying the PTE bits is crucial.
1424 */
1425 static __inline void
1426 invlcaddr(void *caddr)
1427 {
1428
1429 invlpg((u_int)caddr);
1430 }
1431
1432 /*
1433 * Super fast pmap_pte routine best used when scanning
1434 * the pv lists. This eliminates many coarse-grained
1435 * invltlb calls. Note that many of the pv list
1436 * scans are across different pmaps. It is very wasteful
1437 * to do an entire invltlb for checking a single mapping.
1438 *
1439 * If the given pmap is not the current pmap, pvh_global_lock
1440 * must be held and curthread pinned to a CPU.
1441 */
1442 static pt_entry_t *
1443 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1444 {
1445 pd_entry_t newpf;
1446 pd_entry_t *pde;
1447
1448 pde = pmap_pde(pmap, va);
1449 if (*pde & PG_PS)
1450 return (pde);
1451 if (*pde != 0) {
1452 /* are we current address space or kernel? */
1453 if (pmap_is_current(pmap))
1454 return (vtopte(va));
1455 rw_assert(&pvh_global_lock, RA_WLOCKED);
1456 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1457 newpf = *pde & PG_FRAME;
1458 if ((*PMAP1 & PG_FRAME) != newpf) {
1459 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1460 #ifdef SMP
1461 PMAP1cpu = PCPU_GET(cpuid);
1462 #endif
1463 invlcaddr(PADDR1);
1464 PMAP1changed++;
1465 } else
1466 #ifdef SMP
1467 if (PMAP1cpu != PCPU_GET(cpuid)) {
1468 PMAP1cpu = PCPU_GET(cpuid);
1469 invlcaddr(PADDR1);
1470 PMAP1changedcpu++;
1471 } else
1472 #endif
1473 PMAP1unchanged++;
1474 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1475 }
1476 return (0);
1477 }
1478
1479 /*
1480 * Routine: pmap_extract
1481 * Function:
1482 * Extract the physical page address associated
1483 * with the given map/virtual_address pair.
1484 */
1485 vm_paddr_t
1486 pmap_extract(pmap_t pmap, vm_offset_t va)
1487 {
1488 vm_paddr_t rtval;
1489 pt_entry_t *pte;
1490 pd_entry_t pde;
1491
1492 rtval = 0;
1493 PMAP_LOCK(pmap);
1494 pde = pmap->pm_pdir[va >> PDRSHIFT];
1495 if (pde != 0) {
1496 if ((pde & PG_PS) != 0)
1497 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1498 else {
1499 pte = pmap_pte(pmap, va);
1500 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1501 pmap_pte_release(pte);
1502 }
1503 }
1504 PMAP_UNLOCK(pmap);
1505 return (rtval);
1506 }
1507
1508 /*
1509 * Routine: pmap_extract_and_hold
1510 * Function:
1511 * Atomically extract and hold the physical page
1512 * with the given pmap and virtual address pair
1513 * if that mapping permits the given protection.
1514 */
1515 vm_page_t
1516 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1517 {
1518 pd_entry_t pde;
1519 pt_entry_t pte, *ptep;
1520 vm_page_t m;
1521 vm_paddr_t pa;
1522
1523 pa = 0;
1524 m = NULL;
1525 PMAP_LOCK(pmap);
1526 retry:
1527 pde = *pmap_pde(pmap, va);
1528 if (pde != 0) {
1529 if (pde & PG_PS) {
1530 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1531 if (vm_page_pa_tryrelock(pmap, (pde &
1532 PG_PS_FRAME) | (va & PDRMASK), &pa))
1533 goto retry;
1534 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1535 (va & PDRMASK));
1536 vm_page_hold(m);
1537 }
1538 } else {
1539 ptep = pmap_pte(pmap, va);
1540 pte = *ptep;
1541 pmap_pte_release(ptep);
1542 if (pte != 0 &&
1543 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1544 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1545 &pa))
1546 goto retry;
1547 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1548 vm_page_hold(m);
1549 }
1550 }
1551 }
1552 PA_UNLOCK_COND(pa);
1553 PMAP_UNLOCK(pmap);
1554 return (m);
1555 }
1556
1557 /***************************************************
1558 * Low level mapping routines.....
1559 ***************************************************/
1560
1561 /*
1562 * Add a wired page to the kva.
1563 * Note: not SMP coherent.
1564 *
1565 * This function may be used before pmap_bootstrap() is called.
1566 */
1567 PMAP_INLINE void
1568 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1569 {
1570 pt_entry_t *pte;
1571
1572 pte = vtopte(va);
1573 pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1574 }
1575
1576 static __inline void
1577 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1578 {
1579 pt_entry_t *pte;
1580
1581 pte = vtopte(va);
1582 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1583 }
1584
1585 /*
1586 * Remove a page from the kernel pagetables.
1587 * Note: not SMP coherent.
1588 *
1589 * This function may be used before pmap_bootstrap() is called.
1590 */
1591 PMAP_INLINE void
1592 pmap_kremove(vm_offset_t va)
1593 {
1594 pt_entry_t *pte;
1595
1596 pte = vtopte(va);
1597 pte_clear(pte);
1598 }
1599
1600 /*
1601 * Used to map a range of physical addresses into kernel
1602 * virtual address space.
1603 *
1604 * The value passed in '*virt' is a suggested virtual address for
1605 * the mapping. Architectures which can support a direct-mapped
1606 * physical to virtual region can return the appropriate address
1607 * within that region, leaving '*virt' unchanged. Other
1608 * architectures should map the pages starting at '*virt' and
1609 * update '*virt' with the first usable address after the mapped
1610 * region.
1611 */
1612 vm_offset_t
1613 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1614 {
1615 vm_offset_t va, sva;
1616 vm_paddr_t superpage_offset;
1617 pd_entry_t newpde;
1618
1619 va = *virt;
1620 /*
1621 * Does the physical address range's size and alignment permit at
1622 * least one superpage mapping to be created?
1623 */
1624 superpage_offset = start & PDRMASK;
1625 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1626 /*
1627 * Increase the starting virtual address so that its alignment
1628 * does not preclude the use of superpage mappings.
1629 */
1630 if ((va & PDRMASK) < superpage_offset)
1631 va = (va & ~PDRMASK) + superpage_offset;
1632 else if ((va & PDRMASK) > superpage_offset)
1633 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1634 }
1635 sva = va;
1636 while (start < end) {
1637 if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1638 pseflag) {
1639 KASSERT((va & PDRMASK) == 0,
1640 ("pmap_map: misaligned va %#x", va));
1641 newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1642 pmap_kenter_pde(va, newpde);
1643 va += NBPDR;
1644 start += NBPDR;
1645 } else {
1646 pmap_kenter(va, start);
1647 va += PAGE_SIZE;
1648 start += PAGE_SIZE;
1649 }
1650 }
1651 pmap_invalidate_range(kernel_pmap, sva, va);
1652 *virt = va;
1653 return (sva);
1654 }
1655
1656
1657 /*
1658 * Add a list of wired pages to the kva
1659 * this routine is only used for temporary
1660 * kernel mappings that do not need to have
1661 * page modification or references recorded.
1662 * Note that old mappings are simply written
1663 * over. The page *must* be wired.
1664 * Note: SMP coherent. Uses a ranged shootdown IPI.
1665 */
1666 void
1667 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1668 {
1669 pt_entry_t *endpte, oldpte, pa, *pte;
1670 vm_page_t m;
1671
1672 oldpte = 0;
1673 pte = vtopte(sva);
1674 endpte = pte + count;
1675 while (pte < endpte) {
1676 m = *ma++;
1677 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1678 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1679 oldpte |= *pte;
1680 pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1681 }
1682 pte++;
1683 }
1684 if (__predict_false((oldpte & PG_V) != 0))
1685 pmap_invalidate_range(kernel_pmap, sva, sva + count *
1686 PAGE_SIZE);
1687 }
1688
1689 /*
1690 * This routine tears out page mappings from the
1691 * kernel -- it is meant only for temporary mappings.
1692 * Note: SMP coherent. Uses a ranged shootdown IPI.
1693 */
1694 void
1695 pmap_qremove(vm_offset_t sva, int count)
1696 {
1697 vm_offset_t va;
1698
1699 va = sva;
1700 while (count-- > 0) {
1701 pmap_kremove(va);
1702 va += PAGE_SIZE;
1703 }
1704 pmap_invalidate_range(kernel_pmap, sva, va);
1705 }
1706
1707 /***************************************************
1708 * Page table page management routines.....
1709 ***************************************************/
1710 static __inline void
1711 pmap_free_zero_pages(struct spglist *free)
1712 {
1713 vm_page_t m;
1714
1715 while ((m = SLIST_FIRST(free)) != NULL) {
1716 SLIST_REMOVE_HEAD(free, plinks.s.ss);
1717 /* Preserve the page's PG_ZERO setting. */
1718 vm_page_free_toq(m);
1719 }
1720 }
1721
1722 /*
1723 * Schedule the specified unused page table page to be freed. Specifically,
1724 * add the page to the specified list of pages that will be released to the
1725 * physical memory manager after the TLB has been updated.
1726 */
1727 static __inline void
1728 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1729 boolean_t set_PG_ZERO)
1730 {
1731
1732 if (set_PG_ZERO)
1733 m->flags |= PG_ZERO;
1734 else
1735 m->flags &= ~PG_ZERO;
1736 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1737 }
1738
1739 /*
1740 * Inserts the specified page table page into the specified pmap's collection
1741 * of idle page table pages. Each of a pmap's page table pages is responsible
1742 * for mapping a distinct range of virtual addresses. The pmap's collection is
1743 * ordered by this virtual address range.
1744 */
1745 static __inline int
1746 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1747 {
1748
1749 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1750 return (vm_radix_insert(&pmap->pm_root, mpte));
1751 }
1752
1753 /*
1754 * Removes the page table page mapping the specified virtual address from the
1755 * specified pmap's collection of idle page table pages, and returns it.
1756 * Otherwise, returns NULL if there is no page table page corresponding to the
1757 * specified virtual address.
1758 */
1759 static __inline vm_page_t
1760 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1761 {
1762
1763 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1764 return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT));
1765 }
1766
1767 /*
1768 * Decrements a page table page's wire count, which is used to record the
1769 * number of valid page table entries within the page. If the wire count
1770 * drops to zero, then the page table page is unmapped. Returns TRUE if the
1771 * page table page was unmapped and FALSE otherwise.
1772 */
1773 static inline boolean_t
1774 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1775 {
1776
1777 --m->wire_count;
1778 if (m->wire_count == 0) {
1779 _pmap_unwire_ptp(pmap, m, free);
1780 return (TRUE);
1781 } else
1782 return (FALSE);
1783 }
1784
1785 static void
1786 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1787 {
1788 vm_offset_t pteva;
1789
1790 /*
1791 * unmap the page table page
1792 */
1793 pmap->pm_pdir[m->pindex] = 0;
1794 --pmap->pm_stats.resident_count;
1795
1796 /*
1797 * This is a release store so that the ordinary store unmapping
1798 * the page table page is globally performed before TLB shoot-
1799 * down is begun.
1800 */
1801 atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
1802
1803 /*
1804 * Do an invltlb to make the invalidated mapping
1805 * take effect immediately.
1806 */
1807 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1808 pmap_invalidate_page(pmap, pteva);
1809
1810 /*
1811 * Put page on a list so that it is released after
1812 * *ALL* TLB shootdown is done
1813 */
1814 pmap_add_delayed_free_list(m, free, TRUE);
1815 }
1816
1817 /*
1818 * After removing a page table entry, this routine is used to
1819 * conditionally free the page, and manage the hold/wire counts.
1820 */
1821 static int
1822 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
1823 {
1824 pd_entry_t ptepde;
1825 vm_page_t mpte;
1826
1827 if (va >= VM_MAXUSER_ADDRESS)
1828 return (0);
1829 ptepde = *pmap_pde(pmap, va);
1830 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1831 return (pmap_unwire_ptp(pmap, mpte, free));
1832 }
1833
1834 /*
1835 * Initialize the pmap for the swapper process.
1836 */
1837 void
1838 pmap_pinit0(pmap_t pmap)
1839 {
1840
1841 PMAP_LOCK_INIT(pmap);
1842 /*
1843 * Since the page table directory is shared with the kernel pmap,
1844 * which is already included in the list "allpmaps", this pmap does
1845 * not need to be inserted into that list.
1846 */
1847 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1848 #if defined(PAE) || defined(PAE_TABLES)
1849 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1850 #endif
1851 pmap->pm_root.rt_root = 0;
1852 CPU_ZERO(&pmap->pm_active);
1853 PCPU_SET(curpmap, pmap);
1854 TAILQ_INIT(&pmap->pm_pvchunk);
1855 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1856 }
1857
1858 /*
1859 * Initialize a preallocated and zeroed pmap structure,
1860 * such as one in a vmspace structure.
1861 */
1862 int
1863 pmap_pinit(pmap_t pmap)
1864 {
1865 vm_page_t m, ptdpg[NPGPTD];
1866 vm_paddr_t pa;
1867 int i;
1868
1869 /*
1870 * No need to allocate page table space yet but we do need a valid
1871 * page directory table.
1872 */
1873 if (pmap->pm_pdir == NULL) {
1874 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
1875 if (pmap->pm_pdir == NULL)
1876 return (0);
1877 #if defined(PAE) || defined(PAE_TABLES)
1878 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1879 KASSERT(((vm_offset_t)pmap->pm_pdpt &
1880 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1881 ("pmap_pinit: pdpt misaligned"));
1882 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1883 ("pmap_pinit: pdpt above 4g"));
1884 #endif
1885 pmap->pm_root.rt_root = 0;
1886 }
1887 KASSERT(vm_radix_is_empty(&pmap->pm_root),
1888 ("pmap_pinit: pmap has reserved page table page(s)"));
1889
1890 /*
1891 * allocate the page directory page(s)
1892 */
1893 for (i = 0; i < NPGPTD;) {
1894 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1895 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1896 if (m == NULL)
1897 VM_WAIT;
1898 else {
1899 ptdpg[i++] = m;
1900 }
1901 }
1902
1903 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1904
1905 for (i = 0; i < NPGPTD; i++)
1906 if ((ptdpg[i]->flags & PG_ZERO) == 0)
1907 pagezero(pmap->pm_pdir + (i * NPDEPG));
1908
1909 mtx_lock_spin(&allpmaps_lock);
1910 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1911 /* Copy the kernel page table directory entries. */
1912 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1913 mtx_unlock_spin(&allpmaps_lock);
1914
1915 /* install self-referential address mapping entry(s) */
1916 for (i = 0; i < NPGPTD; i++) {
1917 pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1918 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1919 #if defined(PAE) || defined(PAE_TABLES)
1920 pmap->pm_pdpt[i] = pa | PG_V;
1921 #endif
1922 }
1923
1924 CPU_ZERO(&pmap->pm_active);
1925 TAILQ_INIT(&pmap->pm_pvchunk);
1926 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1927
1928 return (1);
1929 }
1930
1931 /*
1932 * this routine is called if the page table page is not
1933 * mapped correctly.
1934 */
1935 static vm_page_t
1936 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
1937 {
1938 vm_paddr_t ptepa;
1939 vm_page_t m;
1940
1941 /*
1942 * Allocate a page table page.
1943 */
1944 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1945 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1946 if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
1947 PMAP_UNLOCK(pmap);
1948 rw_wunlock(&pvh_global_lock);
1949 VM_WAIT;
1950 rw_wlock(&pvh_global_lock);
1951 PMAP_LOCK(pmap);
1952 }
1953
1954 /*
1955 * Indicate the need to retry. While waiting, the page table
1956 * page may have been allocated.
1957 */
1958 return (NULL);
1959 }
1960 if ((m->flags & PG_ZERO) == 0)
1961 pmap_zero_page(m);
1962
1963 /*
1964 * Map the pagetable page into the process address space, if
1965 * it isn't already there.
1966 */
1967
1968 pmap->pm_stats.resident_count++;
1969
1970 ptepa = VM_PAGE_TO_PHYS(m);
1971 pmap->pm_pdir[ptepindex] =
1972 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1973
1974 return (m);
1975 }
1976
1977 static vm_page_t
1978 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
1979 {
1980 u_int ptepindex;
1981 pd_entry_t ptepa;
1982 vm_page_t m;
1983
1984 /*
1985 * Calculate pagetable page index
1986 */
1987 ptepindex = va >> PDRSHIFT;
1988 retry:
1989 /*
1990 * Get the page directory entry
1991 */
1992 ptepa = pmap->pm_pdir[ptepindex];
1993
1994 /*
1995 * This supports switching from a 4MB page to a
1996 * normal 4K page.
1997 */
1998 if (ptepa & PG_PS) {
1999 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
2000 ptepa = pmap->pm_pdir[ptepindex];
2001 }
2002
2003 /*
2004 * If the page table page is mapped, we just increment the
2005 * hold count, and activate it.
2006 */
2007 if (ptepa) {
2008 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
2009 m->wire_count++;
2010 } else {
2011 /*
2012 * Here if the pte page isn't mapped, or if it has
2013 * been deallocated.
2014 */
2015 m = _pmap_allocpte(pmap, ptepindex, flags);
2016 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
2017 goto retry;
2018 }
2019 return (m);
2020 }
2021
2022
2023 /***************************************************
2024 * Pmap allocation/deallocation routines.
2025 ***************************************************/
2026
2027 /*
2028 * Release any resources held by the given physical map.
2029 * Called when a pmap initialized by pmap_pinit is being released.
2030 * Should only be called if the map contains no valid mappings.
2031 */
2032 void
2033 pmap_release(pmap_t pmap)
2034 {
2035 vm_page_t m, ptdpg[NPGPTD];
2036 int i;
2037
2038 KASSERT(pmap->pm_stats.resident_count == 0,
2039 ("pmap_release: pmap resident count %ld != 0",
2040 pmap->pm_stats.resident_count));
2041 KASSERT(vm_radix_is_empty(&pmap->pm_root),
2042 ("pmap_release: pmap has reserved page table page(s)"));
2043 KASSERT(CPU_EMPTY(&pmap->pm_active),
2044 ("releasing active pmap %p", pmap));
2045
2046 mtx_lock_spin(&allpmaps_lock);
2047 LIST_REMOVE(pmap, pm_list);
2048 mtx_unlock_spin(&allpmaps_lock);
2049
2050 for (i = 0; i < NPGPTD; i++)
2051 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2052 PG_FRAME);
2053
2054 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2055 sizeof(*pmap->pm_pdir));
2056
2057 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2058
2059 for (i = 0; i < NPGPTD; i++) {
2060 m = ptdpg[i];
2061 #if defined(PAE) || defined(PAE_TABLES)
2062 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2063 ("pmap_release: got wrong ptd page"));
2064 #endif
2065 m->wire_count--;
2066 atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2067 vm_page_free_zero(m);
2068 }
2069 }
2070
2071 static int
2072 kvm_size(SYSCTL_HANDLER_ARGS)
2073 {
2074 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2075
2076 return (sysctl_handle_long(oidp, &ksize, 0, req));
2077 }
2078 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2079 0, 0, kvm_size, "IU", "Size of KVM");
2080
2081 static int
2082 kvm_free(SYSCTL_HANDLER_ARGS)
2083 {
2084 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2085
2086 return (sysctl_handle_long(oidp, &kfree, 0, req));
2087 }
2088 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2089 0, 0, kvm_free, "IU", "Amount of KVM free");
2090
2091 /*
2092 * grow the number of kernel page table entries, if needed
2093 */
2094 void
2095 pmap_growkernel(vm_offset_t addr)
2096 {
2097 vm_paddr_t ptppaddr;
2098 vm_page_t nkpg;
2099 pd_entry_t newpdir;
2100
2101 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2102 addr = roundup2(addr, NBPDR);
2103 if (addr - 1 >= kernel_map->max_offset)
2104 addr = kernel_map->max_offset;
2105 while (kernel_vm_end < addr) {
2106 if (pdir_pde(PTD, kernel_vm_end)) {
2107 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2108 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2109 kernel_vm_end = kernel_map->max_offset;
2110 break;
2111 }
2112 continue;
2113 }
2114
2115 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2116 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2117 VM_ALLOC_ZERO);
2118 if (nkpg == NULL)
2119 panic("pmap_growkernel: no memory to grow kernel");
2120
2121 nkpt++;
2122
2123 if ((nkpg->flags & PG_ZERO) == 0)
2124 pmap_zero_page(nkpg);
2125 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2126 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2127 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2128
2129 pmap_kenter_pde(kernel_vm_end, newpdir);
2130 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2131 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2132 kernel_vm_end = kernel_map->max_offset;
2133 break;
2134 }
2135 }
2136 }
2137
2138
2139 /***************************************************
2140 * page management routines.
2141 ***************************************************/
2142
2143 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2144 CTASSERT(_NPCM == 11);
2145 CTASSERT(_NPCPV == 336);
2146
2147 static __inline struct pv_chunk *
2148 pv_to_chunk(pv_entry_t pv)
2149 {
2150
2151 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2152 }
2153
2154 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2155
2156 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */
2157 #define PC_FREE10 0x0000fffful /* Free values for index 10 */
2158
2159 static const uint32_t pc_freemask[_NPCM] = {
2160 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2161 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2162 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2163 PC_FREE0_9, PC_FREE10
2164 };
2165
2166 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2167 "Current number of pv entries");
2168
2169 #ifdef PV_STATS
2170 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2171
2172 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2173 "Current number of pv entry chunks");
2174 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2175 "Current number of pv entry chunks allocated");
2176 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2177 "Current number of pv entry chunks frees");
2178 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2179 "Number of times tried to get a chunk page but failed.");
2180
2181 static long pv_entry_frees, pv_entry_allocs;
2182 static int pv_entry_spare;
2183
2184 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2185 "Current number of pv entry frees");
2186 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2187 "Current number of pv entry allocs");
2188 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2189 "Current number of spare pv entries");
2190 #endif
2191
2192 /*
2193 * We are in a serious low memory condition. Resort to
2194 * drastic measures to free some pages so we can allocate
2195 * another pv entry chunk.
2196 */
2197 static vm_page_t
2198 pmap_pv_reclaim(pmap_t locked_pmap)
2199 {
2200 struct pch newtail;
2201 struct pv_chunk *pc;
2202 struct md_page *pvh;
2203 pd_entry_t *pde;
2204 pmap_t pmap;
2205 pt_entry_t *pte, tpte;
2206 pv_entry_t pv;
2207 vm_offset_t va;
2208 vm_page_t m, m_pc;
2209 struct spglist free;
2210 uint32_t inuse;
2211 int bit, field, freed;
2212
2213 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2214 pmap = NULL;
2215 m_pc = NULL;
2216 SLIST_INIT(&free);
2217 TAILQ_INIT(&newtail);
2218 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2219 SLIST_EMPTY(&free))) {
2220 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2221 if (pmap != pc->pc_pmap) {
2222 if (pmap != NULL) {
2223 pmap_invalidate_all(pmap);
2224 if (pmap != locked_pmap)
2225 PMAP_UNLOCK(pmap);
2226 }
2227 pmap = pc->pc_pmap;
2228 /* Avoid deadlock and lock recursion. */
2229 if (pmap > locked_pmap)
2230 PMAP_LOCK(pmap);
2231 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2232 pmap = NULL;
2233 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2234 continue;
2235 }
2236 }
2237
2238 /*
2239 * Destroy every non-wired, 4 KB page mapping in the chunk.
2240 */
2241 freed = 0;
2242 for (field = 0; field < _NPCM; field++) {
2243 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2244 inuse != 0; inuse &= ~(1UL << bit)) {
2245 bit = bsfl(inuse);
2246 pv = &pc->pc_pventry[field * 32 + bit];
2247 va = pv->pv_va;
2248 pde = pmap_pde(pmap, va);
2249 if ((*pde & PG_PS) != 0)
2250 continue;
2251 pte = pmap_pte(pmap, va);
2252 tpte = *pte;
2253 if ((tpte & PG_W) == 0)
2254 tpte = pte_load_clear(pte);
2255 pmap_pte_release(pte);
2256 if ((tpte & PG_W) != 0)
2257 continue;
2258 KASSERT(tpte != 0,
2259 ("pmap_pv_reclaim: pmap %p va %x zero pte",
2260 pmap, va));
2261 if ((tpte & PG_G) != 0)
2262 pmap_invalidate_page(pmap, va);
2263 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2264 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2265 vm_page_dirty(m);
2266 if ((tpte & PG_A) != 0)
2267 vm_page_aflag_set(m, PGA_REFERENCED);
2268 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2269 if (TAILQ_EMPTY(&m->md.pv_list) &&
2270 (m->flags & PG_FICTITIOUS) == 0) {
2271 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2272 if (TAILQ_EMPTY(&pvh->pv_list)) {
2273 vm_page_aflag_clear(m,
2274 PGA_WRITEABLE);
2275 }
2276 }
2277 pc->pc_map[field] |= 1UL << bit;
2278 pmap_unuse_pt(pmap, va, &free);
2279 freed++;
2280 }
2281 }
2282 if (freed == 0) {
2283 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2284 continue;
2285 }
2286 /* Every freed mapping is for a 4 KB page. */
2287 pmap->pm_stats.resident_count -= freed;
2288 PV_STAT(pv_entry_frees += freed);
2289 PV_STAT(pv_entry_spare += freed);
2290 pv_entry_count -= freed;
2291 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2292 for (field = 0; field < _NPCM; field++)
2293 if (pc->pc_map[field] != pc_freemask[field]) {
2294 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2295 pc_list);
2296 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2297
2298 /*
2299 * One freed pv entry in locked_pmap is
2300 * sufficient.
2301 */
2302 if (pmap == locked_pmap)
2303 goto out;
2304 break;
2305 }
2306 if (field == _NPCM) {
2307 PV_STAT(pv_entry_spare -= _NPCPV);
2308 PV_STAT(pc_chunk_count--);
2309 PV_STAT(pc_chunk_frees++);
2310 /* Entire chunk is free; return it. */
2311 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2312 pmap_qremove((vm_offset_t)pc, 1);
2313 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2314 break;
2315 }
2316 }
2317 out:
2318 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2319 if (pmap != NULL) {
2320 pmap_invalidate_all(pmap);
2321 if (pmap != locked_pmap)
2322 PMAP_UNLOCK(pmap);
2323 }
2324 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
2325 m_pc = SLIST_FIRST(&free);
2326 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2327 /* Recycle a freed page table page. */
2328 m_pc->wire_count = 1;
2329 atomic_add_int(&vm_cnt.v_wire_count, 1);
2330 }
2331 pmap_free_zero_pages(&free);
2332 return (m_pc);
2333 }
2334
2335 /*
2336 * free the pv_entry back to the free list
2337 */
2338 static void
2339 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2340 {
2341 struct pv_chunk *pc;
2342 int idx, field, bit;
2343
2344 rw_assert(&pvh_global_lock, RA_WLOCKED);
2345 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2346 PV_STAT(pv_entry_frees++);
2347 PV_STAT(pv_entry_spare++);
2348 pv_entry_count--;
2349 pc = pv_to_chunk(pv);
2350 idx = pv - &pc->pc_pventry[0];
2351 field = idx / 32;
2352 bit = idx % 32;
2353 pc->pc_map[field] |= 1ul << bit;
2354 for (idx = 0; idx < _NPCM; idx++)
2355 if (pc->pc_map[idx] != pc_freemask[idx]) {
2356 /*
2357 * 98% of the time, pc is already at the head of the
2358 * list. If it isn't already, move it to the head.
2359 */
2360 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2361 pc)) {
2362 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2363 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2364 pc_list);
2365 }
2366 return;
2367 }
2368 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2369 free_pv_chunk(pc);
2370 }
2371
2372 static void
2373 free_pv_chunk(struct pv_chunk *pc)
2374 {
2375 vm_page_t m;
2376
2377 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2378 PV_STAT(pv_entry_spare -= _NPCPV);
2379 PV_STAT(pc_chunk_count--);
2380 PV_STAT(pc_chunk_frees++);
2381 /* entire chunk is free, return it */
2382 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2383 pmap_qremove((vm_offset_t)pc, 1);
2384 vm_page_unwire(m, PQ_NONE);
2385 vm_page_free(m);
2386 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2387 }
2388
2389 /*
2390 * get a new pv_entry, allocating a block from the system
2391 * when needed.
2392 */
2393 static pv_entry_t
2394 get_pv_entry(pmap_t pmap, boolean_t try)
2395 {
2396 static const struct timeval printinterval = { 60, 0 };
2397 static struct timeval lastprint;
2398 int bit, field;
2399 pv_entry_t pv;
2400 struct pv_chunk *pc;
2401 vm_page_t m;
2402
2403 rw_assert(&pvh_global_lock, RA_WLOCKED);
2404 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2405 PV_STAT(pv_entry_allocs++);
2406 pv_entry_count++;
2407 if (pv_entry_count > pv_entry_high_water)
2408 if (ratecheck(&lastprint, &printinterval))
2409 printf("Approaching the limit on PV entries, consider "
2410 "increasing either the vm.pmap.shpgperproc or the "
2411 "vm.pmap.pv_entry_max tunable.\n");
2412 retry:
2413 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2414 if (pc != NULL) {
2415 for (field = 0; field < _NPCM; field++) {
2416 if (pc->pc_map[field]) {
2417 bit = bsfl(pc->pc_map[field]);
2418 break;
2419 }
2420 }
2421 if (field < _NPCM) {
2422 pv = &pc->pc_pventry[field * 32 + bit];
2423 pc->pc_map[field] &= ~(1ul << bit);
2424 /* If this was the last item, move it to tail */
2425 for (field = 0; field < _NPCM; field++)
2426 if (pc->pc_map[field] != 0) {
2427 PV_STAT(pv_entry_spare--);
2428 return (pv); /* not full, return */
2429 }
2430 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2431 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2432 PV_STAT(pv_entry_spare--);
2433 return (pv);
2434 }
2435 }
2436 /*
2437 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2438 * global lock. If "pv_vafree" is currently non-empty, it will
2439 * remain non-empty until pmap_ptelist_alloc() completes.
2440 */
2441 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2442 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2443 if (try) {
2444 pv_entry_count--;
2445 PV_STAT(pc_chunk_tryfail++);
2446 return (NULL);
2447 }
2448 m = pmap_pv_reclaim(pmap);
2449 if (m == NULL)
2450 goto retry;
2451 }
2452 PV_STAT(pc_chunk_count++);
2453 PV_STAT(pc_chunk_allocs++);
2454 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2455 pmap_qenter((vm_offset_t)pc, &m, 1);
2456 pc->pc_pmap = pmap;
2457 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */
2458 for (field = 1; field < _NPCM; field++)
2459 pc->pc_map[field] = pc_freemask[field];
2460 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2461 pv = &pc->pc_pventry[0];
2462 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2463 PV_STAT(pv_entry_spare += _NPCPV - 1);
2464 return (pv);
2465 }
2466
2467 static __inline pv_entry_t
2468 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2469 {
2470 pv_entry_t pv;
2471
2472 rw_assert(&pvh_global_lock, RA_WLOCKED);
2473 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2474 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2475 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2476 break;
2477 }
2478 }
2479 return (pv);
2480 }
2481
2482 static void
2483 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2484 {
2485 struct md_page *pvh;
2486 pv_entry_t pv;
2487 vm_offset_t va_last;
2488 vm_page_t m;
2489
2490 rw_assert(&pvh_global_lock, RA_WLOCKED);
2491 KASSERT((pa & PDRMASK) == 0,
2492 ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2493
2494 /*
2495 * Transfer the 4mpage's pv entry for this mapping to the first
2496 * page's pv list.
2497 */
2498 pvh = pa_to_pvh(pa);
2499 va = trunc_4mpage(va);
2500 pv = pmap_pvh_remove(pvh, pmap, va);
2501 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2502 m = PHYS_TO_VM_PAGE(pa);
2503 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2504 /* Instantiate the remaining NPTEPG - 1 pv entries. */
2505 va_last = va + NBPDR - PAGE_SIZE;
2506 do {
2507 m++;
2508 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2509 ("pmap_pv_demote_pde: page %p is not managed", m));
2510 va += PAGE_SIZE;
2511 pmap_insert_entry(pmap, va, m);
2512 } while (va < va_last);
2513 }
2514
2515 static void
2516 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2517 {
2518 struct md_page *pvh;
2519 pv_entry_t pv;
2520 vm_offset_t va_last;
2521 vm_page_t m;
2522
2523 rw_assert(&pvh_global_lock, RA_WLOCKED);
2524 KASSERT((pa & PDRMASK) == 0,
2525 ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2526
2527 /*
2528 * Transfer the first page's pv entry for this mapping to the
2529 * 4mpage's pv list. Aside from avoiding the cost of a call
2530 * to get_pv_entry(), a transfer avoids the possibility that
2531 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2532 * removes one of the mappings that is being promoted.
2533 */
2534 m = PHYS_TO_VM_PAGE(pa);
2535 va = trunc_4mpage(va);
2536 pv = pmap_pvh_remove(&m->md, pmap, va);
2537 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2538 pvh = pa_to_pvh(pa);
2539 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2540 /* Free the remaining NPTEPG - 1 pv entries. */
2541 va_last = va + NBPDR - PAGE_SIZE;
2542 do {
2543 m++;
2544 va += PAGE_SIZE;
2545 pmap_pvh_free(&m->md, pmap, va);
2546 } while (va < va_last);
2547 }
2548
2549 static void
2550 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2551 {
2552 pv_entry_t pv;
2553
2554 pv = pmap_pvh_remove(pvh, pmap, va);
2555 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2556 free_pv_entry(pmap, pv);
2557 }
2558
2559 static void
2560 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2561 {
2562 struct md_page *pvh;
2563
2564 rw_assert(&pvh_global_lock, RA_WLOCKED);
2565 pmap_pvh_free(&m->md, pmap, va);
2566 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2567 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2568 if (TAILQ_EMPTY(&pvh->pv_list))
2569 vm_page_aflag_clear(m, PGA_WRITEABLE);
2570 }
2571 }
2572
2573 /*
2574 * Create a pv entry for page at pa for
2575 * (pmap, va).
2576 */
2577 static void
2578 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2579 {
2580 pv_entry_t pv;
2581
2582 rw_assert(&pvh_global_lock, RA_WLOCKED);
2583 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2584 pv = get_pv_entry(pmap, FALSE);
2585 pv->pv_va = va;
2586 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2587 }
2588
2589 /*
2590 * Conditionally create a pv entry.
2591 */
2592 static boolean_t
2593 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2594 {
2595 pv_entry_t pv;
2596
2597 rw_assert(&pvh_global_lock, RA_WLOCKED);
2598 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2599 if (pv_entry_count < pv_entry_high_water &&
2600 (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2601 pv->pv_va = va;
2602 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2603 return (TRUE);
2604 } else
2605 return (FALSE);
2606 }
2607
2608 /*
2609 * Create the pv entries for each of the pages within a superpage.
2610 */
2611 static boolean_t
2612 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2613 {
2614 struct md_page *pvh;
2615 pv_entry_t pv;
2616
2617 rw_assert(&pvh_global_lock, RA_WLOCKED);
2618 if (pv_entry_count < pv_entry_high_water &&
2619 (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2620 pv->pv_va = va;
2621 pvh = pa_to_pvh(pa);
2622 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2623 return (TRUE);
2624 } else
2625 return (FALSE);
2626 }
2627
2628 /*
2629 * Fills a page table page with mappings to consecutive physical pages.
2630 */
2631 static void
2632 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2633 {
2634 pt_entry_t *pte;
2635
2636 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2637 *pte = newpte;
2638 newpte += PAGE_SIZE;
2639 }
2640 }
2641
2642 /*
2643 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the
2644 * 2- or 4MB page mapping is invalidated.
2645 */
2646 static boolean_t
2647 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2648 {
2649 pd_entry_t newpde, oldpde;
2650 pt_entry_t *firstpte, newpte;
2651 vm_paddr_t mptepa;
2652 vm_page_t mpte;
2653 struct spglist free;
2654 vm_offset_t sva;
2655
2656 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2657 oldpde = *pde;
2658 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2659 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2660 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
2661 NULL) {
2662 KASSERT((oldpde & PG_W) == 0,
2663 ("pmap_demote_pde: page table page for a wired mapping"
2664 " is missing"));
2665
2666 /*
2667 * Invalidate the 2- or 4MB page mapping and return
2668 * "failure" if the mapping was never accessed or the
2669 * allocation of the new page table page fails.
2670 */
2671 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2672 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2673 VM_ALLOC_WIRED)) == NULL) {
2674 SLIST_INIT(&free);
2675 sva = trunc_4mpage(va);
2676 pmap_remove_pde(pmap, pde, sva, &free);
2677 if ((oldpde & PG_G) == 0)
2678 pmap_invalidate_pde_page(pmap, sva, oldpde);
2679 pmap_free_zero_pages(&free);
2680 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2681 " in pmap %p", va, pmap);
2682 return (FALSE);
2683 }
2684 if (va < VM_MAXUSER_ADDRESS)
2685 pmap->pm_stats.resident_count++;
2686 }
2687 mptepa = VM_PAGE_TO_PHYS(mpte);
2688
2689 /*
2690 * If the page mapping is in the kernel's address space, then the
2691 * KPTmap can provide access to the page table page. Otherwise,
2692 * temporarily map the page table page (mpte) into the kernel's
2693 * address space at either PADDR1 or PADDR2.
2694 */
2695 if (va >= KERNBASE)
2696 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2697 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2698 if ((*PMAP1 & PG_FRAME) != mptepa) {
2699 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2700 #ifdef SMP
2701 PMAP1cpu = PCPU_GET(cpuid);
2702 #endif
2703 invlcaddr(PADDR1);
2704 PMAP1changed++;
2705 } else
2706 #ifdef SMP
2707 if (PMAP1cpu != PCPU_GET(cpuid)) {
2708 PMAP1cpu = PCPU_GET(cpuid);
2709 invlcaddr(PADDR1);
2710 PMAP1changedcpu++;
2711 } else
2712 #endif
2713 PMAP1unchanged++;
2714 firstpte = PADDR1;
2715 } else {
2716 mtx_lock(&PMAP2mutex);
2717 if ((*PMAP2 & PG_FRAME) != mptepa) {
2718 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2719 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2720 }
2721 firstpte = PADDR2;
2722 }
2723 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2724 KASSERT((oldpde & PG_A) != 0,
2725 ("pmap_demote_pde: oldpde is missing PG_A"));
2726 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2727 ("pmap_demote_pde: oldpde is missing PG_M"));
2728 newpte = oldpde & ~PG_PS;
2729 if ((newpte & PG_PDE_PAT) != 0)
2730 newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2731
2732 /*
2733 * If the page table page is new, initialize it.
2734 */
2735 if (mpte->wire_count == 1) {
2736 mpte->wire_count = NPTEPG;
2737 pmap_fill_ptp(firstpte, newpte);
2738 }
2739 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2740 ("pmap_demote_pde: firstpte and newpte map different physical"
2741 " addresses"));
2742
2743 /*
2744 * If the mapping has changed attributes, update the page table
2745 * entries.
2746 */
2747 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2748 pmap_fill_ptp(firstpte, newpte);
2749
2750 /*
2751 * Demote the mapping. This pmap is locked. The old PDE has
2752 * PG_A set. If the old PDE has PG_RW set, it also has PG_M
2753 * set. Thus, there is no danger of a race with another
2754 * processor changing the setting of PG_A and/or PG_M between
2755 * the read above and the store below.
2756 */
2757 if (workaround_erratum383)
2758 pmap_update_pde(pmap, va, pde, newpde);
2759 else if (pmap == kernel_pmap)
2760 pmap_kenter_pde(va, newpde);
2761 else
2762 pde_store(pde, newpde);
2763 if (firstpte == PADDR2)
2764 mtx_unlock(&PMAP2mutex);
2765
2766 /*
2767 * Invalidate the recursive mapping of the page table page.
2768 */
2769 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2770
2771 /*
2772 * Demote the pv entry. This depends on the earlier demotion
2773 * of the mapping. Specifically, the (re)creation of a per-
2774 * page pv entry might trigger the execution of pmap_collect(),
2775 * which might reclaim a newly (re)created per-page pv entry
2776 * and destroy the associated mapping. In order to destroy
2777 * the mapping, the PDE must have already changed from mapping
2778 * the 2mpage to referencing the page table page.
2779 */
2780 if ((oldpde & PG_MANAGED) != 0)
2781 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2782
2783 pmap_pde_demotions++;
2784 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2785 " in pmap %p", va, pmap);
2786 return (TRUE);
2787 }
2788
2789 /*
2790 * Removes a 2- or 4MB page mapping from the kernel pmap.
2791 */
2792 static void
2793 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2794 {
2795 pd_entry_t newpde;
2796 vm_paddr_t mptepa;
2797 vm_page_t mpte;
2798
2799 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2800 mpte = pmap_remove_pt_page(pmap, va);
2801 if (mpte == NULL)
2802 panic("pmap_remove_kernel_pde: Missing pt page.");
2803
2804 mptepa = VM_PAGE_TO_PHYS(mpte);
2805 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
2806
2807 /*
2808 * Initialize the page table page.
2809 */
2810 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
2811
2812 /*
2813 * Remove the mapping.
2814 */
2815 if (workaround_erratum383)
2816 pmap_update_pde(pmap, va, pde, newpde);
2817 else
2818 pmap_kenter_pde(va, newpde);
2819
2820 /*
2821 * Invalidate the recursive mapping of the page table page.
2822 */
2823 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2824 }
2825
2826 /*
2827 * pmap_remove_pde: do the things to unmap a superpage in a process
2828 */
2829 static void
2830 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2831 struct spglist *free)
2832 {
2833 struct md_page *pvh;
2834 pd_entry_t oldpde;
2835 vm_offset_t eva, va;
2836 vm_page_t m, mpte;
2837
2838 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2839 KASSERT((sva & PDRMASK) == 0,
2840 ("pmap_remove_pde: sva is not 4mpage aligned"));
2841 oldpde = pte_load_clear(pdq);
2842 if (oldpde & PG_W)
2843 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2844
2845 /*
2846 * Machines that don't support invlpg, also don't support
2847 * PG_G.
2848 */
2849 if ((oldpde & PG_G) != 0)
2850 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
2851
2852 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2853 if (oldpde & PG_MANAGED) {
2854 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2855 pmap_pvh_free(pvh, pmap, sva);
2856 eva = sva + NBPDR;
2857 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2858 va < eva; va += PAGE_SIZE, m++) {
2859 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2860 vm_page_dirty(m);
2861 if (oldpde & PG_A)
2862 vm_page_aflag_set(m, PGA_REFERENCED);
2863 if (TAILQ_EMPTY(&m->md.pv_list) &&
2864 TAILQ_EMPTY(&pvh->pv_list))
2865 vm_page_aflag_clear(m, PGA_WRITEABLE);
2866 }
2867 }
2868 if (pmap == kernel_pmap) {
2869 pmap_remove_kernel_pde(pmap, pdq, sva);
2870 } else {
2871 mpte = pmap_remove_pt_page(pmap, sva);
2872 if (mpte != NULL) {
2873 pmap->pm_stats.resident_count--;
2874 KASSERT(mpte->wire_count == NPTEPG,
2875 ("pmap_remove_pde: pte page wire count error"));
2876 mpte->wire_count = 0;
2877 pmap_add_delayed_free_list(mpte, free, FALSE);
2878 atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2879 }
2880 }
2881 }
2882
2883 /*
2884 * pmap_remove_pte: do the things to unmap a page in a process
2885 */
2886 static int
2887 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2888 struct spglist *free)
2889 {
2890 pt_entry_t oldpte;
2891 vm_page_t m;
2892
2893 rw_assert(&pvh_global_lock, RA_WLOCKED);
2894 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2895 oldpte = pte_load_clear(ptq);
2896 KASSERT(oldpte != 0,
2897 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2898 if (oldpte & PG_W)
2899 pmap->pm_stats.wired_count -= 1;
2900 /*
2901 * Machines that don't support invlpg, also don't support
2902 * PG_G.
2903 */
2904 if (oldpte & PG_G)
2905 pmap_invalidate_page(kernel_pmap, va);
2906 pmap->pm_stats.resident_count -= 1;
2907 if (oldpte & PG_MANAGED) {
2908 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2909 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2910 vm_page_dirty(m);
2911 if (oldpte & PG_A)
2912 vm_page_aflag_set(m, PGA_REFERENCED);
2913 pmap_remove_entry(pmap, m, va);
2914 }
2915 return (pmap_unuse_pt(pmap, va, free));
2916 }
2917
2918 /*
2919 * Remove a single page from a process address space
2920 */
2921 static void
2922 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
2923 {
2924 pt_entry_t *pte;
2925
2926 rw_assert(&pvh_global_lock, RA_WLOCKED);
2927 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2928 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2929 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2930 return;
2931 pmap_remove_pte(pmap, pte, va, free);
2932 pmap_invalidate_page(pmap, va);
2933 }
2934
2935 /*
2936 * Remove the given range of addresses from the specified map.
2937 *
2938 * It is assumed that the start and end are properly
2939 * rounded to the page size.
2940 */
2941 void
2942 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2943 {
2944 vm_offset_t pdnxt;
2945 pd_entry_t ptpaddr;
2946 pt_entry_t *pte;
2947 struct spglist free;
2948 int anyvalid;
2949
2950 /*
2951 * Perform an unsynchronized read. This is, however, safe.
2952 */
2953 if (pmap->pm_stats.resident_count == 0)
2954 return;
2955
2956 anyvalid = 0;
2957 SLIST_INIT(&free);
2958
2959 rw_wlock(&pvh_global_lock);
2960 sched_pin();
2961 PMAP_LOCK(pmap);
2962
2963 /*
2964 * special handling of removing one page. a very
2965 * common operation and easy to short circuit some
2966 * code.
2967 */
2968 if ((sva + PAGE_SIZE == eva) &&
2969 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2970 pmap_remove_page(pmap, sva, &free);
2971 goto out;
2972 }
2973
2974 for (; sva < eva; sva = pdnxt) {
2975 u_int pdirindex;
2976
2977 /*
2978 * Calculate index for next page table.
2979 */
2980 pdnxt = (sva + NBPDR) & ~PDRMASK;
2981 if (pdnxt < sva)
2982 pdnxt = eva;
2983 if (pmap->pm_stats.resident_count == 0)
2984 break;
2985
2986 pdirindex = sva >> PDRSHIFT;
2987 ptpaddr = pmap->pm_pdir[pdirindex];
2988
2989 /*
2990 * Weed out invalid mappings. Note: we assume that the page
2991 * directory table is always allocated, and in kernel virtual.
2992 */
2993 if (ptpaddr == 0)
2994 continue;
2995
2996 /*
2997 * Check for large page.
2998 */
2999 if ((ptpaddr & PG_PS) != 0) {
3000 /*
3001 * Are we removing the entire large page? If not,
3002 * demote the mapping and fall through.
3003 */
3004 if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3005 /*
3006 * The TLB entry for a PG_G mapping is
3007 * invalidated by pmap_remove_pde().
3008 */
3009 if ((ptpaddr & PG_G) == 0)
3010 anyvalid = 1;
3011 pmap_remove_pde(pmap,
3012 &pmap->pm_pdir[pdirindex], sva, &free);
3013 continue;
3014 } else if (!pmap_demote_pde(pmap,
3015 &pmap->pm_pdir[pdirindex], sva)) {
3016 /* The large page mapping was destroyed. */
3017 continue;
3018 }
3019 }
3020
3021 /*
3022 * Limit our scan to either the end of the va represented
3023 * by the current page table page, or to the end of the
3024 * range being removed.
3025 */
3026 if (pdnxt > eva)
3027 pdnxt = eva;
3028
3029 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3030 sva += PAGE_SIZE) {
3031 if (*pte == 0)
3032 continue;
3033
3034 /*
3035 * The TLB entry for a PG_G mapping is invalidated
3036 * by pmap_remove_pte().
3037 */
3038 if ((*pte & PG_G) == 0)
3039 anyvalid = 1;
3040 if (pmap_remove_pte(pmap, pte, sva, &free))
3041 break;
3042 }
3043 }
3044 out:
3045 sched_unpin();
3046 if (anyvalid)
3047 pmap_invalidate_all(pmap);
3048 rw_wunlock(&pvh_global_lock);
3049 PMAP_UNLOCK(pmap);
3050 pmap_free_zero_pages(&free);
3051 }
3052
3053 /*
3054 * Routine: pmap_remove_all
3055 * Function:
3056 * Removes this physical page from
3057 * all physical maps in which it resides.
3058 * Reflects back modify bits to the pager.
3059 *
3060 * Notes:
3061 * Original versions of this routine were very
3062 * inefficient because they iteratively called
3063 * pmap_remove (slow...)
3064 */
3065
3066 void
3067 pmap_remove_all(vm_page_t m)
3068 {
3069 struct md_page *pvh;
3070 pv_entry_t pv;
3071 pmap_t pmap;
3072 pt_entry_t *pte, tpte;
3073 pd_entry_t *pde;
3074 vm_offset_t va;
3075 struct spglist free;
3076
3077 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3078 ("pmap_remove_all: page %p is not managed", m));
3079 SLIST_INIT(&free);
3080 rw_wlock(&pvh_global_lock);
3081 sched_pin();
3082 if ((m->flags & PG_FICTITIOUS) != 0)
3083 goto small_mappings;
3084 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3085 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3086 va = pv->pv_va;
3087 pmap = PV_PMAP(pv);
3088 PMAP_LOCK(pmap);
3089 pde = pmap_pde(pmap, va);
3090 (void)pmap_demote_pde(pmap, pde, va);
3091 PMAP_UNLOCK(pmap);
3092 }
3093 small_mappings:
3094 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3095 pmap = PV_PMAP(pv);
3096 PMAP_LOCK(pmap);
3097 pmap->pm_stats.resident_count--;
3098 pde = pmap_pde(pmap, pv->pv_va);
3099 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3100 " a 4mpage in page %p's pv list", m));
3101 pte = pmap_pte_quick(pmap, pv->pv_va);
3102 tpte = pte_load_clear(pte);
3103 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3104 pmap, pv->pv_va));
3105 if (tpte & PG_W)
3106 pmap->pm_stats.wired_count--;
3107 if (tpte & PG_A)
3108 vm_page_aflag_set(m, PGA_REFERENCED);
3109
3110 /*
3111 * Update the vm_page_t clean and reference bits.
3112 */
3113 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3114 vm_page_dirty(m);
3115 pmap_unuse_pt(pmap, pv->pv_va, &free);
3116 pmap_invalidate_page(pmap, pv->pv_va);
3117 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3118 free_pv_entry(pmap, pv);
3119 PMAP_UNLOCK(pmap);
3120 }
3121 vm_page_aflag_clear(m, PGA_WRITEABLE);
3122 sched_unpin();
3123 rw_wunlock(&pvh_global_lock);
3124 pmap_free_zero_pages(&free);
3125 }
3126
3127 /*
3128 * pmap_protect_pde: do the things to protect a 4mpage in a process
3129 */
3130 static boolean_t
3131 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3132 {
3133 pd_entry_t newpde, oldpde;
3134 vm_offset_t eva, va;
3135 vm_page_t m;
3136 boolean_t anychanged;
3137
3138 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3139 KASSERT((sva & PDRMASK) == 0,
3140 ("pmap_protect_pde: sva is not 4mpage aligned"));
3141 anychanged = FALSE;
3142 retry:
3143 oldpde = newpde = *pde;
3144 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
3145 (PG_MANAGED | PG_M | PG_RW)) {
3146 eva = sva + NBPDR;
3147 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3148 va < eva; va += PAGE_SIZE, m++)
3149 vm_page_dirty(m);
3150 }
3151 if ((prot & VM_PROT_WRITE) == 0)
3152 newpde &= ~(PG_RW | PG_M);
3153 #if defined(PAE) || defined(PAE_TABLES)
3154 if ((prot & VM_PROT_EXECUTE) == 0)
3155 newpde |= pg_nx;
3156 #endif
3157 if (newpde != oldpde) {
3158 /*
3159 * As an optimization to future operations on this PDE, clear
3160 * PG_PROMOTED. The impending invalidation will remove any
3161 * lingering 4KB page mappings from the TLB.
3162 */
3163 if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED))
3164 goto retry;
3165 if ((oldpde & PG_G) != 0)
3166 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
3167 else
3168 anychanged = TRUE;
3169 }
3170 return (anychanged);
3171 }
3172
3173 /*
3174 * Set the physical protection on the
3175 * specified range of this map as requested.
3176 */
3177 void
3178 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3179 {
3180 vm_offset_t pdnxt;
3181 pd_entry_t ptpaddr;
3182 pt_entry_t *pte;
3183 boolean_t anychanged, pv_lists_locked;
3184
3185 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3186 if (prot == VM_PROT_NONE) {
3187 pmap_remove(pmap, sva, eva);
3188 return;
3189 }
3190
3191 #if defined(PAE) || defined(PAE_TABLES)
3192 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3193 (VM_PROT_WRITE|VM_PROT_EXECUTE))
3194 return;
3195 #else
3196 if (prot & VM_PROT_WRITE)
3197 return;
3198 #endif
3199
3200 if (pmap_is_current(pmap))
3201 pv_lists_locked = FALSE;
3202 else {
3203 pv_lists_locked = TRUE;
3204 resume:
3205 rw_wlock(&pvh_global_lock);
3206 sched_pin();
3207 }
3208 anychanged = FALSE;
3209
3210 PMAP_LOCK(pmap);
3211 for (; sva < eva; sva = pdnxt) {
3212 pt_entry_t obits, pbits;
3213 u_int pdirindex;
3214
3215 pdnxt = (sva + NBPDR) & ~PDRMASK;
3216 if (pdnxt < sva)
3217 pdnxt = eva;
3218
3219 pdirindex = sva >> PDRSHIFT;
3220 ptpaddr = pmap->pm_pdir[pdirindex];
3221
3222 /*
3223 * Weed out invalid mappings. Note: we assume that the page
3224 * directory table is always allocated, and in kernel virtual.
3225 */
3226 if (ptpaddr == 0)
3227 continue;
3228
3229 /*
3230 * Check for large page.
3231 */
3232 if ((ptpaddr & PG_PS) != 0) {
3233 /*
3234 * Are we protecting the entire large page? If not,
3235 * demote the mapping and fall through.
3236 */
3237 if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3238 /*
3239 * The TLB entry for a PG_G mapping is
3240 * invalidated by pmap_protect_pde().
3241 */
3242 if (pmap_protect_pde(pmap,
3243 &pmap->pm_pdir[pdirindex], sva, prot))
3244 anychanged = TRUE;
3245 continue;
3246 } else {
3247 if (!pv_lists_locked) {
3248 pv_lists_locked = TRUE;
3249 if (!rw_try_wlock(&pvh_global_lock)) {
3250 if (anychanged)
3251 pmap_invalidate_all(
3252 pmap);
3253 PMAP_UNLOCK(pmap);
3254 goto resume;
3255 }
3256 sched_pin();
3257 }
3258 if (!pmap_demote_pde(pmap,
3259 &pmap->pm_pdir[pdirindex], sva)) {
3260 /*
3261 * The large page mapping was
3262 * destroyed.
3263 */
3264 continue;
3265 }
3266 }
3267 }
3268
3269 if (pdnxt > eva)
3270 pdnxt = eva;
3271
3272 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3273 sva += PAGE_SIZE) {
3274 vm_page_t m;
3275
3276 retry:
3277 /*
3278 * Regardless of whether a pte is 32 or 64 bits in
3279 * size, PG_RW, PG_A, and PG_M are among the least
3280 * significant 32 bits.
3281 */
3282 obits = pbits = *pte;
3283 if ((pbits & PG_V) == 0)
3284 continue;
3285
3286 if ((prot & VM_PROT_WRITE) == 0) {
3287 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3288 (PG_MANAGED | PG_M | PG_RW)) {
3289 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3290 vm_page_dirty(m);
3291 }
3292 pbits &= ~(PG_RW | PG_M);
3293 }
3294 #if defined(PAE) || defined(PAE_TABLES)
3295 if ((prot & VM_PROT_EXECUTE) == 0)
3296 pbits |= pg_nx;
3297 #endif
3298
3299 if (pbits != obits) {
3300 #if defined(PAE) || defined(PAE_TABLES)
3301 if (!atomic_cmpset_64(pte, obits, pbits))
3302 goto retry;
3303 #else
3304 if (!atomic_cmpset_int((u_int *)pte, obits,
3305 pbits))
3306 goto retry;
3307 #endif
3308 if (obits & PG_G)
3309 pmap_invalidate_page(pmap, sva);
3310 else
3311 anychanged = TRUE;
3312 }
3313 }
3314 }
3315 if (anychanged)
3316 pmap_invalidate_all(pmap);
3317 if (pv_lists_locked) {
3318 sched_unpin();
3319 rw_wunlock(&pvh_global_lock);
3320 }
3321 PMAP_UNLOCK(pmap);
3322 }
3323
3324 /*
3325 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3326 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3327 * For promotion to occur, two conditions must be met: (1) the 4KB page
3328 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3329 * mappings must have identical characteristics.
3330 *
3331 * Managed (PG_MANAGED) mappings within the kernel address space are not
3332 * promoted. The reason is that kernel PDEs are replicated in each pmap but
3333 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3334 * pmap.
3335 */
3336 static void
3337 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3338 {
3339 pd_entry_t newpde;
3340 pt_entry_t *firstpte, oldpte, pa, *pte;
3341 vm_offset_t oldpteva;
3342 vm_page_t mpte;
3343
3344 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3345
3346 /*
3347 * Examine the first PTE in the specified PTP. Abort if this PTE is
3348 * either invalid, unused, or does not map the first 4KB physical page
3349 * within a 2- or 4MB page.
3350 */
3351 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3352 setpde:
3353 newpde = *firstpte;
3354 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3355 pmap_pde_p_failures++;
3356 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3357 " in pmap %p", va, pmap);
3358 return;
3359 }
3360 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3361 pmap_pde_p_failures++;
3362 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3363 " in pmap %p", va, pmap);
3364 return;
3365 }
3366 if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3367 /*
3368 * When PG_M is already clear, PG_RW can be cleared without
3369 * a TLB invalidation.
3370 */
3371 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3372 ~PG_RW))
3373 goto setpde;
3374 newpde &= ~PG_RW;
3375 }
3376
3377 /*
3378 * Examine each of the other PTEs in the specified PTP. Abort if this
3379 * PTE maps an unexpected 4KB physical page or does not have identical
3380 * characteristics to the first PTE.
3381 */
3382 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3383 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3384 setpte:
3385 oldpte = *pte;
3386 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3387 pmap_pde_p_failures++;
3388 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3389 " in pmap %p", va, pmap);
3390 return;
3391 }
3392 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3393 /*
3394 * When PG_M is already clear, PG_RW can be cleared
3395 * without a TLB invalidation.
3396 */
3397 if (!atomic_cmpset_int((u_int *)pte, oldpte,
3398 oldpte & ~PG_RW))
3399 goto setpte;
3400 oldpte &= ~PG_RW;
3401 oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3402 (va & ~PDRMASK);
3403 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3404 " in pmap %p", oldpteva, pmap);
3405 }
3406 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3407 pmap_pde_p_failures++;
3408 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3409 " in pmap %p", va, pmap);
3410 return;
3411 }
3412 pa -= PAGE_SIZE;
3413 }
3414
3415 /*
3416 * Save the page table page in its current state until the PDE
3417 * mapping the superpage is demoted by pmap_demote_pde() or
3418 * destroyed by pmap_remove_pde().
3419 */
3420 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3421 KASSERT(mpte >= vm_page_array &&
3422 mpte < &vm_page_array[vm_page_array_size],
3423 ("pmap_promote_pde: page table page is out of range"));
3424 KASSERT(mpte->pindex == va >> PDRSHIFT,
3425 ("pmap_promote_pde: page table page's pindex is wrong"));
3426 if (pmap_insert_pt_page(pmap, mpte)) {
3427 pmap_pde_p_failures++;
3428 CTR2(KTR_PMAP,
3429 "pmap_promote_pde: failure for va %#x in pmap %p", va,
3430 pmap);
3431 return;
3432 }
3433
3434 /*
3435 * Promote the pv entries.
3436 */
3437 if ((newpde & PG_MANAGED) != 0)
3438 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3439
3440 /*
3441 * Propagate the PAT index to its proper position.
3442 */
3443 if ((newpde & PG_PTE_PAT) != 0)
3444 newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3445
3446 /*
3447 * Map the superpage.
3448 */
3449 if (workaround_erratum383)
3450 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3451 else if (pmap == kernel_pmap)
3452 pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde);
3453 else
3454 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
3455
3456 pmap_pde_promotions++;
3457 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3458 " in pmap %p", va, pmap);
3459 }
3460
3461 /*
3462 * Insert the given physical page (p) at
3463 * the specified virtual address (v) in the
3464 * target physical map with the protection requested.
3465 *
3466 * If specified, the page will be wired down, meaning
3467 * that the related pte can not be reclaimed.
3468 *
3469 * NB: This is the only routine which MAY NOT lazy-evaluate
3470 * or lose information. That is, this routine must actually
3471 * insert this page into the given map NOW.
3472 */
3473 int
3474 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3475 u_int flags, int8_t psind)
3476 {
3477 pd_entry_t *pde;
3478 pt_entry_t *pte;
3479 pt_entry_t newpte, origpte;
3480 pv_entry_t pv;
3481 vm_paddr_t opa, pa;
3482 vm_page_t mpte, om;
3483 boolean_t invlva, wired;
3484
3485 va = trunc_page(va);
3486 mpte = NULL;
3487 wired = (flags & PMAP_ENTER_WIRED) != 0;
3488
3489 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3490 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3491 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3492 va));
3493 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
3494 VM_OBJECT_ASSERT_LOCKED(m->object);
3495
3496 rw_wlock(&pvh_global_lock);
3497 PMAP_LOCK(pmap);
3498 sched_pin();
3499
3500 pde = pmap_pde(pmap, va);
3501 if (va < VM_MAXUSER_ADDRESS) {
3502 /*
3503 * va is for UVA.
3504 * In the case that a page table page is not resident,
3505 * we are creating it here. pmap_allocpte() handles
3506 * demotion.
3507 */
3508 mpte = pmap_allocpte(pmap, va, flags);
3509 if (mpte == NULL) {
3510 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
3511 ("pmap_allocpte failed with sleep allowed"));
3512 sched_unpin();
3513 rw_wunlock(&pvh_global_lock);
3514 PMAP_UNLOCK(pmap);
3515 return (KERN_RESOURCE_SHORTAGE);
3516 }
3517 } else {
3518 /*
3519 * va is for KVA, so pmap_demote_pde() will never fail
3520 * to install a page table page. PG_V is also
3521 * asserted by pmap_demote_pde().
3522 */
3523 KASSERT(pde != NULL && (*pde & PG_V) != 0,
3524 ("KVA %#x invalid pde pdir %#jx", va,
3525 (uintmax_t)pmap->pm_pdir[PTDPTDI]));
3526 if ((*pde & PG_PS) != 0)
3527 pmap_demote_pde(pmap, pde, va);
3528 }
3529 pte = pmap_pte_quick(pmap, va);
3530
3531 /*
3532 * Page Directory table entry is not valid, which should not
3533 * happen. We should have either allocated the page table
3534 * page or demoted the existing mapping above.
3535 */
3536 if (pte == NULL) {
3537 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3538 (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3539 }
3540
3541 pa = VM_PAGE_TO_PHYS(m);
3542 om = NULL;
3543 origpte = *pte;
3544 opa = origpte & PG_FRAME;
3545
3546 /*
3547 * Mapping has not changed, must be protection or wiring change.
3548 */
3549 if (origpte && (opa == pa)) {
3550 /*
3551 * Wiring change, just update stats. We don't worry about
3552 * wiring PT pages as they remain resident as long as there
3553 * are valid mappings in them. Hence, if a user page is wired,
3554 * the PT page will be also.
3555 */
3556 if (wired && ((origpte & PG_W) == 0))
3557 pmap->pm_stats.wired_count++;
3558 else if (!wired && (origpte & PG_W))
3559 pmap->pm_stats.wired_count--;
3560
3561 /*
3562 * Remove extra pte reference
3563 */
3564 if (mpte)
3565 mpte->wire_count--;
3566
3567 if (origpte & PG_MANAGED) {
3568 om = m;
3569 pa |= PG_MANAGED;
3570 }
3571 goto validate;
3572 }
3573
3574 pv = NULL;
3575
3576 /*
3577 * Mapping has changed, invalidate old range and fall through to
3578 * handle validating new mapping.
3579 */
3580 if (opa) {
3581 if (origpte & PG_W)
3582 pmap->pm_stats.wired_count--;
3583 if (origpte & PG_MANAGED) {
3584 om = PHYS_TO_VM_PAGE(opa);
3585 pv = pmap_pvh_remove(&om->md, pmap, va);
3586 }
3587 if (mpte != NULL) {
3588 mpte->wire_count--;
3589 KASSERT(mpte->wire_count > 0,
3590 ("pmap_enter: missing reference to page table page,"
3591 " va: 0x%x", va));
3592 }
3593 } else
3594 pmap->pm_stats.resident_count++;
3595
3596 /*
3597 * Enter on the PV list if part of our managed memory.
3598 */
3599 if ((m->oflags & VPO_UNMANAGED) == 0) {
3600 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3601 ("pmap_enter: managed mapping within the clean submap"));
3602 if (pv == NULL)
3603 pv = get_pv_entry(pmap, FALSE);
3604 pv->pv_va = va;
3605 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3606 pa |= PG_MANAGED;
3607 } else if (pv != NULL)
3608 free_pv_entry(pmap, pv);
3609
3610 /*
3611 * Increment counters
3612 */
3613 if (wired)
3614 pmap->pm_stats.wired_count++;
3615
3616 validate:
3617 /*
3618 * Now validate mapping with desired protection/wiring.
3619 */
3620 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3621 if ((prot & VM_PROT_WRITE) != 0) {
3622 newpte |= PG_RW;
3623 if ((newpte & PG_MANAGED) != 0)
3624 vm_page_aflag_set(m, PGA_WRITEABLE);
3625 }
3626 #if defined(PAE) || defined(PAE_TABLES)
3627 if ((prot & VM_PROT_EXECUTE) == 0)
3628 newpte |= pg_nx;
3629 #endif
3630 if (wired)
3631 newpte |= PG_W;
3632 if (va < VM_MAXUSER_ADDRESS)
3633 newpte |= PG_U;
3634 if (pmap == kernel_pmap)
3635 newpte |= pgeflag;
3636
3637 /*
3638 * if the mapping or permission bits are different, we need
3639 * to update the pte.
3640 */
3641 if ((origpte & ~(PG_M|PG_A)) != newpte) {
3642 newpte |= PG_A;
3643 if ((flags & VM_PROT_WRITE) != 0)
3644 newpte |= PG_M;
3645 if (origpte & PG_V) {
3646 invlva = FALSE;
3647 origpte = pte_load_store(pte, newpte);
3648 if (origpte & PG_A) {
3649 if (origpte & PG_MANAGED)
3650 vm_page_aflag_set(om, PGA_REFERENCED);
3651 if (opa != VM_PAGE_TO_PHYS(m))
3652 invlva = TRUE;
3653 #if defined(PAE) || defined(PAE_TABLES)
3654 if ((origpte & PG_NX) == 0 &&
3655 (newpte & PG_NX) != 0)
3656 invlva = TRUE;
3657 #endif
3658 }
3659 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3660 if ((origpte & PG_MANAGED) != 0)
3661 vm_page_dirty(om);
3662 if ((prot & VM_PROT_WRITE) == 0)
3663 invlva = TRUE;
3664 }
3665 if ((origpte & PG_MANAGED) != 0 &&
3666 TAILQ_EMPTY(&om->md.pv_list) &&
3667 ((om->flags & PG_FICTITIOUS) != 0 ||
3668 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3669 vm_page_aflag_clear(om, PGA_WRITEABLE);
3670 if (invlva)
3671 pmap_invalidate_page(pmap, va);
3672 } else
3673 pte_store(pte, newpte);
3674 }
3675
3676 /*
3677 * If both the page table page and the reservation are fully
3678 * populated, then attempt promotion.
3679 */
3680 if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3681 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3682 vm_reserv_level_iffullpop(m) == 0)
3683 pmap_promote_pde(pmap, pde, va);
3684
3685 sched_unpin();
3686 rw_wunlock(&pvh_global_lock);
3687 PMAP_UNLOCK(pmap);
3688 return (KERN_SUCCESS);
3689 }
3690
3691 /*
3692 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and
3693 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without
3694 * blocking, (2) a mapping already exists at the specified virtual address, or
3695 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3696 */
3697 static boolean_t
3698 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3699 {
3700 pd_entry_t *pde, newpde;
3701
3702 rw_assert(&pvh_global_lock, RA_WLOCKED);
3703 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3704 pde = pmap_pde(pmap, va);
3705 if (*pde != 0) {
3706 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3707 " in pmap %p", va, pmap);
3708 return (FALSE);
3709 }
3710 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3711 PG_PS | PG_V;
3712 if ((m->oflags & VPO_UNMANAGED) == 0) {
3713 newpde |= PG_MANAGED;
3714
3715 /*
3716 * Abort this mapping if its PV entry could not be created.
3717 */
3718 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3719 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3720 " in pmap %p", va, pmap);
3721 return (FALSE);
3722 }
3723 }
3724 #if defined(PAE) || defined(PAE_TABLES)
3725 if ((prot & VM_PROT_EXECUTE) == 0)
3726 newpde |= pg_nx;
3727 #endif
3728 if (va < VM_MAXUSER_ADDRESS)
3729 newpde |= PG_U;
3730
3731 /*
3732 * Increment counters.
3733 */
3734 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3735
3736 /*
3737 * Map the superpage. (This is not a promoted mapping; there will not
3738 * be any lingering 4KB page mappings in the TLB.)
3739 */
3740 pde_store(pde, newpde);
3741
3742 pmap_pde_mappings++;
3743 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3744 " in pmap %p", va, pmap);
3745 return (TRUE);
3746 }
3747
3748 /*
3749 * Maps a sequence of resident pages belonging to the same object.
3750 * The sequence begins with the given page m_start. This page is
3751 * mapped at the given virtual address start. Each subsequent page is
3752 * mapped at a virtual address that is offset from start by the same
3753 * amount as the page is offset from m_start within the object. The
3754 * last page in the sequence is the page with the largest offset from
3755 * m_start that can be mapped at a virtual address less than the given
3756 * virtual address end. Not every virtual page between start and end
3757 * is mapped; only those for which a resident page exists with the
3758 * corresponding offset from m_start are mapped.
3759 */
3760 void
3761 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3762 vm_page_t m_start, vm_prot_t prot)
3763 {
3764 vm_offset_t va;
3765 vm_page_t m, mpte;
3766 vm_pindex_t diff, psize;
3767
3768 VM_OBJECT_ASSERT_LOCKED(m_start->object);
3769
3770 psize = atop(end - start);
3771 mpte = NULL;
3772 m = m_start;
3773 rw_wlock(&pvh_global_lock);
3774 PMAP_LOCK(pmap);
3775 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3776 va = start + ptoa(diff);
3777 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3778 m->psind == 1 && pg_ps_enabled &&
3779 pmap_enter_pde(pmap, va, m, prot))
3780 m = &m[NBPDR / PAGE_SIZE - 1];
3781 else
3782 mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3783 mpte);
3784 m = TAILQ_NEXT(m, listq);
3785 }
3786 rw_wunlock(&pvh_global_lock);
3787 PMAP_UNLOCK(pmap);
3788 }
3789
3790 /*
3791 * this code makes some *MAJOR* assumptions:
3792 * 1. Current pmap & pmap exists.
3793 * 2. Not wired.
3794 * 3. Read access.
3795 * 4. No page table pages.
3796 * but is *MUCH* faster than pmap_enter...
3797 */
3798
3799 void
3800 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3801 {
3802
3803 rw_wlock(&pvh_global_lock);
3804 PMAP_LOCK(pmap);
3805 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3806 rw_wunlock(&pvh_global_lock);
3807 PMAP_UNLOCK(pmap);
3808 }
3809
3810 static vm_page_t
3811 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3812 vm_prot_t prot, vm_page_t mpte)
3813 {
3814 pt_entry_t *pte;
3815 vm_paddr_t pa;
3816 struct spglist free;
3817
3818 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3819 (m->oflags & VPO_UNMANAGED) != 0,
3820 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3821 rw_assert(&pvh_global_lock, RA_WLOCKED);
3822 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3823
3824 /*
3825 * In the case that a page table page is not
3826 * resident, we are creating it here.
3827 */
3828 if (va < VM_MAXUSER_ADDRESS) {
3829 u_int ptepindex;
3830 pd_entry_t ptepa;
3831
3832 /*
3833 * Calculate pagetable page index
3834 */
3835 ptepindex = va >> PDRSHIFT;
3836 if (mpte && (mpte->pindex == ptepindex)) {
3837 mpte->wire_count++;
3838 } else {
3839 /*
3840 * Get the page directory entry
3841 */
3842 ptepa = pmap->pm_pdir[ptepindex];
3843
3844 /*
3845 * If the page table page is mapped, we just increment
3846 * the hold count, and activate it.
3847 */
3848 if (ptepa) {
3849 if (ptepa & PG_PS)
3850 return (NULL);
3851 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3852 mpte->wire_count++;
3853 } else {
3854 mpte = _pmap_allocpte(pmap, ptepindex,
3855 PMAP_ENTER_NOSLEEP);
3856 if (mpte == NULL)
3857 return (mpte);
3858 }
3859 }
3860 } else {
3861 mpte = NULL;
3862 }
3863
3864 /*
3865 * This call to vtopte makes the assumption that we are
3866 * entering the page into the current pmap. In order to support
3867 * quick entry into any pmap, one would likely use pmap_pte_quick.
3868 * But that isn't as quick as vtopte.
3869 */
3870 pte = vtopte(va);
3871 if (*pte) {
3872 if (mpte != NULL) {
3873 mpte->wire_count--;
3874 mpte = NULL;
3875 }
3876 return (mpte);
3877 }
3878
3879 /*
3880 * Enter on the PV list if part of our managed memory.
3881 */
3882 if ((m->oflags & VPO_UNMANAGED) == 0 &&
3883 !pmap_try_insert_pv_entry(pmap, va, m)) {
3884 if (mpte != NULL) {
3885 SLIST_INIT(&free);
3886 if (pmap_unwire_ptp(pmap, mpte, &free)) {
3887 pmap_invalidate_page(pmap, va);
3888 pmap_free_zero_pages(&free);
3889 }
3890
3891 mpte = NULL;
3892 }
3893 return (mpte);
3894 }
3895
3896 /*
3897 * Increment counters
3898 */
3899 pmap->pm_stats.resident_count++;
3900
3901 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3902 #if defined(PAE) || defined(PAE_TABLES)
3903 if ((prot & VM_PROT_EXECUTE) == 0)
3904 pa |= pg_nx;
3905 #endif
3906
3907 /*
3908 * Now validate mapping with RO protection
3909 */
3910 if ((m->oflags & VPO_UNMANAGED) != 0)
3911 pte_store(pte, pa | PG_V | PG_U);
3912 else
3913 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3914 return (mpte);
3915 }
3916
3917 /*
3918 * Make a temporary mapping for a physical address. This is only intended
3919 * to be used for panic dumps.
3920 */
3921 void *
3922 pmap_kenter_temporary(vm_paddr_t pa, int i)
3923 {
3924 vm_offset_t va;
3925
3926 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3927 pmap_kenter(va, pa);
3928 invlpg(va);
3929 return ((void *)crashdumpmap);
3930 }
3931
3932 /*
3933 * This code maps large physical mmap regions into the
3934 * processor address space. Note that some shortcuts
3935 * are taken, but the code works.
3936 */
3937 void
3938 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3939 vm_pindex_t pindex, vm_size_t size)
3940 {
3941 pd_entry_t *pde;
3942 vm_paddr_t pa, ptepa;
3943 vm_page_t p;
3944 int pat_mode;
3945
3946 VM_OBJECT_ASSERT_WLOCKED(object);
3947 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3948 ("pmap_object_init_pt: non-device object"));
3949 if (pseflag &&
3950 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3951 if (!vm_object_populate(object, pindex, pindex + atop(size)))
3952 return;
3953 p = vm_page_lookup(object, pindex);
3954 KASSERT(p->valid == VM_PAGE_BITS_ALL,
3955 ("pmap_object_init_pt: invalid page %p", p));
3956 pat_mode = p->md.pat_mode;
3957
3958 /*
3959 * Abort the mapping if the first page is not physically
3960 * aligned to a 2/4MB page boundary.
3961 */
3962 ptepa = VM_PAGE_TO_PHYS(p);
3963 if (ptepa & (NBPDR - 1))
3964 return;
3965
3966 /*
3967 * Skip the first page. Abort the mapping if the rest of
3968 * the pages are not physically contiguous or have differing
3969 * memory attributes.
3970 */
3971 p = TAILQ_NEXT(p, listq);
3972 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3973 pa += PAGE_SIZE) {
3974 KASSERT(p->valid == VM_PAGE_BITS_ALL,
3975 ("pmap_object_init_pt: invalid page %p", p));
3976 if (pa != VM_PAGE_TO_PHYS(p) ||
3977 pat_mode != p->md.pat_mode)
3978 return;
3979 p = TAILQ_NEXT(p, listq);
3980 }
3981
3982 /*
3983 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and
3984 * "size" is a multiple of 2/4M, adding the PAT setting to
3985 * "pa" will not affect the termination of this loop.
3986 */
3987 PMAP_LOCK(pmap);
3988 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3989 size; pa += NBPDR) {
3990 pde = pmap_pde(pmap, addr);
3991 if (*pde == 0) {
3992 pde_store(pde, pa | PG_PS | PG_M | PG_A |
3993 PG_U | PG_RW | PG_V);
3994 pmap->pm_stats.resident_count += NBPDR /
3995 PAGE_SIZE;
3996 pmap_pde_mappings++;
3997 }
3998 /* Else continue on if the PDE is already valid. */
3999 addr += NBPDR;
4000 }
4001 PMAP_UNLOCK(pmap);
4002 }
4003 }
4004
4005 /*
4006 * Clear the wired attribute from the mappings for the specified range of
4007 * addresses in the given pmap. Every valid mapping within that range
4008 * must have the wired attribute set. In contrast, invalid mappings
4009 * cannot have the wired attribute set, so they are ignored.
4010 *
4011 * The wired attribute of the page table entry is not a hardware feature,
4012 * so there is no need to invalidate any TLB entries.
4013 */
4014 void
4015 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4016 {
4017 vm_offset_t pdnxt;
4018 pd_entry_t *pde;
4019 pt_entry_t *pte;
4020 boolean_t pv_lists_locked;
4021
4022 if (pmap_is_current(pmap))
4023 pv_lists_locked = FALSE;
4024 else {
4025 pv_lists_locked = TRUE;
4026 resume:
4027 rw_wlock(&pvh_global_lock);
4028 sched_pin();
4029 }
4030 PMAP_LOCK(pmap);
4031 for (; sva < eva; sva = pdnxt) {
4032 pdnxt = (sva + NBPDR) & ~PDRMASK;
4033 if (pdnxt < sva)
4034 pdnxt = eva;
4035 pde = pmap_pde(pmap, sva);
4036 if ((*pde & PG_V) == 0)
4037 continue;
4038 if ((*pde & PG_PS) != 0) {
4039 if ((*pde & PG_W) == 0)
4040 panic("pmap_unwire: pde %#jx is missing PG_W",
4041 (uintmax_t)*pde);
4042
4043 /*
4044 * Are we unwiring the entire large page? If not,
4045 * demote the mapping and fall through.
4046 */
4047 if (sva + NBPDR == pdnxt && eva >= pdnxt) {
4048 /*
4049 * Regardless of whether a pde (or pte) is 32
4050 * or 64 bits in size, PG_W is among the least
4051 * significant 32 bits.
4052 */
4053 atomic_clear_int((u_int *)pde, PG_W);
4054 pmap->pm_stats.wired_count -= NBPDR /
4055 PAGE_SIZE;
4056 continue;
4057 } else {
4058 if (!pv_lists_locked) {
4059 pv_lists_locked = TRUE;
4060 if (!rw_try_wlock(&pvh_global_lock)) {
4061 PMAP_UNLOCK(pmap);
4062 /* Repeat sva. */
4063 goto resume;
4064 }
4065 sched_pin();
4066 }
4067 if (!pmap_demote_pde(pmap, pde, sva))
4068 panic("pmap_unwire: demotion failed");
4069 }
4070 }
4071 if (pdnxt > eva)
4072 pdnxt = eva;
4073 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
4074 sva += PAGE_SIZE) {
4075 if ((*pte & PG_V) == 0)
4076 continue;
4077 if ((*pte & PG_W) == 0)
4078 panic("pmap_unwire: pte %#jx is missing PG_W",
4079 (uintmax_t)*pte);
4080
4081 /*
4082 * PG_W must be cleared atomically. Although the pmap
4083 * lock synchronizes access to PG_W, another processor
4084 * could be setting PG_M and/or PG_A concurrently.
4085 *
4086 * PG_W is among the least significant 32 bits.
4087 */
4088 atomic_clear_int((u_int *)pte, PG_W);
4089 pmap->pm_stats.wired_count--;
4090 }
4091 }
4092 if (pv_lists_locked) {
4093 sched_unpin();
4094 rw_wunlock(&pvh_global_lock);
4095 }
4096 PMAP_UNLOCK(pmap);
4097 }
4098
4099
4100 /*
4101 * Copy the range specified by src_addr/len
4102 * from the source map to the range dst_addr/len
4103 * in the destination map.
4104 *
4105 * This routine is only advisory and need not do anything.
4106 */
4107
4108 void
4109 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4110 vm_offset_t src_addr)
4111 {
4112 struct spglist free;
4113 vm_offset_t addr;
4114 vm_offset_t end_addr = src_addr + len;
4115 vm_offset_t pdnxt;
4116
4117 if (dst_addr != src_addr)
4118 return;
4119
4120 if (!pmap_is_current(src_pmap))
4121 return;
4122
4123 rw_wlock(&pvh_global_lock);
4124 if (dst_pmap < src_pmap) {
4125 PMAP_LOCK(dst_pmap);
4126 PMAP_LOCK(src_pmap);
4127 } else {
4128 PMAP_LOCK(src_pmap);
4129 PMAP_LOCK(dst_pmap);
4130 }
4131 sched_pin();
4132 for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4133 pt_entry_t *src_pte, *dst_pte;
4134 vm_page_t dstmpte, srcmpte;
4135 pd_entry_t srcptepaddr;
4136 u_int ptepindex;
4137
4138 KASSERT(addr < UPT_MIN_ADDRESS,
4139 ("pmap_copy: invalid to pmap_copy page tables"));
4140
4141 pdnxt = (addr + NBPDR) & ~PDRMASK;
4142 if (pdnxt < addr)
4143 pdnxt = end_addr;
4144 ptepindex = addr >> PDRSHIFT;
4145
4146 srcptepaddr = src_pmap->pm_pdir[ptepindex];
4147 if (srcptepaddr == 0)
4148 continue;
4149
4150 if (srcptepaddr & PG_PS) {
4151 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4152 continue;
4153 if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4154 ((srcptepaddr & PG_MANAGED) == 0 ||
4155 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4156 PG_PS_FRAME))) {
4157 dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4158 ~PG_W;
4159 dst_pmap->pm_stats.resident_count +=
4160 NBPDR / PAGE_SIZE;
4161 pmap_pde_mappings++;
4162 }
4163 continue;
4164 }
4165
4166 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4167 KASSERT(srcmpte->wire_count > 0,
4168 ("pmap_copy: source page table page is unused"));
4169
4170 if (pdnxt > end_addr)
4171 pdnxt = end_addr;
4172
4173 src_pte = vtopte(addr);
4174 while (addr < pdnxt) {
4175 pt_entry_t ptetemp;
4176 ptetemp = *src_pte;
4177 /*
4178 * we only virtual copy managed pages
4179 */
4180 if ((ptetemp & PG_MANAGED) != 0) {
4181 dstmpte = pmap_allocpte(dst_pmap, addr,
4182 PMAP_ENTER_NOSLEEP);
4183 if (dstmpte == NULL)
4184 goto out;
4185 dst_pte = pmap_pte_quick(dst_pmap, addr);
4186 if (*dst_pte == 0 &&
4187 pmap_try_insert_pv_entry(dst_pmap, addr,
4188 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4189 /*
4190 * Clear the wired, modified, and
4191 * accessed (referenced) bits
4192 * during the copy.
4193 */
4194 *dst_pte = ptetemp & ~(PG_W | PG_M |
4195 PG_A);
4196 dst_pmap->pm_stats.resident_count++;
4197 } else {
4198 SLIST_INIT(&free);
4199 if (pmap_unwire_ptp(dst_pmap, dstmpte,
4200 &free)) {
4201 pmap_invalidate_page(dst_pmap,
4202 addr);
4203 pmap_free_zero_pages(&free);
4204 }
4205 goto out;
4206 }
4207 if (dstmpte->wire_count >= srcmpte->wire_count)
4208 break;
4209 }
4210 addr += PAGE_SIZE;
4211 src_pte++;
4212 }
4213 }
4214 out:
4215 sched_unpin();
4216 rw_wunlock(&pvh_global_lock);
4217 PMAP_UNLOCK(src_pmap);
4218 PMAP_UNLOCK(dst_pmap);
4219 }
4220
4221 static __inline void
4222 pagezero(void *page)
4223 {
4224 #if defined(I686_CPU)
4225 if (cpu_class == CPUCLASS_686) {
4226 if (cpu_feature & CPUID_SSE2)
4227 sse2_pagezero(page);
4228 else
4229 i686_pagezero(page);
4230 } else
4231 #endif
4232 bzero(page, PAGE_SIZE);
4233 }
4234
4235 /*
4236 * pmap_zero_page zeros the specified hardware page by mapping
4237 * the page into KVM and using bzero to clear its contents.
4238 */
4239 void
4240 pmap_zero_page(vm_page_t m)
4241 {
4242 pt_entry_t *cmap_pte2;
4243 struct pcpu *pc;
4244
4245 sched_pin();
4246 pc = get_pcpu();
4247 cmap_pte2 = pc->pc_cmap_pte2;
4248 mtx_lock(&pc->pc_cmap_lock);
4249 if (*cmap_pte2)
4250 panic("pmap_zero_page: CMAP2 busy");
4251 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4252 pmap_cache_bits(m->md.pat_mode, 0);
4253 invlcaddr(pc->pc_cmap_addr2);
4254 pagezero(pc->pc_cmap_addr2);
4255 *cmap_pte2 = 0;
4256
4257 /*
4258 * Unpin the thread before releasing the lock. Otherwise the thread
4259 * could be rescheduled while still bound to the current CPU, only
4260 * to unpin itself immediately upon resuming execution.
4261 */
4262 sched_unpin();
4263 mtx_unlock(&pc->pc_cmap_lock);
4264 }
4265
4266 /*
4267 * pmap_zero_page_area zeros the specified hardware page by mapping
4268 * the page into KVM and using bzero to clear its contents.
4269 *
4270 * off and size may not cover an area beyond a single hardware page.
4271 */
4272 void
4273 pmap_zero_page_area(vm_page_t m, int off, int size)
4274 {
4275 pt_entry_t *cmap_pte2;
4276 struct pcpu *pc;
4277
4278 sched_pin();
4279 pc = get_pcpu();
4280 cmap_pte2 = pc->pc_cmap_pte2;
4281 mtx_lock(&pc->pc_cmap_lock);
4282 if (*cmap_pte2)
4283 panic("pmap_zero_page_area: CMAP2 busy");
4284 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4285 pmap_cache_bits(m->md.pat_mode, 0);
4286 invlcaddr(pc->pc_cmap_addr2);
4287 if (off == 0 && size == PAGE_SIZE)
4288 pagezero(pc->pc_cmap_addr2);
4289 else
4290 bzero(pc->pc_cmap_addr2 + off, size);
4291 *cmap_pte2 = 0;
4292 sched_unpin();
4293 mtx_unlock(&pc->pc_cmap_lock);
4294 }
4295
4296 /*
4297 * pmap_zero_page_idle zeros the specified hardware page by mapping
4298 * the page into KVM and using bzero to clear its contents. This
4299 * is intended to be called from the vm_pagezero process only and
4300 * outside of Giant.
4301 */
4302 void
4303 pmap_zero_page_idle(vm_page_t m)
4304 {
4305
4306 if (*CMAP3)
4307 panic("pmap_zero_page_idle: CMAP3 busy");
4308 sched_pin();
4309 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4310 pmap_cache_bits(m->md.pat_mode, 0);
4311 invlcaddr(CADDR3);
4312 pagezero(CADDR3);
4313 *CMAP3 = 0;
4314 sched_unpin();
4315 }
4316
4317 /*
4318 * pmap_copy_page copies the specified (machine independent)
4319 * page by mapping the page into virtual memory and using
4320 * bcopy to copy the page, one machine dependent page at a
4321 * time.
4322 */
4323 void
4324 pmap_copy_page(vm_page_t src, vm_page_t dst)
4325 {
4326 pt_entry_t *cmap_pte1, *cmap_pte2;
4327 struct pcpu *pc;
4328
4329 sched_pin();
4330 pc = get_pcpu();
4331 cmap_pte1 = pc->pc_cmap_pte1;
4332 cmap_pte2 = pc->pc_cmap_pte2;
4333 mtx_lock(&pc->pc_cmap_lock);
4334 if (*cmap_pte1)
4335 panic("pmap_copy_page: CMAP1 busy");
4336 if (*cmap_pte2)
4337 panic("pmap_copy_page: CMAP2 busy");
4338 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4339 pmap_cache_bits(src->md.pat_mode, 0);
4340 invlcaddr(pc->pc_cmap_addr1);
4341 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4342 pmap_cache_bits(dst->md.pat_mode, 0);
4343 invlcaddr(pc->pc_cmap_addr2);
4344 bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
4345 *cmap_pte1 = 0;
4346 *cmap_pte2 = 0;
4347 sched_unpin();
4348 mtx_unlock(&pc->pc_cmap_lock);
4349 }
4350
4351 int unmapped_buf_allowed = 1;
4352
4353 void
4354 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4355 vm_offset_t b_offset, int xfersize)
4356 {
4357 vm_page_t a_pg, b_pg;
4358 char *a_cp, *b_cp;
4359 vm_offset_t a_pg_offset, b_pg_offset;
4360 pt_entry_t *cmap_pte1, *cmap_pte2;
4361 struct pcpu *pc;
4362 int cnt;
4363
4364 sched_pin();
4365 pc = get_pcpu();
4366 cmap_pte1 = pc->pc_cmap_pte1;
4367 cmap_pte2 = pc->pc_cmap_pte2;
4368 mtx_lock(&pc->pc_cmap_lock);
4369 if (*cmap_pte1 != 0)
4370 panic("pmap_copy_pages: CMAP1 busy");
4371 if (*cmap_pte2 != 0)
4372 panic("pmap_copy_pages: CMAP2 busy");
4373 while (xfersize > 0) {
4374 a_pg = ma[a_offset >> PAGE_SHIFT];
4375 a_pg_offset = a_offset & PAGE_MASK;
4376 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4377 b_pg = mb[b_offset >> PAGE_SHIFT];
4378 b_pg_offset = b_offset & PAGE_MASK;
4379 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4380 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4381 pmap_cache_bits(a_pg->md.pat_mode, 0);
4382 invlcaddr(pc->pc_cmap_addr1);
4383 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4384 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4385 invlcaddr(pc->pc_cmap_addr2);
4386 a_cp = pc->pc_cmap_addr1 + a_pg_offset;
4387 b_cp = pc->pc_cmap_addr2 + b_pg_offset;
4388 bcopy(a_cp, b_cp, cnt);
4389 a_offset += cnt;
4390 b_offset += cnt;
4391 xfersize -= cnt;
4392 }
4393 *cmap_pte1 = 0;
4394 *cmap_pte2 = 0;
4395 sched_unpin();
4396 mtx_unlock(&pc->pc_cmap_lock);
4397 }
4398
4399 /*
4400 * Returns true if the pmap's pv is one of the first
4401 * 16 pvs linked to from this page. This count may
4402 * be changed upwards or downwards in the future; it
4403 * is only necessary that true be returned for a small
4404 * subset of pmaps for proper page aging.
4405 */
4406 boolean_t
4407 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4408 {
4409 struct md_page *pvh;
4410 pv_entry_t pv;
4411 int loops = 0;
4412 boolean_t rv;
4413
4414 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4415 ("pmap_page_exists_quick: page %p is not managed", m));
4416 rv = FALSE;
4417 rw_wlock(&pvh_global_lock);
4418 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4419 if (PV_PMAP(pv) == pmap) {
4420 rv = TRUE;
4421 break;
4422 }
4423 loops++;
4424 if (loops >= 16)
4425 break;
4426 }
4427 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4428 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4429 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4430 if (PV_PMAP(pv) == pmap) {
4431 rv = TRUE;
4432 break;
4433 }
4434 loops++;
4435 if (loops >= 16)
4436 break;
4437 }
4438 }
4439 rw_wunlock(&pvh_global_lock);
4440 return (rv);
4441 }
4442
4443 /*
4444 * pmap_page_wired_mappings:
4445 *
4446 * Return the number of managed mappings to the given physical page
4447 * that are wired.
4448 */
4449 int
4450 pmap_page_wired_mappings(vm_page_t m)
4451 {
4452 int count;
4453
4454 count = 0;
4455 if ((m->oflags & VPO_UNMANAGED) != 0)
4456 return (count);
4457 rw_wlock(&pvh_global_lock);
4458 count = pmap_pvh_wired_mappings(&m->md, count);
4459 if ((m->flags & PG_FICTITIOUS) == 0) {
4460 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4461 count);
4462 }
4463 rw_wunlock(&pvh_global_lock);
4464 return (count);
4465 }
4466
4467 /*
4468 * pmap_pvh_wired_mappings:
4469 *
4470 * Return the updated number "count" of managed mappings that are wired.
4471 */
4472 static int
4473 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4474 {
4475 pmap_t pmap;
4476 pt_entry_t *pte;
4477 pv_entry_t pv;
4478
4479 rw_assert(&pvh_global_lock, RA_WLOCKED);
4480 sched_pin();
4481 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4482 pmap = PV_PMAP(pv);
4483 PMAP_LOCK(pmap);
4484 pte = pmap_pte_quick(pmap, pv->pv_va);
4485 if ((*pte & PG_W) != 0)
4486 count++;
4487 PMAP_UNLOCK(pmap);
4488 }
4489 sched_unpin();
4490 return (count);
4491 }
4492
4493 /*
4494 * Returns TRUE if the given page is mapped individually or as part of
4495 * a 4mpage. Otherwise, returns FALSE.
4496 */
4497 boolean_t
4498 pmap_page_is_mapped(vm_page_t m)
4499 {
4500 boolean_t rv;
4501
4502 if ((m->oflags & VPO_UNMANAGED) != 0)
4503 return (FALSE);
4504 rw_wlock(&pvh_global_lock);
4505 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4506 ((m->flags & PG_FICTITIOUS) == 0 &&
4507 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4508 rw_wunlock(&pvh_global_lock);
4509 return (rv);
4510 }
4511
4512 /*
4513 * Remove all pages from specified address space
4514 * this aids process exit speeds. Also, this code
4515 * is special cased for current process only, but
4516 * can have the more generic (and slightly slower)
4517 * mode enabled. This is much faster than pmap_remove
4518 * in the case of running down an entire address space.
4519 */
4520 void
4521 pmap_remove_pages(pmap_t pmap)
4522 {
4523 pt_entry_t *pte, tpte;
4524 vm_page_t m, mpte, mt;
4525 pv_entry_t pv;
4526 struct md_page *pvh;
4527 struct pv_chunk *pc, *npc;
4528 struct spglist free;
4529 int field, idx;
4530 int32_t bit;
4531 uint32_t inuse, bitmask;
4532 int allfree;
4533
4534 if (pmap != PCPU_GET(curpmap)) {
4535 printf("warning: pmap_remove_pages called with non-current pmap\n");
4536 return;
4537 }
4538 SLIST_INIT(&free);
4539 rw_wlock(&pvh_global_lock);
4540 PMAP_LOCK(pmap);
4541 sched_pin();
4542 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4543 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4544 pc->pc_pmap));
4545 allfree = 1;
4546 for (field = 0; field < _NPCM; field++) {
4547 inuse = ~pc->pc_map[field] & pc_freemask[field];
4548 while (inuse != 0) {
4549 bit = bsfl(inuse);
4550 bitmask = 1UL << bit;
4551 idx = field * 32 + bit;
4552 pv = &pc->pc_pventry[idx];
4553 inuse &= ~bitmask;
4554
4555 pte = pmap_pde(pmap, pv->pv_va);
4556 tpte = *pte;
4557 if ((tpte & PG_PS) == 0) {
4558 pte = vtopte(pv->pv_va);
4559 tpte = *pte & ~PG_PTE_PAT;
4560 }
4561
4562 if (tpte == 0) {
4563 printf(
4564 "TPTE at %p IS ZERO @ VA %08x\n",
4565 pte, pv->pv_va);
4566 panic("bad pte");
4567 }
4568
4569 /*
4570 * We cannot remove wired pages from a process' mapping at this time
4571 */
4572 if (tpte & PG_W) {
4573 allfree = 0;
4574 continue;
4575 }
4576
4577 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4578 KASSERT(m->phys_addr == (tpte & PG_FRAME),
4579 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4580 m, (uintmax_t)m->phys_addr,
4581 (uintmax_t)tpte));
4582
4583 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4584 m < &vm_page_array[vm_page_array_size],
4585 ("pmap_remove_pages: bad tpte %#jx",
4586 (uintmax_t)tpte));
4587
4588 pte_clear(pte);
4589
4590 /*
4591 * Update the vm_page_t clean/reference bits.
4592 */
4593 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4594 if ((tpte & PG_PS) != 0) {
4595 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4596 vm_page_dirty(mt);
4597 } else
4598 vm_page_dirty(m);
4599 }
4600
4601 /* Mark free */
4602 PV_STAT(pv_entry_frees++);
4603 PV_STAT(pv_entry_spare++);
4604 pv_entry_count--;
4605 pc->pc_map[field] |= bitmask;
4606 if ((tpte & PG_PS) != 0) {
4607 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4608 pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4609 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4610 if (TAILQ_EMPTY(&pvh->pv_list)) {
4611 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4612 if (TAILQ_EMPTY(&mt->md.pv_list))
4613 vm_page_aflag_clear(mt, PGA_WRITEABLE);
4614 }
4615 mpte = pmap_remove_pt_page(pmap, pv->pv_va);
4616 if (mpte != NULL) {
4617 pmap->pm_stats.resident_count--;
4618 KASSERT(mpte->wire_count == NPTEPG,
4619 ("pmap_remove_pages: pte page wire count error"));
4620 mpte->wire_count = 0;
4621 pmap_add_delayed_free_list(mpte, &free, FALSE);
4622 atomic_subtract_int(&vm_cnt.v_wire_count, 1);
4623 }
4624 } else {
4625 pmap->pm_stats.resident_count--;
4626 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4627 if (TAILQ_EMPTY(&m->md.pv_list) &&
4628 (m->flags & PG_FICTITIOUS) == 0) {
4629 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4630 if (TAILQ_EMPTY(&pvh->pv_list))
4631 vm_page_aflag_clear(m, PGA_WRITEABLE);
4632 }
4633 pmap_unuse_pt(pmap, pv->pv_va, &free);
4634 }
4635 }
4636 }
4637 if (allfree) {
4638 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4639 free_pv_chunk(pc);
4640 }
4641 }
4642 sched_unpin();
4643 pmap_invalidate_all(pmap);
4644 rw_wunlock(&pvh_global_lock);
4645 PMAP_UNLOCK(pmap);
4646 pmap_free_zero_pages(&free);
4647 }
4648
4649 /*
4650 * pmap_is_modified:
4651 *
4652 * Return whether or not the specified physical page was modified
4653 * in any physical maps.
4654 */
4655 boolean_t
4656 pmap_is_modified(vm_page_t m)
4657 {
4658 boolean_t rv;
4659
4660 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4661 ("pmap_is_modified: page %p is not managed", m));
4662
4663 /*
4664 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4665 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE
4666 * is clear, no PTEs can have PG_M set.
4667 */
4668 VM_OBJECT_ASSERT_WLOCKED(m->object);
4669 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4670 return (FALSE);
4671 rw_wlock(&pvh_global_lock);
4672 rv = pmap_is_modified_pvh(&m->md) ||
4673 ((m->flags & PG_FICTITIOUS) == 0 &&
4674 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4675 rw_wunlock(&pvh_global_lock);
4676 return (rv);
4677 }
4678
4679 /*
4680 * Returns TRUE if any of the given mappings were used to modify
4681 * physical memory. Otherwise, returns FALSE. Both page and 2mpage
4682 * mappings are supported.
4683 */
4684 static boolean_t
4685 pmap_is_modified_pvh(struct md_page *pvh)
4686 {
4687 pv_entry_t pv;
4688 pt_entry_t *pte;
4689 pmap_t pmap;
4690 boolean_t rv;
4691
4692 rw_assert(&pvh_global_lock, RA_WLOCKED);
4693 rv = FALSE;
4694 sched_pin();
4695 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4696 pmap = PV_PMAP(pv);
4697 PMAP_LOCK(pmap);
4698 pte = pmap_pte_quick(pmap, pv->pv_va);
4699 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4700 PMAP_UNLOCK(pmap);
4701 if (rv)
4702 break;
4703 }
4704 sched_unpin();
4705 return (rv);
4706 }
4707
4708 /*
4709 * pmap_is_prefaultable:
4710 *
4711 * Return whether or not the specified virtual address is elgible
4712 * for prefault.
4713 */
4714 boolean_t
4715 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4716 {
4717 pd_entry_t *pde;
4718 pt_entry_t *pte;
4719 boolean_t rv;
4720
4721 rv = FALSE;
4722 PMAP_LOCK(pmap);
4723 pde = pmap_pde(pmap, addr);
4724 if (*pde != 0 && (*pde & PG_PS) == 0) {
4725 pte = vtopte(addr);
4726 rv = *pte == 0;
4727 }
4728 PMAP_UNLOCK(pmap);
4729 return (rv);
4730 }
4731
4732 /*
4733 * pmap_is_referenced:
4734 *
4735 * Return whether or not the specified physical page was referenced
4736 * in any physical maps.
4737 */
4738 boolean_t
4739 pmap_is_referenced(vm_page_t m)
4740 {
4741 boolean_t rv;
4742
4743 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4744 ("pmap_is_referenced: page %p is not managed", m));
4745 rw_wlock(&pvh_global_lock);
4746 rv = pmap_is_referenced_pvh(&m->md) ||
4747 ((m->flags & PG_FICTITIOUS) == 0 &&
4748 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4749 rw_wunlock(&pvh_global_lock);
4750 return (rv);
4751 }
4752
4753 /*
4754 * Returns TRUE if any of the given mappings were referenced and FALSE
4755 * otherwise. Both page and 4mpage mappings are supported.
4756 */
4757 static boolean_t
4758 pmap_is_referenced_pvh(struct md_page *pvh)
4759 {
4760 pv_entry_t pv;
4761 pt_entry_t *pte;
4762 pmap_t pmap;
4763 boolean_t rv;
4764
4765 rw_assert(&pvh_global_lock, RA_WLOCKED);
4766 rv = FALSE;
4767 sched_pin();
4768 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4769 pmap = PV_PMAP(pv);
4770 PMAP_LOCK(pmap);
4771 pte = pmap_pte_quick(pmap, pv->pv_va);
4772 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4773 PMAP_UNLOCK(pmap);
4774 if (rv)
4775 break;
4776 }
4777 sched_unpin();
4778 return (rv);
4779 }
4780
4781 /*
4782 * Clear the write and modified bits in each of the given page's mappings.
4783 */
4784 void
4785 pmap_remove_write(vm_page_t m)
4786 {
4787 struct md_page *pvh;
4788 pv_entry_t next_pv, pv;
4789 pmap_t pmap;
4790 pd_entry_t *pde;
4791 pt_entry_t oldpte, *pte;
4792 vm_offset_t va;
4793
4794 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4795 ("pmap_remove_write: page %p is not managed", m));
4796
4797 /*
4798 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4799 * set by another thread while the object is locked. Thus,
4800 * if PGA_WRITEABLE is clear, no page table entries need updating.
4801 */
4802 VM_OBJECT_ASSERT_WLOCKED(m->object);
4803 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4804 return;
4805 rw_wlock(&pvh_global_lock);
4806 sched_pin();
4807 if ((m->flags & PG_FICTITIOUS) != 0)
4808 goto small_mappings;
4809 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4810 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4811 va = pv->pv_va;
4812 pmap = PV_PMAP(pv);
4813 PMAP_LOCK(pmap);
4814 pde = pmap_pde(pmap, va);
4815 if ((*pde & PG_RW) != 0)
4816 (void)pmap_demote_pde(pmap, pde, va);
4817 PMAP_UNLOCK(pmap);
4818 }
4819 small_mappings:
4820 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4821 pmap = PV_PMAP(pv);
4822 PMAP_LOCK(pmap);
4823 pde = pmap_pde(pmap, pv->pv_va);
4824 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4825 " a 4mpage in page %p's pv list", m));
4826 pte = pmap_pte_quick(pmap, pv->pv_va);
4827 retry:
4828 oldpte = *pte;
4829 if ((oldpte & PG_RW) != 0) {
4830 /*
4831 * Regardless of whether a pte is 32 or 64 bits
4832 * in size, PG_RW and PG_M are among the least
4833 * significant 32 bits.
4834 */
4835 if (!atomic_cmpset_int((u_int *)pte, oldpte,
4836 oldpte & ~(PG_RW | PG_M)))
4837 goto retry;
4838 if ((oldpte & PG_M) != 0)
4839 vm_page_dirty(m);
4840 pmap_invalidate_page(pmap, pv->pv_va);
4841 }
4842 PMAP_UNLOCK(pmap);
4843 }
4844 vm_page_aflag_clear(m, PGA_WRITEABLE);
4845 sched_unpin();
4846 rw_wunlock(&pvh_global_lock);
4847 }
4848
4849 #define PMAP_TS_REFERENCED_MAX 5
4850
4851 /*
4852 * pmap_ts_referenced:
4853 *
4854 * Return a count of reference bits for a page, clearing those bits.
4855 * It is not necessary for every reference bit to be cleared, but it
4856 * is necessary that 0 only be returned when there are truly no
4857 * reference bits set.
4858 *
4859 * XXX: The exact number of bits to check and clear is a matter that
4860 * should be tested and standardized at some point in the future for
4861 * optimal aging of shared pages.
4862 *
4863 * As an optimization, update the page's dirty field if a modified bit is
4864 * found while counting reference bits. This opportunistic update can be
4865 * performed at low cost and can eliminate the need for some future calls
4866 * to pmap_is_modified(). However, since this function stops after
4867 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4868 * dirty pages. Those dirty pages will only be detected by a future call
4869 * to pmap_is_modified().
4870 */
4871 int
4872 pmap_ts_referenced(vm_page_t m)
4873 {
4874 struct md_page *pvh;
4875 pv_entry_t pv, pvf;
4876 pmap_t pmap;
4877 pd_entry_t *pde;
4878 pt_entry_t *pte;
4879 vm_paddr_t pa;
4880 int rtval = 0;
4881
4882 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4883 ("pmap_ts_referenced: page %p is not managed", m));
4884 pa = VM_PAGE_TO_PHYS(m);
4885 pvh = pa_to_pvh(pa);
4886 rw_wlock(&pvh_global_lock);
4887 sched_pin();
4888 if ((m->flags & PG_FICTITIOUS) != 0 ||
4889 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4890 goto small_mappings;
4891 pv = pvf;
4892 do {
4893 pmap = PV_PMAP(pv);
4894 PMAP_LOCK(pmap);
4895 pde = pmap_pde(pmap, pv->pv_va);
4896 if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4897 /*
4898 * Although "*pde" is mapping a 2/4MB page, because
4899 * this function is called at a 4KB page granularity,
4900 * we only update the 4KB page under test.
4901 */
4902 vm_page_dirty(m);
4903 }
4904 if ((*pde & PG_A) != 0) {
4905 /*
4906 * Since this reference bit is shared by either 1024
4907 * or 512 4KB pages, it should not be cleared every
4908 * time it is tested. Apply a simple "hash" function
4909 * on the physical page number, the virtual superpage
4910 * number, and the pmap address to select one 4KB page
4911 * out of the 1024 or 512 on which testing the
4912 * reference bit will result in clearing that bit.
4913 * This function is designed to avoid the selection of
4914 * the same 4KB page for every 2- or 4MB page mapping.
4915 *
4916 * On demotion, a mapping that hasn't been referenced
4917 * is simply destroyed. To avoid the possibility of a
4918 * subsequent page fault on a demoted wired mapping,
4919 * always leave its reference bit set. Moreover,
4920 * since the superpage is wired, the current state of
4921 * its reference bit won't affect page replacement.
4922 */
4923 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
4924 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
4925 (*pde & PG_W) == 0) {
4926 atomic_clear_int((u_int *)pde, PG_A);
4927 pmap_invalidate_page(pmap, pv->pv_va);
4928 }
4929 rtval++;
4930 }
4931 PMAP_UNLOCK(pmap);
4932 /* Rotate the PV list if it has more than one entry. */
4933 if (TAILQ_NEXT(pv, pv_next) != NULL) {
4934 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4935 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4936 }
4937 if (rtval >= PMAP_TS_REFERENCED_MAX)
4938 goto out;
4939 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4940 small_mappings:
4941 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4942 goto out;
4943 pv = pvf;
4944 do {
4945 pmap = PV_PMAP(pv);
4946 PMAP_LOCK(pmap);
4947 pde = pmap_pde(pmap, pv->pv_va);
4948 KASSERT((*pde & PG_PS) == 0,
4949 ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
4950 m));
4951 pte = pmap_pte_quick(pmap, pv->pv_va);
4952 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4953 vm_page_dirty(m);
4954 if ((*pte & PG_A) != 0) {
4955 atomic_clear_int((u_int *)pte, PG_A);
4956 pmap_invalidate_page(pmap, pv->pv_va);
4957 rtval++;
4958 }
4959 PMAP_UNLOCK(pmap);
4960 /* Rotate the PV list if it has more than one entry. */
4961 if (TAILQ_NEXT(pv, pv_next) != NULL) {
4962 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4963 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4964 }
4965 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
4966 PMAP_TS_REFERENCED_MAX);
4967 out:
4968 sched_unpin();
4969 rw_wunlock(&pvh_global_lock);
4970 return (rtval);
4971 }
4972
4973 /*
4974 * Apply the given advice to the specified range of addresses within the
4975 * given pmap. Depending on the advice, clear the referenced and/or
4976 * modified flags in each mapping and set the mapped page's dirty field.
4977 */
4978 void
4979 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4980 {
4981 pd_entry_t oldpde, *pde;
4982 pt_entry_t *pte;
4983 vm_offset_t va, pdnxt;
4984 vm_page_t m;
4985 boolean_t anychanged, pv_lists_locked;
4986
4987 if (advice != MADV_DONTNEED && advice != MADV_FREE)
4988 return;
4989 if (pmap_is_current(pmap))
4990 pv_lists_locked = FALSE;
4991 else {
4992 pv_lists_locked = TRUE;
4993 resume:
4994 rw_wlock(&pvh_global_lock);
4995 sched_pin();
4996 }
4997 anychanged = FALSE;
4998 PMAP_LOCK(pmap);
4999 for (; sva < eva; sva = pdnxt) {
5000 pdnxt = (sva + NBPDR) & ~PDRMASK;
5001 if (pdnxt < sva)
5002 pdnxt = eva;
5003 pde = pmap_pde(pmap, sva);
5004 oldpde = *pde;
5005 if ((oldpde & PG_V) == 0)
5006 continue;
5007 else if ((oldpde & PG_PS) != 0) {
5008 if ((oldpde & PG_MANAGED) == 0)
5009 continue;
5010 if (!pv_lists_locked) {
5011 pv_lists_locked = TRUE;
5012 if (!rw_try_wlock(&pvh_global_lock)) {
5013 if (anychanged)
5014 pmap_invalidate_all(pmap);
5015 PMAP_UNLOCK(pmap);
5016 goto resume;
5017 }
5018 sched_pin();
5019 }
5020 if (!pmap_demote_pde(pmap, pde, sva)) {
5021 /*
5022 * The large page mapping was destroyed.
5023 */
5024 continue;
5025 }
5026
5027 /*
5028 * Unless the page mappings are wired, remove the
5029 * mapping to a single page so that a subsequent
5030 * access may repromote. Since the underlying page
5031 * table page is fully populated, this removal never
5032 * frees a page table page.
5033 */
5034 if ((oldpde & PG_W) == 0) {
5035 pte = pmap_pte_quick(pmap, sva);
5036 KASSERT((*pte & PG_V) != 0,
5037 ("pmap_advise: invalid PTE"));
5038 pmap_remove_pte(pmap, pte, sva, NULL);
5039 anychanged = TRUE;
5040 }
5041 }
5042 if (pdnxt > eva)
5043 pdnxt = eva;
5044 va = pdnxt;
5045 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
5046 sva += PAGE_SIZE) {
5047 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
5048 goto maybe_invlrng;
5049 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5050 if (advice == MADV_DONTNEED) {
5051 /*
5052 * Future calls to pmap_is_modified()
5053 * can be avoided by making the page
5054 * dirty now.
5055 */
5056 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5057 vm_page_dirty(m);
5058 }
5059 atomic_clear_int((u_int *)pte, PG_M | PG_A);
5060 } else if ((*pte & PG_A) != 0)
5061 atomic_clear_int((u_int *)pte, PG_A);
5062 else
5063 goto maybe_invlrng;
5064 if ((*pte & PG_G) != 0) {
5065 if (va == pdnxt)
5066 va = sva;
5067 } else
5068 anychanged = TRUE;
5069 continue;
5070 maybe_invlrng:
5071 if (va != pdnxt) {
5072 pmap_invalidate_range(pmap, va, sva);
5073 va = pdnxt;
5074 }
5075 }
5076 if (va != pdnxt)
5077 pmap_invalidate_range(pmap, va, sva);
5078 }
5079 if (anychanged)
5080 pmap_invalidate_all(pmap);
5081 if (pv_lists_locked) {
5082 sched_unpin();
5083 rw_wunlock(&pvh_global_lock);
5084 }
5085 PMAP_UNLOCK(pmap);
5086 }
5087
5088 /*
5089 * Clear the modify bits on the specified physical page.
5090 */
5091 void
5092 pmap_clear_modify(vm_page_t m)
5093 {
5094 struct md_page *pvh;
5095 pv_entry_t next_pv, pv;
5096 pmap_t pmap;
5097 pd_entry_t oldpde, *pde;
5098 pt_entry_t oldpte, *pte;
5099 vm_offset_t va;
5100
5101 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5102 ("pmap_clear_modify: page %p is not managed", m));
5103 VM_OBJECT_ASSERT_WLOCKED(m->object);
5104 KASSERT(!vm_page_xbusied(m),
5105 ("pmap_clear_modify: page %p is exclusive busied", m));
5106
5107 /*
5108 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
5109 * If the object containing the page is locked and the page is not
5110 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
5111 */
5112 if ((m->aflags & PGA_WRITEABLE) == 0)
5113 return;
5114 rw_wlock(&pvh_global_lock);
5115 sched_pin();
5116 if ((m->flags & PG_FICTITIOUS) != 0)
5117 goto small_mappings;
5118 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5119 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5120 va = pv->pv_va;
5121 pmap = PV_PMAP(pv);
5122 PMAP_LOCK(pmap);
5123 pde = pmap_pde(pmap, va);
5124 oldpde = *pde;
5125 if ((oldpde & PG_RW) != 0) {
5126 if (pmap_demote_pde(pmap, pde, va)) {
5127 if ((oldpde & PG_W) == 0) {
5128 /*
5129 * Write protect the mapping to a
5130 * single page so that a subsequent
5131 * write access may repromote.
5132 */
5133 va += VM_PAGE_TO_PHYS(m) - (oldpde &
5134 PG_PS_FRAME);
5135 pte = pmap_pte_quick(pmap, va);
5136 oldpte = *pte;
5137 if ((oldpte & PG_V) != 0) {
5138 /*
5139 * Regardless of whether a pte is 32 or 64 bits
5140 * in size, PG_RW and PG_M are among the least
5141 * significant 32 bits.
5142 */
5143 while (!atomic_cmpset_int((u_int *)pte,
5144 oldpte,
5145 oldpte & ~(PG_M | PG_RW)))
5146 oldpte = *pte;
5147 vm_page_dirty(m);
5148 pmap_invalidate_page(pmap, va);
5149 }
5150 }
5151 }
5152 }
5153 PMAP_UNLOCK(pmap);
5154 }
5155 small_mappings:
5156 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5157 pmap = PV_PMAP(pv);
5158 PMAP_LOCK(pmap);
5159 pde = pmap_pde(pmap, pv->pv_va);
5160 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
5161 " a 4mpage in page %p's pv list", m));
5162 pte = pmap_pte_quick(pmap, pv->pv_va);
5163 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5164 /*
5165 * Regardless of whether a pte is 32 or 64 bits
5166 * in size, PG_M is among the least significant
5167 * 32 bits.
5168 */
5169 atomic_clear_int((u_int *)pte, PG_M);
5170 pmap_invalidate_page(pmap, pv->pv_va);
5171 }
5172 PMAP_UNLOCK(pmap);
5173 }
5174 sched_unpin();
5175 rw_wunlock(&pvh_global_lock);
5176 }
5177
5178 /*
5179 * Miscellaneous support routines follow
5180 */
5181
5182 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
5183 static __inline void
5184 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5185 {
5186 u_int opte, npte;
5187
5188 /*
5189 * The cache mode bits are all in the low 32-bits of the
5190 * PTE, so we can just spin on updating the low 32-bits.
5191 */
5192 do {
5193 opte = *(u_int *)pte;
5194 npte = opte & ~PG_PTE_CACHE;
5195 npte |= cache_bits;
5196 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5197 }
5198
5199 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5200 static __inline void
5201 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5202 {
5203 u_int opde, npde;
5204
5205 /*
5206 * The cache mode bits are all in the low 32-bits of the
5207 * PDE, so we can just spin on updating the low 32-bits.
5208 */
5209 do {
5210 opde = *(u_int *)pde;
5211 npde = opde & ~PG_PDE_CACHE;
5212 npde |= cache_bits;
5213 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5214 }
5215
5216 /*
5217 * Map a set of physical memory pages into the kernel virtual
5218 * address space. Return a pointer to where it is mapped. This
5219 * routine is intended to be used for mapping device memory,
5220 * NOT real memory.
5221 */
5222 void *
5223 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5224 {
5225 struct pmap_preinit_mapping *ppim;
5226 vm_offset_t va, offset;
5227 vm_size_t tmpsize;
5228 int i;
5229
5230 offset = pa & PAGE_MASK;
5231 size = round_page(offset + size);
5232 pa = pa & PG_FRAME;
5233
5234 if (pa < KERNLOAD && pa + size <= KERNLOAD)
5235 va = KERNBASE + pa;
5236 else if (!pmap_initialized) {
5237 va = 0;
5238 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5239 ppim = pmap_preinit_mapping + i;
5240 if (ppim->va == 0) {
5241 ppim->pa = pa;
5242 ppim->sz = size;
5243 ppim->mode = mode;
5244 ppim->va = virtual_avail;
5245 virtual_avail += size;
5246 va = ppim->va;
5247 break;
5248 }
5249 }
5250 if (va == 0)
5251 panic("%s: too many preinit mappings", __func__);
5252 } else {
5253 /*
5254 * If we have a preinit mapping, re-use it.
5255 */
5256 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5257 ppim = pmap_preinit_mapping + i;
5258 if (ppim->pa == pa && ppim->sz == size &&
5259 ppim->mode == mode)
5260 return ((void *)(ppim->va + offset));
5261 }
5262 va = kva_alloc(size);
5263 if (va == 0)
5264 panic("%s: Couldn't allocate KVA", __func__);
5265 }
5266 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5267 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5268 pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5269 pmap_invalidate_cache_range(va, va + size, FALSE);
5270 return ((void *)(va + offset));
5271 }
5272
5273 void *
5274 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5275 {
5276
5277 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5278 }
5279
5280 void *
5281 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5282 {
5283
5284 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5285 }
5286
5287 void
5288 pmap_unmapdev(vm_offset_t va, vm_size_t size)
5289 {
5290 struct pmap_preinit_mapping *ppim;
5291 vm_offset_t offset;
5292 int i;
5293
5294 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5295 return;
5296 offset = va & PAGE_MASK;
5297 size = round_page(offset + size);
5298 va = trunc_page(va);
5299 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5300 ppim = pmap_preinit_mapping + i;
5301 if (ppim->va == va && ppim->sz == size) {
5302 if (pmap_initialized)
5303 return;
5304 ppim->pa = 0;
5305 ppim->va = 0;
5306 ppim->sz = 0;
5307 ppim->mode = 0;
5308 if (va + size == virtual_avail)
5309 virtual_avail = va;
5310 return;
5311 }
5312 }
5313 if (pmap_initialized)
5314 kva_free(va, size);
5315 }
5316
5317 /*
5318 * Sets the memory attribute for the specified page.
5319 */
5320 void
5321 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5322 {
5323
5324 m->md.pat_mode = ma;
5325 if ((m->flags & PG_FICTITIOUS) != 0)
5326 return;
5327
5328 /*
5329 * If "m" is a normal page, flush it from the cache.
5330 * See pmap_invalidate_cache_range().
5331 *
5332 * First, try to find an existing mapping of the page by sf
5333 * buffer. sf_buf_invalidate_cache() modifies mapping and
5334 * flushes the cache.
5335 */
5336 if (sf_buf_invalidate_cache(m))
5337 return;
5338
5339 /*
5340 * If page is not mapped by sf buffer, but CPU does not
5341 * support self snoop, map the page transient and do
5342 * invalidation. In the worst case, whole cache is flushed by
5343 * pmap_invalidate_cache_range().
5344 */
5345 if ((cpu_feature & CPUID_SS) == 0)
5346 pmap_flush_page(m);
5347 }
5348
5349 static void
5350 pmap_flush_page(vm_page_t m)
5351 {
5352 pt_entry_t *cmap_pte2;
5353 struct pcpu *pc;
5354 vm_offset_t sva, eva;
5355 bool useclflushopt;
5356
5357 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
5358 if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
5359 sched_pin();
5360 pc = get_pcpu();
5361 cmap_pte2 = pc->pc_cmap_pte2;
5362 mtx_lock(&pc->pc_cmap_lock);
5363 if (*cmap_pte2)
5364 panic("pmap_flush_page: CMAP2 busy");
5365 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5366 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5367 invlcaddr(pc->pc_cmap_addr2);
5368 sva = (vm_offset_t)pc->pc_cmap_addr2;
5369 eva = sva + PAGE_SIZE;
5370
5371 /*
5372 * Use mfence or sfence despite the ordering implied by
5373 * mtx_{un,}lock() because clflush on non-Intel CPUs
5374 * and clflushopt are not guaranteed to be ordered by
5375 * any other instruction.
5376 */
5377 if (useclflushopt)
5378 sfence();
5379 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
5380 mfence();
5381 for (; sva < eva; sva += cpu_clflush_line_size) {
5382 if (useclflushopt)
5383 clflushopt(sva);
5384 else
5385 clflush(sva);
5386 }
5387 if (useclflushopt)
5388 sfence();
5389 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
5390 mfence();
5391 *cmap_pte2 = 0;
5392 sched_unpin();
5393 mtx_unlock(&pc->pc_cmap_lock);
5394 } else
5395 pmap_invalidate_cache();
5396 }
5397
5398 /*
5399 * Changes the specified virtual address range's memory type to that given by
5400 * the parameter "mode". The specified virtual address range must be
5401 * completely contained within either the kernel map.
5402 *
5403 * Returns zero if the change completed successfully, and either EINVAL or
5404 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
5405 * of the virtual address range was not mapped, and ENOMEM is returned if
5406 * there was insufficient memory available to complete the change.
5407 */
5408 int
5409 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5410 {
5411 vm_offset_t base, offset, tmpva;
5412 pd_entry_t *pde;
5413 pt_entry_t *pte;
5414 int cache_bits_pte, cache_bits_pde;
5415 boolean_t changed;
5416
5417 base = trunc_page(va);
5418 offset = va & PAGE_MASK;
5419 size = round_page(offset + size);
5420
5421 /*
5422 * Only supported on kernel virtual addresses above the recursive map.
5423 */
5424 if (base < VM_MIN_KERNEL_ADDRESS)
5425 return (EINVAL);
5426
5427 cache_bits_pde = pmap_cache_bits(mode, 1);
5428 cache_bits_pte = pmap_cache_bits(mode, 0);
5429 changed = FALSE;
5430
5431 /*
5432 * Pages that aren't mapped aren't supported. Also break down
5433 * 2/4MB pages into 4KB pages if required.
5434 */
5435 PMAP_LOCK(kernel_pmap);
5436 for (tmpva = base; tmpva < base + size; ) {
5437 pde = pmap_pde(kernel_pmap, tmpva);
5438 if (*pde == 0) {
5439 PMAP_UNLOCK(kernel_pmap);
5440 return (EINVAL);
5441 }
5442 if (*pde & PG_PS) {
5443 /*
5444 * If the current 2/4MB page already has
5445 * the required memory type, then we need not
5446 * demote this page. Just increment tmpva to
5447 * the next 2/4MB page frame.
5448 */
5449 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5450 tmpva = trunc_4mpage(tmpva) + NBPDR;
5451 continue;
5452 }
5453
5454 /*
5455 * If the current offset aligns with a 2/4MB
5456 * page frame and there is at least 2/4MB left
5457 * within the range, then we need not break
5458 * down this page into 4KB pages.
5459 */
5460 if ((tmpva & PDRMASK) == 0 &&
5461 tmpva + PDRMASK < base + size) {
5462 tmpva += NBPDR;
5463 continue;
5464 }
5465 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5466 PMAP_UNLOCK(kernel_pmap);
5467 return (ENOMEM);
5468 }
5469 }
5470 pte = vtopte(tmpva);
5471 if (*pte == 0) {
5472 PMAP_UNLOCK(kernel_pmap);
5473 return (EINVAL);
5474 }
5475 tmpva += PAGE_SIZE;
5476 }
5477 PMAP_UNLOCK(kernel_pmap);
5478
5479 /*
5480 * Ok, all the pages exist, so run through them updating their
5481 * cache mode if required.
5482 */
5483 for (tmpva = base; tmpva < base + size; ) {
5484 pde = pmap_pde(kernel_pmap, tmpva);
5485 if (*pde & PG_PS) {
5486 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5487 pmap_pde_attr(pde, cache_bits_pde);
5488 changed = TRUE;
5489 }
5490 tmpva = trunc_4mpage(tmpva) + NBPDR;
5491 } else {
5492 pte = vtopte(tmpva);
5493 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5494 pmap_pte_attr(pte, cache_bits_pte);
5495 changed = TRUE;
5496 }
5497 tmpva += PAGE_SIZE;
5498 }
5499 }
5500
5501 /*
5502 * Flush CPU caches to make sure any data isn't cached that
5503 * shouldn't be, etc.
5504 */
5505 if (changed) {
5506 pmap_invalidate_range(kernel_pmap, base, tmpva);
5507 pmap_invalidate_cache_range(base, tmpva, FALSE);
5508 }
5509 return (0);
5510 }
5511
5512 /*
5513 * perform the pmap work for mincore
5514 */
5515 int
5516 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5517 {
5518 pd_entry_t *pdep;
5519 pt_entry_t *ptep, pte;
5520 vm_paddr_t pa;
5521 int val;
5522
5523 PMAP_LOCK(pmap);
5524 retry:
5525 pdep = pmap_pde(pmap, addr);
5526 if (*pdep != 0) {
5527 if (*pdep & PG_PS) {
5528 pte = *pdep;
5529 /* Compute the physical address of the 4KB page. */
5530 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5531 PG_FRAME;
5532 val = MINCORE_SUPER;
5533 } else {
5534 ptep = pmap_pte(pmap, addr);
5535 pte = *ptep;
5536 pmap_pte_release(ptep);
5537 pa = pte & PG_FRAME;
5538 val = 0;
5539 }
5540 } else {
5541 pte = 0;
5542 pa = 0;
5543 val = 0;
5544 }
5545 if ((pte & PG_V) != 0) {
5546 val |= MINCORE_INCORE;
5547 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5548 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5549 if ((pte & PG_A) != 0)
5550 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5551 }
5552 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5553 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5554 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5555 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5556 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5557 goto retry;
5558 } else
5559 PA_UNLOCK_COND(*locked_pa);
5560 PMAP_UNLOCK(pmap);
5561 return (val);
5562 }
5563
5564 void
5565 pmap_activate(struct thread *td)
5566 {
5567 pmap_t pmap, oldpmap;
5568 u_int cpuid;
5569 u_int32_t cr3;
5570
5571 critical_enter();
5572 pmap = vmspace_pmap(td->td_proc->p_vmspace);
5573 oldpmap = PCPU_GET(curpmap);
5574 cpuid = PCPU_GET(cpuid);
5575 #if defined(SMP)
5576 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5577 CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5578 #else
5579 CPU_CLR(cpuid, &oldpmap->pm_active);
5580 CPU_SET(cpuid, &pmap->pm_active);
5581 #endif
5582 #if defined(PAE) || defined(PAE_TABLES)
5583 cr3 = vtophys(pmap->pm_pdpt);
5584 #else
5585 cr3 = vtophys(pmap->pm_pdir);
5586 #endif
5587 /*
5588 * pmap_activate is for the current thread on the current cpu
5589 */
5590 td->td_pcb->pcb_cr3 = cr3;
5591 load_cr3(cr3);
5592 PCPU_SET(curpmap, pmap);
5593 critical_exit();
5594 }
5595
5596 void
5597 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5598 {
5599 }
5600
5601 /*
5602 * Increase the starting virtual address of the given mapping if a
5603 * different alignment might result in more superpage mappings.
5604 */
5605 void
5606 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5607 vm_offset_t *addr, vm_size_t size)
5608 {
5609 vm_offset_t superpage_offset;
5610
5611 if (size < NBPDR)
5612 return;
5613 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5614 offset += ptoa(object->pg_color);
5615 superpage_offset = offset & PDRMASK;
5616 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5617 (*addr & PDRMASK) == superpage_offset)
5618 return;
5619 if ((*addr & PDRMASK) < superpage_offset)
5620 *addr = (*addr & ~PDRMASK) + superpage_offset;
5621 else
5622 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5623 }
5624
5625 vm_offset_t
5626 pmap_quick_enter_page(vm_page_t m)
5627 {
5628 vm_offset_t qaddr;
5629 pt_entry_t *pte;
5630
5631 critical_enter();
5632 qaddr = PCPU_GET(qmap_addr);
5633 pte = vtopte(qaddr);
5634
5635 KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
5636 *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
5637 pmap_cache_bits(pmap_page_get_memattr(m), 0);
5638 invlpg(qaddr);
5639
5640 return (qaddr);
5641 }
5642
5643 void
5644 pmap_quick_remove_page(vm_offset_t addr)
5645 {
5646 vm_offset_t qaddr;
5647 pt_entry_t *pte;
5648
5649 qaddr = PCPU_GET(qmap_addr);
5650 pte = vtopte(qaddr);
5651
5652 KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
5653 KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
5654
5655 *pte = 0;
5656 critical_exit();
5657 }
5658
5659 #if defined(PMAP_DEBUG)
5660 pmap_pid_dump(int pid)
5661 {
5662 pmap_t pmap;
5663 struct proc *p;
5664 int npte = 0;
5665 int index;
5666
5667 sx_slock(&allproc_lock);
5668 FOREACH_PROC_IN_SYSTEM(p) {
5669 if (p->p_pid != pid)
5670 continue;
5671
5672 if (p->p_vmspace) {
5673 int i,j;
5674 index = 0;
5675 pmap = vmspace_pmap(p->p_vmspace);
5676 for (i = 0; i < NPDEPTD; i++) {
5677 pd_entry_t *pde;
5678 pt_entry_t *pte;
5679 vm_offset_t base = i << PDRSHIFT;
5680
5681 pde = &pmap->pm_pdir[i];
5682 if (pde && pmap_pde_v(pde)) {
5683 for (j = 0; j < NPTEPG; j++) {
5684 vm_offset_t va = base + (j << PAGE_SHIFT);
5685 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5686 if (index) {
5687 index = 0;
5688 printf("\n");
5689 }
5690 sx_sunlock(&allproc_lock);
5691 return (npte);
5692 }
5693 pte = pmap_pte(pmap, va);
5694 if (pte && pmap_pte_v(pte)) {
5695 pt_entry_t pa;
5696 vm_page_t m;
5697 pa = *pte;
5698 m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5699 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5700 va, pa, m->hold_count, m->wire_count, m->flags);
5701 npte++;
5702 index++;
5703 if (index >= 2) {
5704 index = 0;
5705 printf("\n");
5706 } else {
5707 printf(" ");
5708 }
5709 }
5710 }
5711 }
5712 }
5713 }
5714 }
5715 sx_sunlock(&allproc_lock);
5716 return (npte);
5717 }
5718 #endif
Cache object: 971b89f64a3de20d11f2c5547daff94e
|