FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c
1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * the Systems Programming Group of the University of Utah Computer
15 * Science Department and William Jolitz of UUNET Technologies Inc.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 * must display the following acknowledgement:
27 * This product includes software developed by the University of
28 * California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 * may be used to endorse or promote products derived from this software
31 * without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
46 */
47 /*-
48 * Copyright (c) 2003 Networks Associates Technology, Inc.
49 * All rights reserved.
50 *
51 * This software was developed for the FreeBSD Project by Jake Burkholder,
52 * Safeport Network Services, and Network Associates Laboratories, the
53 * Security Research Division of Network Associates, Inc. under
54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
55 * CHATS research program.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 * notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 * notice, this list of conditions and the following disclaimer in the
64 * documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
76 * SUCH DAMAGE.
77 */
78
79 #include <sys/cdefs.h>
80 __FBSDID("$FreeBSD$");
81
82 /*
83 * Manages physical address maps.
84 *
85 * In addition to hardware address maps, this
86 * module is called upon to provide software-use-only
87 * maps which may or may not be stored in the same
88 * form as hardware maps. These pseudo-maps are
89 * used to store intermediate results from copy
90 * operations to and from address spaces.
91 *
92 * Since the information managed by this module is
93 * also stored by the logical address mapping module,
94 * this module may throw away valid virtual-to-physical
95 * mappings at almost any time. However, invalidations
96 * of virtual-to-physical mappings must be done as
97 * requested.
98 *
99 * In order to cope with hardware architectures which
100 * make virtual-to-physical map invalidates expensive,
101 * this module may delay invalidate or reduced protection
102 * operations until such time as they are actually
103 * necessary. This module is given full information as
104 * to which processors are currently using which maps,
105 * and to when physical maps must be made correct.
106 */
107
108 #include "opt_msgbuf.h"
109 #include "opt_vm.h"
110
111 #include <sys/param.h>
112 #include <sys/systm.h>
113 #include <sys/kernel.h>
114 #include <sys/ktr.h>
115 #include <sys/lock.h>
116 #include <sys/malloc.h>
117 #include <sys/mman.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/proc.h>
121 #include <sys/sx.h>
122 #include <sys/vmmeter.h>
123 #include <sys/sched.h>
124 #include <sys/sysctl.h>
125 #ifdef SMP
126 #include <sys/smp.h>
127 #endif
128
129 #include <vm/vm.h>
130 #include <vm/vm_param.h>
131 #include <vm/vm_kern.h>
132 #include <vm/vm_page.h>
133 #include <vm/vm_map.h>
134 #include <vm/vm_object.h>
135 #include <vm/vm_extern.h>
136 #include <vm/vm_pageout.h>
137 #include <vm/vm_pager.h>
138 #include <vm/vm_reserv.h>
139 #include <vm/uma.h>
140
141 #include <machine/cpu.h>
142 #include <machine/cputypes.h>
143 #include <machine/md_var.h>
144 #include <machine/pcb.h>
145 #include <machine/specialreg.h>
146 #ifdef SMP
147 #include <machine/smp.h>
148 #endif
149
150 #if !defined(DIAGNOSTIC)
151 #define PMAP_INLINE __gnu89_inline
152 #else
153 #define PMAP_INLINE
154 #endif
155
156 #define PV_STATS
157 #ifdef PV_STATS
158 #define PV_STAT(x) do { x ; } while (0)
159 #else
160 #define PV_STAT(x) do { } while (0)
161 #endif
162
163 #define pa_index(pa) ((pa) >> PDRSHIFT)
164 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
165
166 struct pmap kernel_pmap_store;
167
168 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
169 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
170
171 static int ndmpdp;
172 static vm_paddr_t dmaplimit;
173 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
174 pt_entry_t pg_nx;
175
176 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
177
178 static int pg_ps_enabled = 1;
179 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
180 "Are large page mappings enabled?");
181
182 static u_int64_t KPTphys; /* phys addr of kernel level 1 */
183 static u_int64_t KPDphys; /* phys addr of kernel level 2 */
184 u_int64_t KPDPphys; /* phys addr of kernel level 3 */
185 u_int64_t KPML4phys; /* phys addr of kernel level 4 */
186
187 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
188 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
189
190 /*
191 * Data for the pv entry allocation mechanism
192 */
193 static int pv_entry_count;
194 static struct md_page *pv_table;
195
196 /*
197 * All those kernel PT submaps that BSD is so fond of
198 */
199 pt_entry_t *CMAP1 = 0;
200 caddr_t CADDR1 = 0;
201 struct msgbuf *msgbufp = 0;
202
203 /*
204 * Crashdump maps.
205 */
206 static caddr_t crashdumpmap;
207
208 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
209 static pv_entry_t get_pv_entry(pmap_t locked_pmap, boolean_t try);
210 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
211 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
212 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
213 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
214 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
215 vm_offset_t va);
216
217 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
218 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
219 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
220 vm_prot_t prot);
221 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
222 vm_page_t m, vm_prot_t prot, vm_page_t mpte);
223 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
224 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
225 static void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
226 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
227 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
228 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
229 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
230 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
231 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
232 vm_prot_t prot);
233 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
234 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
235 vm_page_t *free);
236 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
237 vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
238 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
239 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
240 vm_page_t *free);
241 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
242 vm_offset_t va);
243 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
244 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
245 vm_page_t m);
246 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
247 pd_entry_t newpde);
248 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
249
250 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
251 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
252
253 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
254 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
255 vm_page_t* free);
256 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
257 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
258
259 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
260 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
261
262 /*
263 * Move the kernel virtual free pointer to the next
264 * 2MB. This is used to help improve performance
265 * by using a large (2MB) page for much of the kernel
266 * (.text, .data, .bss)
267 */
268 static vm_offset_t
269 pmap_kmem_choose(vm_offset_t addr)
270 {
271 vm_offset_t newaddr = addr;
272
273 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
274 return newaddr;
275 }
276
277 /********************/
278 /* Inline functions */
279 /********************/
280
281 /* Return a non-clipped PD index for a given VA */
282 static __inline vm_pindex_t
283 pmap_pde_pindex(vm_offset_t va)
284 {
285 return va >> PDRSHIFT;
286 }
287
288
289 /* Return various clipped indexes for a given VA */
290 static __inline vm_pindex_t
291 pmap_pte_index(vm_offset_t va)
292 {
293
294 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
295 }
296
297 static __inline vm_pindex_t
298 pmap_pde_index(vm_offset_t va)
299 {
300
301 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
302 }
303
304 static __inline vm_pindex_t
305 pmap_pdpe_index(vm_offset_t va)
306 {
307
308 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
309 }
310
311 static __inline vm_pindex_t
312 pmap_pml4e_index(vm_offset_t va)
313 {
314
315 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
316 }
317
318 /* Return a pointer to the PML4 slot that corresponds to a VA */
319 static __inline pml4_entry_t *
320 pmap_pml4e(pmap_t pmap, vm_offset_t va)
321 {
322
323 return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
324 }
325
326 /* Return a pointer to the PDP slot that corresponds to a VA */
327 static __inline pdp_entry_t *
328 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
329 {
330 pdp_entry_t *pdpe;
331
332 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
333 return (&pdpe[pmap_pdpe_index(va)]);
334 }
335
336 /* Return a pointer to the PDP slot that corresponds to a VA */
337 static __inline pdp_entry_t *
338 pmap_pdpe(pmap_t pmap, vm_offset_t va)
339 {
340 pml4_entry_t *pml4e;
341
342 pml4e = pmap_pml4e(pmap, va);
343 if ((*pml4e & PG_V) == 0)
344 return NULL;
345 return (pmap_pml4e_to_pdpe(pml4e, va));
346 }
347
348 /* Return a pointer to the PD slot that corresponds to a VA */
349 static __inline pd_entry_t *
350 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
351 {
352 pd_entry_t *pde;
353
354 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
355 return (&pde[pmap_pde_index(va)]);
356 }
357
358 /* Return a pointer to the PD slot that corresponds to a VA */
359 static __inline pd_entry_t *
360 pmap_pde(pmap_t pmap, vm_offset_t va)
361 {
362 pdp_entry_t *pdpe;
363
364 pdpe = pmap_pdpe(pmap, va);
365 if (pdpe == NULL || (*pdpe & PG_V) == 0)
366 return NULL;
367 return (pmap_pdpe_to_pde(pdpe, va));
368 }
369
370 /* Return a pointer to the PT slot that corresponds to a VA */
371 static __inline pt_entry_t *
372 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
373 {
374 pt_entry_t *pte;
375
376 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
377 return (&pte[pmap_pte_index(va)]);
378 }
379
380 /* Return a pointer to the PT slot that corresponds to a VA */
381 static __inline pt_entry_t *
382 pmap_pte(pmap_t pmap, vm_offset_t va)
383 {
384 pd_entry_t *pde;
385
386 pde = pmap_pde(pmap, va);
387 if (pde == NULL || (*pde & PG_V) == 0)
388 return NULL;
389 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */
390 return ((pt_entry_t *)pde);
391 return (pmap_pde_to_pte(pde, va));
392 }
393
394
395 PMAP_INLINE pt_entry_t *
396 vtopte(vm_offset_t va)
397 {
398 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
399
400 return (PTmap + ((va >> PAGE_SHIFT) & mask));
401 }
402
403 static __inline pd_entry_t *
404 vtopde(vm_offset_t va)
405 {
406 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
407
408 return (PDmap + ((va >> PDRSHIFT) & mask));
409 }
410
411 static u_int64_t
412 allocpages(vm_paddr_t *firstaddr, int n)
413 {
414 u_int64_t ret;
415
416 ret = *firstaddr;
417 bzero((void *)ret, n * PAGE_SIZE);
418 *firstaddr += n * PAGE_SIZE;
419 return (ret);
420 }
421
422 static void
423 create_pagetables(vm_paddr_t *firstaddr)
424 {
425 int i;
426
427 /* Allocate pages */
428 KPTphys = allocpages(firstaddr, NKPT);
429 KPML4phys = allocpages(firstaddr, 1);
430 KPDPphys = allocpages(firstaddr, NKPML4E);
431 KPDphys = allocpages(firstaddr, NKPDPE);
432
433 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
434 if (ndmpdp < 4) /* Minimum 4GB of dirmap */
435 ndmpdp = 4;
436 DMPDPphys = allocpages(firstaddr, NDMPML4E);
437 DMPDphys = allocpages(firstaddr, ndmpdp);
438 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
439
440 /* Fill in the underlying page table pages */
441 /* Read-only from zero to physfree */
442 /* XXX not fully used, underneath 2M pages */
443 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
444 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
445 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
446 }
447
448 /* Now map the page tables at their location within PTmap */
449 for (i = 0; i < NKPT; i++) {
450 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
451 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
452 }
453
454 /* Map from zero to end of allocations under 2M pages */
455 /* This replaces some of the KPTphys entries above */
456 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
457 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
458 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
459 }
460
461 /* And connect up the PD to the PDP */
462 for (i = 0; i < NKPDPE; i++) {
463 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
464 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
465 }
466
467 /* Now set up the direct map space using 2MB pages */
468 /* Preset PG_M and PG_A because demotion expects it */
469 for (i = 0; i < NPDEPG * ndmpdp; i++) {
470 ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
471 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
472 PG_M | PG_A;
473 }
474
475 /* And the direct map space's PDP */
476 for (i = 0; i < ndmpdp; i++) {
477 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
478 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
479 }
480
481 /* And recursively map PML4 to itself in order to get PTmap */
482 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
483 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
484
485 /* Connect the Direct Map slot up to the PML4 */
486 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
487 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
488
489 /* Connect the KVA slot up to the PML4 */
490 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
491 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
492 }
493
494 /*
495 * Bootstrap the system enough to run with virtual memory.
496 *
497 * On amd64 this is called after mapping has already been enabled
498 * and just syncs the pmap module with what has already been done.
499 * [We can't call it easily with mapping off since the kernel is not
500 * mapped with PA == VA, hence we would have to relocate every address
501 * from the linked base (virtual) address "KERNBASE" to the actual
502 * (physical) address starting relative to 0]
503 */
504 void
505 pmap_bootstrap(vm_paddr_t *firstaddr)
506 {
507 vm_offset_t va;
508 pt_entry_t *pte, *unused;
509
510 /*
511 * Create an initial set of page tables to run the kernel in.
512 */
513 create_pagetables(firstaddr);
514
515 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
516 virtual_avail = pmap_kmem_choose(virtual_avail);
517
518 virtual_end = VM_MAX_KERNEL_ADDRESS;
519
520
521 /* XXX do %cr0 as well */
522 load_cr4(rcr4() | CR4_PGE | CR4_PSE);
523 load_cr3(KPML4phys);
524
525 /*
526 * Initialize the kernel pmap (which is statically allocated).
527 */
528 PMAP_LOCK_INIT(kernel_pmap);
529 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
530 kernel_pmap->pm_root = NULL;
531 kernel_pmap->pm_active = -1; /* don't allow deactivation */
532 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
533
534 /*
535 * Reserve some special page table entries/VA space for temporary
536 * mapping of pages.
537 */
538 #define SYSMAP(c, p, v, n) \
539 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
540
541 va = virtual_avail;
542 pte = vtopte(va);
543
544 /*
545 * CMAP1 is only used for the memory test.
546 */
547 SYSMAP(caddr_t, CMAP1, CADDR1, 1)
548
549 /*
550 * Crashdump maps.
551 */
552 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
553
554 /*
555 * msgbufp is used to map the system message buffer.
556 */
557 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
558
559 virtual_avail = va;
560
561 *CMAP1 = 0;
562
563 invltlb();
564
565 /* Initialize the PAT MSR. */
566 pmap_init_pat();
567 }
568
569 /*
570 * Setup the PAT MSR.
571 */
572 void
573 pmap_init_pat(void)
574 {
575 uint64_t pat_msr;
576
577 /* Bail if this CPU doesn't implement PAT. */
578 if (!(cpu_feature & CPUID_PAT))
579 panic("no PAT??");
580
581 #ifdef PAT_WORKS
582 /*
583 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
584 * Program 4 and 5 as WP and WC.
585 * Leave 6 and 7 as UC and UC-.
586 */
587 pat_msr = rdmsr(MSR_PAT);
588 pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
589 pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
590 PAT_VALUE(5, PAT_WRITE_COMBINING);
591 #else
592 /*
593 * Due to some Intel errata, we can only safely use the lower 4
594 * PAT entries. Thus, just replace PAT Index 2 with WC instead
595 * of UC-.
596 *
597 * Intel Pentium III Processor Specification Update
598 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
599 * or Mode C Paging)
600 *
601 * Intel Pentium IV Processor Specification Update
602 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
603 */
604 pat_msr = rdmsr(MSR_PAT);
605 pat_msr &= ~PAT_MASK(2);
606 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
607 #endif
608 wrmsr(MSR_PAT, pat_msr);
609 }
610
611 /*
612 * Initialize a vm_page's machine-dependent fields.
613 */
614 void
615 pmap_page_init(vm_page_t m)
616 {
617
618 TAILQ_INIT(&m->md.pv_list);
619 m->md.pat_mode = PAT_WRITE_BACK;
620 }
621
622 /*
623 * Initialize the pmap module.
624 * Called by vm_init, to initialize any structures that the pmap
625 * system needs to map virtual memory.
626 */
627 void
628 pmap_init(void)
629 {
630 vm_page_t mpte;
631 vm_size_t s;
632 int i, pv_npg;
633
634 /*
635 * Initialize the vm page array entries for the kernel pmap's
636 * page table pages.
637 */
638 for (i = 0; i < NKPT; i++) {
639 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
640 KASSERT(mpte >= vm_page_array &&
641 mpte < &vm_page_array[vm_page_array_size],
642 ("pmap_init: page table page is out of range"));
643 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
644 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
645 }
646
647 /*
648 * If the kernel is running in a virtual machine on an AMD Family 10h
649 * processor, then it must assume that MCA is enabled by the virtual
650 * machine monitor.
651 */
652 if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
653 CPUID_TO_FAMILY(cpu_id) == 0x10)
654 workaround_erratum383 = 1;
655
656 /*
657 * Are large page mappings enabled?
658 */
659 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
660 if (pg_ps_enabled) {
661 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
662 ("pmap_init: can't assign to pagesizes[1]"));
663 pagesizes[1] = NBPDR;
664 }
665
666 /*
667 * Calculate the size of the pv head table for superpages.
668 */
669 for (i = 0; phys_avail[i + 1]; i += 2);
670 pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
671
672 /*
673 * Allocate memory for the pv head table for superpages.
674 */
675 s = (vm_size_t)(pv_npg * sizeof(struct md_page));
676 s = round_page(s);
677 pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
678 for (i = 0; i < pv_npg; i++)
679 TAILQ_INIT(&pv_table[i].pv_list);
680 }
681
682 SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
683 "2MB page mapping counters");
684
685 static u_long pmap_pde_demotions;
686 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
687 &pmap_pde_demotions, 0, "2MB page demotions");
688
689 static u_long pmap_pde_mappings;
690 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
691 &pmap_pde_mappings, 0, "2MB page mappings");
692
693 static u_long pmap_pde_p_failures;
694 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
695 &pmap_pde_p_failures, 0, "2MB page promotion failures");
696
697 static u_long pmap_pde_promotions;
698 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
699 &pmap_pde_promotions, 0, "2MB page promotions");
700
701
702 /***************************************************
703 * Low level helper routines.....
704 ***************************************************/
705
706 /*
707 * Determine the appropriate bits to set in a PTE or PDE for a specified
708 * caching mode.
709 */
710 static int
711 pmap_cache_bits(int mode, boolean_t is_pde)
712 {
713 int pat_flag, pat_index, cache_bits;
714
715 /* The PAT bit is different for PTE's and PDE's. */
716 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
717
718 /* Map the caching mode to a PAT index. */
719 switch (mode) {
720 #ifdef PAT_WORKS
721 case PAT_UNCACHEABLE:
722 pat_index = 3;
723 break;
724 case PAT_WRITE_THROUGH:
725 pat_index = 1;
726 break;
727 case PAT_WRITE_BACK:
728 pat_index = 0;
729 break;
730 case PAT_UNCACHED:
731 pat_index = 2;
732 break;
733 case PAT_WRITE_COMBINING:
734 pat_index = 5;
735 break;
736 case PAT_WRITE_PROTECTED:
737 pat_index = 4;
738 break;
739 #else
740 case PAT_UNCACHED:
741 case PAT_UNCACHEABLE:
742 case PAT_WRITE_PROTECTED:
743 pat_index = 3;
744 break;
745 case PAT_WRITE_THROUGH:
746 pat_index = 1;
747 break;
748 case PAT_WRITE_BACK:
749 pat_index = 0;
750 break;
751 case PAT_WRITE_COMBINING:
752 pat_index = 2;
753 break;
754 #endif
755 default:
756 panic("Unknown caching mode %d\n", mode);
757 }
758
759 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
760 cache_bits = 0;
761 if (pat_index & 0x4)
762 cache_bits |= pat_flag;
763 if (pat_index & 0x2)
764 cache_bits |= PG_NC_PCD;
765 if (pat_index & 0x1)
766 cache_bits |= PG_NC_PWT;
767 return (cache_bits);
768 }
769
770 /*
771 * After changing the page size for the specified virtual address in the page
772 * table, flush the corresponding entries from the processor's TLB. Only the
773 * calling processor's TLB is affected.
774 *
775 * The calling thread must be pinned to a processor.
776 */
777 static void
778 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
779 {
780 u_long cr4;
781
782 if ((newpde & PG_PS) == 0)
783 /* Demotion: flush a specific 2MB page mapping. */
784 invlpg(va);
785 else if ((newpde & PG_G) == 0)
786 /*
787 * Promotion: flush every 4KB page mapping from the TLB
788 * because there are too many to flush individually.
789 */
790 invltlb();
791 else {
792 /*
793 * Promotion: flush every 4KB page mapping from the TLB,
794 * including any global (PG_G) mappings.
795 */
796 cr4 = rcr4();
797 load_cr4(cr4 & ~CR4_PGE);
798 /*
799 * Although preemption at this point could be detrimental to
800 * performance, it would not lead to an error. PG_G is simply
801 * ignored if CR4.PGE is clear. Moreover, in case this block
802 * is re-entered, the load_cr4() either above or below will
803 * modify CR4.PGE flushing the TLB.
804 */
805 load_cr4(cr4 | CR4_PGE);
806 }
807 }
808 #ifdef SMP
809 /*
810 * For SMP, these functions have to use the IPI mechanism for coherence.
811 *
812 * N.B.: Before calling any of the following TLB invalidation functions,
813 * the calling processor must ensure that all stores updating a non-
814 * kernel page table are globally performed. Otherwise, another
815 * processor could cache an old, pre-update entry without being
816 * invalidated. This can happen one of two ways: (1) The pmap becomes
817 * active on another processor after its pm_active field is checked by
818 * one of the following functions but before a store updating the page
819 * table is globally performed. (2) The pmap becomes active on another
820 * processor before its pm_active field is checked but due to
821 * speculative loads one of the following functions stills reads the
822 * pmap as inactive on the other processor.
823 *
824 * The kernel page table is exempt because its pm_active field is
825 * immutable. The kernel page table is always active on every
826 * processor.
827 */
828 void
829 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
830 {
831 u_int cpumask;
832 u_int other_cpus;
833
834 sched_pin();
835 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
836 invlpg(va);
837 smp_invlpg(va);
838 } else {
839 cpumask = PCPU_GET(cpumask);
840 other_cpus = PCPU_GET(other_cpus);
841 if (pmap->pm_active & cpumask)
842 invlpg(va);
843 if (pmap->pm_active & other_cpus)
844 smp_masked_invlpg(pmap->pm_active & other_cpus, va);
845 }
846 sched_unpin();
847 }
848
849 void
850 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
851 {
852 u_int cpumask;
853 u_int other_cpus;
854 vm_offset_t addr;
855
856 sched_pin();
857 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
858 for (addr = sva; addr < eva; addr += PAGE_SIZE)
859 invlpg(addr);
860 smp_invlpg_range(sva, eva);
861 } else {
862 cpumask = PCPU_GET(cpumask);
863 other_cpus = PCPU_GET(other_cpus);
864 if (pmap->pm_active & cpumask)
865 for (addr = sva; addr < eva; addr += PAGE_SIZE)
866 invlpg(addr);
867 if (pmap->pm_active & other_cpus)
868 smp_masked_invlpg_range(pmap->pm_active & other_cpus,
869 sva, eva);
870 }
871 sched_unpin();
872 }
873
874 void
875 pmap_invalidate_all(pmap_t pmap)
876 {
877 u_int cpumask;
878 u_int other_cpus;
879
880 sched_pin();
881 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
882 invltlb();
883 smp_invltlb();
884 } else {
885 cpumask = PCPU_GET(cpumask);
886 other_cpus = PCPU_GET(other_cpus);
887 if (pmap->pm_active & cpumask)
888 invltlb();
889 if (pmap->pm_active & other_cpus)
890 smp_masked_invltlb(pmap->pm_active & other_cpus);
891 }
892 sched_unpin();
893 }
894
895 void
896 pmap_invalidate_cache(void)
897 {
898
899 sched_pin();
900 wbinvd();
901 smp_cache_flush();
902 sched_unpin();
903 }
904
905 struct pde_action {
906 cpumask_t store; /* processor that updates the PDE */
907 cpumask_t invalidate; /* processors that invalidate their TLB */
908 vm_offset_t va;
909 pd_entry_t *pde;
910 pd_entry_t newpde;
911 };
912
913 static void
914 pmap_update_pde_action(void *arg)
915 {
916 struct pde_action *act = arg;
917
918 if (act->store == PCPU_GET(cpumask))
919 pde_store(act->pde, act->newpde);
920 }
921
922 static void
923 pmap_update_pde_teardown(void *arg)
924 {
925 struct pde_action *act = arg;
926
927 if ((act->invalidate & PCPU_GET(cpumask)) != 0)
928 pmap_update_pde_invalidate(act->va, act->newpde);
929 }
930
931 /*
932 * Change the page size for the specified virtual address in a way that
933 * prevents any possibility of the TLB ever having two entries that map the
934 * same virtual address using different page sizes. This is the recommended
935 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
936 * machine check exception for a TLB state that is improperly diagnosed as a
937 * hardware error.
938 */
939 static void
940 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
941 {
942 struct pde_action act;
943 cpumask_t active, cpumask;
944
945 sched_pin();
946 cpumask = PCPU_GET(cpumask);
947 if (pmap == kernel_pmap)
948 active = all_cpus;
949 else
950 active = pmap->pm_active;
951 if ((active & PCPU_GET(other_cpus)) != 0) {
952 act.store = cpumask;
953 act.invalidate = active;
954 act.va = va;
955 act.pde = pde;
956 act.newpde = newpde;
957 smp_rendezvous_cpus(cpumask | active,
958 smp_no_rendevous_barrier, pmap_update_pde_action,
959 pmap_update_pde_teardown, &act);
960 } else {
961 pde_store(pde, newpde);
962 if ((active & cpumask) != 0)
963 pmap_update_pde_invalidate(va, newpde);
964 }
965 sched_unpin();
966 }
967 #else /* !SMP */
968 /*
969 * Normal, non-SMP, invalidation functions.
970 * We inline these within pmap.c for speed.
971 */
972 PMAP_INLINE void
973 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
974 {
975
976 if (pmap == kernel_pmap || pmap->pm_active)
977 invlpg(va);
978 }
979
980 PMAP_INLINE void
981 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
982 {
983 vm_offset_t addr;
984
985 if (pmap == kernel_pmap || pmap->pm_active)
986 for (addr = sva; addr < eva; addr += PAGE_SIZE)
987 invlpg(addr);
988 }
989
990 PMAP_INLINE void
991 pmap_invalidate_all(pmap_t pmap)
992 {
993
994 if (pmap == kernel_pmap || pmap->pm_active)
995 invltlb();
996 }
997
998 PMAP_INLINE void
999 pmap_invalidate_cache(void)
1000 {
1001
1002 wbinvd();
1003 }
1004
1005 static void
1006 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1007 {
1008
1009 pde_store(pde, newpde);
1010 if (pmap == kernel_pmap || pmap->pm_active)
1011 pmap_update_pde_invalidate(va, newpde);
1012 }
1013 #endif /* !SMP */
1014
1015 static void
1016 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1017 {
1018
1019 KASSERT((sva & PAGE_MASK) == 0,
1020 ("pmap_invalidate_cache_range: sva not page-aligned"));
1021 KASSERT((eva & PAGE_MASK) == 0,
1022 ("pmap_invalidate_cache_range: eva not page-aligned"));
1023
1024 if (cpu_feature & CPUID_SS)
1025 ; /* If "Self Snoop" is supported, do nothing. */
1026 else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1027 eva - sva < 2 * 1024 * 1024) {
1028
1029 /*
1030 * Otherwise, do per-cache line flush. Use the mfence
1031 * instruction to insure that previous stores are
1032 * included in the write-back. The processor
1033 * propagates flush to other processors in the cache
1034 * coherence domain.
1035 */
1036 mfence();
1037 for (; sva < eva; sva += cpu_clflush_line_size)
1038 clflush(sva);
1039 mfence();
1040 } else {
1041
1042 /*
1043 * No targeted cache flush methods are supported by CPU,
1044 * or the supplied range is bigger than 2MB.
1045 * Globally invalidate cache.
1046 */
1047 pmap_invalidate_cache();
1048 }
1049 }
1050
1051 /*
1052 * Routine: pmap_extract
1053 * Function:
1054 * Extract the physical page address associated
1055 * with the given map/virtual_address pair.
1056 */
1057 vm_paddr_t
1058 pmap_extract(pmap_t pmap, vm_offset_t va)
1059 {
1060 vm_paddr_t rtval;
1061 pt_entry_t *pte;
1062 pd_entry_t pde, *pdep;
1063
1064 rtval = 0;
1065 PMAP_LOCK(pmap);
1066 pdep = pmap_pde(pmap, va);
1067 if (pdep != NULL) {
1068 pde = *pdep;
1069 if (pde) {
1070 if ((pde & PG_PS) != 0)
1071 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1072 else {
1073 pte = pmap_pde_to_pte(pdep, va);
1074 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1075 }
1076 }
1077 }
1078 PMAP_UNLOCK(pmap);
1079 return (rtval);
1080 }
1081
1082 /*
1083 * Routine: pmap_extract_and_hold
1084 * Function:
1085 * Atomically extract and hold the physical page
1086 * with the given pmap and virtual address pair
1087 * if that mapping permits the given protection.
1088 */
1089 vm_page_t
1090 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1091 {
1092 pd_entry_t pde, *pdep;
1093 pt_entry_t pte;
1094 vm_page_t m;
1095
1096 m = NULL;
1097 vm_page_lock_queues();
1098 PMAP_LOCK(pmap);
1099 pdep = pmap_pde(pmap, va);
1100 if (pdep != NULL && (pde = *pdep)) {
1101 if (pde & PG_PS) {
1102 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1103 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1104 (va & PDRMASK));
1105 vm_page_hold(m);
1106 }
1107 } else {
1108 pte = *pmap_pde_to_pte(pdep, va);
1109 if ((pte & PG_V) &&
1110 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1111 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1112 vm_page_hold(m);
1113 }
1114 }
1115 }
1116 vm_page_unlock_queues();
1117 PMAP_UNLOCK(pmap);
1118 return (m);
1119 }
1120
1121 vm_paddr_t
1122 pmap_kextract(vm_offset_t va)
1123 {
1124 pd_entry_t pde;
1125 vm_paddr_t pa;
1126
1127 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1128 pa = DMAP_TO_PHYS(va);
1129 } else {
1130 pde = *vtopde(va);
1131 if (pde & PG_PS) {
1132 pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
1133 } else {
1134 /*
1135 * Beware of a concurrent promotion that changes the
1136 * PDE at this point! For example, vtopte() must not
1137 * be used to access the PTE because it would use the
1138 * new PDE. It is, however, safe to use the old PDE
1139 * because the page table page is preserved by the
1140 * promotion.
1141 */
1142 pa = *pmap_pde_to_pte(&pde, va);
1143 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1144 }
1145 }
1146 return pa;
1147 }
1148
1149 /***************************************************
1150 * Low level mapping routines.....
1151 ***************************************************/
1152
1153 /*
1154 * Add a wired page to the kva.
1155 * Note: not SMP coherent.
1156 */
1157 PMAP_INLINE void
1158 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1159 {
1160 pt_entry_t *pte;
1161
1162 pte = vtopte(va);
1163 pte_store(pte, pa | PG_RW | PG_V | PG_G);
1164 }
1165
1166 static __inline void
1167 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1168 {
1169 pt_entry_t *pte;
1170
1171 pte = vtopte(va);
1172 pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
1173 }
1174
1175 /*
1176 * Remove a page from the kernel pagetables.
1177 * Note: not SMP coherent.
1178 */
1179 PMAP_INLINE void
1180 pmap_kremove(vm_offset_t va)
1181 {
1182 pt_entry_t *pte;
1183
1184 pte = vtopte(va);
1185 pte_clear(pte);
1186 }
1187
1188 /*
1189 * Used to map a range of physical addresses into kernel
1190 * virtual address space.
1191 *
1192 * The value passed in '*virt' is a suggested virtual address for
1193 * the mapping. Architectures which can support a direct-mapped
1194 * physical to virtual region can return the appropriate address
1195 * within that region, leaving '*virt' unchanged. Other
1196 * architectures should map the pages starting at '*virt' and
1197 * update '*virt' with the first usable address after the mapped
1198 * region.
1199 */
1200 vm_offset_t
1201 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1202 {
1203 return PHYS_TO_DMAP(start);
1204 }
1205
1206
1207 /*
1208 * Add a list of wired pages to the kva
1209 * this routine is only used for temporary
1210 * kernel mappings that do not need to have
1211 * page modification or references recorded.
1212 * Note that old mappings are simply written
1213 * over. The page *must* be wired.
1214 * Note: SMP coherent. Uses a ranged shootdown IPI.
1215 */
1216 void
1217 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1218 {
1219 pt_entry_t *endpte, oldpte, *pte;
1220
1221 oldpte = 0;
1222 pte = vtopte(sva);
1223 endpte = pte + count;
1224 while (pte < endpte) {
1225 oldpte |= *pte;
1226 pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G |
1227 pmap_cache_bits((*ma)->md.pat_mode, 0) | PG_RW | PG_V);
1228 pte++;
1229 ma++;
1230 }
1231 if ((oldpte & PG_V) != 0)
1232 pmap_invalidate_range(kernel_pmap, sva, sva + count *
1233 PAGE_SIZE);
1234 }
1235
1236 /*
1237 * This routine tears out page mappings from the
1238 * kernel -- it is meant only for temporary mappings.
1239 * Note: SMP coherent. Uses a ranged shootdown IPI.
1240 */
1241 void
1242 pmap_qremove(vm_offset_t sva, int count)
1243 {
1244 vm_offset_t va;
1245
1246 va = sva;
1247 while (count-- > 0) {
1248 pmap_kremove(va);
1249 va += PAGE_SIZE;
1250 }
1251 pmap_invalidate_range(kernel_pmap, sva, va);
1252 }
1253
1254 /***************************************************
1255 * Page table page management routines.....
1256 ***************************************************/
1257 static __inline void
1258 pmap_free_zero_pages(vm_page_t free)
1259 {
1260 vm_page_t m;
1261
1262 while (free != NULL) {
1263 m = free;
1264 free = m->right;
1265 /* Preserve the page's PG_ZERO setting. */
1266 vm_page_free_toq(m);
1267 }
1268 }
1269
1270 /*
1271 * Schedule the specified unused page table page to be freed. Specifically,
1272 * add the page to the specified list of pages that will be released to the
1273 * physical memory manager after the TLB has been updated.
1274 */
1275 static __inline void
1276 pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1277 {
1278
1279 if (set_PG_ZERO)
1280 m->flags |= PG_ZERO;
1281 else
1282 m->flags &= ~PG_ZERO;
1283 m->right = *free;
1284 *free = m;
1285 }
1286
1287 /*
1288 * Inserts the specified page table page into the specified pmap's collection
1289 * of idle page table pages. Each of a pmap's page table pages is responsible
1290 * for mapping a distinct range of virtual addresses. The pmap's collection is
1291 * ordered by this virtual address range.
1292 */
1293 static void
1294 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1295 {
1296 vm_page_t root;
1297
1298 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1299 root = pmap->pm_root;
1300 if (root == NULL) {
1301 mpte->left = NULL;
1302 mpte->right = NULL;
1303 } else {
1304 root = vm_page_splay(mpte->pindex, root);
1305 if (mpte->pindex < root->pindex) {
1306 mpte->left = root->left;
1307 mpte->right = root;
1308 root->left = NULL;
1309 } else if (mpte->pindex == root->pindex)
1310 panic("pmap_insert_pt_page: pindex already inserted");
1311 else {
1312 mpte->right = root->right;
1313 mpte->left = root;
1314 root->right = NULL;
1315 }
1316 }
1317 pmap->pm_root = mpte;
1318 }
1319
1320 /*
1321 * Looks for a page table page mapping the specified virtual address in the
1322 * specified pmap's collection of idle page table pages. Returns NULL if there
1323 * is no page table page corresponding to the specified virtual address.
1324 */
1325 static vm_page_t
1326 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1327 {
1328 vm_page_t mpte;
1329 vm_pindex_t pindex = pmap_pde_pindex(va);
1330
1331 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1332 if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1333 mpte = vm_page_splay(pindex, mpte);
1334 if ((pmap->pm_root = mpte)->pindex != pindex)
1335 mpte = NULL;
1336 }
1337 return (mpte);
1338 }
1339
1340 /*
1341 * Removes the specified page table page from the specified pmap's collection
1342 * of idle page table pages. The specified page table page must be a member of
1343 * the pmap's collection.
1344 */
1345 static void
1346 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1347 {
1348 vm_page_t root;
1349
1350 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1351 if (mpte != pmap->pm_root) {
1352 root = vm_page_splay(mpte->pindex, pmap->pm_root);
1353 KASSERT(mpte == root,
1354 ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
1355 mpte, pmap));
1356 }
1357 if (mpte->left == NULL)
1358 root = mpte->right;
1359 else {
1360 root = vm_page_splay(mpte->pindex, mpte->left);
1361 root->right = mpte->right;
1362 }
1363 pmap->pm_root = root;
1364 }
1365
1366 /*
1367 * This routine unholds page table pages, and if the hold count
1368 * drops to zero, then it decrements the wire count.
1369 */
1370 static __inline int
1371 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
1372 {
1373
1374 --m->wire_count;
1375 if (m->wire_count == 0)
1376 return _pmap_unwire_pte_hold(pmap, va, m, free);
1377 else
1378 return 0;
1379 }
1380
1381 static int
1382 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
1383 vm_page_t *free)
1384 {
1385
1386 /*
1387 * unmap the page table page
1388 */
1389 if (m->pindex >= (NUPDE + NUPDPE)) {
1390 /* PDP page */
1391 pml4_entry_t *pml4;
1392 pml4 = pmap_pml4e(pmap, va);
1393 *pml4 = 0;
1394 } else if (m->pindex >= NUPDE) {
1395 /* PD page */
1396 pdp_entry_t *pdp;
1397 pdp = pmap_pdpe(pmap, va);
1398 *pdp = 0;
1399 } else {
1400 /* PTE page */
1401 pd_entry_t *pd;
1402 pd = pmap_pde(pmap, va);
1403 *pd = 0;
1404 }
1405 --pmap->pm_stats.resident_count;
1406 if (m->pindex < NUPDE) {
1407 /* We just released a PT, unhold the matching PD */
1408 vm_page_t pdpg;
1409
1410 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
1411 pmap_unwire_pte_hold(pmap, va, pdpg, free);
1412 }
1413 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1414 /* We just released a PD, unhold the matching PDP */
1415 vm_page_t pdppg;
1416
1417 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
1418 pmap_unwire_pte_hold(pmap, va, pdppg, free);
1419 }
1420
1421 /*
1422 * This is a release store so that the ordinary store unmapping
1423 * the page table page is globally performed before TLB shoot-
1424 * down is begun.
1425 */
1426 atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1427
1428 /*
1429 * Put page on a list so that it is released after
1430 * *ALL* TLB shootdown is done
1431 */
1432 pmap_add_delayed_free_list(m, free, TRUE);
1433
1434 return 1;
1435 }
1436
1437 /*
1438 * After removing a page table entry, this routine is used to
1439 * conditionally free the page, and manage the hold/wire counts.
1440 */
1441 static int
1442 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
1443 {
1444 vm_page_t mpte;
1445
1446 if (va >= VM_MAXUSER_ADDRESS)
1447 return 0;
1448 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1449 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1450 return pmap_unwire_pte_hold(pmap, va, mpte, free);
1451 }
1452
1453 void
1454 pmap_pinit0(pmap_t pmap)
1455 {
1456
1457 PMAP_LOCK_INIT(pmap);
1458 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
1459 pmap->pm_root = NULL;
1460 pmap->pm_active = 0;
1461 TAILQ_INIT(&pmap->pm_pvchunk);
1462 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1463 }
1464
1465 /*
1466 * Initialize a preallocated and zeroed pmap structure,
1467 * such as one in a vmspace structure.
1468 */
1469 int
1470 pmap_pinit(pmap_t pmap)
1471 {
1472 vm_page_t pml4pg;
1473 static vm_pindex_t color;
1474
1475 PMAP_LOCK_INIT(pmap);
1476
1477 /*
1478 * allocate the page directory page
1479 */
1480 while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1481 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1482 VM_WAIT;
1483
1484 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1485
1486 if ((pml4pg->flags & PG_ZERO) == 0)
1487 pagezero(pmap->pm_pml4);
1488
1489 /* Wire in kernel global address entries. */
1490 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1491 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1492
1493 /* install self-referential address mapping entry(s) */
1494 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1495
1496 pmap->pm_root = NULL;
1497 pmap->pm_active = 0;
1498 TAILQ_INIT(&pmap->pm_pvchunk);
1499 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1500
1501 return (1);
1502 }
1503
1504 /*
1505 * this routine is called if the page table page is not
1506 * mapped correctly.
1507 *
1508 * Note: If a page allocation fails at page table level two or three,
1509 * one or two pages may be held during the wait, only to be released
1510 * afterwards. This conservative approach is easily argued to avoid
1511 * race conditions.
1512 */
1513 static vm_page_t
1514 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
1515 {
1516 vm_page_t m, pdppg, pdpg;
1517
1518 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1519 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1520 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1521
1522 /*
1523 * Allocate a page table page.
1524 */
1525 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1526 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1527 if (flags & M_WAITOK) {
1528 PMAP_UNLOCK(pmap);
1529 vm_page_unlock_queues();
1530 VM_WAIT;
1531 vm_page_lock_queues();
1532 PMAP_LOCK(pmap);
1533 }
1534
1535 /*
1536 * Indicate the need to retry. While waiting, the page table
1537 * page may have been allocated.
1538 */
1539 return (NULL);
1540 }
1541 if ((m->flags & PG_ZERO) == 0)
1542 pmap_zero_page(m);
1543
1544 /*
1545 * Map the pagetable page into the process address space, if
1546 * it isn't already there.
1547 */
1548
1549 if (ptepindex >= (NUPDE + NUPDPE)) {
1550 pml4_entry_t *pml4;
1551 vm_pindex_t pml4index;
1552
1553 /* Wire up a new PDPE page */
1554 pml4index = ptepindex - (NUPDE + NUPDPE);
1555 pml4 = &pmap->pm_pml4[pml4index];
1556 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1557
1558 } else if (ptepindex >= NUPDE) {
1559 vm_pindex_t pml4index;
1560 vm_pindex_t pdpindex;
1561 pml4_entry_t *pml4;
1562 pdp_entry_t *pdp;
1563
1564 /* Wire up a new PDE page */
1565 pdpindex = ptepindex - NUPDE;
1566 pml4index = pdpindex >> NPML4EPGSHIFT;
1567
1568 pml4 = &pmap->pm_pml4[pml4index];
1569 if ((*pml4 & PG_V) == 0) {
1570 /* Have to allocate a new pdp, recurse */
1571 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
1572 flags) == NULL) {
1573 --m->wire_count;
1574 atomic_subtract_int(&cnt.v_wire_count, 1);
1575 vm_page_free_zero(m);
1576 return (NULL);
1577 }
1578 } else {
1579 /* Add reference to pdp page */
1580 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
1581 pdppg->wire_count++;
1582 }
1583 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1584
1585 /* Now find the pdp page */
1586 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1587 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1588
1589 } else {
1590 vm_pindex_t pml4index;
1591 vm_pindex_t pdpindex;
1592 pml4_entry_t *pml4;
1593 pdp_entry_t *pdp;
1594 pd_entry_t *pd;
1595
1596 /* Wire up a new PTE page */
1597 pdpindex = ptepindex >> NPDPEPGSHIFT;
1598 pml4index = pdpindex >> NPML4EPGSHIFT;
1599
1600 /* First, find the pdp and check that its valid. */
1601 pml4 = &pmap->pm_pml4[pml4index];
1602 if ((*pml4 & PG_V) == 0) {
1603 /* Have to allocate a new pd, recurse */
1604 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1605 flags) == NULL) {
1606 --m->wire_count;
1607 atomic_subtract_int(&cnt.v_wire_count, 1);
1608 vm_page_free_zero(m);
1609 return (NULL);
1610 }
1611 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1612 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1613 } else {
1614 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1615 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1616 if ((*pdp & PG_V) == 0) {
1617 /* Have to allocate a new pd, recurse */
1618 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
1619 flags) == NULL) {
1620 --m->wire_count;
1621 atomic_subtract_int(&cnt.v_wire_count,
1622 1);
1623 vm_page_free_zero(m);
1624 return (NULL);
1625 }
1626 } else {
1627 /* Add reference to the pd page */
1628 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
1629 pdpg->wire_count++;
1630 }
1631 }
1632 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1633
1634 /* Now we know where the page directory page is */
1635 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1636 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1637 }
1638
1639 pmap->pm_stats.resident_count++;
1640
1641 return m;
1642 }
1643
1644 static vm_page_t
1645 pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
1646 {
1647 vm_pindex_t pdpindex, ptepindex;
1648 pdp_entry_t *pdpe;
1649 vm_page_t pdpg;
1650
1651 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1652 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1653 ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
1654 retry:
1655 pdpe = pmap_pdpe(pmap, va);
1656 if (pdpe != NULL && (*pdpe & PG_V) != 0) {
1657 /* Add a reference to the pd page. */
1658 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
1659 pdpg->wire_count++;
1660 } else {
1661 /* Allocate a pd page. */
1662 ptepindex = pmap_pde_pindex(va);
1663 pdpindex = ptepindex >> NPDPEPGSHIFT;
1664 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
1665 if (pdpg == NULL && (flags & M_WAITOK))
1666 goto retry;
1667 }
1668 return (pdpg);
1669 }
1670
1671 static vm_page_t
1672 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1673 {
1674 vm_pindex_t ptepindex;
1675 pd_entry_t *pd;
1676 vm_page_t m;
1677
1678 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1679 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1680 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1681
1682 /*
1683 * Calculate pagetable page index
1684 */
1685 ptepindex = pmap_pde_pindex(va);
1686 retry:
1687 /*
1688 * Get the page directory entry
1689 */
1690 pd = pmap_pde(pmap, va);
1691
1692 /*
1693 * This supports switching from a 2MB page to a
1694 * normal 4K page.
1695 */
1696 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1697 if (!pmap_demote_pde(pmap, pd, va)) {
1698 /*
1699 * Invalidation of the 2MB page mapping may have caused
1700 * the deallocation of the underlying PD page.
1701 */
1702 pd = NULL;
1703 }
1704 }
1705
1706 /*
1707 * If the page table page is mapped, we just increment the
1708 * hold count, and activate it.
1709 */
1710 if (pd != NULL && (*pd & PG_V) != 0) {
1711 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
1712 m->wire_count++;
1713 } else {
1714 /*
1715 * Here if the pte page isn't mapped, or if it has been
1716 * deallocated.
1717 */
1718 m = _pmap_allocpte(pmap, ptepindex, flags);
1719 if (m == NULL && (flags & M_WAITOK))
1720 goto retry;
1721 }
1722 return (m);
1723 }
1724
1725
1726 /***************************************************
1727 * Pmap allocation/deallocation routines.
1728 ***************************************************/
1729
1730 /*
1731 * Release any resources held by the given physical map.
1732 * Called when a pmap initialized by pmap_pinit is being released.
1733 * Should only be called if the map contains no valid mappings.
1734 */
1735 void
1736 pmap_release(pmap_t pmap)
1737 {
1738 vm_page_t m;
1739
1740 KASSERT(pmap->pm_stats.resident_count == 0,
1741 ("pmap_release: pmap resident count %ld != 0",
1742 pmap->pm_stats.resident_count));
1743 KASSERT(pmap->pm_root == NULL,
1744 ("pmap_release: pmap has reserved page table page(s)"));
1745
1746 m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
1747
1748 pmap->pm_pml4[KPML4I] = 0; /* KVA */
1749 pmap->pm_pml4[DMPML4I] = 0; /* Direct Map */
1750 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
1751
1752 m->wire_count--;
1753 atomic_subtract_int(&cnt.v_wire_count, 1);
1754 vm_page_free_zero(m);
1755 PMAP_LOCK_DESTROY(pmap);
1756 }
1757
1758 static int
1759 kvm_size(SYSCTL_HANDLER_ARGS)
1760 {
1761 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1762
1763 return sysctl_handle_long(oidp, &ksize, 0, req);
1764 }
1765 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1766 0, 0, kvm_size, "LU", "Size of KVM");
1767
1768 static int
1769 kvm_free(SYSCTL_HANDLER_ARGS)
1770 {
1771 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1772
1773 return sysctl_handle_long(oidp, &kfree, 0, req);
1774 }
1775 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1776 0, 0, kvm_free, "LU", "Amount of KVM free");
1777
1778 /*
1779 * grow the number of kernel page table entries, if needed
1780 */
1781 void
1782 pmap_growkernel(vm_offset_t addr)
1783 {
1784 vm_paddr_t paddr;
1785 vm_page_t nkpg;
1786 pd_entry_t *pde, newpdir;
1787 pdp_entry_t *pdpe;
1788
1789 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1790
1791 /*
1792 * Return if "addr" is within the range of kernel page table pages
1793 * that were preallocated during pmap bootstrap. Moreover, leave
1794 * "kernel_vm_end" and the kernel page table as they were.
1795 *
1796 * The correctness of this action is based on the following
1797 * argument: vm_map_findspace() allocates contiguous ranges of the
1798 * kernel virtual address space. It calls this function if a range
1799 * ends after "kernel_vm_end". If the kernel is mapped between
1800 * "kernel_vm_end" and "addr", then the range cannot begin at
1801 * "kernel_vm_end". In fact, its beginning address cannot be less
1802 * than the kernel. Thus, there is no immediate need to allocate
1803 * any new kernel page table pages between "kernel_vm_end" and
1804 * "KERNBASE".
1805 */
1806 if (KERNBASE < addr && addr <= KERNBASE + NKPT * NBPDR)
1807 return;
1808
1809 addr = roundup2(addr, NBPDR);
1810 if (addr - 1 >= kernel_map->max_offset)
1811 addr = kernel_map->max_offset;
1812 while (kernel_vm_end < addr) {
1813 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
1814 if ((*pdpe & PG_V) == 0) {
1815 /* We need a new PDP entry */
1816 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
1817 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
1818 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1819 if (nkpg == NULL)
1820 panic("pmap_growkernel: no memory to grow kernel");
1821 if ((nkpg->flags & PG_ZERO) == 0)
1822 pmap_zero_page(nkpg);
1823 paddr = VM_PAGE_TO_PHYS(nkpg);
1824 *pdpe = (pdp_entry_t)
1825 (paddr | PG_V | PG_RW | PG_A | PG_M);
1826 continue; /* try again */
1827 }
1828 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
1829 if ((*pde & PG_V) != 0) {
1830 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
1831 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1832 kernel_vm_end = kernel_map->max_offset;
1833 break;
1834 }
1835 continue;
1836 }
1837
1838 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
1839 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1840 VM_ALLOC_ZERO);
1841 if (nkpg == NULL)
1842 panic("pmap_growkernel: no memory to grow kernel");
1843 if ((nkpg->flags & PG_ZERO) == 0)
1844 pmap_zero_page(nkpg);
1845 paddr = VM_PAGE_TO_PHYS(nkpg);
1846 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1847 pde_store(pde, newpdir);
1848
1849 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
1850 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1851 kernel_vm_end = kernel_map->max_offset;
1852 break;
1853 }
1854 }
1855 }
1856
1857
1858 /***************************************************
1859 * page management routines.
1860 ***************************************************/
1861
1862 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1863 CTASSERT(_NPCM == 3);
1864 CTASSERT(_NPCPV == 168);
1865
1866 static __inline struct pv_chunk *
1867 pv_to_chunk(pv_entry_t pv)
1868 {
1869
1870 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1871 }
1872
1873 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1874
1875 #define PC_FREE0 0xfffffffffffffffful
1876 #define PC_FREE1 0xfffffffffffffffful
1877 #define PC_FREE2 0x000000fffffffffful
1878
1879 static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1880
1881 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1882 "Current number of pv entries");
1883
1884 #ifdef PV_STATS
1885 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1886
1887 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1888 "Current number of pv entry chunks");
1889 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1890 "Current number of pv entry chunks allocated");
1891 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1892 "Current number of pv entry chunks frees");
1893 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1894 "Number of times tried to get a chunk page but failed.");
1895
1896 static long pv_entry_frees, pv_entry_allocs;
1897 static int pv_entry_spare;
1898
1899 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1900 "Current number of pv entry frees");
1901 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1902 "Current number of pv entry allocs");
1903 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1904 "Current number of spare pv entries");
1905
1906 static int pmap_collect_inactive, pmap_collect_active;
1907
1908 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1909 "Current number times pmap_collect called on inactive queue");
1910 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1911 "Current number times pmap_collect called on active queue");
1912 #endif
1913
1914 /*
1915 * We are in a serious low memory condition. Resort to
1916 * drastic measures to free some pages so we can allocate
1917 * another pv entry chunk. This is normally called to
1918 * unmap inactive pages, and if necessary, active pages.
1919 *
1920 * We do not, however, unmap 2mpages because subsequent accesses will
1921 * allocate per-page pv entries until repromotion occurs, thereby
1922 * exacerbating the shortage of free pv entries.
1923 */
1924 static void
1925 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1926 {
1927 struct md_page *pvh;
1928 pd_entry_t *pde;
1929 pmap_t pmap;
1930 pt_entry_t *pte, tpte;
1931 pv_entry_t next_pv, pv;
1932 vm_offset_t va;
1933 vm_page_t m, free;
1934
1935 TAILQ_FOREACH(m, &vpq->pl, pageq) {
1936 if (m->hold_count || m->busy)
1937 continue;
1938 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1939 va = pv->pv_va;
1940 pmap = PV_PMAP(pv);
1941 /* Avoid deadlock and lock recursion. */
1942 if (pmap > locked_pmap)
1943 PMAP_LOCK(pmap);
1944 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1945 continue;
1946 pmap->pm_stats.resident_count--;
1947 pde = pmap_pde(pmap, va);
1948 KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
1949 " a 2mpage in page %p's pv list", m));
1950 pte = pmap_pde_to_pte(pde, va);
1951 tpte = pte_load_clear(pte);
1952 KASSERT((tpte & PG_W) == 0,
1953 ("pmap_collect: wired pte %#lx", tpte));
1954 if (tpte & PG_A)
1955 vm_page_flag_set(m, PG_REFERENCED);
1956 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
1957 vm_page_dirty(m);
1958 free = NULL;
1959 pmap_unuse_pt(pmap, va, *pde, &free);
1960 pmap_invalidate_page(pmap, va);
1961 pmap_free_zero_pages(free);
1962 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1963 if (TAILQ_EMPTY(&m->md.pv_list)) {
1964 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1965 if (TAILQ_EMPTY(&pvh->pv_list))
1966 vm_page_flag_clear(m, PG_WRITEABLE);
1967 }
1968 free_pv_entry(pmap, pv);
1969 if (pmap != locked_pmap)
1970 PMAP_UNLOCK(pmap);
1971 }
1972 }
1973 }
1974
1975
1976 /*
1977 * free the pv_entry back to the free list
1978 */
1979 static void
1980 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1981 {
1982 vm_page_t m;
1983 struct pv_chunk *pc;
1984 int idx, field, bit;
1985
1986 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1987 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1988 PV_STAT(pv_entry_frees++);
1989 PV_STAT(pv_entry_spare++);
1990 pv_entry_count--;
1991 pc = pv_to_chunk(pv);
1992 idx = pv - &pc->pc_pventry[0];
1993 field = idx / 64;
1994 bit = idx % 64;
1995 pc->pc_map[field] |= 1ul << bit;
1996 /* move to head of list */
1997 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1998 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1999 pc->pc_map[2] != PC_FREE2) {
2000 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2001 return;
2002 }
2003 PV_STAT(pv_entry_spare -= _NPCPV);
2004 PV_STAT(pc_chunk_count--);
2005 PV_STAT(pc_chunk_frees++);
2006 /* entire chunk is free, return it */
2007 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2008 dump_drop_page(m->phys_addr);
2009 vm_page_unwire(m, 0);
2010 vm_page_free(m);
2011 }
2012
2013 /*
2014 * get a new pv_entry, allocating a block from the system
2015 * when needed.
2016 */
2017 static pv_entry_t
2018 get_pv_entry(pmap_t pmap, boolean_t try)
2019 {
2020 static vm_pindex_t colour;
2021 struct vpgqueues *pq;
2022 int bit, field;
2023 pv_entry_t pv;
2024 struct pv_chunk *pc;
2025 vm_page_t m;
2026
2027 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2028 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2029 PV_STAT(pv_entry_allocs++);
2030 pq = NULL;
2031 retry:
2032 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2033 if (pc != NULL) {
2034 for (field = 0; field < _NPCM; field++) {
2035 if (pc->pc_map[field]) {
2036 bit = bsfq(pc->pc_map[field]);
2037 break;
2038 }
2039 }
2040 if (field < _NPCM) {
2041 pv = &pc->pc_pventry[field * 64 + bit];
2042 pc->pc_map[field] &= ~(1ul << bit);
2043 /* If this was the last item, move it to tail */
2044 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2045 pc->pc_map[2] == 0) {
2046 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2047 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2048 pc_list);
2049 }
2050 pv_entry_count++;
2051 PV_STAT(pv_entry_spare--);
2052 return (pv);
2053 }
2054 }
2055 /* No free items, allocate another chunk */
2056 m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ?
2057 VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
2058 VM_ALLOC_WIRED);
2059 if (m == NULL) {
2060 if (try) {
2061 PV_STAT(pc_chunk_tryfail++);
2062 return (NULL);
2063 }
2064 /*
2065 * Reclaim pv entries: At first, destroy mappings to inactive
2066 * pages. After that, if a pv chunk entry is still needed,
2067 * destroy mappings to active pages.
2068 */
2069 if (pq == NULL) {
2070 PV_STAT(pmap_collect_inactive++);
2071 pq = &vm_page_queues[PQ_INACTIVE];
2072 } else if (pq == &vm_page_queues[PQ_INACTIVE]) {
2073 PV_STAT(pmap_collect_active++);
2074 pq = &vm_page_queues[PQ_ACTIVE];
2075 } else
2076 panic("get_pv_entry: allocation failed");
2077 pmap_collect(pmap, pq);
2078 goto retry;
2079 }
2080 PV_STAT(pc_chunk_count++);
2081 PV_STAT(pc_chunk_allocs++);
2082 colour++;
2083 dump_add_page(m->phys_addr);
2084 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2085 pc->pc_pmap = pmap;
2086 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
2087 pc->pc_map[1] = PC_FREE1;
2088 pc->pc_map[2] = PC_FREE2;
2089 pv = &pc->pc_pventry[0];
2090 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2091 pv_entry_count++;
2092 PV_STAT(pv_entry_spare += _NPCPV - 1);
2093 return (pv);
2094 }
2095
2096 /*
2097 * First find and then remove the pv entry for the specified pmap and virtual
2098 * address from the specified pv list. Returns the pv entry if found and NULL
2099 * otherwise. This operation can be performed on pv lists for either 4KB or
2100 * 2MB page mappings.
2101 */
2102 static __inline pv_entry_t
2103 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2104 {
2105 pv_entry_t pv;
2106
2107 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2108 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2109 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2110 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2111 break;
2112 }
2113 }
2114 return (pv);
2115 }
2116
2117 /*
2118 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2119 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2120 * entries for each of the 4KB page mappings.
2121 */
2122 static void
2123 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2124 {
2125 struct md_page *pvh;
2126 pv_entry_t pv;
2127 vm_offset_t va_last;
2128 vm_page_t m;
2129
2130 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2131 KASSERT((pa & PDRMASK) == 0,
2132 ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
2133
2134 /*
2135 * Transfer the 2mpage's pv entry for this mapping to the first
2136 * page's pv list.
2137 */
2138 pvh = pa_to_pvh(pa);
2139 va = trunc_2mpage(va);
2140 pv = pmap_pvh_remove(pvh, pmap, va);
2141 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2142 m = PHYS_TO_VM_PAGE(pa);
2143 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2144 /* Instantiate the remaining NPTEPG - 1 pv entries. */
2145 va_last = va + NBPDR - PAGE_SIZE;
2146 do {
2147 m++;
2148 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2149 ("pmap_pv_demote_pde: page %p is not managed", m));
2150 va += PAGE_SIZE;
2151 pmap_insert_entry(pmap, va, m);
2152 } while (va < va_last);
2153 }
2154
2155 /*
2156 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
2157 * replace the many pv entries for the 4KB page mappings by a single pv entry
2158 * for the 2MB page mapping.
2159 */
2160 static void
2161 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2162 {
2163 struct md_page *pvh;
2164 pv_entry_t pv;
2165 vm_offset_t va_last;
2166 vm_page_t m;
2167
2168 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2169 KASSERT((pa & PDRMASK) == 0,
2170 ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
2171
2172 /*
2173 * Transfer the first page's pv entry for this mapping to the
2174 * 2mpage's pv list. Aside from avoiding the cost of a call
2175 * to get_pv_entry(), a transfer avoids the possibility that
2176 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2177 * removes one of the mappings that is being promoted.
2178 */
2179 m = PHYS_TO_VM_PAGE(pa);
2180 va = trunc_2mpage(va);
2181 pv = pmap_pvh_remove(&m->md, pmap, va);
2182 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2183 pvh = pa_to_pvh(pa);
2184 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2185 /* Free the remaining NPTEPG - 1 pv entries. */
2186 va_last = va + NBPDR - PAGE_SIZE;
2187 do {
2188 m++;
2189 va += PAGE_SIZE;
2190 pmap_pvh_free(&m->md, pmap, va);
2191 } while (va < va_last);
2192 }
2193
2194 /*
2195 * First find and then destroy the pv entry for the specified pmap and virtual
2196 * address. This operation can be performed on pv lists for either 4KB or 2MB
2197 * page mappings.
2198 */
2199 static void
2200 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2201 {
2202 pv_entry_t pv;
2203
2204 pv = pmap_pvh_remove(pvh, pmap, va);
2205 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2206 free_pv_entry(pmap, pv);
2207 }
2208
2209 static void
2210 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2211 {
2212 struct md_page *pvh;
2213
2214 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2215 pmap_pvh_free(&m->md, pmap, va);
2216 if (TAILQ_EMPTY(&m->md.pv_list)) {
2217 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2218 if (TAILQ_EMPTY(&pvh->pv_list))
2219 vm_page_flag_clear(m, PG_WRITEABLE);
2220 }
2221 }
2222
2223 /*
2224 * Create a pv entry for page at pa for
2225 * (pmap, va).
2226 */
2227 static void
2228 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2229 {
2230 pv_entry_t pv;
2231
2232 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2233 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2234 pv = get_pv_entry(pmap, FALSE);
2235 pv->pv_va = va;
2236 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2237 }
2238
2239 /*
2240 * Conditionally create a pv entry.
2241 */
2242 static boolean_t
2243 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2244 {
2245 pv_entry_t pv;
2246
2247 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2248 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2249 if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
2250 pv->pv_va = va;
2251 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2252 return (TRUE);
2253 } else
2254 return (FALSE);
2255 }
2256
2257 /*
2258 * Create the pv entry for a 2MB page mapping.
2259 */
2260 static boolean_t
2261 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2262 {
2263 struct md_page *pvh;
2264 pv_entry_t pv;
2265
2266 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2267 if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
2268 pv->pv_va = va;
2269 pvh = pa_to_pvh(pa);
2270 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2271 return (TRUE);
2272 } else
2273 return (FALSE);
2274 }
2275
2276 /*
2277 * Fills a page table page with mappings to consecutive physical pages.
2278 */
2279 static void
2280 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2281 {
2282 pt_entry_t *pte;
2283
2284 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2285 *pte = newpte;
2286 newpte += PAGE_SIZE;
2287 }
2288 }
2289
2290 /*
2291 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page
2292 * mapping is invalidated.
2293 */
2294 static boolean_t
2295 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2296 {
2297 pd_entry_t newpde, oldpde;
2298 pt_entry_t *firstpte, newpte;
2299 vm_paddr_t mptepa;
2300 vm_page_t free, mpte;
2301
2302 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2303 oldpde = *pde;
2304 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2305 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2306 mpte = pmap_lookup_pt_page(pmap, va);
2307 if (mpte != NULL)
2308 pmap_remove_pt_page(pmap, mpte);
2309 else {
2310 KASSERT((oldpde & PG_W) == 0,
2311 ("pmap_demote_pde: page table page for a wired mapping"
2312 " is missing"));
2313
2314 /*
2315 * Invalidate the 2MB page mapping and return "failure" if the
2316 * mapping was never accessed or the allocation of the new
2317 * page table page fails. If the 2MB page mapping belongs to
2318 * the direct map region of the kernel's address space, then
2319 * the page allocation request specifies the highest possible
2320 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is
2321 * normal. Page table pages are preallocated for every other
2322 * part of the kernel address space, so the direct map region
2323 * is the only part of the kernel address space that must be
2324 * handled here.
2325 */
2326 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2327 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
2328 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
2329 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2330 free = NULL;
2331 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
2332 pmap_invalidate_page(pmap, trunc_2mpage(va));
2333 pmap_free_zero_pages(free);
2334 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
2335 " in pmap %p", va, pmap);
2336 return (FALSE);
2337 }
2338 if (va < VM_MAXUSER_ADDRESS)
2339 pmap->pm_stats.resident_count++;
2340 }
2341 mptepa = VM_PAGE_TO_PHYS(mpte);
2342 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2343 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2344 KASSERT((oldpde & PG_A) != 0,
2345 ("pmap_demote_pde: oldpde is missing PG_A"));
2346 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2347 ("pmap_demote_pde: oldpde is missing PG_M"));
2348 newpte = oldpde & ~PG_PS;
2349 if ((newpte & PG_PDE_PAT) != 0)
2350 newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2351
2352 /*
2353 * If the page table page is new, initialize it.
2354 */
2355 if (mpte->wire_count == 1) {
2356 mpte->wire_count = NPTEPG;
2357 pmap_fill_ptp(firstpte, newpte);
2358 }
2359 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2360 ("pmap_demote_pde: firstpte and newpte map different physical"
2361 " addresses"));
2362
2363 /*
2364 * If the mapping has changed attributes, update the page table
2365 * entries.
2366 */
2367 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2368 pmap_fill_ptp(firstpte, newpte);
2369
2370 /*
2371 * Demote the mapping. This pmap is locked. The old PDE has
2372 * PG_A set. If the old PDE has PG_RW set, it also has PG_M
2373 * set. Thus, there is no danger of a race with another
2374 * processor changing the setting of PG_A and/or PG_M between
2375 * the read above and the store below.
2376 */
2377 if (workaround_erratum383)
2378 pmap_update_pde(pmap, va, pde, newpde);
2379 else
2380 pde_store(pde, newpde);
2381
2382 /*
2383 * Invalidate a stale recursive mapping of the page table page.
2384 */
2385 if (va >= VM_MAXUSER_ADDRESS)
2386 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2387
2388 /*
2389 * Demote the pv entry. This depends on the earlier demotion
2390 * of the mapping. Specifically, the (re)creation of a per-
2391 * page pv entry might trigger the execution of pmap_collect(),
2392 * which might reclaim a newly (re)created per-page pv entry
2393 * and destroy the associated mapping. In order to destroy
2394 * the mapping, the PDE must have already changed from mapping
2395 * the 2mpage to referencing the page table page.
2396 */
2397 if ((oldpde & PG_MANAGED) != 0)
2398 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2399
2400 pmap_pde_demotions++;
2401 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
2402 " in pmap %p", va, pmap);
2403 return (TRUE);
2404 }
2405
2406 /*
2407 * pmap_remove_pde: do the things to unmap a superpage in a process
2408 */
2409 static int
2410 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2411 vm_page_t *free)
2412 {
2413 struct md_page *pvh;
2414 pd_entry_t oldpde;
2415 vm_offset_t eva, va;
2416 vm_page_t m, mpte;
2417
2418 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2419 KASSERT((sva & PDRMASK) == 0,
2420 ("pmap_remove_pde: sva is not 2mpage aligned"));
2421 oldpde = pte_load_clear(pdq);
2422 if (oldpde & PG_W)
2423 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2424
2425 /*
2426 * Machines that don't support invlpg, also don't support
2427 * PG_G.
2428 */
2429 if (oldpde & PG_G)
2430 pmap_invalidate_page(kernel_pmap, sva);
2431 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2432 if (oldpde & PG_MANAGED) {
2433 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2434 pmap_pvh_free(pvh, pmap, sva);
2435 eva = sva + NBPDR;
2436 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2437 va < eva; va += PAGE_SIZE, m++) {
2438 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2439 vm_page_dirty(m);
2440 if (oldpde & PG_A)
2441 vm_page_flag_set(m, PG_REFERENCED);
2442 if (TAILQ_EMPTY(&m->md.pv_list) &&
2443 TAILQ_EMPTY(&pvh->pv_list))
2444 vm_page_flag_clear(m, PG_WRITEABLE);
2445 }
2446 }
2447 if (pmap == kernel_pmap) {
2448 if (!pmap_demote_pde(pmap, pdq, sva))
2449 panic("pmap_remove_pde: failed demotion");
2450 } else {
2451 mpte = pmap_lookup_pt_page(pmap, sva);
2452 if (mpte != NULL) {
2453 pmap_remove_pt_page(pmap, mpte);
2454 pmap->pm_stats.resident_count--;
2455 KASSERT(mpte->wire_count == NPTEPG,
2456 ("pmap_remove_pde: pte page wire count error"));
2457 mpte->wire_count = 0;
2458 pmap_add_delayed_free_list(mpte, free, FALSE);
2459 atomic_subtract_int(&cnt.v_wire_count, 1);
2460 }
2461 }
2462 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
2463 }
2464
2465 /*
2466 * pmap_remove_pte: do the things to unmap a page in a process
2467 */
2468 static int
2469 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2470 pd_entry_t ptepde, vm_page_t *free)
2471 {
2472 pt_entry_t oldpte;
2473 vm_page_t m;
2474
2475 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2476 oldpte = pte_load_clear(ptq);
2477 if (oldpte & PG_W)
2478 pmap->pm_stats.wired_count -= 1;
2479 /*
2480 * Machines that don't support invlpg, also don't support
2481 * PG_G.
2482 */
2483 if (oldpte & PG_G)
2484 pmap_invalidate_page(kernel_pmap, va);
2485 pmap->pm_stats.resident_count -= 1;
2486 if (oldpte & PG_MANAGED) {
2487 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2488 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2489 vm_page_dirty(m);
2490 if (oldpte & PG_A)
2491 vm_page_flag_set(m, PG_REFERENCED);
2492 pmap_remove_entry(pmap, m, va);
2493 }
2494 return (pmap_unuse_pt(pmap, va, ptepde, free));
2495 }
2496
2497 /*
2498 * Remove a single page from a process address space
2499 */
2500 static void
2501 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
2502 {
2503 pt_entry_t *pte;
2504
2505 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2506 if ((*pde & PG_V) == 0)
2507 return;
2508 pte = pmap_pde_to_pte(pde, va);
2509 if ((*pte & PG_V) == 0)
2510 return;
2511 pmap_remove_pte(pmap, pte, va, *pde, free);
2512 pmap_invalidate_page(pmap, va);
2513 }
2514
2515 /*
2516 * Remove the given range of addresses from the specified map.
2517 *
2518 * It is assumed that the start and end are properly
2519 * rounded to the page size.
2520 */
2521 void
2522 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2523 {
2524 vm_offset_t va_next;
2525 pml4_entry_t *pml4e;
2526 pdp_entry_t *pdpe;
2527 pd_entry_t ptpaddr, *pde;
2528 pt_entry_t *pte;
2529 vm_page_t free = NULL;
2530 int anyvalid;
2531
2532 /*
2533 * Perform an unsynchronized read. This is, however, safe.
2534 */
2535 if (pmap->pm_stats.resident_count == 0)
2536 return;
2537
2538 anyvalid = 0;
2539
2540 vm_page_lock_queues();
2541 PMAP_LOCK(pmap);
2542
2543 /*
2544 * special handling of removing one page. a very
2545 * common operation and easy to short circuit some
2546 * code.
2547 */
2548 if (sva + PAGE_SIZE == eva) {
2549 pde = pmap_pde(pmap, sva);
2550 if (pde && (*pde & PG_PS) == 0) {
2551 pmap_remove_page(pmap, sva, pde, &free);
2552 goto out;
2553 }
2554 }
2555
2556 for (; sva < eva; sva = va_next) {
2557
2558 if (pmap->pm_stats.resident_count == 0)
2559 break;
2560
2561 pml4e = pmap_pml4e(pmap, sva);
2562 if ((*pml4e & PG_V) == 0) {
2563 va_next = (sva + NBPML4) & ~PML4MASK;
2564 if (va_next < sva)
2565 va_next = eva;
2566 continue;
2567 }
2568
2569 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2570 if ((*pdpe & PG_V) == 0) {
2571 va_next = (sva + NBPDP) & ~PDPMASK;
2572 if (va_next < sva)
2573 va_next = eva;
2574 continue;
2575 }
2576
2577 /*
2578 * Calculate index for next page table.
2579 */
2580 va_next = (sva + NBPDR) & ~PDRMASK;
2581 if (va_next < sva)
2582 va_next = eva;
2583
2584 pde = pmap_pdpe_to_pde(pdpe, sva);
2585 ptpaddr = *pde;
2586
2587 /*
2588 * Weed out invalid mappings.
2589 */
2590 if (ptpaddr == 0)
2591 continue;
2592
2593 /*
2594 * Check for large page.
2595 */
2596 if ((ptpaddr & PG_PS) != 0) {
2597 /*
2598 * Are we removing the entire large page? If not,
2599 * demote the mapping and fall through.
2600 */
2601 if (sva + NBPDR == va_next && eva >= va_next) {
2602 /*
2603 * The TLB entry for a PG_G mapping is
2604 * invalidated by pmap_remove_pde().
2605 */
2606 if ((ptpaddr & PG_G) == 0)
2607 anyvalid = 1;
2608 pmap_remove_pde(pmap, pde, sva, &free);
2609 continue;
2610 } else if (!pmap_demote_pde(pmap, pde, sva)) {
2611 /* The large page mapping was destroyed. */
2612 continue;
2613 } else
2614 ptpaddr = *pde;
2615 }
2616
2617 /*
2618 * Limit our scan to either the end of the va represented
2619 * by the current page table page, or to the end of the
2620 * range being removed.
2621 */
2622 if (va_next > eva)
2623 va_next = eva;
2624
2625 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2626 sva += PAGE_SIZE) {
2627 if (*pte == 0)
2628 continue;
2629
2630 /*
2631 * The TLB entry for a PG_G mapping is invalidated
2632 * by pmap_remove_pte().
2633 */
2634 if ((*pte & PG_G) == 0)
2635 anyvalid = 1;
2636 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
2637 break;
2638 }
2639 }
2640 out:
2641 if (anyvalid)
2642 pmap_invalidate_all(pmap);
2643 vm_page_unlock_queues();
2644 PMAP_UNLOCK(pmap);
2645 pmap_free_zero_pages(free);
2646 }
2647
2648 /*
2649 * Routine: pmap_remove_all
2650 * Function:
2651 * Removes this physical page from
2652 * all physical maps in which it resides.
2653 * Reflects back modify bits to the pager.
2654 *
2655 * Notes:
2656 * Original versions of this routine were very
2657 * inefficient because they iteratively called
2658 * pmap_remove (slow...)
2659 */
2660
2661 void
2662 pmap_remove_all(vm_page_t m)
2663 {
2664 struct md_page *pvh;
2665 pv_entry_t pv;
2666 pmap_t pmap;
2667 pt_entry_t *pte, tpte;
2668 pd_entry_t *pde;
2669 vm_offset_t va;
2670 vm_page_t free;
2671
2672 KASSERT((m->flags & PG_FICTITIOUS) == 0,
2673 ("pmap_remove_all: page %p is fictitious", m));
2674 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2675 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2676 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2677 va = pv->pv_va;
2678 pmap = PV_PMAP(pv);
2679 PMAP_LOCK(pmap);
2680 pde = pmap_pde(pmap, va);
2681 (void)pmap_demote_pde(pmap, pde, va);
2682 PMAP_UNLOCK(pmap);
2683 }
2684 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2685 pmap = PV_PMAP(pv);
2686 PMAP_LOCK(pmap);
2687 pmap->pm_stats.resident_count--;
2688 pde = pmap_pde(pmap, pv->pv_va);
2689 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2690 " a 2mpage in page %p's pv list", m));
2691 pte = pmap_pde_to_pte(pde, pv->pv_va);
2692 tpte = pte_load_clear(pte);
2693 if (tpte & PG_W)
2694 pmap->pm_stats.wired_count--;
2695 if (tpte & PG_A)
2696 vm_page_flag_set(m, PG_REFERENCED);
2697
2698 /*
2699 * Update the vm_page_t clean and reference bits.
2700 */
2701 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2702 vm_page_dirty(m);
2703 free = NULL;
2704 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
2705 pmap_invalidate_page(pmap, pv->pv_va);
2706 pmap_free_zero_pages(free);
2707 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2708 free_pv_entry(pmap, pv);
2709 PMAP_UNLOCK(pmap);
2710 }
2711 vm_page_flag_clear(m, PG_WRITEABLE);
2712 }
2713
2714 /*
2715 * pmap_protect_pde: do the things to protect a 2mpage in a process
2716 */
2717 static boolean_t
2718 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2719 {
2720 pd_entry_t newpde, oldpde;
2721 vm_offset_t eva, va;
2722 vm_page_t m;
2723 boolean_t anychanged;
2724
2725 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2726 KASSERT((sva & PDRMASK) == 0,
2727 ("pmap_protect_pde: sva is not 2mpage aligned"));
2728 anychanged = FALSE;
2729 retry:
2730 oldpde = newpde = *pde;
2731 if (oldpde & PG_MANAGED) {
2732 eva = sva + NBPDR;
2733 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2734 va < eva; va += PAGE_SIZE, m++) {
2735 /*
2736 * In contrast to the analogous operation on a 4KB page
2737 * mapping, the mapping's PG_A flag is not cleared and
2738 * the page's PG_REFERENCED flag is not set. The
2739 * reason is that pmap_demote_pde() expects that a 2MB
2740 * page mapping with a stored page table page has PG_A
2741 * set.
2742 */
2743 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2744 vm_page_dirty(m);
2745 }
2746 }
2747 if ((prot & VM_PROT_WRITE) == 0)
2748 newpde &= ~(PG_RW | PG_M);
2749 if ((prot & VM_PROT_EXECUTE) == 0)
2750 newpde |= pg_nx;
2751 if (newpde != oldpde) {
2752 if (!atomic_cmpset_long(pde, oldpde, newpde))
2753 goto retry;
2754 if (oldpde & PG_G)
2755 pmap_invalidate_page(pmap, sva);
2756 else
2757 anychanged = TRUE;
2758 }
2759 return (anychanged);
2760 }
2761
2762 /*
2763 * Set the physical protection on the
2764 * specified range of this map as requested.
2765 */
2766 void
2767 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2768 {
2769 vm_offset_t va_next;
2770 pml4_entry_t *pml4e;
2771 pdp_entry_t *pdpe;
2772 pd_entry_t ptpaddr, *pde;
2773 pt_entry_t *pte;
2774 int anychanged;
2775
2776 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2777 pmap_remove(pmap, sva, eva);
2778 return;
2779 }
2780
2781 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2782 (VM_PROT_WRITE|VM_PROT_EXECUTE))
2783 return;
2784
2785 anychanged = 0;
2786
2787 vm_page_lock_queues();
2788 PMAP_LOCK(pmap);
2789 for (; sva < eva; sva = va_next) {
2790
2791 pml4e = pmap_pml4e(pmap, sva);
2792 if ((*pml4e & PG_V) == 0) {
2793 va_next = (sva + NBPML4) & ~PML4MASK;
2794 if (va_next < sva)
2795 va_next = eva;
2796 continue;
2797 }
2798
2799 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2800 if ((*pdpe & PG_V) == 0) {
2801 va_next = (sva + NBPDP) & ~PDPMASK;
2802 if (va_next < sva)
2803 va_next = eva;
2804 continue;
2805 }
2806
2807 va_next = (sva + NBPDR) & ~PDRMASK;
2808 if (va_next < sva)
2809 va_next = eva;
2810
2811 pde = pmap_pdpe_to_pde(pdpe, sva);
2812 ptpaddr = *pde;
2813
2814 /*
2815 * Weed out invalid mappings.
2816 */
2817 if (ptpaddr == 0)
2818 continue;
2819
2820 /*
2821 * Check for large page.
2822 */
2823 if ((ptpaddr & PG_PS) != 0) {
2824 /*
2825 * Are we protecting the entire large page? If not,
2826 * demote the mapping and fall through.
2827 */
2828 if (sva + NBPDR == va_next && eva >= va_next) {
2829 /*
2830 * The TLB entry for a PG_G mapping is
2831 * invalidated by pmap_protect_pde().
2832 */
2833 if (pmap_protect_pde(pmap, pde, sva, prot))
2834 anychanged = 1;
2835 continue;
2836 } else if (!pmap_demote_pde(pmap, pde, sva)) {
2837 /* The large page mapping was destroyed. */
2838 continue;
2839 }
2840 }
2841
2842 if (va_next > eva)
2843 va_next = eva;
2844
2845 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2846 sva += PAGE_SIZE) {
2847 pt_entry_t obits, pbits;
2848 vm_page_t m;
2849
2850 retry:
2851 obits = pbits = *pte;
2852 if ((pbits & PG_V) == 0)
2853 continue;
2854 if (pbits & PG_MANAGED) {
2855 m = NULL;
2856 if (pbits & PG_A) {
2857 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2858 vm_page_flag_set(m, PG_REFERENCED);
2859 pbits &= ~PG_A;
2860 }
2861 if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2862 if (m == NULL)
2863 m = PHYS_TO_VM_PAGE(pbits &
2864 PG_FRAME);
2865 vm_page_dirty(m);
2866 }
2867 }
2868
2869 if ((prot & VM_PROT_WRITE) == 0)
2870 pbits &= ~(PG_RW | PG_M);
2871 if ((prot & VM_PROT_EXECUTE) == 0)
2872 pbits |= pg_nx;
2873
2874 if (pbits != obits) {
2875 if (!atomic_cmpset_long(pte, obits, pbits))
2876 goto retry;
2877 if (obits & PG_G)
2878 pmap_invalidate_page(pmap, sva);
2879 else
2880 anychanged = 1;
2881 }
2882 }
2883 }
2884 if (anychanged)
2885 pmap_invalidate_all(pmap);
2886 vm_page_unlock_queues();
2887 PMAP_UNLOCK(pmap);
2888 }
2889
2890 /*
2891 * Tries to promote the 512, contiguous 4KB page mappings that are within a
2892 * single page table page (PTP) to a single 2MB page mapping. For promotion
2893 * to occur, two conditions must be met: (1) the 4KB page mappings must map
2894 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2895 * identical characteristics.
2896 */
2897 static void
2898 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2899 {
2900 pd_entry_t newpde;
2901 pt_entry_t *firstpte, oldpte, pa, *pte;
2902 vm_offset_t oldpteva;
2903 vm_page_t mpte;
2904
2905 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2906
2907 /*
2908 * Examine the first PTE in the specified PTP. Abort if this PTE is
2909 * either invalid, unused, or does not map the first 4KB physical page
2910 * within a 2MB page.
2911 */
2912 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
2913 setpde:
2914 newpde = *firstpte;
2915 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
2916 pmap_pde_p_failures++;
2917 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2918 " in pmap %p", va, pmap);
2919 return;
2920 }
2921 if ((newpde & (PG_M | PG_RW)) == PG_RW) {
2922 /*
2923 * When PG_M is already clear, PG_RW can be cleared without
2924 * a TLB invalidation.
2925 */
2926 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
2927 goto setpde;
2928 newpde &= ~PG_RW;
2929 }
2930
2931 /*
2932 * Examine each of the other PTEs in the specified PTP. Abort if this
2933 * PTE maps an unexpected 4KB physical page or does not have identical
2934 * characteristics to the first PTE.
2935 */
2936 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
2937 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
2938 setpte:
2939 oldpte = *pte;
2940 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
2941 pmap_pde_p_failures++;
2942 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2943 " in pmap %p", va, pmap);
2944 return;
2945 }
2946 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2947 /*
2948 * When PG_M is already clear, PG_RW can be cleared
2949 * without a TLB invalidation.
2950 */
2951 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
2952 goto setpte;
2953 oldpte &= ~PG_RW;
2954 oldpteva = (oldpte & PG_FRAME & PDRMASK) |
2955 (va & ~PDRMASK);
2956 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
2957 " in pmap %p", oldpteva, pmap);
2958 }
2959 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2960 pmap_pde_p_failures++;
2961 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
2962 " in pmap %p", va, pmap);
2963 return;
2964 }
2965 pa -= PAGE_SIZE;
2966 }
2967
2968 /*
2969 * Save the page table page in its current state until the PDE
2970 * mapping the superpage is demoted by pmap_demote_pde() or
2971 * destroyed by pmap_remove_pde().
2972 */
2973 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
2974 KASSERT(mpte >= vm_page_array &&
2975 mpte < &vm_page_array[vm_page_array_size],
2976 ("pmap_promote_pde: page table page is out of range"));
2977 KASSERT(mpte->pindex == pmap_pde_pindex(va),
2978 ("pmap_promote_pde: page table page's pindex is wrong"));
2979 pmap_insert_pt_page(pmap, mpte);
2980
2981 /*
2982 * Promote the pv entries.
2983 */
2984 if ((newpde & PG_MANAGED) != 0)
2985 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
2986
2987 /*
2988 * Propagate the PAT index to its proper position.
2989 */
2990 if ((newpde & PG_PTE_PAT) != 0)
2991 newpde ^= PG_PDE_PAT | PG_PTE_PAT;
2992
2993 /*
2994 * Map the superpage.
2995 */
2996 if (workaround_erratum383)
2997 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
2998 else
2999 pde_store(pde, PG_PS | newpde);
3000
3001 pmap_pde_promotions++;
3002 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
3003 " in pmap %p", va, pmap);
3004 }
3005
3006 /*
3007 * Insert the given physical page (p) at
3008 * the specified virtual address (v) in the
3009 * target physical map with the protection requested.
3010 *
3011 * If specified, the page will be wired down, meaning
3012 * that the related pte can not be reclaimed.
3013 *
3014 * NB: This is the only routine which MAY NOT lazy-evaluate
3015 * or lose information. That is, this routine must actually
3016 * insert this page into the given map NOW.
3017 */
3018 void
3019 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3020 vm_prot_t prot, boolean_t wired)
3021 {
3022 vm_paddr_t pa;
3023 pd_entry_t *pde;
3024 pt_entry_t *pte;
3025 vm_paddr_t opa;
3026 pt_entry_t origpte, newpte;
3027 vm_page_t mpte, om;
3028 boolean_t invlva;
3029
3030 va = trunc_page(va);
3031 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3032 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3033 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va));
3034
3035 mpte = NULL;
3036
3037 vm_page_lock_queues();
3038 PMAP_LOCK(pmap);
3039
3040 /*
3041 * In the case that a page table page is not
3042 * resident, we are creating it here.
3043 */
3044 if (va < VM_MAXUSER_ADDRESS) {
3045 mpte = pmap_allocpte(pmap, va, M_WAITOK);
3046 }
3047
3048 pde = pmap_pde(pmap, va);
3049 if (pde != NULL && (*pde & PG_V) != 0) {
3050 if ((*pde & PG_PS) != 0)
3051 panic("pmap_enter: attempted pmap_enter on 2MB page");
3052 pte = pmap_pde_to_pte(pde, va);
3053 } else
3054 panic("pmap_enter: invalid page directory va=%#lx", va);
3055
3056 pa = VM_PAGE_TO_PHYS(m);
3057 om = NULL;
3058 origpte = *pte;
3059 opa = origpte & PG_FRAME;
3060
3061 /*
3062 * Mapping has not changed, must be protection or wiring change.
3063 */
3064 if (origpte && (opa == pa)) {
3065 /*
3066 * Wiring change, just update stats. We don't worry about
3067 * wiring PT pages as they remain resident as long as there
3068 * are valid mappings in them. Hence, if a user page is wired,
3069 * the PT page will be also.
3070 */
3071 if (wired && ((origpte & PG_W) == 0))
3072 pmap->pm_stats.wired_count++;
3073 else if (!wired && (origpte & PG_W))
3074 pmap->pm_stats.wired_count--;
3075
3076 /*
3077 * Remove extra pte reference
3078 */
3079 if (mpte)
3080 mpte->wire_count--;
3081
3082 /*
3083 * We might be turning off write access to the page,
3084 * so we go ahead and sense modify status.
3085 */
3086 if (origpte & PG_MANAGED) {
3087 om = m;
3088 pa |= PG_MANAGED;
3089 }
3090 goto validate;
3091 }
3092 /*
3093 * Mapping has changed, invalidate old range and fall through to
3094 * handle validating new mapping.
3095 */
3096 if (opa) {
3097 if (origpte & PG_W)
3098 pmap->pm_stats.wired_count--;
3099 if (origpte & PG_MANAGED) {
3100 om = PHYS_TO_VM_PAGE(opa);
3101 pmap_remove_entry(pmap, om, va);
3102 }
3103 if (mpte != NULL) {
3104 mpte->wire_count--;
3105 KASSERT(mpte->wire_count > 0,
3106 ("pmap_enter: missing reference to page table page,"
3107 " va: 0x%lx", va));
3108 }
3109 } else
3110 pmap->pm_stats.resident_count++;
3111
3112 /*
3113 * Enter on the PV list if part of our managed memory.
3114 */
3115 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3116 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3117 ("pmap_enter: managed mapping within the clean submap"));
3118 pmap_insert_entry(pmap, va, m);
3119 pa |= PG_MANAGED;
3120 }
3121
3122 /*
3123 * Increment counters
3124 */
3125 if (wired)
3126 pmap->pm_stats.wired_count++;
3127
3128 validate:
3129 /*
3130 * Now validate mapping with desired protection/wiring.
3131 */
3132 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3133 if ((prot & VM_PROT_WRITE) != 0) {
3134 newpte |= PG_RW;
3135 vm_page_flag_set(m, PG_WRITEABLE);
3136 }
3137 if ((prot & VM_PROT_EXECUTE) == 0)
3138 newpte |= pg_nx;
3139 if (wired)
3140 newpte |= PG_W;
3141 if (va < VM_MAXUSER_ADDRESS)
3142 newpte |= PG_U;
3143 if (pmap == kernel_pmap)
3144 newpte |= PG_G;
3145
3146 /*
3147 * if the mapping or permission bits are different, we need
3148 * to update the pte.
3149 */
3150 if ((origpte & ~(PG_M|PG_A)) != newpte) {
3151 newpte |= PG_A;
3152 if ((access & VM_PROT_WRITE) != 0)
3153 newpte |= PG_M;
3154 if (origpte & PG_V) {
3155 invlva = FALSE;
3156 origpte = pte_load_store(pte, newpte);
3157 if (origpte & PG_A) {
3158 if (origpte & PG_MANAGED)
3159 vm_page_flag_set(om, PG_REFERENCED);
3160 if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
3161 PG_NX) == 0 && (newpte & PG_NX)))
3162 invlva = TRUE;
3163 }
3164 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3165 if ((origpte & PG_MANAGED) != 0)
3166 vm_page_dirty(om);
3167 if ((newpte & PG_RW) == 0)
3168 invlva = TRUE;
3169 }
3170 if (invlva)
3171 pmap_invalidate_page(pmap, va);
3172 } else
3173 pte_store(pte, newpte);
3174 }
3175
3176 /*
3177 * If both the page table page and the reservation are fully
3178 * populated, then attempt promotion.
3179 */
3180 if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3181 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3182 pmap_promote_pde(pmap, pde, va);
3183
3184 vm_page_unlock_queues();
3185 PMAP_UNLOCK(pmap);
3186 }
3187
3188 /*
3189 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE
3190 * otherwise. Fails if (1) a page table page cannot be allocated without
3191 * blocking, (2) a mapping already exists at the specified virtual address, or
3192 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3193 */
3194 static boolean_t
3195 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3196 {
3197 pd_entry_t *pde, newpde;
3198 vm_page_t free, mpde;
3199
3200 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3201 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3202 if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
3203 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3204 " in pmap %p", va, pmap);
3205 return (FALSE);
3206 }
3207 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde));
3208 pde = &pde[pmap_pde_index(va)];
3209 if ((*pde & PG_V) != 0) {
3210 KASSERT(mpde->wire_count > 1,
3211 ("pmap_enter_pde: mpde's wire count is too low"));
3212 mpde->wire_count--;
3213 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3214 " in pmap %p", va, pmap);
3215 return (FALSE);
3216 }
3217 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3218 PG_PS | PG_V;
3219 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3220 newpde |= PG_MANAGED;
3221
3222 /*
3223 * Abort this mapping if its PV entry could not be created.
3224 */
3225 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3226 free = NULL;
3227 if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) {
3228 pmap_invalidate_page(pmap, va);
3229 pmap_free_zero_pages(free);
3230 }
3231 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3232 " in pmap %p", va, pmap);
3233 return (FALSE);
3234 }
3235 }
3236 if ((prot & VM_PROT_EXECUTE) == 0)
3237 newpde |= pg_nx;
3238 if (va < VM_MAXUSER_ADDRESS)
3239 newpde |= PG_U;
3240
3241 /*
3242 * Increment counters.
3243 */
3244 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3245
3246 /*
3247 * Map the superpage.
3248 */
3249 pde_store(pde, newpde);
3250
3251 pmap_pde_mappings++;
3252 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3253 " in pmap %p", va, pmap);
3254 return (TRUE);
3255 }
3256
3257 /*
3258 * Maps a sequence of resident pages belonging to the same object.
3259 * The sequence begins with the given page m_start. This page is
3260 * mapped at the given virtual address start. Each subsequent page is
3261 * mapped at a virtual address that is offset from start by the same
3262 * amount as the page is offset from m_start within the object. The
3263 * last page in the sequence is the page with the largest offset from
3264 * m_start that can be mapped at a virtual address less than the given
3265 * virtual address end. Not every virtual page between start and end
3266 * is mapped; only those for which a resident page exists with the
3267 * corresponding offset from m_start are mapped.
3268 */
3269 void
3270 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3271 vm_page_t m_start, vm_prot_t prot)
3272 {
3273 vm_offset_t va;
3274 vm_page_t m, mpte;
3275 vm_pindex_t diff, psize;
3276
3277 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3278 psize = atop(end - start);
3279 mpte = NULL;
3280 m = m_start;
3281 PMAP_LOCK(pmap);
3282 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3283 va = start + ptoa(diff);
3284 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3285 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3286 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3287 pmap_enter_pde(pmap, va, m, prot))
3288 m = &m[NBPDR / PAGE_SIZE - 1];
3289 else
3290 mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3291 mpte);
3292 m = TAILQ_NEXT(m, listq);
3293 }
3294 PMAP_UNLOCK(pmap);
3295 }
3296
3297 /*
3298 * this code makes some *MAJOR* assumptions:
3299 * 1. Current pmap & pmap exists.
3300 * 2. Not wired.
3301 * 3. Read access.
3302 * 4. No page table pages.
3303 * but is *MUCH* faster than pmap_enter...
3304 */
3305
3306 void
3307 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3308 {
3309
3310 PMAP_LOCK(pmap);
3311 (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3312 PMAP_UNLOCK(pmap);
3313 }
3314
3315 static vm_page_t
3316 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3317 vm_prot_t prot, vm_page_t mpte)
3318 {
3319 vm_page_t free;
3320 pt_entry_t *pte;
3321 vm_paddr_t pa;
3322
3323 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3324 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3325 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3326 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3327 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3328
3329 /*
3330 * In the case that a page table page is not
3331 * resident, we are creating it here.
3332 */
3333 if (va < VM_MAXUSER_ADDRESS) {
3334 vm_pindex_t ptepindex;
3335 pd_entry_t *ptepa;
3336
3337 /*
3338 * Calculate pagetable page index
3339 */
3340 ptepindex = pmap_pde_pindex(va);
3341 if (mpte && (mpte->pindex == ptepindex)) {
3342 mpte->wire_count++;
3343 } else {
3344 /*
3345 * Get the page directory entry
3346 */
3347 ptepa = pmap_pde(pmap, va);
3348
3349 /*
3350 * If the page table page is mapped, we just increment
3351 * the hold count, and activate it.
3352 */
3353 if (ptepa && (*ptepa & PG_V) != 0) {
3354 if (*ptepa & PG_PS)
3355 return (NULL);
3356 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
3357 mpte->wire_count++;
3358 } else {
3359 mpte = _pmap_allocpte(pmap, ptepindex,
3360 M_NOWAIT);
3361 if (mpte == NULL)
3362 return (mpte);
3363 }
3364 }
3365 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3366 pte = &pte[pmap_pte_index(va)];
3367 } else {
3368 mpte = NULL;
3369 pte = vtopte(va);
3370 }
3371 if (*pte) {
3372 if (mpte != NULL) {
3373 mpte->wire_count--;
3374 mpte = NULL;
3375 }
3376 return (mpte);
3377 }
3378
3379 /*
3380 * Enter on the PV list if part of our managed memory.
3381 */
3382 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3383 !pmap_try_insert_pv_entry(pmap, va, m)) {
3384 if (mpte != NULL) {
3385 free = NULL;
3386 if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
3387 pmap_invalidate_page(pmap, va);
3388 pmap_free_zero_pages(free);
3389 }
3390 mpte = NULL;
3391 }
3392 return (mpte);
3393 }
3394
3395 /*
3396 * Increment counters
3397 */
3398 pmap->pm_stats.resident_count++;
3399
3400 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3401 if ((prot & VM_PROT_EXECUTE) == 0)
3402 pa |= pg_nx;
3403
3404 /*
3405 * Now validate mapping with RO protection
3406 */
3407 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3408 pte_store(pte, pa | PG_V | PG_U);
3409 else
3410 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3411 return mpte;
3412 }
3413
3414 /*
3415 * Make a temporary mapping for a physical address. This is only intended
3416 * to be used for panic dumps.
3417 */
3418 void *
3419 pmap_kenter_temporary(vm_paddr_t pa, int i)
3420 {
3421 vm_offset_t va;
3422
3423 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3424 pmap_kenter(va, pa);
3425 invlpg(va);
3426 return ((void *)crashdumpmap);
3427 }
3428
3429 /*
3430 * This code maps large physical mmap regions into the
3431 * processor address space. Note that some shortcuts
3432 * are taken, but the code works.
3433 */
3434 void
3435 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3436 vm_pindex_t pindex, vm_size_t size)
3437 {
3438 pd_entry_t *pde;
3439 vm_paddr_t pa, ptepa;
3440 vm_page_t p, pdpg;
3441 int pat_mode;
3442
3443 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3444 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3445 ("pmap_object_init_pt: non-device object"));
3446 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3447 if (!vm_object_populate(object, pindex, pindex + atop(size)))
3448 return;
3449 p = vm_page_lookup(object, pindex);
3450 KASSERT(p->valid == VM_PAGE_BITS_ALL,
3451 ("pmap_object_init_pt: invalid page %p", p));
3452 pat_mode = p->md.pat_mode;
3453
3454 /*
3455 * Abort the mapping if the first page is not physically
3456 * aligned to a 2MB page boundary.
3457 */
3458 ptepa = VM_PAGE_TO_PHYS(p);
3459 if (ptepa & (NBPDR - 1))
3460 return;
3461
3462 /*
3463 * Skip the first page. Abort the mapping if the rest of
3464 * the pages are not physically contiguous or have differing
3465 * memory attributes.
3466 */
3467 p = TAILQ_NEXT(p, listq);
3468 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3469 pa += PAGE_SIZE) {
3470 KASSERT(p->valid == VM_PAGE_BITS_ALL,
3471 ("pmap_object_init_pt: invalid page %p", p));
3472 if (pa != VM_PAGE_TO_PHYS(p) ||
3473 pat_mode != p->md.pat_mode)
3474 return;
3475 p = TAILQ_NEXT(p, listq);
3476 }
3477
3478 /*
3479 * Map using 2MB pages. Since "ptepa" is 2M aligned and
3480 * "size" is a multiple of 2M, adding the PAT setting to "pa"
3481 * will not affect the termination of this loop.
3482 */
3483 PMAP_LOCK(pmap);
3484 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3485 size; pa += NBPDR) {
3486 pdpg = pmap_allocpde(pmap, addr, M_NOWAIT);
3487 if (pdpg == NULL) {
3488 /*
3489 * The creation of mappings below is only an
3490 * optimization. If a page directory page
3491 * cannot be allocated without blocking,
3492 * continue on to the next mapping rather than
3493 * blocking.
3494 */
3495 addr += NBPDR;
3496 continue;
3497 }
3498 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3499 pde = &pde[pmap_pde_index(addr)];
3500 if ((*pde & PG_V) == 0) {
3501 pde_store(pde, pa | PG_PS | PG_M | PG_A |
3502 PG_U | PG_RW | PG_V);
3503 pmap->pm_stats.resident_count += NBPDR /
3504 PAGE_SIZE;
3505 pmap_pde_mappings++;
3506 } else {
3507 /* Continue on if the PDE is already valid. */
3508 pdpg->wire_count--;
3509 KASSERT(pdpg->wire_count > 0,
3510 ("pmap_object_init_pt: missing reference "
3511 "to page directory page, va: 0x%lx", addr));
3512 }
3513 addr += NBPDR;
3514 }
3515 PMAP_UNLOCK(pmap);
3516 }
3517 }
3518
3519 /*
3520 * Routine: pmap_change_wiring
3521 * Function: Change the wiring attribute for a map/virtual-address
3522 * pair.
3523 * In/out conditions:
3524 * The mapping must already exist in the pmap.
3525 */
3526 void
3527 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3528 {
3529 pd_entry_t *pde;
3530 pt_entry_t *pte;
3531 boolean_t are_queues_locked;
3532
3533 are_queues_locked = FALSE;
3534
3535 /*
3536 * Wiring is not a hardware characteristic so there is no need to
3537 * invalidate TLB.
3538 */
3539 retry:
3540 PMAP_LOCK(pmap);
3541 pde = pmap_pde(pmap, va);
3542 if ((*pde & PG_PS) != 0) {
3543 if (!wired != ((*pde & PG_W) == 0)) {
3544 if (!are_queues_locked) {
3545 are_queues_locked = TRUE;
3546 if (!mtx_trylock(&vm_page_queue_mtx)) {
3547 PMAP_UNLOCK(pmap);
3548 vm_page_lock_queues();
3549 goto retry;
3550 }
3551 }
3552 if (!pmap_demote_pde(pmap, pde, va))
3553 panic("pmap_change_wiring: demotion failed");
3554 } else
3555 goto out;
3556 }
3557 pte = pmap_pde_to_pte(pde, va);
3558 if (wired && (*pte & PG_W) == 0) {
3559 pmap->pm_stats.wired_count++;
3560 atomic_set_long(pte, PG_W);
3561 } else if (!wired && (*pte & PG_W) != 0) {
3562 pmap->pm_stats.wired_count--;
3563 atomic_clear_long(pte, PG_W);
3564 }
3565 out:
3566 if (are_queues_locked)
3567 vm_page_unlock_queues();
3568 PMAP_UNLOCK(pmap);
3569 }
3570
3571
3572
3573 /*
3574 * Copy the range specified by src_addr/len
3575 * from the source map to the range dst_addr/len
3576 * in the destination map.
3577 *
3578 * This routine is only advisory and need not do anything.
3579 */
3580
3581 void
3582 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3583 vm_offset_t src_addr)
3584 {
3585 vm_page_t free;
3586 vm_offset_t addr;
3587 vm_offset_t end_addr = src_addr + len;
3588 vm_offset_t va_next;
3589
3590 if (dst_addr != src_addr)
3591 return;
3592
3593 vm_page_lock_queues();
3594 if (dst_pmap < src_pmap) {
3595 PMAP_LOCK(dst_pmap);
3596 PMAP_LOCK(src_pmap);
3597 } else {
3598 PMAP_LOCK(src_pmap);
3599 PMAP_LOCK(dst_pmap);
3600 }
3601 for (addr = src_addr; addr < end_addr; addr = va_next) {
3602 pt_entry_t *src_pte, *dst_pte;
3603 vm_page_t dstmpde, dstmpte, srcmpte;
3604 pml4_entry_t *pml4e;
3605 pdp_entry_t *pdpe;
3606 pd_entry_t srcptepaddr, *pde;
3607
3608 KASSERT(addr < UPT_MIN_ADDRESS,
3609 ("pmap_copy: invalid to pmap_copy page tables"));
3610
3611 pml4e = pmap_pml4e(src_pmap, addr);
3612 if ((*pml4e & PG_V) == 0) {
3613 va_next = (addr + NBPML4) & ~PML4MASK;
3614 if (va_next < addr)
3615 va_next = end_addr;
3616 continue;
3617 }
3618
3619 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
3620 if ((*pdpe & PG_V) == 0) {
3621 va_next = (addr + NBPDP) & ~PDPMASK;
3622 if (va_next < addr)
3623 va_next = end_addr;
3624 continue;
3625 }
3626
3627 va_next = (addr + NBPDR) & ~PDRMASK;
3628 if (va_next < addr)
3629 va_next = end_addr;
3630
3631 pde = pmap_pdpe_to_pde(pdpe, addr);
3632 srcptepaddr = *pde;
3633 if (srcptepaddr == 0)
3634 continue;
3635
3636 if (srcptepaddr & PG_PS) {
3637 dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
3638 if (dstmpde == NULL)
3639 break;
3640 pde = (pd_entry_t *)
3641 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
3642 pde = &pde[pmap_pde_index(addr)];
3643 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
3644 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
3645 PG_PS_FRAME))) {
3646 *pde = srcptepaddr & ~PG_W;
3647 dst_pmap->pm_stats.resident_count +=
3648 NBPDR / PAGE_SIZE;
3649 } else
3650 dstmpde->wire_count--;
3651 continue;
3652 }
3653
3654 srcptepaddr &= PG_FRAME;
3655 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
3656 KASSERT(srcmpte->wire_count > 0,
3657 ("pmap_copy: source page table page is unused"));
3658
3659 if (va_next > end_addr)
3660 va_next = end_addr;
3661
3662 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
3663 src_pte = &src_pte[pmap_pte_index(addr)];
3664 dstmpte = NULL;
3665 while (addr < va_next) {
3666 pt_entry_t ptetemp;
3667 ptetemp = *src_pte;
3668 /*
3669 * we only virtual copy managed pages
3670 */
3671 if ((ptetemp & PG_MANAGED) != 0) {
3672 if (dstmpte != NULL &&
3673 dstmpte->pindex == pmap_pde_pindex(addr))
3674 dstmpte->wire_count++;
3675 else if ((dstmpte = pmap_allocpte(dst_pmap,
3676 addr, M_NOWAIT)) == NULL)
3677 goto out;
3678 dst_pte = (pt_entry_t *)
3679 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
3680 dst_pte = &dst_pte[pmap_pte_index(addr)];
3681 if (*dst_pte == 0 &&
3682 pmap_try_insert_pv_entry(dst_pmap, addr,
3683 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3684 /*
3685 * Clear the wired, modified, and
3686 * accessed (referenced) bits
3687 * during the copy.
3688 */
3689 *dst_pte = ptetemp & ~(PG_W | PG_M |
3690 PG_A);
3691 dst_pmap->pm_stats.resident_count++;
3692 } else {
3693 free = NULL;
3694 if (pmap_unwire_pte_hold(dst_pmap,
3695 addr, dstmpte, &free)) {
3696 pmap_invalidate_page(dst_pmap,
3697 addr);
3698 pmap_free_zero_pages(free);
3699 }
3700 goto out;
3701 }
3702 if (dstmpte->wire_count >= srcmpte->wire_count)
3703 break;
3704 }
3705 addr += PAGE_SIZE;
3706 src_pte++;
3707 }
3708 }
3709 out:
3710 vm_page_unlock_queues();
3711 PMAP_UNLOCK(src_pmap);
3712 PMAP_UNLOCK(dst_pmap);
3713 }
3714
3715 /*
3716 * pmap_zero_page zeros the specified hardware page by mapping
3717 * the page into KVM and using bzero to clear its contents.
3718 */
3719 void
3720 pmap_zero_page(vm_page_t m)
3721 {
3722 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3723
3724 pagezero((void *)va);
3725 }
3726
3727 /*
3728 * pmap_zero_page_area zeros the specified hardware page by mapping
3729 * the page into KVM and using bzero to clear its contents.
3730 *
3731 * off and size may not cover an area beyond a single hardware page.
3732 */
3733 void
3734 pmap_zero_page_area(vm_page_t m, int off, int size)
3735 {
3736 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3737
3738 if (off == 0 && size == PAGE_SIZE)
3739 pagezero((void *)va);
3740 else
3741 bzero((char *)va + off, size);
3742 }
3743
3744 /*
3745 * pmap_zero_page_idle zeros the specified hardware page by mapping
3746 * the page into KVM and using bzero to clear its contents. This
3747 * is intended to be called from the vm_pagezero process only and
3748 * outside of Giant.
3749 */
3750 void
3751 pmap_zero_page_idle(vm_page_t m)
3752 {
3753 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3754
3755 pagezero((void *)va);
3756 }
3757
3758 /*
3759 * pmap_copy_page copies the specified (machine independent)
3760 * page by mapping the page into virtual memory and using
3761 * bcopy to copy the page, one machine dependent page at a
3762 * time.
3763 */
3764 void
3765 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3766 {
3767 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3768 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3769
3770 pagecopy((void *)src, (void *)dst);
3771 }
3772
3773 /*
3774 * Returns true if the pmap's pv is one of the first
3775 * 16 pvs linked to from this page. This count may
3776 * be changed upwards or downwards in the future; it
3777 * is only necessary that true be returned for a small
3778 * subset of pmaps for proper page aging.
3779 */
3780 boolean_t
3781 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3782 {
3783 struct md_page *pvh;
3784 pv_entry_t pv;
3785 int loops = 0;
3786
3787 if (m->flags & PG_FICTITIOUS)
3788 return FALSE;
3789
3790 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3791 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3792 if (PV_PMAP(pv) == pmap) {
3793 return TRUE;
3794 }
3795 loops++;
3796 if (loops >= 16)
3797 break;
3798 }
3799 if (loops < 16) {
3800 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3801 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3802 if (PV_PMAP(pv) == pmap)
3803 return (TRUE);
3804 loops++;
3805 if (loops >= 16)
3806 break;
3807 }
3808 }
3809 return (FALSE);
3810 }
3811
3812 /*
3813 * Returns TRUE if the given page is mapped individually or as part of
3814 * a 2mpage. Otherwise, returns FALSE.
3815 */
3816 boolean_t
3817 pmap_page_is_mapped(vm_page_t m)
3818 {
3819 struct md_page *pvh;
3820
3821 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
3822 return (FALSE);
3823 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3824 if (TAILQ_EMPTY(&m->md.pv_list)) {
3825 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3826 return (!TAILQ_EMPTY(&pvh->pv_list));
3827 } else
3828 return (TRUE);
3829 }
3830
3831 /*
3832 * Remove all pages from specified address space
3833 * this aids process exit speeds. Also, this code
3834 * is special cased for current process only, but
3835 * can have the more generic (and slightly slower)
3836 * mode enabled. This is much faster than pmap_remove
3837 * in the case of running down an entire address space.
3838 */
3839 void
3840 pmap_remove_pages(pmap_t pmap)
3841 {
3842 pd_entry_t ptepde;
3843 pt_entry_t *pte, tpte;
3844 vm_page_t free = NULL;
3845 vm_page_t m, mpte, mt;
3846 pv_entry_t pv;
3847 struct md_page *pvh;
3848 struct pv_chunk *pc, *npc;
3849 int field, idx;
3850 int64_t bit;
3851 uint64_t inuse, bitmask;
3852 int allfree;
3853
3854 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
3855 printf("warning: pmap_remove_pages called with non-current pmap\n");
3856 return;
3857 }
3858 vm_page_lock_queues();
3859 PMAP_LOCK(pmap);
3860 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3861 allfree = 1;
3862 for (field = 0; field < _NPCM; field++) {
3863 inuse = (~(pc->pc_map[field])) & pc_freemask[field];
3864 while (inuse != 0) {
3865 bit = bsfq(inuse);
3866 bitmask = 1UL << bit;
3867 idx = field * 64 + bit;
3868 pv = &pc->pc_pventry[idx];
3869 inuse &= ~bitmask;
3870
3871 pte = pmap_pdpe(pmap, pv->pv_va);
3872 ptepde = *pte;
3873 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
3874 tpte = *pte;
3875 if ((tpte & (PG_PS | PG_V)) == PG_V) {
3876 ptepde = tpte;
3877 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
3878 PG_FRAME);
3879 pte = &pte[pmap_pte_index(pv->pv_va)];
3880 tpte = *pte & ~PG_PTE_PAT;
3881 }
3882 if ((tpte & PG_V) == 0)
3883 panic("bad pte");
3884
3885 /*
3886 * We cannot remove wired pages from a process' mapping at this time
3887 */
3888 if (tpte & PG_W) {
3889 allfree = 0;
3890 continue;
3891 }
3892
3893 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3894 KASSERT(m->phys_addr == (tpte & PG_FRAME),
3895 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3896 m, (uintmax_t)m->phys_addr,
3897 (uintmax_t)tpte));
3898
3899 KASSERT(m < &vm_page_array[vm_page_array_size],
3900 ("pmap_remove_pages: bad tpte %#jx",
3901 (uintmax_t)tpte));
3902
3903 pte_clear(pte);
3904
3905 /*
3906 * Update the vm_page_t clean/reference bits.
3907 */
3908 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3909 if ((tpte & PG_PS) != 0) {
3910 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
3911 vm_page_dirty(mt);
3912 } else
3913 vm_page_dirty(m);
3914 }
3915
3916 /* Mark free */
3917 PV_STAT(pv_entry_frees++);
3918 PV_STAT(pv_entry_spare++);
3919 pv_entry_count--;
3920 pc->pc_map[field] |= bitmask;
3921 if ((tpte & PG_PS) != 0) {
3922 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
3923 pvh = pa_to_pvh(tpte & PG_PS_FRAME);
3924 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
3925 if (TAILQ_EMPTY(&pvh->pv_list)) {
3926 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
3927 if (TAILQ_EMPTY(&mt->md.pv_list))
3928 vm_page_flag_clear(mt, PG_WRITEABLE);
3929 }
3930 mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
3931 if (mpte != NULL) {
3932 pmap_remove_pt_page(pmap, mpte);
3933 pmap->pm_stats.resident_count--;
3934 KASSERT(mpte->wire_count == NPTEPG,
3935 ("pmap_remove_pages: pte page wire count error"));
3936 mpte->wire_count = 0;
3937 pmap_add_delayed_free_list(mpte, &free, FALSE);
3938 atomic_subtract_int(&cnt.v_wire_count, 1);
3939 }
3940 } else {
3941 pmap->pm_stats.resident_count--;
3942 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3943 if (TAILQ_EMPTY(&m->md.pv_list)) {
3944 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3945 if (TAILQ_EMPTY(&pvh->pv_list))
3946 vm_page_flag_clear(m, PG_WRITEABLE);
3947 }
3948 }
3949 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
3950 }
3951 }
3952 if (allfree) {
3953 PV_STAT(pv_entry_spare -= _NPCPV);
3954 PV_STAT(pc_chunk_count--);
3955 PV_STAT(pc_chunk_frees++);
3956 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3957 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3958 dump_drop_page(m->phys_addr);
3959 vm_page_unwire(m, 0);
3960 vm_page_free(m);
3961 }
3962 }
3963 pmap_invalidate_all(pmap);
3964 vm_page_unlock_queues();
3965 PMAP_UNLOCK(pmap);
3966 pmap_free_zero_pages(free);
3967 }
3968
3969 /*
3970 * pmap_is_modified:
3971 *
3972 * Return whether or not the specified physical page was modified
3973 * in any physical maps.
3974 */
3975 boolean_t
3976 pmap_is_modified(vm_page_t m)
3977 {
3978
3979 if (m->flags & PG_FICTITIOUS)
3980 return (FALSE);
3981 if (pmap_is_modified_pvh(&m->md))
3982 return (TRUE);
3983 return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
3984 }
3985
3986 /*
3987 * Returns TRUE if any of the given mappings were used to modify
3988 * physical memory. Otherwise, returns FALSE. Both page and 2mpage
3989 * mappings are supported.
3990 */
3991 static boolean_t
3992 pmap_is_modified_pvh(struct md_page *pvh)
3993 {
3994 pv_entry_t pv;
3995 pt_entry_t *pte;
3996 pmap_t pmap;
3997 boolean_t rv;
3998
3999 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4000 rv = FALSE;
4001 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4002 pmap = PV_PMAP(pv);
4003 PMAP_LOCK(pmap);
4004 pte = pmap_pte(pmap, pv->pv_va);
4005 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4006 PMAP_UNLOCK(pmap);
4007 if (rv)
4008 break;
4009 }
4010 return (rv);
4011 }
4012
4013 /*
4014 * pmap_is_prefaultable:
4015 *
4016 * Return whether or not the specified virtual address is elgible
4017 * for prefault.
4018 */
4019 boolean_t
4020 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4021 {
4022 pd_entry_t *pde;
4023 pt_entry_t *pte;
4024 boolean_t rv;
4025
4026 rv = FALSE;
4027 PMAP_LOCK(pmap);
4028 pde = pmap_pde(pmap, addr);
4029 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
4030 pte = pmap_pde_to_pte(pde, addr);
4031 rv = (*pte & PG_V) == 0;
4032 }
4033 PMAP_UNLOCK(pmap);
4034 return (rv);
4035 }
4036
4037 /*
4038 * Clear the write and modified bits in each of the given page's mappings.
4039 */
4040 void
4041 pmap_remove_write(vm_page_t m)
4042 {
4043 struct md_page *pvh;
4044 pmap_t pmap;
4045 pv_entry_t next_pv, pv;
4046 pd_entry_t *pde;
4047 pt_entry_t oldpte, *pte;
4048 vm_offset_t va;
4049
4050 if ((m->flags & PG_FICTITIOUS) != 0 ||
4051 (m->flags & PG_WRITEABLE) == 0)
4052 return;
4053 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4054 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4055 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4056 va = pv->pv_va;
4057 pmap = PV_PMAP(pv);
4058 PMAP_LOCK(pmap);
4059 pde = pmap_pde(pmap, va);
4060 if ((*pde & PG_RW) != 0)
4061 (void)pmap_demote_pde(pmap, pde, va);
4062 PMAP_UNLOCK(pmap);
4063 }
4064 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4065 pmap = PV_PMAP(pv);
4066 PMAP_LOCK(pmap);
4067 pde = pmap_pde(pmap, pv->pv_va);
4068 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4069 " a 2mpage in page %p's pv list", m));
4070 pte = pmap_pde_to_pte(pde, pv->pv_va);
4071 retry:
4072 oldpte = *pte;
4073 if (oldpte & PG_RW) {
4074 if (!atomic_cmpset_long(pte, oldpte, oldpte &
4075 ~(PG_RW | PG_M)))
4076 goto retry;
4077 if ((oldpte & PG_M) != 0)
4078 vm_page_dirty(m);
4079 pmap_invalidate_page(pmap, pv->pv_va);
4080 }
4081 PMAP_UNLOCK(pmap);
4082 }
4083 vm_page_flag_clear(m, PG_WRITEABLE);
4084 }
4085
4086 /*
4087 * pmap_ts_referenced:
4088 *
4089 * Return a count of reference bits for a page, clearing those bits.
4090 * It is not necessary for every reference bit to be cleared, but it
4091 * is necessary that 0 only be returned when there are truly no
4092 * reference bits set.
4093 *
4094 * XXX: The exact number of bits to check and clear is a matter that
4095 * should be tested and standardized at some point in the future for
4096 * optimal aging of shared pages.
4097 */
4098 int
4099 pmap_ts_referenced(vm_page_t m)
4100 {
4101 struct md_page *pvh;
4102 pv_entry_t pv, pvf, pvn;
4103 pmap_t pmap;
4104 pd_entry_t oldpde, *pde;
4105 pt_entry_t *pte;
4106 vm_offset_t va;
4107 int rtval = 0;
4108
4109 if (m->flags & PG_FICTITIOUS)
4110 return (rtval);
4111 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4112 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4113 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4114 va = pv->pv_va;
4115 pmap = PV_PMAP(pv);
4116 PMAP_LOCK(pmap);
4117 pde = pmap_pde(pmap, va);
4118 oldpde = *pde;
4119 if ((oldpde & PG_A) != 0) {
4120 if (pmap_demote_pde(pmap, pde, va)) {
4121 if ((oldpde & PG_W) == 0) {
4122 /*
4123 * Remove the mapping to a single page
4124 * so that a subsequent access may
4125 * repromote. Since the underlying
4126 * page table page is fully populated,
4127 * this removal never frees a page
4128 * table page.
4129 */
4130 va += VM_PAGE_TO_PHYS(m) - (oldpde &
4131 PG_PS_FRAME);
4132 pmap_remove_page(pmap, va, pde, NULL);
4133 rtval++;
4134 if (rtval > 4) {
4135 PMAP_UNLOCK(pmap);
4136 return (rtval);
4137 }
4138 }
4139 }
4140 }
4141 PMAP_UNLOCK(pmap);
4142 }
4143 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4144 pvf = pv;
4145 do {
4146 pvn = TAILQ_NEXT(pv, pv_list);
4147 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4148 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4149 pmap = PV_PMAP(pv);
4150 PMAP_LOCK(pmap);
4151 pde = pmap_pde(pmap, pv->pv_va);
4152 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4153 " found a 2mpage in page %p's pv list", m));
4154 pte = pmap_pde_to_pte(pde, pv->pv_va);
4155 if ((*pte & PG_A) != 0) {
4156 atomic_clear_long(pte, PG_A);
4157 pmap_invalidate_page(pmap, pv->pv_va);
4158 rtval++;
4159 if (rtval > 4)
4160 pvn = NULL;
4161 }
4162 PMAP_UNLOCK(pmap);
4163 } while ((pv = pvn) != NULL && pv != pvf);
4164 }
4165 return (rtval);
4166 }
4167
4168 /*
4169 * Clear the modify bits on the specified physical page.
4170 */
4171 void
4172 pmap_clear_modify(vm_page_t m)
4173 {
4174 struct md_page *pvh;
4175 pmap_t pmap;
4176 pv_entry_t next_pv, pv;
4177 pd_entry_t oldpde, *pde;
4178 pt_entry_t oldpte, *pte;
4179 vm_offset_t va;
4180
4181 if ((m->flags & PG_FICTITIOUS) != 0)
4182 return;
4183 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4184 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4185 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4186 va = pv->pv_va;
4187 pmap = PV_PMAP(pv);
4188 PMAP_LOCK(pmap);
4189 pde = pmap_pde(pmap, va);
4190 oldpde = *pde;
4191 if ((oldpde & PG_RW) != 0) {
4192 if (pmap_demote_pde(pmap, pde, va)) {
4193 if ((oldpde & PG_W) == 0) {
4194 /*
4195 * Write protect the mapping to a
4196 * single page so that a subsequent
4197 * write access may repromote.
4198 */
4199 va += VM_PAGE_TO_PHYS(m) - (oldpde &
4200 PG_PS_FRAME);
4201 pte = pmap_pde_to_pte(pde, va);
4202 oldpte = *pte;
4203 if ((oldpte & PG_V) != 0) {
4204 while (!atomic_cmpset_long(pte,
4205 oldpte,
4206 oldpte & ~(PG_M | PG_RW)))
4207 oldpte = *pte;
4208 vm_page_dirty(m);
4209 pmap_invalidate_page(pmap, va);
4210 }
4211 }
4212 }
4213 }
4214 PMAP_UNLOCK(pmap);
4215 }
4216 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4217 pmap = PV_PMAP(pv);
4218 PMAP_LOCK(pmap);
4219 pde = pmap_pde(pmap, pv->pv_va);
4220 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4221 " a 2mpage in page %p's pv list", m));
4222 pte = pmap_pde_to_pte(pde, pv->pv_va);
4223 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4224 atomic_clear_long(pte, PG_M);
4225 pmap_invalidate_page(pmap, pv->pv_va);
4226 }
4227 PMAP_UNLOCK(pmap);
4228 }
4229 }
4230
4231 /*
4232 * pmap_clear_reference:
4233 *
4234 * Clear the reference bit on the specified physical page.
4235 */
4236 void
4237 pmap_clear_reference(vm_page_t m)
4238 {
4239 struct md_page *pvh;
4240 pmap_t pmap;
4241 pv_entry_t next_pv, pv;
4242 pd_entry_t oldpde, *pde;
4243 pt_entry_t *pte;
4244 vm_offset_t va;
4245
4246 if ((m->flags & PG_FICTITIOUS) != 0)
4247 return;
4248 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4249 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4250 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4251 va = pv->pv_va;
4252 pmap = PV_PMAP(pv);
4253 PMAP_LOCK(pmap);
4254 pde = pmap_pde(pmap, va);
4255 oldpde = *pde;
4256 if ((oldpde & PG_A) != 0) {
4257 if (pmap_demote_pde(pmap, pde, va)) {
4258 /*
4259 * Remove the mapping to a single page so
4260 * that a subsequent access may repromote.
4261 * Since the underlying page table page is
4262 * fully populated, this removal never frees
4263 * a page table page.
4264 */
4265 va += VM_PAGE_TO_PHYS(m) - (oldpde &
4266 PG_PS_FRAME);
4267 pmap_remove_page(pmap, va, pde, NULL);
4268 }
4269 }
4270 PMAP_UNLOCK(pmap);
4271 }
4272 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4273 pmap = PV_PMAP(pv);
4274 PMAP_LOCK(pmap);
4275 pde = pmap_pde(pmap, pv->pv_va);
4276 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4277 " a 2mpage in page %p's pv list", m));
4278 pte = pmap_pde_to_pte(pde, pv->pv_va);
4279 if (*pte & PG_A) {
4280 atomic_clear_long(pte, PG_A);
4281 pmap_invalidate_page(pmap, pv->pv_va);
4282 }
4283 PMAP_UNLOCK(pmap);
4284 }
4285 }
4286
4287 /*
4288 * Miscellaneous support routines follow
4289 */
4290
4291 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
4292 static __inline void
4293 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4294 {
4295 u_int opte, npte;
4296
4297 /*
4298 * The cache mode bits are all in the low 32-bits of the
4299 * PTE, so we can just spin on updating the low 32-bits.
4300 */
4301 do {
4302 opte = *(u_int *)pte;
4303 npte = opte & ~PG_PTE_CACHE;
4304 npte |= cache_bits;
4305 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4306 }
4307
4308 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
4309 static __inline void
4310 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4311 {
4312 u_int opde, npde;
4313
4314 /*
4315 * The cache mode bits are all in the low 32-bits of the
4316 * PDE, so we can just spin on updating the low 32-bits.
4317 */
4318 do {
4319 opde = *(u_int *)pde;
4320 npde = opde & ~PG_PDE_CACHE;
4321 npde |= cache_bits;
4322 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4323 }
4324
4325 /*
4326 * Map a set of physical memory pages into the kernel virtual
4327 * address space. Return a pointer to where it is mapped. This
4328 * routine is intended to be used for mapping device memory,
4329 * NOT real memory.
4330 */
4331 void *
4332 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4333 {
4334 vm_offset_t va, offset;
4335 vm_size_t tmpsize;
4336
4337 /*
4338 * If the specified range of physical addresses fits within the direct
4339 * map window, use the direct map.
4340 */
4341 if (pa < dmaplimit && pa + size < dmaplimit) {
4342 va = PHYS_TO_DMAP(pa);
4343 if (!pmap_change_attr(va, size, mode))
4344 return ((void *)va);
4345 }
4346 offset = pa & PAGE_MASK;
4347 size = roundup(offset + size, PAGE_SIZE);
4348 va = kmem_alloc_nofault(kernel_map, size);
4349 if (!va)
4350 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4351 pa = trunc_page(pa);
4352 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4353 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4354 pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4355 pmap_invalidate_cache_range(va, va + tmpsize);
4356 return ((void *)(va + offset));
4357 }
4358
4359 void *
4360 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4361 {
4362
4363 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4364 }
4365
4366 void *
4367 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4368 {
4369
4370 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4371 }
4372
4373 void
4374 pmap_unmapdev(vm_offset_t va, vm_size_t size)
4375 {
4376 vm_offset_t base, offset, tmpva;
4377
4378 /* If we gave a direct map region in pmap_mapdev, do nothing */
4379 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
4380 return;
4381 base = trunc_page(va);
4382 offset = va & PAGE_MASK;
4383 size = roundup(offset + size, PAGE_SIZE);
4384 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4385 pmap_kremove(tmpva);
4386 pmap_invalidate_range(kernel_pmap, va, tmpva);
4387 kmem_free(kernel_map, base, size);
4388 }
4389
4390 /*
4391 * Sets the memory attribute for the specified page.
4392 */
4393 void
4394 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4395 {
4396
4397 m->md.pat_mode = ma;
4398
4399 /*
4400 * If "m" is a normal page, update its direct mapping. This update
4401 * can be relied upon to perform any cache operations that are
4402 * required for data coherence.
4403 */
4404 if ((m->flags & PG_FICTITIOUS) == 0 &&
4405 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4406 m->md.pat_mode))
4407 panic("memory attribute change on the direct map failed");
4408 }
4409
4410 /*
4411 * Changes the specified virtual address range's memory type to that given by
4412 * the parameter "mode". The specified virtual address range must be
4413 * completely contained within either the direct map or the kernel map. If
4414 * the virtual address range is contained within the kernel map, then the
4415 * memory type for each of the corresponding ranges of the direct map is also
4416 * changed. (The corresponding ranges of the direct map are those ranges that
4417 * map the same physical pages as the specified virtual address range.) These
4418 * changes to the direct map are necessary because Intel describes the
4419 * behavior of their processors as "undefined" if two or more mappings to the
4420 * same physical page have different memory types.
4421 *
4422 * Returns zero if the change completed successfully, and either EINVAL or
4423 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
4424 * of the virtual address range was not mapped, and ENOMEM is returned if
4425 * there was insufficient memory available to complete the change. In the
4426 * latter case, the memory type may have been changed on some part of the
4427 * virtual address range or the direct map.
4428 */
4429 int
4430 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4431 {
4432 int error;
4433
4434 PMAP_LOCK(kernel_pmap);
4435 error = pmap_change_attr_locked(va, size, mode);
4436 PMAP_UNLOCK(kernel_pmap);
4437 return (error);
4438 }
4439
4440 static int
4441 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4442 {
4443 vm_offset_t base, offset, tmpva;
4444 vm_paddr_t pa_start, pa_end;
4445 pd_entry_t *pde;
4446 pt_entry_t *pte;
4447 int cache_bits_pte, cache_bits_pde, error;
4448 boolean_t changed;
4449
4450 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4451 base = trunc_page(va);
4452 offset = va & PAGE_MASK;
4453 size = roundup(offset + size, PAGE_SIZE);
4454
4455 /*
4456 * Only supported on kernel virtual addresses, including the direct
4457 * map but excluding the recursive map.
4458 */
4459 if (base < DMAP_MIN_ADDRESS)
4460 return (EINVAL);
4461
4462 cache_bits_pde = pmap_cache_bits(mode, 1);
4463 cache_bits_pte = pmap_cache_bits(mode, 0);
4464 changed = FALSE;
4465
4466 /*
4467 * Pages that aren't mapped aren't supported. Also break down 2MB pages
4468 * into 4KB pages if required.
4469 */
4470 for (tmpva = base; tmpva < base + size; ) {
4471 pde = pmap_pde(kernel_pmap, tmpva);
4472 if (*pde == 0)
4473 return (EINVAL);
4474 if (*pde & PG_PS) {
4475 /*
4476 * If the current 2MB page already has the required
4477 * memory type, then we need not demote this page. Just
4478 * increment tmpva to the next 2MB page frame.
4479 */
4480 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
4481 tmpva = trunc_2mpage(tmpva) + NBPDR;
4482 continue;
4483 }
4484
4485 /*
4486 * If the current offset aligns with a 2MB page frame
4487 * and there is at least 2MB left within the range, then
4488 * we need not break down this page into 4KB pages.
4489 */
4490 if ((tmpva & PDRMASK) == 0 &&
4491 tmpva + PDRMASK < base + size) {
4492 tmpva += NBPDR;
4493 continue;
4494 }
4495 if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
4496 return (ENOMEM);
4497 }
4498 pte = pmap_pde_to_pte(pde, tmpva);
4499 if (*pte == 0)
4500 return (EINVAL);
4501 tmpva += PAGE_SIZE;
4502 }
4503 error = 0;
4504
4505 /*
4506 * Ok, all the pages exist, so run through them updating their
4507 * cache mode if required.
4508 */
4509 pa_start = pa_end = 0;
4510 for (tmpva = base; tmpva < base + size; ) {
4511 pde = pmap_pde(kernel_pmap, tmpva);
4512 if (*pde & PG_PS) {
4513 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
4514 pmap_pde_attr(pde, cache_bits_pde);
4515 changed = TRUE;
4516 }
4517 if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
4518 if (pa_start == pa_end) {
4519 /* Start physical address run. */
4520 pa_start = *pde & PG_PS_FRAME;
4521 pa_end = pa_start + NBPDR;
4522 } else if (pa_end == (*pde & PG_PS_FRAME))
4523 pa_end += NBPDR;
4524 else {
4525 /* Run ended, update direct map. */
4526 error = pmap_change_attr_locked(
4527 PHYS_TO_DMAP(pa_start),
4528 pa_end - pa_start, mode);
4529 if (error != 0)
4530 break;
4531 /* Start physical address run. */
4532 pa_start = *pde & PG_PS_FRAME;
4533 pa_end = pa_start + NBPDR;
4534 }
4535 }
4536 tmpva = trunc_2mpage(tmpva) + NBPDR;
4537 } else {
4538 pte = pmap_pde_to_pte(pde, tmpva);
4539 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
4540 pmap_pte_attr(pte, cache_bits_pte);
4541 changed = TRUE;
4542 }
4543 if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
4544 if (pa_start == pa_end) {
4545 /* Start physical address run. */
4546 pa_start = *pte & PG_FRAME;
4547 pa_end = pa_start + PAGE_SIZE;
4548 } else if (pa_end == (*pte & PG_FRAME))
4549 pa_end += PAGE_SIZE;
4550 else {
4551 /* Run ended, update direct map. */
4552 error = pmap_change_attr_locked(
4553 PHYS_TO_DMAP(pa_start),
4554 pa_end - pa_start, mode);
4555 if (error != 0)
4556 break;
4557 /* Start physical address run. */
4558 pa_start = *pte & PG_FRAME;
4559 pa_end = pa_start + PAGE_SIZE;
4560 }
4561 }
4562 tmpva += PAGE_SIZE;
4563 }
4564 }
4565 if (error == 0 && pa_start != pa_end)
4566 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
4567 pa_end - pa_start, mode);
4568
4569 /*
4570 * Flush CPU caches if required to make sure any data isn't cached that
4571 * shouldn't be, etc.
4572 */
4573 if (changed) {
4574 pmap_invalidate_range(kernel_pmap, base, tmpva);
4575 pmap_invalidate_cache_range(base, tmpva);
4576 }
4577 return (error);
4578 }
4579
4580 /*
4581 * perform the pmap work for mincore
4582 */
4583 int
4584 pmap_mincore(pmap_t pmap, vm_offset_t addr)
4585 {
4586 pd_entry_t *pdep;
4587 pt_entry_t pte;
4588 vm_paddr_t pa;
4589 vm_page_t m;
4590 int val = 0;
4591
4592 PMAP_LOCK(pmap);
4593 pdep = pmap_pde(pmap, addr);
4594 if (pdep != NULL && (*pdep & PG_V)) {
4595 if (*pdep & PG_PS) {
4596 pte = *pdep;
4597 val = MINCORE_SUPER;
4598 /* Compute the physical address of the 4KB page. */
4599 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
4600 PG_FRAME;
4601 } else {
4602 pte = *pmap_pde_to_pte(pdep, addr);
4603 pa = pte & PG_FRAME;
4604 }
4605 } else {
4606 pte = 0;
4607 pa = 0;
4608 }
4609 PMAP_UNLOCK(pmap);
4610
4611 if (pte != 0) {
4612 val |= MINCORE_INCORE;
4613 if ((pte & PG_MANAGED) == 0)
4614 return val;
4615
4616 m = PHYS_TO_VM_PAGE(pa);
4617
4618 /*
4619 * Modified by us
4620 */
4621 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4622 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
4623 else {
4624 /*
4625 * Modified by someone else
4626 */
4627 vm_page_lock_queues();
4628 if (m->dirty || pmap_is_modified(m))
4629 val |= MINCORE_MODIFIED_OTHER;
4630 vm_page_unlock_queues();
4631 }
4632 /*
4633 * Referenced by us
4634 */
4635 if (pte & PG_A)
4636 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
4637 else {
4638 /*
4639 * Referenced by someone else
4640 */
4641 vm_page_lock_queues();
4642 if ((m->flags & PG_REFERENCED) ||
4643 pmap_ts_referenced(m)) {
4644 val |= MINCORE_REFERENCED_OTHER;
4645 vm_page_flag_set(m, PG_REFERENCED);
4646 }
4647 vm_page_unlock_queues();
4648 }
4649 }
4650 return val;
4651 }
4652
4653 void
4654 pmap_activate(struct thread *td)
4655 {
4656 pmap_t pmap, oldpmap;
4657 u_int64_t cr3;
4658
4659 critical_enter();
4660 pmap = vmspace_pmap(td->td_proc->p_vmspace);
4661 oldpmap = PCPU_GET(curpmap);
4662 #ifdef SMP
4663 if (oldpmap) /* XXX FIXME */
4664 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
4665 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
4666 #else
4667 if (oldpmap) /* XXX FIXME */
4668 oldpmap->pm_active &= ~PCPU_GET(cpumask);
4669 pmap->pm_active |= PCPU_GET(cpumask);
4670 #endif
4671 cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
4672 td->td_pcb->pcb_cr3 = cr3;
4673 load_cr3(cr3);
4674 critical_exit();
4675 }
4676
4677 vm_offset_t
4678 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
4679 {
4680
4681 if ((obj == NULL) || (size < NBPDR) ||
4682 (obj->type != OBJT_DEVICE && obj->type != OBJT_SG)) {
4683 return addr;
4684 }
4685
4686 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
4687 return addr;
4688 }
4689
4690 /*
4691 * Increase the starting virtual address of the given mapping if a
4692 * different alignment might result in more superpage mappings.
4693 */
4694 void
4695 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4696 vm_offset_t *addr, vm_size_t size)
4697 {
4698 vm_offset_t superpage_offset;
4699
4700 if (size < NBPDR)
4701 return;
4702 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4703 offset += ptoa(object->pg_color);
4704 superpage_offset = offset & PDRMASK;
4705 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
4706 (*addr & PDRMASK) == superpage_offset)
4707 return;
4708 if ((*addr & PDRMASK) < superpage_offset)
4709 *addr = (*addr & ~PDRMASK) + superpage_offset;
4710 else
4711 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
4712 }
Cache object: 0c3f0be732355ae29105ea91b91fd0c5
|