FreeBSD/Linux Kernel Cross Reference
sys/i386/i386/pmap.c
1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 * must display the following acknowledgement:
25 * This product includes software developed by the University of
26 * California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
44 */
45 /*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD: releng/7.4/sys/i386/i386/pmap.c 215374 2010-11-16 06:20:12Z alc $");
79
80 /*
81 * Manages physical address maps.
82 *
83 * In addition to hardware address maps, this
84 * module is called upon to provide software-use-only
85 * maps which may or may not be stored in the same
86 * form as hardware maps. These pseudo-maps are
87 * used to store intermediate results from copy
88 * operations to and from address spaces.
89 *
90 * Since the information managed by this module is
91 * also stored by the logical address mapping module,
92 * this module may throw away valid virtual-to-physical
93 * mappings at almost any time. However, invalidations
94 * of virtual-to-physical mappings must be done as
95 * requested.
96 *
97 * In order to cope with hardware architectures which
98 * make virtual-to-physical map invalidates expensive,
99 * this module may delay invalidate or reduced protection
100 * operations until such time as they are actually
101 * necessary. This module is given full information as
102 * to which processors are currently using which maps,
103 * and to when physical maps must be made correct.
104 */
105
106 #include "opt_cpu.h"
107 #include "opt_pmap.h"
108 #include "opt_msgbuf.h"
109 #include "opt_smp.h"
110 #include "opt_xbox.h"
111
112 #include <sys/param.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/ktr.h>
116 #include <sys/lock.h>
117 #include <sys/malloc.h>
118 #include <sys/mman.h>
119 #include <sys/msgbuf.h>
120 #include <sys/mutex.h>
121 #include <sys/proc.h>
122 #include <sys/sf_buf.h>
123 #include <sys/sx.h>
124 #include <sys/vmmeter.h>
125 #include <sys/sched.h>
126 #include <sys/sysctl.h>
127 #ifdef SMP
128 #include <sys/smp.h>
129 #endif
130
131 #include <vm/vm.h>
132 #include <vm/vm_param.h>
133 #include <vm/vm_kern.h>
134 #include <vm/vm_page.h>
135 #include <vm/vm_map.h>
136 #include <vm/vm_object.h>
137 #include <vm/vm_extern.h>
138 #include <vm/vm_pageout.h>
139 #include <vm/vm_pager.h>
140 #include <vm/vm_reserv.h>
141 #include <vm/uma.h>
142
143 #include <machine/cpu.h>
144 #include <machine/cputypes.h>
145 #include <machine/md_var.h>
146 #include <machine/pcb.h>
147 #include <machine/specialreg.h>
148 #ifdef SMP
149 #include <machine/smp.h>
150 #endif
151
152 #ifdef XBOX
153 #include <machine/xbox.h>
154 #endif
155
156 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
157 #define CPU_ENABLE_SSE
158 #endif
159
160 #ifndef PMAP_SHPGPERPROC
161 #define PMAP_SHPGPERPROC 200
162 #endif
163
164 #if !defined(DIAGNOSTIC)
165 #define PMAP_INLINE __gnu89_inline
166 #else
167 #define PMAP_INLINE
168 #endif
169
170 #define PV_STATS
171 #ifdef PV_STATS
172 #define PV_STAT(x) do { x ; } while (0)
173 #else
174 #define PV_STAT(x) do { } while (0)
175 #endif
176
177 #define pa_index(pa) ((pa) >> PDRSHIFT)
178 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
179
180 /*
181 * Get PDEs and PTEs for user/kernel address space
182 */
183 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
184 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
185
186 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0)
187 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0)
188 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0)
189 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0)
190 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0)
191
192 #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
193 atomic_clear_int((u_int *)(pte), PG_W))
194 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
195
196 struct pmap kernel_pmap_store;
197 LIST_HEAD(pmaplist, pmap);
198 static struct pmaplist allpmaps;
199 static struct mtx allpmaps_lock;
200
201 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
202 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
203 int pgeflag = 0; /* PG_G or-in */
204 int pseflag = 0; /* PG_PS or-in */
205
206 static int nkpt;
207 vm_offset_t kernel_vm_end;
208 extern u_int32_t KERNend;
209 extern u_int32_t KPTphys;
210
211 #ifdef PAE
212 pt_entry_t pg_nx;
213 static uma_zone_t pdptzone;
214 #endif
215
216 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
217
218 static int pg_ps_enabled;
219 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
220 "Are large page mappings enabled?");
221
222 /*
223 * Data for the pv entry allocation mechanism
224 */
225 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
226 static struct md_page *pv_table;
227 static int shpgperproc = PMAP_SHPGPERPROC;
228
229 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */
230 int pv_maxchunks; /* How many chunks we have KVA for */
231 vm_offset_t pv_vafree; /* freelist stored in the PTE */
232
233 /*
234 * All those kernel PT submaps that BSD is so fond of
235 */
236 struct sysmaps {
237 struct mtx lock;
238 pt_entry_t *CMAP1;
239 pt_entry_t *CMAP2;
240 caddr_t CADDR1;
241 caddr_t CADDR2;
242 };
243 static struct sysmaps sysmaps_pcpu[MAXCPU];
244 pt_entry_t *CMAP1 = 0;
245 static pt_entry_t *CMAP3;
246 static pd_entry_t *KPTD;
247 caddr_t CADDR1 = 0, ptvmmap = 0;
248 static caddr_t CADDR3;
249 struct msgbuf *msgbufp = 0;
250
251 /*
252 * Crashdump maps.
253 */
254 static caddr_t crashdumpmap;
255
256 static pt_entry_t *PMAP1 = 0, *PMAP2;
257 static pt_entry_t *PADDR1 = 0, *PADDR2;
258 #ifdef SMP
259 static int PMAP1cpu;
260 static int PMAP1changedcpu;
261 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
262 &PMAP1changedcpu, 0,
263 "Number of times pmap_pte_quick changed CPU with same PMAP1");
264 #endif
265 static int PMAP1changed;
266 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
267 &PMAP1changed, 0,
268 "Number of times pmap_pte_quick changed PMAP1");
269 static int PMAP1unchanged;
270 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
271 &PMAP1unchanged, 0,
272 "Number of times pmap_pte_quick didn't change PMAP1");
273 static struct mtx PMAP2mutex;
274
275 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
276 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
277 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
278 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
279 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
280 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
281 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
282 vm_offset_t va);
283
284 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
285 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
286 vm_prot_t prot);
287 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
288 vm_page_t m, vm_prot_t prot, vm_page_t mpte);
289 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
290 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
291 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
292 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
293 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
294 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
295 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
296 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
297 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
298 vm_prot_t prot);
299 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
300 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
301 vm_page_t *free);
302 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
303 vm_page_t *free);
304 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
305 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
306 vm_page_t *free);
307 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
308 vm_offset_t va);
309 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
310 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
311 vm_page_t m);
312 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
313 pd_entry_t newpde);
314 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
315
316 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
317
318 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
319 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
320 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
321 static void pmap_pte_release(pt_entry_t *pte);
322 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
323 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
324 #ifdef PAE
325 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
326 #endif
327
328 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
329 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
330
331 /*
332 * If you get an error here, then you set KVA_PAGES wrong! See the
333 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
334 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
335 */
336 CTASSERT(KERNBASE % (1 << 24) == 0);
337
338 /*
339 * Move the kernel virtual free pointer to the next
340 * 4MB. This is used to help improve performance
341 * by using a large (4MB) page for much of the kernel
342 * (.text, .data, .bss)
343 */
344 static vm_offset_t
345 pmap_kmem_choose(vm_offset_t addr)
346 {
347 vm_offset_t newaddr = addr;
348
349 #ifndef DISABLE_PSE
350 if (cpu_feature & CPUID_PSE)
351 newaddr = (addr + PDRMASK) & ~PDRMASK;
352 #endif
353 return newaddr;
354 }
355
356 /*
357 * Bootstrap the system enough to run with virtual memory.
358 *
359 * On the i386 this is called after mapping has already been enabled
360 * and just syncs the pmap module with what has already been done.
361 * [We can't call it easily with mapping off since the kernel is not
362 * mapped with PA == VA, hence we would have to relocate every address
363 * from the linked base (virtual) address "KERNBASE" to the actual
364 * (physical) address starting relative to 0]
365 */
366 void
367 pmap_bootstrap(vm_paddr_t firstaddr)
368 {
369 vm_offset_t va;
370 pt_entry_t *pte, *unused;
371 struct sysmaps *sysmaps;
372 int i;
373
374 /*
375 * Initialize the first available kernel virtual address. However,
376 * using "firstaddr" may waste a few pages of the kernel virtual
377 * address space, because locore may not have mapped every physical
378 * page that it allocated. Preferably, locore would provide a first
379 * unused virtual address in addition to "firstaddr".
380 */
381 virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
382 virtual_avail = pmap_kmem_choose(virtual_avail);
383
384 virtual_end = VM_MAX_KERNEL_ADDRESS;
385
386 /*
387 * Initialize the kernel pmap (which is statically allocated).
388 */
389 PMAP_LOCK_INIT(kernel_pmap);
390 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
391 #ifdef PAE
392 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
393 #endif
394 kernel_pmap->pm_root = NULL;
395 kernel_pmap->pm_active = -1; /* don't allow deactivation */
396 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
397 LIST_INIT(&allpmaps);
398
399 /*
400 * Request a spin mutex so that changes to allpmaps cannot be
401 * preempted by smp_rendezvous_cpus(). Otherwise,
402 * pmap_update_pde_kernel() could access allpmaps while it is
403 * being changed.
404 */
405 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
406 mtx_lock_spin(&allpmaps_lock);
407 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
408 mtx_unlock_spin(&allpmaps_lock);
409 nkpt = NKPT;
410
411 /*
412 * Reserve some special page table entries/VA space for temporary
413 * mapping of pages.
414 */
415 #define SYSMAP(c, p, v, n) \
416 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
417
418 va = virtual_avail;
419 pte = vtopte(va);
420
421 /*
422 * CMAP1/CMAP2 are used for zeroing and copying pages.
423 * CMAP3 is used for the idle process page zeroing.
424 */
425 for (i = 0; i < MAXCPU; i++) {
426 sysmaps = &sysmaps_pcpu[i];
427 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
428 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
429 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
430 }
431 SYSMAP(caddr_t, CMAP1, CADDR1, 1)
432 SYSMAP(caddr_t, CMAP3, CADDR3, 1)
433
434 /*
435 * Crashdump maps.
436 */
437 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
438
439 /*
440 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
441 */
442 SYSMAP(caddr_t, unused, ptvmmap, 1)
443
444 /*
445 * msgbufp is used to map the system message buffer.
446 */
447 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
448
449 /*
450 * KPTmap is used by pmap_kextract().
451 *
452 * KPTmap is first initialized by locore. However, that initial
453 * KPTmap can only support NKPT page table pages. Here, a larger
454 * KPTmap is created that can support KVA_PAGES page table pages.
455 */
456 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
457
458 for (i = 0; i < NKPT; i++)
459 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
460
461 /*
462 * Adjust the start of the KPTD and KPTmap so that the implementation
463 * of pmap_kextract() and pmap_growkernel() can be made simpler.
464 */
465 KPTD -= KPTDI;
466 KPTmap -= i386_btop(KPTDI << PDRSHIFT);
467
468 /*
469 * ptemap is used for pmap_pte_quick
470 */
471 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
472 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
473
474 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
475
476 virtual_avail = va;
477
478 /*
479 * Leave in place an identity mapping (virt == phys) for the low 1 MB
480 * physical memory region that is used by the ACPI wakeup code. This
481 * mapping must not have PG_G set.
482 */
483 #ifdef XBOX
484 /* FIXME: This is gross, but needed for the XBOX. Since we are in such
485 * an early stadium, we cannot yet neatly map video memory ... :-(
486 * Better fixes are very welcome! */
487 if (!arch_i386_is_xbox)
488 #endif
489 for (i = 1; i < NKPT; i++)
490 PTD[i] = 0;
491
492 /* Initialize the PAT MSR if present. */
493 pmap_init_pat();
494
495 /* Turn on PG_G on kernel page(s) */
496 pmap_set_pg();
497 }
498
499 /*
500 * Setup the PAT MSR.
501 */
502 void
503 pmap_init_pat(void)
504 {
505 uint64_t pat_msr;
506
507 /* Bail if this CPU doesn't implement PAT. */
508 if (!(cpu_feature & CPUID_PAT))
509 return;
510
511 #ifdef PAT_WORKS
512 /*
513 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
514 * Program 4 and 5 as WP and WC.
515 * Leave 6 and 7 as UC and UC-.
516 */
517 pat_msr = rdmsr(MSR_PAT);
518 pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
519 pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
520 PAT_VALUE(5, PAT_WRITE_COMBINING);
521 #else
522 /*
523 * Due to some Intel errata, we can only safely use the lower 4
524 * PAT entries. Thus, just replace PAT Index 2 with WC instead
525 * of UC-.
526 *
527 * Intel Pentium III Processor Specification Update
528 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
529 * or Mode C Paging)
530 *
531 * Intel Pentium IV Processor Specification Update
532 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
533 */
534 pat_msr = rdmsr(MSR_PAT);
535 pat_msr &= ~PAT_MASK(2);
536 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
537 #endif
538 wrmsr(MSR_PAT, pat_msr);
539 }
540
541 /*
542 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on.
543 */
544 void
545 pmap_set_pg(void)
546 {
547 pt_entry_t *pte;
548 vm_offset_t va, endva;
549
550 if (pgeflag == 0)
551 return;
552
553 endva = KERNBASE + KERNend;
554
555 if (pseflag) {
556 va = KERNBASE + KERNLOAD;
557 while (va < endva) {
558 pdir_pde(PTD, va) |= pgeflag;
559 invltlb(); /* Play it safe, invltlb() every time */
560 va += NBPDR;
561 }
562 } else {
563 va = (vm_offset_t)btext;
564 while (va < endva) {
565 pte = vtopte(va);
566 if (*pte)
567 *pte |= pgeflag;
568 invltlb(); /* Play it safe, invltlb() every time */
569 va += PAGE_SIZE;
570 }
571 }
572 }
573
574 /*
575 * Initialize a vm_page's machine-dependent fields.
576 */
577 void
578 pmap_page_init(vm_page_t m)
579 {
580
581 TAILQ_INIT(&m->md.pv_list);
582 m->md.pat_mode = PAT_WRITE_BACK;
583 }
584
585 #ifdef PAE
586 static void *
587 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
588 {
589
590 /* Inform UMA that this allocator uses kernel_map/object. */
591 *flags = UMA_SLAB_KERNEL;
592 return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
593 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
594 }
595 #endif
596
597 /*
598 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
599 * Requirements:
600 * - Must deal with pages in order to ensure that none of the PG_* bits
601 * are ever set, PG_V in particular.
602 * - Assumes we can write to ptes without pte_store() atomic ops, even
603 * on PAE systems. This should be ok.
604 * - Assumes nothing will ever test these addresses for 0 to indicate
605 * no mapping instead of correctly checking PG_V.
606 * - Assumes a vm_offset_t will fit in a pte (true for i386).
607 * Because PG_V is never set, there can be no mappings to invalidate.
608 */
609 static vm_offset_t
610 pmap_ptelist_alloc(vm_offset_t *head)
611 {
612 pt_entry_t *pte;
613 vm_offset_t va;
614
615 va = *head;
616 if (va == 0)
617 return (va); /* Out of memory */
618 pte = vtopte(va);
619 *head = *pte;
620 if (*head & PG_V)
621 panic("pmap_ptelist_alloc: va with PG_V set!");
622 *pte = 0;
623 return (va);
624 }
625
626 static void
627 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
628 {
629 pt_entry_t *pte;
630
631 if (va & PG_V)
632 panic("pmap_ptelist_free: freeing va with PG_V set!");
633 pte = vtopte(va);
634 *pte = *head; /* virtual! PG_V is 0 though */
635 *head = va;
636 }
637
638 static void
639 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
640 {
641 int i;
642 vm_offset_t va;
643
644 *head = 0;
645 for (i = npages - 1; i >= 0; i--) {
646 va = (vm_offset_t)base + i * PAGE_SIZE;
647 pmap_ptelist_free(head, va);
648 }
649 }
650
651
652 /*
653 * Initialize the pmap module.
654 * Called by vm_init, to initialize any structures that the pmap
655 * system needs to map virtual memory.
656 */
657 void
658 pmap_init(void)
659 {
660 vm_page_t mpte;
661 vm_size_t s;
662 int i, pv_npg;
663
664 /*
665 * Initialize the vm page array entries for the kernel pmap's
666 * page table pages.
667 */
668 for (i = 0; i < NKPT; i++) {
669 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
670 KASSERT(mpte >= vm_page_array &&
671 mpte < &vm_page_array[vm_page_array_size],
672 ("pmap_init: page table page is out of range"));
673 mpte->pindex = i + KPTDI;
674 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
675 }
676
677 /*
678 * Initialize the address space (zone) for the pv entries. Set a
679 * high water mark so that the system can recover from excessive
680 * numbers of pv entries.
681 */
682 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
683 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
684 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
685 pv_entry_max = roundup(pv_entry_max, _NPCPV);
686 pv_entry_high_water = 9 * (pv_entry_max / 10);
687
688 /*
689 * If the kernel is running in a virtual machine on an AMD Family 10h
690 * processor, then it must assume that MCA is enabled by the virtual
691 * machine monitor.
692 */
693 if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
694 CPUID_TO_FAMILY(cpu_id) == 0x10)
695 workaround_erratum383 = 1;
696
697 /*
698 * Are large page mappings supported and enabled?
699 */
700 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
701 if (pseflag == 0)
702 pg_ps_enabled = 0;
703 else if (pg_ps_enabled) {
704 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
705 ("pmap_init: can't assign to pagesizes[1]"));
706 pagesizes[1] = NBPDR;
707 }
708
709 /*
710 * Calculate the size of the pv head table for superpages.
711 */
712 for (i = 0; phys_avail[i + 1]; i += 2);
713 pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
714
715 /*
716 * Allocate memory for the pv head table for superpages.
717 */
718 s = (vm_size_t)(pv_npg * sizeof(struct md_page));
719 s = round_page(s);
720 pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
721 for (i = 0; i < pv_npg; i++)
722 TAILQ_INIT(&pv_table[i].pv_list);
723
724 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
725 pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
726 PAGE_SIZE * pv_maxchunks);
727 if (pv_chunkbase == NULL)
728 panic("pmap_init: not enough kvm for pv chunks");
729 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
730 #ifdef PAE
731 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
732 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
733 UMA_ZONE_VM | UMA_ZONE_NOFREE);
734 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
735 #endif
736 }
737
738
739 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
740 "Max number of PV entries");
741 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
742 "Page share factor per proc");
743
744 SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
745 "2/4MB page mapping counters");
746
747 static u_long pmap_pde_demotions;
748 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
749 &pmap_pde_demotions, 0, "2/4MB page demotions");
750
751 static u_long pmap_pde_mappings;
752 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
753 &pmap_pde_mappings, 0, "2/4MB page mappings");
754
755 static u_long pmap_pde_p_failures;
756 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
757 &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
758
759 static u_long pmap_pde_promotions;
760 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
761 &pmap_pde_promotions, 0, "2/4MB page promotions");
762
763 /***************************************************
764 * Low level helper routines.....
765 ***************************************************/
766
767 /*
768 * Determine the appropriate bits to set in a PTE or PDE for a specified
769 * caching mode.
770 */
771 int
772 pmap_cache_bits(int mode, boolean_t is_pde)
773 {
774 int pat_flag, pat_index, cache_bits;
775
776 /* The PAT bit is different for PTE's and PDE's. */
777 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
778
779 /* If we don't support PAT, map extended modes to older ones. */
780 if (!(cpu_feature & CPUID_PAT)) {
781 switch (mode) {
782 case PAT_UNCACHEABLE:
783 case PAT_WRITE_THROUGH:
784 case PAT_WRITE_BACK:
785 break;
786 case PAT_UNCACHED:
787 case PAT_WRITE_COMBINING:
788 case PAT_WRITE_PROTECTED:
789 mode = PAT_UNCACHEABLE;
790 break;
791 }
792 }
793
794 /* Map the caching mode to a PAT index. */
795 switch (mode) {
796 #ifdef PAT_WORKS
797 case PAT_UNCACHEABLE:
798 pat_index = 3;
799 break;
800 case PAT_WRITE_THROUGH:
801 pat_index = 1;
802 break;
803 case PAT_WRITE_BACK:
804 pat_index = 0;
805 break;
806 case PAT_UNCACHED:
807 pat_index = 2;
808 break;
809 case PAT_WRITE_COMBINING:
810 pat_index = 5;
811 break;
812 case PAT_WRITE_PROTECTED:
813 pat_index = 4;
814 break;
815 #else
816 case PAT_UNCACHED:
817 case PAT_UNCACHEABLE:
818 case PAT_WRITE_PROTECTED:
819 pat_index = 3;
820 break;
821 case PAT_WRITE_THROUGH:
822 pat_index = 1;
823 break;
824 case PAT_WRITE_BACK:
825 pat_index = 0;
826 break;
827 case PAT_WRITE_COMBINING:
828 pat_index = 2;
829 break;
830 #endif
831 default:
832 panic("Unknown caching mode %d\n", mode);
833 }
834
835 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
836 cache_bits = 0;
837 if (pat_index & 0x4)
838 cache_bits |= pat_flag;
839 if (pat_index & 0x2)
840 cache_bits |= PG_NC_PCD;
841 if (pat_index & 0x1)
842 cache_bits |= PG_NC_PWT;
843 return (cache_bits);
844 }
845
846 /*
847 * The caller is responsible for maintaining TLB consistency.
848 */
849 static void
850 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
851 {
852 pd_entry_t *pde;
853 pmap_t pmap;
854 boolean_t PTD_updated;
855
856 PTD_updated = FALSE;
857 mtx_lock_spin(&allpmaps_lock);
858 LIST_FOREACH(pmap, &allpmaps, pm_list) {
859 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
860 PG_FRAME))
861 PTD_updated = TRUE;
862 pde = pmap_pde(pmap, va);
863 pde_store(pde, newpde);
864 }
865 mtx_unlock_spin(&allpmaps_lock);
866 KASSERT(PTD_updated,
867 ("pmap_kenter_pde: current page table is not in allpmaps"));
868 }
869
870 /*
871 * After changing the page size for the specified virtual address in the page
872 * table, flush the corresponding entries from the processor's TLB. Only the
873 * calling processor's TLB is affected.
874 *
875 * The calling thread must be pinned to a processor.
876 */
877 static void
878 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
879 {
880 u_long cr4;
881
882 if ((newpde & PG_PS) == 0)
883 /* Demotion: flush a specific 2MB page mapping. */
884 invlpg(va);
885 else if ((newpde & PG_G) == 0)
886 /*
887 * Promotion: flush every 4KB page mapping from the TLB
888 * because there are too many to flush individually.
889 */
890 invltlb();
891 else {
892 /*
893 * Promotion: flush every 4KB page mapping from the TLB,
894 * including any global (PG_G) mappings.
895 */
896 cr4 = rcr4();
897 load_cr4(cr4 & ~CR4_PGE);
898 /*
899 * Although preemption at this point could be detrimental to
900 * performance, it would not lead to an error. PG_G is simply
901 * ignored if CR4.PGE is clear. Moreover, in case this block
902 * is re-entered, the load_cr4() either above or below will
903 * modify CR4.PGE flushing the TLB.
904 */
905 load_cr4(cr4 | CR4_PGE);
906 }
907 }
908 #ifdef SMP
909 /*
910 * For SMP, these functions have to use the IPI mechanism for coherence.
911 *
912 * N.B.: Before calling any of the following TLB invalidation functions,
913 * the calling processor must ensure that all stores updating a non-
914 * kernel page table are globally performed. Otherwise, another
915 * processor could cache an old, pre-update entry without being
916 * invalidated. This can happen one of two ways: (1) The pmap becomes
917 * active on another processor after its pm_active field is checked by
918 * one of the following functions but before a store updating the page
919 * table is globally performed. (2) The pmap becomes active on another
920 * processor before its pm_active field is checked but due to
921 * speculative loads one of the following functions stills reads the
922 * pmap as inactive on the other processor.
923 *
924 * The kernel page table is exempt because its pm_active field is
925 * immutable. The kernel page table is always active on every
926 * processor.
927 */
928 void
929 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
930 {
931 u_int cpumask;
932 u_int other_cpus;
933
934 sched_pin();
935 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
936 invlpg(va);
937 smp_invlpg(va);
938 } else {
939 cpumask = PCPU_GET(cpumask);
940 other_cpus = PCPU_GET(other_cpus);
941 if (pmap->pm_active & cpumask)
942 invlpg(va);
943 if (pmap->pm_active & other_cpus)
944 smp_masked_invlpg(pmap->pm_active & other_cpus, va);
945 }
946 sched_unpin();
947 }
948
949 void
950 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
951 {
952 u_int cpumask;
953 u_int other_cpus;
954 vm_offset_t addr;
955
956 sched_pin();
957 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
958 for (addr = sva; addr < eva; addr += PAGE_SIZE)
959 invlpg(addr);
960 smp_invlpg_range(sva, eva);
961 } else {
962 cpumask = PCPU_GET(cpumask);
963 other_cpus = PCPU_GET(other_cpus);
964 if (pmap->pm_active & cpumask)
965 for (addr = sva; addr < eva; addr += PAGE_SIZE)
966 invlpg(addr);
967 if (pmap->pm_active & other_cpus)
968 smp_masked_invlpg_range(pmap->pm_active & other_cpus,
969 sva, eva);
970 }
971 sched_unpin();
972 }
973
974 void
975 pmap_invalidate_all(pmap_t pmap)
976 {
977 u_int cpumask;
978 u_int other_cpus;
979
980 sched_pin();
981 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
982 invltlb();
983 smp_invltlb();
984 } else {
985 cpumask = PCPU_GET(cpumask);
986 other_cpus = PCPU_GET(other_cpus);
987 if (pmap->pm_active & cpumask)
988 invltlb();
989 if (pmap->pm_active & other_cpus)
990 smp_masked_invltlb(pmap->pm_active & other_cpus);
991 }
992 sched_unpin();
993 }
994
995 void
996 pmap_invalidate_cache(void)
997 {
998
999 sched_pin();
1000 wbinvd();
1001 smp_cache_flush();
1002 sched_unpin();
1003 }
1004
1005 struct pde_action {
1006 cpumask_t store; /* processor that updates the PDE */
1007 cpumask_t invalidate; /* processors that invalidate their TLB */
1008 vm_offset_t va;
1009 pd_entry_t *pde;
1010 pd_entry_t newpde;
1011 };
1012
1013 static void
1014 pmap_update_pde_kernel(void *arg)
1015 {
1016 struct pde_action *act = arg;
1017 pd_entry_t *pde;
1018 pmap_t pmap;
1019
1020 if (act->store == PCPU_GET(cpumask))
1021 /*
1022 * Elsewhere, this operation requires allpmaps_lock for
1023 * synchronization. Here, it does not because it is being
1024 * performed in the context of an all_cpus rendezvous.
1025 */
1026 LIST_FOREACH(pmap, &allpmaps, pm_list) {
1027 pde = pmap_pde(pmap, act->va);
1028 pde_store(pde, act->newpde);
1029 }
1030 }
1031
1032 static void
1033 pmap_update_pde_user(void *arg)
1034 {
1035 struct pde_action *act = arg;
1036
1037 if (act->store == PCPU_GET(cpumask))
1038 pde_store(act->pde, act->newpde);
1039 }
1040
1041 static void
1042 pmap_update_pde_teardown(void *arg)
1043 {
1044 struct pde_action *act = arg;
1045
1046 if ((act->invalidate & PCPU_GET(cpumask)) != 0)
1047 pmap_update_pde_invalidate(act->va, act->newpde);
1048 }
1049
1050 /*
1051 * Change the page size for the specified virtual address in a way that
1052 * prevents any possibility of the TLB ever having two entries that map the
1053 * same virtual address using different page sizes. This is the recommended
1054 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
1055 * machine check exception for a TLB state that is improperly diagnosed as a
1056 * hardware error.
1057 */
1058 static void
1059 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1060 {
1061 struct pde_action act;
1062 cpumask_t active, cpumask;
1063
1064 sched_pin();
1065 cpumask = PCPU_GET(cpumask);
1066 if (pmap == kernel_pmap)
1067 active = all_cpus;
1068 else
1069 active = pmap->pm_active;
1070 if ((active & PCPU_GET(other_cpus)) != 0) {
1071 act.store = cpumask;
1072 act.invalidate = active;
1073 act.va = va;
1074 act.pde = pde;
1075 act.newpde = newpde;
1076 smp_rendezvous_cpus(cpumask | active,
1077 smp_no_rendevous_barrier, pmap == kernel_pmap ?
1078 pmap_update_pde_kernel : pmap_update_pde_user,
1079 pmap_update_pde_teardown, &act);
1080 } else {
1081 if (pmap == kernel_pmap)
1082 pmap_kenter_pde(va, newpde);
1083 else
1084 pde_store(pde, newpde);
1085 if ((active & cpumask) != 0)
1086 pmap_update_pde_invalidate(va, newpde);
1087 }
1088 sched_unpin();
1089 }
1090 #else /* !SMP */
1091 /*
1092 * Normal, non-SMP, 486+ invalidation functions.
1093 * We inline these within pmap.c for speed.
1094 */
1095 PMAP_INLINE void
1096 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1097 {
1098
1099 if (pmap == kernel_pmap || pmap->pm_active)
1100 invlpg(va);
1101 }
1102
1103 PMAP_INLINE void
1104 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1105 {
1106 vm_offset_t addr;
1107
1108 if (pmap == kernel_pmap || pmap->pm_active)
1109 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1110 invlpg(addr);
1111 }
1112
1113 PMAP_INLINE void
1114 pmap_invalidate_all(pmap_t pmap)
1115 {
1116
1117 if (pmap == kernel_pmap || pmap->pm_active)
1118 invltlb();
1119 }
1120
1121 PMAP_INLINE void
1122 pmap_invalidate_cache(void)
1123 {
1124
1125 wbinvd();
1126 }
1127
1128 static void
1129 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1130 {
1131
1132 if (pmap == kernel_pmap)
1133 pmap_kenter_pde(va, newpde);
1134 else
1135 pde_store(pde, newpde);
1136 if (pmap == kernel_pmap || pmap->pm_active)
1137 pmap_update_pde_invalidate(va, newpde);
1138 }
1139 #endif /* !SMP */
1140
1141 void
1142 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1143 {
1144
1145 KASSERT((sva & PAGE_MASK) == 0,
1146 ("pmap_invalidate_cache_range: sva not page-aligned"));
1147 KASSERT((eva & PAGE_MASK) == 0,
1148 ("pmap_invalidate_cache_range: eva not page-aligned"));
1149
1150 if (cpu_feature & CPUID_SS)
1151 ; /* If "Self Snoop" is supported, do nothing. */
1152 else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1153 eva - sva < 2 * 1024 * 1024) {
1154
1155 /*
1156 * Otherwise, do per-cache line flush. Use the mfence
1157 * instruction to insure that previous stores are
1158 * included in the write-back. The processor
1159 * propagates flush to other processors in the cache
1160 * coherence domain.
1161 */
1162 mfence();
1163 for (; sva < eva; sva += cpu_clflush_line_size)
1164 clflush(sva);
1165 mfence();
1166 } else {
1167
1168 /*
1169 * No targeted cache flush methods are supported by CPU,
1170 * or the supplied range is bigger than 2MB.
1171 * Globally invalidate cache.
1172 */
1173 pmap_invalidate_cache();
1174 }
1175 }
1176
1177 /*
1178 * Are we current address space or kernel? N.B. We return FALSE when
1179 * a pmap's page table is in use because a kernel thread is borrowing
1180 * it. The borrowed page table can change spontaneously, making any
1181 * dependence on its continued use subject to a race condition.
1182 */
1183 static __inline int
1184 pmap_is_current(pmap_t pmap)
1185 {
1186
1187 return (pmap == kernel_pmap ||
1188 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1189 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1190 }
1191
1192 /*
1193 * If the given pmap is not the current or kernel pmap, the returned pte must
1194 * be released by passing it to pmap_pte_release().
1195 */
1196 pt_entry_t *
1197 pmap_pte(pmap_t pmap, vm_offset_t va)
1198 {
1199 pd_entry_t newpf;
1200 pd_entry_t *pde;
1201
1202 pde = pmap_pde(pmap, va);
1203 if (*pde & PG_PS)
1204 return (pde);
1205 if (*pde != 0) {
1206 /* are we current address space or kernel? */
1207 if (pmap_is_current(pmap))
1208 return (vtopte(va));
1209 mtx_lock(&PMAP2mutex);
1210 newpf = *pde & PG_FRAME;
1211 if ((*PMAP2 & PG_FRAME) != newpf) {
1212 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1213 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1214 }
1215 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1216 }
1217 return (0);
1218 }
1219
1220 /*
1221 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte
1222 * being NULL.
1223 */
1224 static __inline void
1225 pmap_pte_release(pt_entry_t *pte)
1226 {
1227
1228 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1229 mtx_unlock(&PMAP2mutex);
1230 }
1231
1232 static __inline void
1233 invlcaddr(void *caddr)
1234 {
1235
1236 invlpg((u_int)caddr);
1237 }
1238
1239 /*
1240 * Super fast pmap_pte routine best used when scanning
1241 * the pv lists. This eliminates many coarse-grained
1242 * invltlb calls. Note that many of the pv list
1243 * scans are across different pmaps. It is very wasteful
1244 * to do an entire invltlb for checking a single mapping.
1245 *
1246 * If the given pmap is not the current pmap, vm_page_queue_mtx
1247 * must be held and curthread pinned to a CPU.
1248 */
1249 static pt_entry_t *
1250 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1251 {
1252 pd_entry_t newpf;
1253 pd_entry_t *pde;
1254
1255 pde = pmap_pde(pmap, va);
1256 if (*pde & PG_PS)
1257 return (pde);
1258 if (*pde != 0) {
1259 /* are we current address space or kernel? */
1260 if (pmap_is_current(pmap))
1261 return (vtopte(va));
1262 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1263 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1264 newpf = *pde & PG_FRAME;
1265 if ((*PMAP1 & PG_FRAME) != newpf) {
1266 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1267 #ifdef SMP
1268 PMAP1cpu = PCPU_GET(cpuid);
1269 #endif
1270 invlcaddr(PADDR1);
1271 PMAP1changed++;
1272 } else
1273 #ifdef SMP
1274 if (PMAP1cpu != PCPU_GET(cpuid)) {
1275 PMAP1cpu = PCPU_GET(cpuid);
1276 invlcaddr(PADDR1);
1277 PMAP1changedcpu++;
1278 } else
1279 #endif
1280 PMAP1unchanged++;
1281 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1282 }
1283 return (0);
1284 }
1285
1286 /*
1287 * Routine: pmap_extract
1288 * Function:
1289 * Extract the physical page address associated
1290 * with the given map/virtual_address pair.
1291 */
1292 vm_paddr_t
1293 pmap_extract(pmap_t pmap, vm_offset_t va)
1294 {
1295 vm_paddr_t rtval;
1296 pt_entry_t *pte;
1297 pd_entry_t pde;
1298
1299 rtval = 0;
1300 PMAP_LOCK(pmap);
1301 pde = pmap->pm_pdir[va >> PDRSHIFT];
1302 if (pde != 0) {
1303 if ((pde & PG_PS) != 0)
1304 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1305 else {
1306 pte = pmap_pte(pmap, va);
1307 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1308 pmap_pte_release(pte);
1309 }
1310 }
1311 PMAP_UNLOCK(pmap);
1312 return (rtval);
1313 }
1314
1315 /*
1316 * Routine: pmap_extract_and_hold
1317 * Function:
1318 * Atomically extract and hold the physical page
1319 * with the given pmap and virtual address pair
1320 * if that mapping permits the given protection.
1321 */
1322 vm_page_t
1323 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1324 {
1325 pd_entry_t pde;
1326 pt_entry_t pte;
1327 vm_page_t m;
1328
1329 m = NULL;
1330 vm_page_lock_queues();
1331 PMAP_LOCK(pmap);
1332 pde = *pmap_pde(pmap, va);
1333 if (pde != 0) {
1334 if (pde & PG_PS) {
1335 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1336 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1337 (va & PDRMASK));
1338 vm_page_hold(m);
1339 }
1340 } else {
1341 sched_pin();
1342 pte = *pmap_pte_quick(pmap, va);
1343 if (pte != 0 &&
1344 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1345 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1346 vm_page_hold(m);
1347 }
1348 sched_unpin();
1349 }
1350 }
1351 vm_page_unlock_queues();
1352 PMAP_UNLOCK(pmap);
1353 return (m);
1354 }
1355
1356 /***************************************************
1357 * Low level mapping routines.....
1358 ***************************************************/
1359
1360 /*
1361 * Add a wired page to the kva.
1362 * Note: not SMP coherent.
1363 *
1364 * This function may be used before pmap_bootstrap() is called.
1365 */
1366 PMAP_INLINE void
1367 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1368 {
1369 pt_entry_t *pte;
1370
1371 pte = vtopte(va);
1372 pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1373 }
1374
1375 static __inline void
1376 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1377 {
1378 pt_entry_t *pte;
1379
1380 pte = vtopte(va);
1381 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1382 }
1383
1384 /*
1385 * Remove a page from the kernel pagetables.
1386 * Note: not SMP coherent.
1387 *
1388 * This function may be used before pmap_bootstrap() is called.
1389 */
1390 PMAP_INLINE void
1391 pmap_kremove(vm_offset_t va)
1392 {
1393 pt_entry_t *pte;
1394
1395 pte = vtopte(va);
1396 pte_clear(pte);
1397 }
1398
1399 /*
1400 * Used to map a range of physical addresses into kernel
1401 * virtual address space.
1402 *
1403 * The value passed in '*virt' is a suggested virtual address for
1404 * the mapping. Architectures which can support a direct-mapped
1405 * physical to virtual region can return the appropriate address
1406 * within that region, leaving '*virt' unchanged. Other
1407 * architectures should map the pages starting at '*virt' and
1408 * update '*virt' with the first usable address after the mapped
1409 * region.
1410 */
1411 vm_offset_t
1412 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1413 {
1414 vm_offset_t va, sva;
1415
1416 va = sva = *virt;
1417 while (start < end) {
1418 pmap_kenter(va, start);
1419 va += PAGE_SIZE;
1420 start += PAGE_SIZE;
1421 }
1422 pmap_invalidate_range(kernel_pmap, sva, va);
1423 *virt = va;
1424 return (sva);
1425 }
1426
1427
1428 /*
1429 * Add a list of wired pages to the kva
1430 * this routine is only used for temporary
1431 * kernel mappings that do not need to have
1432 * page modification or references recorded.
1433 * Note that old mappings are simply written
1434 * over. The page *must* be wired.
1435 * Note: SMP coherent. Uses a ranged shootdown IPI.
1436 */
1437 void
1438 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1439 {
1440 pt_entry_t *endpte, oldpte, *pte;
1441
1442 oldpte = 0;
1443 pte = vtopte(sva);
1444 endpte = pte + count;
1445 while (pte < endpte) {
1446 oldpte |= *pte;
1447 pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag |
1448 pmap_cache_bits((*ma)->md.pat_mode, 0) | PG_RW | PG_V);
1449 pte++;
1450 ma++;
1451 }
1452 if ((oldpte & PG_V) != 0)
1453 pmap_invalidate_range(kernel_pmap, sva, sva + count *
1454 PAGE_SIZE);
1455 }
1456
1457 /*
1458 * This routine tears out page mappings from the
1459 * kernel -- it is meant only for temporary mappings.
1460 * Note: SMP coherent. Uses a ranged shootdown IPI.
1461 */
1462 void
1463 pmap_qremove(vm_offset_t sva, int count)
1464 {
1465 vm_offset_t va;
1466
1467 va = sva;
1468 while (count-- > 0) {
1469 pmap_kremove(va);
1470 va += PAGE_SIZE;
1471 }
1472 pmap_invalidate_range(kernel_pmap, sva, va);
1473 }
1474
1475 /***************************************************
1476 * Page table page management routines.....
1477 ***************************************************/
1478 static __inline void
1479 pmap_free_zero_pages(vm_page_t free)
1480 {
1481 vm_page_t m;
1482
1483 while (free != NULL) {
1484 m = free;
1485 free = m->right;
1486 /* Preserve the page's PG_ZERO setting. */
1487 vm_page_free_toq(m);
1488 }
1489 }
1490
1491 /*
1492 * Schedule the specified unused page table page to be freed. Specifically,
1493 * add the page to the specified list of pages that will be released to the
1494 * physical memory manager after the TLB has been updated.
1495 */
1496 static __inline void
1497 pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1498 {
1499
1500 if (set_PG_ZERO)
1501 m->flags |= PG_ZERO;
1502 else
1503 m->flags &= ~PG_ZERO;
1504 m->right = *free;
1505 *free = m;
1506 }
1507
1508 /*
1509 * Inserts the specified page table page into the specified pmap's collection
1510 * of idle page table pages. Each of a pmap's page table pages is responsible
1511 * for mapping a distinct range of virtual addresses. The pmap's collection is
1512 * ordered by this virtual address range.
1513 */
1514 static void
1515 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1516 {
1517 vm_page_t root;
1518
1519 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1520 root = pmap->pm_root;
1521 if (root == NULL) {
1522 mpte->left = NULL;
1523 mpte->right = NULL;
1524 } else {
1525 root = vm_page_splay(mpte->pindex, root);
1526 if (mpte->pindex < root->pindex) {
1527 mpte->left = root->left;
1528 mpte->right = root;
1529 root->left = NULL;
1530 } else if (mpte->pindex == root->pindex)
1531 panic("pmap_insert_pt_page: pindex already inserted");
1532 else {
1533 mpte->right = root->right;
1534 mpte->left = root;
1535 root->right = NULL;
1536 }
1537 }
1538 pmap->pm_root = mpte;
1539 }
1540
1541 /*
1542 * Looks for a page table page mapping the specified virtual address in the
1543 * specified pmap's collection of idle page table pages. Returns NULL if there
1544 * is no page table page corresponding to the specified virtual address.
1545 */
1546 static vm_page_t
1547 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1548 {
1549 vm_page_t mpte;
1550 vm_pindex_t pindex = va >> PDRSHIFT;
1551
1552 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1553 if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1554 mpte = vm_page_splay(pindex, mpte);
1555 if ((pmap->pm_root = mpte)->pindex != pindex)
1556 mpte = NULL;
1557 }
1558 return (mpte);
1559 }
1560
1561 /*
1562 * Removes the specified page table page from the specified pmap's collection
1563 * of idle page table pages. The specified page table page must be a member of
1564 * the pmap's collection.
1565 */
1566 static void
1567 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1568 {
1569 vm_page_t root;
1570
1571 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1572 if (mpte != pmap->pm_root)
1573 vm_page_splay(mpte->pindex, pmap->pm_root);
1574 if (mpte->left == NULL)
1575 root = mpte->right;
1576 else {
1577 root = vm_page_splay(mpte->pindex, mpte->left);
1578 root->right = mpte->right;
1579 }
1580 pmap->pm_root = root;
1581 }
1582
1583 /*
1584 * This routine unholds page table pages, and if the hold count
1585 * drops to zero, then it decrements the wire count.
1586 */
1587 static __inline int
1588 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1589 {
1590
1591 --m->wire_count;
1592 if (m->wire_count == 0)
1593 return _pmap_unwire_pte_hold(pmap, m, free);
1594 else
1595 return 0;
1596 }
1597
1598 static int
1599 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1600 {
1601 vm_offset_t pteva;
1602
1603 /*
1604 * unmap the page table page
1605 */
1606 pmap->pm_pdir[m->pindex] = 0;
1607 --pmap->pm_stats.resident_count;
1608
1609 /*
1610 * This is a release store so that the ordinary store unmapping
1611 * the page table page is globally performed before TLB shoot-
1612 * down is begun.
1613 */
1614 atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1615
1616 /*
1617 * Do an invltlb to make the invalidated mapping
1618 * take effect immediately.
1619 */
1620 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1621 pmap_invalidate_page(pmap, pteva);
1622
1623 /*
1624 * Put page on a list so that it is released after
1625 * *ALL* TLB shootdown is done
1626 */
1627 pmap_add_delayed_free_list(m, free, TRUE);
1628
1629 return 1;
1630 }
1631
1632 /*
1633 * After removing a page table entry, this routine is used to
1634 * conditionally free the page, and manage the hold/wire counts.
1635 */
1636 static int
1637 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1638 {
1639 pd_entry_t ptepde;
1640 vm_page_t mpte;
1641
1642 if (va >= VM_MAXUSER_ADDRESS)
1643 return 0;
1644 ptepde = *pmap_pde(pmap, va);
1645 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1646 return pmap_unwire_pte_hold(pmap, mpte, free);
1647 }
1648
1649 /*
1650 * Initialize the pmap for the swapper process.
1651 */
1652 void
1653 pmap_pinit0(pmap_t pmap)
1654 {
1655
1656 PMAP_LOCK_INIT(pmap);
1657 /*
1658 * Since the page table directory is shared with the kernel pmap,
1659 * which is already included in the list "allpmaps", this pmap does
1660 * not need to be inserted into that list.
1661 */
1662 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1663 #ifdef PAE
1664 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1665 #endif
1666 pmap->pm_root = NULL;
1667 pmap->pm_active = 0;
1668 PCPU_SET(curpmap, pmap);
1669 TAILQ_INIT(&pmap->pm_pvchunk);
1670 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1671 }
1672
1673 /*
1674 * Initialize a preallocated and zeroed pmap structure,
1675 * such as one in a vmspace structure.
1676 */
1677 int
1678 pmap_pinit(pmap_t pmap)
1679 {
1680 vm_page_t m, ptdpg[NPGPTD];
1681 vm_paddr_t pa;
1682 static int color;
1683 int i;
1684
1685 PMAP_LOCK_INIT(pmap);
1686
1687 /*
1688 * No need to allocate page table space yet but we do need a valid
1689 * page directory table.
1690 */
1691 if (pmap->pm_pdir == NULL) {
1692 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1693 NBPTD);
1694
1695 if (pmap->pm_pdir == NULL) {
1696 PMAP_LOCK_DESTROY(pmap);
1697 return (0);
1698 }
1699 #ifdef PAE
1700 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1701 KASSERT(((vm_offset_t)pmap->pm_pdpt &
1702 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1703 ("pmap_pinit: pdpt misaligned"));
1704 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1705 ("pmap_pinit: pdpt above 4g"));
1706 #endif
1707 pmap->pm_root = NULL;
1708 }
1709 KASSERT(pmap->pm_root == NULL,
1710 ("pmap_pinit: pmap has reserved page table page(s)"));
1711
1712 /*
1713 * allocate the page directory page(s)
1714 */
1715 for (i = 0; i < NPGPTD;) {
1716 m = vm_page_alloc(NULL, color++,
1717 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1718 VM_ALLOC_ZERO);
1719 if (m == NULL)
1720 VM_WAIT;
1721 else {
1722 ptdpg[i++] = m;
1723 }
1724 }
1725
1726 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1727
1728 for (i = 0; i < NPGPTD; i++) {
1729 if ((ptdpg[i]->flags & PG_ZERO) == 0)
1730 bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1731 }
1732
1733 mtx_lock_spin(&allpmaps_lock);
1734 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1735 /* Copy the kernel page table directory entries. */
1736 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1737 mtx_unlock_spin(&allpmaps_lock);
1738
1739 /* install self-referential address mapping entry(s) */
1740 for (i = 0; i < NPGPTD; i++) {
1741 pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1742 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1743 #ifdef PAE
1744 pmap->pm_pdpt[i] = pa | PG_V;
1745 #endif
1746 }
1747
1748 pmap->pm_active = 0;
1749 TAILQ_INIT(&pmap->pm_pvchunk);
1750 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1751
1752 return (1);
1753 }
1754
1755 /*
1756 * this routine is called if the page table page is not
1757 * mapped correctly.
1758 */
1759 static vm_page_t
1760 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1761 {
1762 vm_paddr_t ptepa;
1763 vm_page_t m;
1764
1765 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1766 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1767 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1768
1769 /*
1770 * Allocate a page table page.
1771 */
1772 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1773 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1774 if (flags & M_WAITOK) {
1775 PMAP_UNLOCK(pmap);
1776 vm_page_unlock_queues();
1777 VM_WAIT;
1778 vm_page_lock_queues();
1779 PMAP_LOCK(pmap);
1780 }
1781
1782 /*
1783 * Indicate the need to retry. While waiting, the page table
1784 * page may have been allocated.
1785 */
1786 return (NULL);
1787 }
1788 if ((m->flags & PG_ZERO) == 0)
1789 pmap_zero_page(m);
1790
1791 /*
1792 * Map the pagetable page into the process address space, if
1793 * it isn't already there.
1794 */
1795
1796 pmap->pm_stats.resident_count++;
1797
1798 ptepa = VM_PAGE_TO_PHYS(m);
1799 pmap->pm_pdir[ptepindex] =
1800 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1801
1802 return m;
1803 }
1804
1805 static vm_page_t
1806 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1807 {
1808 unsigned ptepindex;
1809 pd_entry_t ptepa;
1810 vm_page_t m;
1811
1812 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1813 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1814 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1815
1816 /*
1817 * Calculate pagetable page index
1818 */
1819 ptepindex = va >> PDRSHIFT;
1820 retry:
1821 /*
1822 * Get the page directory entry
1823 */
1824 ptepa = pmap->pm_pdir[ptepindex];
1825
1826 /*
1827 * This supports switching from a 4MB page to a
1828 * normal 4K page.
1829 */
1830 if (ptepa & PG_PS) {
1831 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1832 ptepa = pmap->pm_pdir[ptepindex];
1833 }
1834
1835 /*
1836 * If the page table page is mapped, we just increment the
1837 * hold count, and activate it.
1838 */
1839 if (ptepa) {
1840 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1841 m->wire_count++;
1842 } else {
1843 /*
1844 * Here if the pte page isn't mapped, or if it has
1845 * been deallocated.
1846 */
1847 m = _pmap_allocpte(pmap, ptepindex, flags);
1848 if (m == NULL && (flags & M_WAITOK))
1849 goto retry;
1850 }
1851 return (m);
1852 }
1853
1854
1855 /***************************************************
1856 * Pmap allocation/deallocation routines.
1857 ***************************************************/
1858
1859 #ifdef SMP
1860 /*
1861 * Deal with a SMP shootdown of other users of the pmap that we are
1862 * trying to dispose of. This can be a bit hairy.
1863 */
1864 static u_int *lazymask;
1865 static u_int lazyptd;
1866 static volatile u_int lazywait;
1867
1868 void pmap_lazyfix_action(void);
1869
1870 void
1871 pmap_lazyfix_action(void)
1872 {
1873 u_int mymask = PCPU_GET(cpumask);
1874
1875 #ifdef COUNT_IPIS
1876 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1877 #endif
1878 if (rcr3() == lazyptd)
1879 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1880 atomic_clear_int(lazymask, mymask);
1881 atomic_store_rel_int(&lazywait, 1);
1882 }
1883
1884 static void
1885 pmap_lazyfix_self(u_int mymask)
1886 {
1887
1888 if (rcr3() == lazyptd)
1889 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1890 atomic_clear_int(lazymask, mymask);
1891 }
1892
1893
1894 static void
1895 pmap_lazyfix(pmap_t pmap)
1896 {
1897 u_int mymask;
1898 u_int mask;
1899 u_int spins;
1900
1901 while ((mask = pmap->pm_active) != 0) {
1902 spins = 50000000;
1903 mask = mask & -mask; /* Find least significant set bit */
1904 mtx_lock_spin(&smp_ipi_mtx);
1905 #ifdef PAE
1906 lazyptd = vtophys(pmap->pm_pdpt);
1907 #else
1908 lazyptd = vtophys(pmap->pm_pdir);
1909 #endif
1910 mymask = PCPU_GET(cpumask);
1911 if (mask == mymask) {
1912 lazymask = &pmap->pm_active;
1913 pmap_lazyfix_self(mymask);
1914 } else {
1915 atomic_store_rel_int((u_int *)&lazymask,
1916 (u_int)&pmap->pm_active);
1917 atomic_store_rel_int(&lazywait, 0);
1918 ipi_selected(mask, IPI_LAZYPMAP);
1919 while (lazywait == 0) {
1920 ia32_pause();
1921 if (--spins == 0)
1922 break;
1923 }
1924 }
1925 mtx_unlock_spin(&smp_ipi_mtx);
1926 if (spins == 0)
1927 printf("pmap_lazyfix: spun for 50000000\n");
1928 }
1929 }
1930
1931 #else /* SMP */
1932
1933 /*
1934 * Cleaning up on uniprocessor is easy. For various reasons, we're
1935 * unlikely to have to even execute this code, including the fact
1936 * that the cleanup is deferred until the parent does a wait(2), which
1937 * means that another userland process has run.
1938 */
1939 static void
1940 pmap_lazyfix(pmap_t pmap)
1941 {
1942 u_int cr3;
1943
1944 cr3 = vtophys(pmap->pm_pdir);
1945 if (cr3 == rcr3()) {
1946 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1947 pmap->pm_active &= ~(PCPU_GET(cpumask));
1948 }
1949 }
1950 #endif /* SMP */
1951
1952 /*
1953 * Release any resources held by the given physical map.
1954 * Called when a pmap initialized by pmap_pinit is being released.
1955 * Should only be called if the map contains no valid mappings.
1956 */
1957 void
1958 pmap_release(pmap_t pmap)
1959 {
1960 vm_page_t m, ptdpg[NPGPTD];
1961 int i;
1962
1963 KASSERT(pmap->pm_stats.resident_count == 0,
1964 ("pmap_release: pmap resident count %ld != 0",
1965 pmap->pm_stats.resident_count));
1966 KASSERT(pmap->pm_root == NULL,
1967 ("pmap_release: pmap has reserved page table page(s)"));
1968
1969 pmap_lazyfix(pmap);
1970 mtx_lock_spin(&allpmaps_lock);
1971 LIST_REMOVE(pmap, pm_list);
1972 mtx_unlock_spin(&allpmaps_lock);
1973
1974 for (i = 0; i < NPGPTD; i++)
1975 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
1976 PG_FRAME);
1977
1978 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1979 sizeof(*pmap->pm_pdir));
1980
1981 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1982
1983 for (i = 0; i < NPGPTD; i++) {
1984 m = ptdpg[i];
1985 #ifdef PAE
1986 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1987 ("pmap_release: got wrong ptd page"));
1988 #endif
1989 m->wire_count--;
1990 atomic_subtract_int(&cnt.v_wire_count, 1);
1991 vm_page_free_zero(m);
1992 }
1993 PMAP_LOCK_DESTROY(pmap);
1994 }
1995
1996 static int
1997 kvm_size(SYSCTL_HANDLER_ARGS)
1998 {
1999 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2000
2001 return sysctl_handle_long(oidp, &ksize, 0, req);
2002 }
2003 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2004 0, 0, kvm_size, "IU", "Size of KVM");
2005
2006 static int
2007 kvm_free(SYSCTL_HANDLER_ARGS)
2008 {
2009 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2010
2011 return sysctl_handle_long(oidp, &kfree, 0, req);
2012 }
2013 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2014 0, 0, kvm_free, "IU", "Amount of KVM free");
2015
2016 /*
2017 * grow the number of kernel page table entries, if needed
2018 */
2019 void
2020 pmap_growkernel(vm_offset_t addr)
2021 {
2022 vm_paddr_t ptppaddr;
2023 vm_page_t nkpg;
2024 pd_entry_t newpdir;
2025
2026 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2027 if (kernel_vm_end == 0) {
2028 kernel_vm_end = KERNBASE;
2029 nkpt = 0;
2030 while (pdir_pde(PTD, kernel_vm_end)) {
2031 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
2032 nkpt++;
2033 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2034 kernel_vm_end = kernel_map->max_offset;
2035 break;
2036 }
2037 }
2038 }
2039 addr = roundup2(addr, PAGE_SIZE * NPTEPG);
2040 if (addr - 1 >= kernel_map->max_offset)
2041 addr = kernel_map->max_offset;
2042 while (kernel_vm_end < addr) {
2043 if (pdir_pde(PTD, kernel_vm_end)) {
2044 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
2045 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2046 kernel_vm_end = kernel_map->max_offset;
2047 break;
2048 }
2049 continue;
2050 }
2051
2052 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2053 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2054 VM_ALLOC_ZERO);
2055 if (nkpg == NULL)
2056 panic("pmap_growkernel: no memory to grow kernel");
2057
2058 nkpt++;
2059
2060 if ((nkpg->flags & PG_ZERO) == 0)
2061 pmap_zero_page(nkpg);
2062 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2063 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2064 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2065
2066 pmap_kenter_pde(kernel_vm_end, newpdir);
2067 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
2068 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2069 kernel_vm_end = kernel_map->max_offset;
2070 break;
2071 }
2072 }
2073 }
2074
2075
2076 /***************************************************
2077 * page management routines.
2078 ***************************************************/
2079
2080 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2081 CTASSERT(_NPCM == 11);
2082
2083 static __inline struct pv_chunk *
2084 pv_to_chunk(pv_entry_t pv)
2085 {
2086
2087 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
2088 }
2089
2090 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2091
2092 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */
2093 #define PC_FREE10 0x0000fffful /* Free values for index 10 */
2094
2095 static uint32_t pc_freemask[11] = {
2096 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2097 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2098 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2099 PC_FREE0_9, PC_FREE10
2100 };
2101
2102 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2103 "Current number of pv entries");
2104
2105 #ifdef PV_STATS
2106 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2107
2108 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2109 "Current number of pv entry chunks");
2110 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2111 "Current number of pv entry chunks allocated");
2112 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2113 "Current number of pv entry chunks frees");
2114 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2115 "Number of times tried to get a chunk page but failed.");
2116
2117 static long pv_entry_frees, pv_entry_allocs;
2118 static int pv_entry_spare;
2119
2120 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2121 "Current number of pv entry frees");
2122 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2123 "Current number of pv entry allocs");
2124 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2125 "Current number of spare pv entries");
2126
2127 static int pmap_collect_inactive, pmap_collect_active;
2128
2129 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
2130 "Current number times pmap_collect called on inactive queue");
2131 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
2132 "Current number times pmap_collect called on active queue");
2133 #endif
2134
2135 /*
2136 * We are in a serious low memory condition. Resort to
2137 * drastic measures to free some pages so we can allocate
2138 * another pv entry chunk. This is normally called to
2139 * unmap inactive pages, and if necessary, active pages.
2140 */
2141 static void
2142 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
2143 {
2144 struct md_page *pvh;
2145 pd_entry_t *pde;
2146 pmap_t pmap;
2147 pt_entry_t *pte, tpte;
2148 pv_entry_t next_pv, pv;
2149 vm_offset_t va;
2150 vm_page_t m, free;
2151
2152 sched_pin();
2153 TAILQ_FOREACH(m, &vpq->pl, pageq) {
2154 if (m->hold_count || m->busy)
2155 continue;
2156 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
2157 va = pv->pv_va;
2158 pmap = PV_PMAP(pv);
2159 /* Avoid deadlock and lock recursion. */
2160 if (pmap > locked_pmap)
2161 PMAP_LOCK(pmap);
2162 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
2163 continue;
2164 pmap->pm_stats.resident_count--;
2165 pde = pmap_pde(pmap, va);
2166 KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
2167 " a 4mpage in page %p's pv list", m));
2168 pte = pmap_pte_quick(pmap, va);
2169 tpte = pte_load_clear(pte);
2170 KASSERT((tpte & PG_W) == 0,
2171 ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
2172 if (tpte & PG_A)
2173 vm_page_flag_set(m, PG_REFERENCED);
2174 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2175 vm_page_dirty(m);
2176 free = NULL;
2177 pmap_unuse_pt(pmap, va, &free);
2178 pmap_invalidate_page(pmap, va);
2179 pmap_free_zero_pages(free);
2180 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2181 if (TAILQ_EMPTY(&m->md.pv_list)) {
2182 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2183 if (TAILQ_EMPTY(&pvh->pv_list))
2184 vm_page_flag_clear(m, PG_WRITEABLE);
2185 }
2186 free_pv_entry(pmap, pv);
2187 if (pmap != locked_pmap)
2188 PMAP_UNLOCK(pmap);
2189 }
2190 }
2191 sched_unpin();
2192 }
2193
2194
2195 /*
2196 * free the pv_entry back to the free list
2197 */
2198 static void
2199 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2200 {
2201 vm_page_t m;
2202 struct pv_chunk *pc;
2203 int idx, field, bit;
2204
2205 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2206 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2207 PV_STAT(pv_entry_frees++);
2208 PV_STAT(pv_entry_spare++);
2209 pv_entry_count--;
2210 pc = pv_to_chunk(pv);
2211 idx = pv - &pc->pc_pventry[0];
2212 field = idx / 32;
2213 bit = idx % 32;
2214 pc->pc_map[field] |= 1ul << bit;
2215 /* move to head of list */
2216 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2217 for (idx = 0; idx < _NPCM; idx++)
2218 if (pc->pc_map[idx] != pc_freemask[idx]) {
2219 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2220 return;
2221 }
2222 PV_STAT(pv_entry_spare -= _NPCPV);
2223 PV_STAT(pc_chunk_count--);
2224 PV_STAT(pc_chunk_frees++);
2225 /* entire chunk is free, return it */
2226 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2227 pmap_qremove((vm_offset_t)pc, 1);
2228 vm_page_unwire(m, 0);
2229 vm_page_free(m);
2230 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2231 }
2232
2233 /*
2234 * get a new pv_entry, allocating a block from the system
2235 * when needed.
2236 */
2237 static pv_entry_t
2238 get_pv_entry(pmap_t pmap, int try)
2239 {
2240 static const struct timeval printinterval = { 60, 0 };
2241 static struct timeval lastprint;
2242 static vm_pindex_t colour;
2243 struct vpgqueues *pq;
2244 int bit, field;
2245 pv_entry_t pv;
2246 struct pv_chunk *pc;
2247 vm_page_t m;
2248
2249 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2250 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2251 PV_STAT(pv_entry_allocs++);
2252 pv_entry_count++;
2253 if (pv_entry_count > pv_entry_high_water)
2254 if (ratecheck(&lastprint, &printinterval))
2255 printf("Approaching the limit on PV entries, consider "
2256 "increasing either the vm.pmap.shpgperproc or the "
2257 "vm.pmap.pv_entry_max tunable.\n");
2258 pq = NULL;
2259 retry:
2260 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2261 if (pc != NULL) {
2262 for (field = 0; field < _NPCM; field++) {
2263 if (pc->pc_map[field]) {
2264 bit = bsfl(pc->pc_map[field]);
2265 break;
2266 }
2267 }
2268 if (field < _NPCM) {
2269 pv = &pc->pc_pventry[field * 32 + bit];
2270 pc->pc_map[field] &= ~(1ul << bit);
2271 /* If this was the last item, move it to tail */
2272 for (field = 0; field < _NPCM; field++)
2273 if (pc->pc_map[field] != 0) {
2274 PV_STAT(pv_entry_spare--);
2275 return (pv); /* not full, return */
2276 }
2277 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2278 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2279 PV_STAT(pv_entry_spare--);
2280 return (pv);
2281 }
2282 }
2283 /*
2284 * Access to the ptelist "pv_vafree" is synchronized by the page
2285 * queues lock. If "pv_vafree" is currently non-empty, it will
2286 * remain non-empty until pmap_ptelist_alloc() completes.
2287 */
2288 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq ==
2289 &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
2290 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2291 if (try) {
2292 pv_entry_count--;
2293 PV_STAT(pc_chunk_tryfail++);
2294 return (NULL);
2295 }
2296 /*
2297 * Reclaim pv entries: At first, destroy mappings to
2298 * inactive pages. After that, if a pv chunk entry
2299 * is still needed, destroy mappings to active pages.
2300 */
2301 if (pq == NULL) {
2302 PV_STAT(pmap_collect_inactive++);
2303 pq = &vm_page_queues[PQ_INACTIVE];
2304 } else if (pq == &vm_page_queues[PQ_INACTIVE]) {
2305 PV_STAT(pmap_collect_active++);
2306 pq = &vm_page_queues[PQ_ACTIVE];
2307 } else
2308 panic("get_pv_entry: increase vm.pmap.shpgperproc");
2309 pmap_collect(pmap, pq);
2310 goto retry;
2311 }
2312 PV_STAT(pc_chunk_count++);
2313 PV_STAT(pc_chunk_allocs++);
2314 colour++;
2315 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2316 pmap_qenter((vm_offset_t)pc, &m, 1);
2317 pc->pc_pmap = pmap;
2318 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */
2319 for (field = 1; field < _NPCM; field++)
2320 pc->pc_map[field] = pc_freemask[field];
2321 pv = &pc->pc_pventry[0];
2322 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2323 PV_STAT(pv_entry_spare += _NPCPV - 1);
2324 return (pv);
2325 }
2326
2327 static __inline pv_entry_t
2328 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2329 {
2330 pv_entry_t pv;
2331
2332 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2333 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2334 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2335 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2336 break;
2337 }
2338 }
2339 return (pv);
2340 }
2341
2342 static void
2343 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2344 {
2345 struct md_page *pvh;
2346 pv_entry_t pv;
2347 vm_offset_t va_last;
2348 vm_page_t m;
2349
2350 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2351 KASSERT((pa & PDRMASK) == 0,
2352 ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2353
2354 /*
2355 * Transfer the 4mpage's pv entry for this mapping to the first
2356 * page's pv list.
2357 */
2358 pvh = pa_to_pvh(pa);
2359 va = trunc_4mpage(va);
2360 pv = pmap_pvh_remove(pvh, pmap, va);
2361 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2362 m = PHYS_TO_VM_PAGE(pa);
2363 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2364 /* Instantiate the remaining NPTEPG - 1 pv entries. */
2365 va_last = va + NBPDR - PAGE_SIZE;
2366 do {
2367 m++;
2368 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2369 ("pmap_pv_demote_pde: page %p is not managed", m));
2370 va += PAGE_SIZE;
2371 pmap_insert_entry(pmap, va, m);
2372 } while (va < va_last);
2373 }
2374
2375 static void
2376 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2377 {
2378 struct md_page *pvh;
2379 pv_entry_t pv;
2380 vm_offset_t va_last;
2381 vm_page_t m;
2382
2383 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2384 KASSERT((pa & PDRMASK) == 0,
2385 ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2386
2387 /*
2388 * Transfer the first page's pv entry for this mapping to the
2389 * 4mpage's pv list. Aside from avoiding the cost of a call
2390 * to get_pv_entry(), a transfer avoids the possibility that
2391 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2392 * removes one of the mappings that is being promoted.
2393 */
2394 m = PHYS_TO_VM_PAGE(pa);
2395 va = trunc_4mpage(va);
2396 pv = pmap_pvh_remove(&m->md, pmap, va);
2397 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2398 pvh = pa_to_pvh(pa);
2399 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2400 /* Free the remaining NPTEPG - 1 pv entries. */
2401 va_last = va + NBPDR - PAGE_SIZE;
2402 do {
2403 m++;
2404 va += PAGE_SIZE;
2405 pmap_pvh_free(&m->md, pmap, va);
2406 } while (va < va_last);
2407 }
2408
2409 static void
2410 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2411 {
2412 pv_entry_t pv;
2413
2414 pv = pmap_pvh_remove(pvh, pmap, va);
2415 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2416 free_pv_entry(pmap, pv);
2417 }
2418
2419 static void
2420 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2421 {
2422 struct md_page *pvh;
2423
2424 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2425 pmap_pvh_free(&m->md, pmap, va);
2426 if (TAILQ_EMPTY(&m->md.pv_list)) {
2427 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2428 if (TAILQ_EMPTY(&pvh->pv_list))
2429 vm_page_flag_clear(m, PG_WRITEABLE);
2430 }
2431 }
2432
2433 /*
2434 * Create a pv entry for page at pa for
2435 * (pmap, va).
2436 */
2437 static void
2438 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2439 {
2440 pv_entry_t pv;
2441
2442 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2443 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2444 pv = get_pv_entry(pmap, FALSE);
2445 pv->pv_va = va;
2446 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2447 }
2448
2449 /*
2450 * Conditionally create a pv entry.
2451 */
2452 static boolean_t
2453 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2454 {
2455 pv_entry_t pv;
2456
2457 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2458 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2459 if (pv_entry_count < pv_entry_high_water &&
2460 (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2461 pv->pv_va = va;
2462 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2463 return (TRUE);
2464 } else
2465 return (FALSE);
2466 }
2467
2468 /*
2469 * Create the pv entries for each of the pages within a superpage.
2470 */
2471 static boolean_t
2472 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2473 {
2474 struct md_page *pvh;
2475 pv_entry_t pv;
2476
2477 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2478 if (pv_entry_count < pv_entry_high_water &&
2479 (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2480 pv->pv_va = va;
2481 pvh = pa_to_pvh(pa);
2482 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2483 return (TRUE);
2484 } else
2485 return (FALSE);
2486 }
2487
2488 /*
2489 * Fills a page table page with mappings to consecutive physical pages.
2490 */
2491 static void
2492 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2493 {
2494 pt_entry_t *pte;
2495
2496 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2497 *pte = newpte;
2498 newpte += PAGE_SIZE;
2499 }
2500 }
2501
2502 /*
2503 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the
2504 * 2- or 4MB page mapping is invalidated.
2505 */
2506 static boolean_t
2507 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2508 {
2509 pd_entry_t newpde, oldpde;
2510 pt_entry_t *firstpte, newpte;
2511 vm_paddr_t mptepa;
2512 vm_page_t free, mpte;
2513
2514 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2515 oldpde = *pde;
2516 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2517 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2518 mpte = pmap_lookup_pt_page(pmap, va);
2519 if (mpte != NULL)
2520 pmap_remove_pt_page(pmap, mpte);
2521 else {
2522 KASSERT((oldpde & PG_W) == 0,
2523 ("pmap_demote_pde: page table page for a wired mapping"
2524 " is missing"));
2525
2526 /*
2527 * Invalidate the 2- or 4MB page mapping and return
2528 * "failure" if the mapping was never accessed or the
2529 * allocation of the new page table page fails.
2530 */
2531 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2532 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2533 VM_ALLOC_WIRED)) == NULL) {
2534 free = NULL;
2535 pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2536 pmap_invalidate_page(pmap, trunc_4mpage(va));
2537 pmap_free_zero_pages(free);
2538 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2539 " in pmap %p", va, pmap);
2540 return (FALSE);
2541 }
2542 if (va < VM_MAXUSER_ADDRESS)
2543 pmap->pm_stats.resident_count++;
2544 }
2545 mptepa = VM_PAGE_TO_PHYS(mpte);
2546
2547 /*
2548 * If the page mapping is in the kernel's address space, then the
2549 * KPTmap can provide access to the page table page. Otherwise,
2550 * temporarily map the page table page (mpte) into the kernel's
2551 * address space at either PADDR1 or PADDR2.
2552 */
2553 if (va >= KERNBASE)
2554 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2555 else if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
2556 if ((*PMAP1 & PG_FRAME) != mptepa) {
2557 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2558 #ifdef SMP
2559 PMAP1cpu = PCPU_GET(cpuid);
2560 #endif
2561 invlcaddr(PADDR1);
2562 PMAP1changed++;
2563 } else
2564 #ifdef SMP
2565 if (PMAP1cpu != PCPU_GET(cpuid)) {
2566 PMAP1cpu = PCPU_GET(cpuid);
2567 invlcaddr(PADDR1);
2568 PMAP1changedcpu++;
2569 } else
2570 #endif
2571 PMAP1unchanged++;
2572 firstpte = PADDR1;
2573 } else {
2574 mtx_lock(&PMAP2mutex);
2575 if ((*PMAP2 & PG_FRAME) != mptepa) {
2576 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2577 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2578 }
2579 firstpte = PADDR2;
2580 }
2581 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2582 KASSERT((oldpde & PG_A) != 0,
2583 ("pmap_demote_pde: oldpde is missing PG_A"));
2584 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2585 ("pmap_demote_pde: oldpde is missing PG_M"));
2586 newpte = oldpde & ~PG_PS;
2587 if ((newpte & PG_PDE_PAT) != 0)
2588 newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2589
2590 /*
2591 * If the page table page is new, initialize it.
2592 */
2593 if (mpte->wire_count == 1) {
2594 mpte->wire_count = NPTEPG;
2595 pmap_fill_ptp(firstpte, newpte);
2596 }
2597 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2598 ("pmap_demote_pde: firstpte and newpte map different physical"
2599 " addresses"));
2600
2601 /*
2602 * If the mapping has changed attributes, update the page table
2603 * entries.
2604 */
2605 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2606 pmap_fill_ptp(firstpte, newpte);
2607
2608 /*
2609 * Demote the mapping. This pmap is locked. The old PDE has
2610 * PG_A set. If the old PDE has PG_RW set, it also has PG_M
2611 * set. Thus, there is no danger of a race with another
2612 * processor changing the setting of PG_A and/or PG_M between
2613 * the read above and the store below.
2614 */
2615 if (workaround_erratum383)
2616 pmap_update_pde(pmap, va, pde, newpde);
2617 else if (pmap == kernel_pmap)
2618 pmap_kenter_pde(va, newpde);
2619 else
2620 pde_store(pde, newpde);
2621 if (firstpte == PADDR2)
2622 mtx_unlock(&PMAP2mutex);
2623
2624 /*
2625 * Invalidate the recursive mapping of the page table page.
2626 */
2627 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2628
2629 /*
2630 * Demote the pv entry. This depends on the earlier demotion
2631 * of the mapping. Specifically, the (re)creation of a per-
2632 * page pv entry might trigger the execution of pmap_collect(),
2633 * which might reclaim a newly (re)created per-page pv entry
2634 * and destroy the associated mapping. In order to destroy
2635 * the mapping, the PDE must have already changed from mapping
2636 * the 2mpage to referencing the page table page.
2637 */
2638 if ((oldpde & PG_MANAGED) != 0)
2639 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2640
2641 pmap_pde_demotions++;
2642 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2643 " in pmap %p", va, pmap);
2644 return (TRUE);
2645 }
2646
2647 /*
2648 * pmap_remove_pde: do the things to unmap a superpage in a process
2649 */
2650 static void
2651 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2652 vm_page_t *free)
2653 {
2654 struct md_page *pvh;
2655 pd_entry_t oldpde;
2656 vm_offset_t eva, va;
2657 vm_page_t m, mpte;
2658
2659 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2660 KASSERT((sva & PDRMASK) == 0,
2661 ("pmap_remove_pde: sva is not 4mpage aligned"));
2662 oldpde = pte_load_clear(pdq);
2663 if (oldpde & PG_W)
2664 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2665
2666 /*
2667 * Machines that don't support invlpg, also don't support
2668 * PG_G.
2669 */
2670 if (oldpde & PG_G)
2671 pmap_invalidate_page(kernel_pmap, sva);
2672 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2673 if (oldpde & PG_MANAGED) {
2674 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2675 pmap_pvh_free(pvh, pmap, sva);
2676 eva = sva + NBPDR;
2677 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2678 va < eva; va += PAGE_SIZE, m++) {
2679 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2680 vm_page_dirty(m);
2681 if (oldpde & PG_A)
2682 vm_page_flag_set(m, PG_REFERENCED);
2683 if (TAILQ_EMPTY(&m->md.pv_list) &&
2684 TAILQ_EMPTY(&pvh->pv_list))
2685 vm_page_flag_clear(m, PG_WRITEABLE);
2686 }
2687 }
2688 if (pmap == kernel_pmap) {
2689 if (!pmap_demote_pde(pmap, pdq, sva))
2690 panic("pmap_remove_pde: failed demotion");
2691 } else {
2692 mpte = pmap_lookup_pt_page(pmap, sva);
2693 if (mpte != NULL) {
2694 pmap_remove_pt_page(pmap, mpte);
2695 pmap->pm_stats.resident_count--;
2696 KASSERT(mpte->wire_count == NPTEPG,
2697 ("pmap_remove_pde: pte page wire count error"));
2698 mpte->wire_count = 0;
2699 pmap_add_delayed_free_list(mpte, free, FALSE);
2700 atomic_subtract_int(&cnt.v_wire_count, 1);
2701 }
2702 }
2703 }
2704
2705 /*
2706 * pmap_remove_pte: do the things to unmap a page in a process
2707 */
2708 static int
2709 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2710 {
2711 pt_entry_t oldpte;
2712 vm_page_t m;
2713
2714 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2715 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2716 oldpte = pte_load_clear(ptq);
2717 if (oldpte & PG_W)
2718 pmap->pm_stats.wired_count -= 1;
2719 /*
2720 * Machines that don't support invlpg, also don't support
2721 * PG_G.
2722 */
2723 if (oldpte & PG_G)
2724 pmap_invalidate_page(kernel_pmap, va);
2725 pmap->pm_stats.resident_count -= 1;
2726 if (oldpte & PG_MANAGED) {
2727 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2728 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2729 vm_page_dirty(m);
2730 if (oldpte & PG_A)
2731 vm_page_flag_set(m, PG_REFERENCED);
2732 pmap_remove_entry(pmap, m, va);
2733 }
2734 return (pmap_unuse_pt(pmap, va, free));
2735 }
2736
2737 /*
2738 * Remove a single page from a process address space
2739 */
2740 static void
2741 pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2742 {
2743 pt_entry_t *pte;
2744
2745 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2746 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2747 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2748 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2749 return;
2750 pmap_remove_pte(pmap, pte, va, free);
2751 pmap_invalidate_page(pmap, va);
2752 }
2753
2754 /*
2755 * Remove the given range of addresses from the specified map.
2756 *
2757 * It is assumed that the start and end are properly
2758 * rounded to the page size.
2759 */
2760 void
2761 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2762 {
2763 vm_offset_t pdnxt;
2764 pd_entry_t ptpaddr;
2765 pt_entry_t *pte;
2766 vm_page_t free = NULL;
2767 int anyvalid;
2768
2769 /*
2770 * Perform an unsynchronized read. This is, however, safe.
2771 */
2772 if (pmap->pm_stats.resident_count == 0)
2773 return;
2774
2775 anyvalid = 0;
2776
2777 vm_page_lock_queues();
2778 sched_pin();
2779 PMAP_LOCK(pmap);
2780
2781 /*
2782 * special handling of removing one page. a very
2783 * common operation and easy to short circuit some
2784 * code.
2785 */
2786 if ((sva + PAGE_SIZE == eva) &&
2787 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2788 pmap_remove_page(pmap, sva, &free);
2789 goto out;
2790 }
2791
2792 for (; sva < eva; sva = pdnxt) {
2793 unsigned pdirindex;
2794
2795 /*
2796 * Calculate index for next page table.
2797 */
2798 pdnxt = (sva + NBPDR) & ~PDRMASK;
2799 if (pdnxt < sva)
2800 pdnxt = eva;
2801 if (pmap->pm_stats.resident_count == 0)
2802 break;
2803
2804 pdirindex = sva >> PDRSHIFT;
2805 ptpaddr = pmap->pm_pdir[pdirindex];
2806
2807 /*
2808 * Weed out invalid mappings. Note: we assume that the page
2809 * directory table is always allocated, and in kernel virtual.
2810 */
2811 if (ptpaddr == 0)
2812 continue;
2813
2814 /*
2815 * Check for large page.
2816 */
2817 if ((ptpaddr & PG_PS) != 0) {
2818 /*
2819 * Are we removing the entire large page? If not,
2820 * demote the mapping and fall through.
2821 */
2822 if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2823 /*
2824 * The TLB entry for a PG_G mapping is
2825 * invalidated by pmap_remove_pde().
2826 */
2827 if ((ptpaddr & PG_G) == 0)
2828 anyvalid = 1;
2829 pmap_remove_pde(pmap,
2830 &pmap->pm_pdir[pdirindex], sva, &free);
2831 continue;
2832 } else if (!pmap_demote_pde(pmap,
2833 &pmap->pm_pdir[pdirindex], sva)) {
2834 /* The large page mapping was destroyed. */
2835 continue;
2836 }
2837 }
2838
2839 /*
2840 * Limit our scan to either the end of the va represented
2841 * by the current page table page, or to the end of the
2842 * range being removed.
2843 */
2844 if (pdnxt > eva)
2845 pdnxt = eva;
2846
2847 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2848 sva += PAGE_SIZE) {
2849 if (*pte == 0)
2850 continue;
2851
2852 /*
2853 * The TLB entry for a PG_G mapping is invalidated
2854 * by pmap_remove_pte().
2855 */
2856 if ((*pte & PG_G) == 0)
2857 anyvalid = 1;
2858 if (pmap_remove_pte(pmap, pte, sva, &free))
2859 break;
2860 }
2861 }
2862 out:
2863 sched_unpin();
2864 if (anyvalid)
2865 pmap_invalidate_all(pmap);
2866 vm_page_unlock_queues();
2867 PMAP_UNLOCK(pmap);
2868 pmap_free_zero_pages(free);
2869 }
2870
2871 /*
2872 * Routine: pmap_remove_all
2873 * Function:
2874 * Removes this physical page from
2875 * all physical maps in which it resides.
2876 * Reflects back modify bits to the pager.
2877 *
2878 * Notes:
2879 * Original versions of this routine were very
2880 * inefficient because they iteratively called
2881 * pmap_remove (slow...)
2882 */
2883
2884 void
2885 pmap_remove_all(vm_page_t m)
2886 {
2887 struct md_page *pvh;
2888 pv_entry_t pv;
2889 pmap_t pmap;
2890 pt_entry_t *pte, tpte;
2891 pd_entry_t *pde;
2892 vm_offset_t va;
2893 vm_page_t free;
2894
2895 KASSERT((m->flags & PG_FICTITIOUS) == 0,
2896 ("pmap_remove_all: page %p is fictitious", m));
2897 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2898 sched_pin();
2899 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2900 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2901 va = pv->pv_va;
2902 pmap = PV_PMAP(pv);
2903 PMAP_LOCK(pmap);
2904 pde = pmap_pde(pmap, va);
2905 (void)pmap_demote_pde(pmap, pde, va);
2906 PMAP_UNLOCK(pmap);
2907 }
2908 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2909 pmap = PV_PMAP(pv);
2910 PMAP_LOCK(pmap);
2911 pmap->pm_stats.resident_count--;
2912 pde = pmap_pde(pmap, pv->pv_va);
2913 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2914 " a 4mpage in page %p's pv list", m));
2915 pte = pmap_pte_quick(pmap, pv->pv_va);
2916 tpte = pte_load_clear(pte);
2917 if (tpte & PG_W)
2918 pmap->pm_stats.wired_count--;
2919 if (tpte & PG_A)
2920 vm_page_flag_set(m, PG_REFERENCED);
2921
2922 /*
2923 * Update the vm_page_t clean and reference bits.
2924 */
2925 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2926 vm_page_dirty(m);
2927 free = NULL;
2928 pmap_unuse_pt(pmap, pv->pv_va, &free);
2929 pmap_invalidate_page(pmap, pv->pv_va);
2930 pmap_free_zero_pages(free);
2931 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2932 free_pv_entry(pmap, pv);
2933 PMAP_UNLOCK(pmap);
2934 }
2935 vm_page_flag_clear(m, PG_WRITEABLE);
2936 sched_unpin();
2937 }
2938
2939 /*
2940 * pmap_protect_pde: do the things to protect a 4mpage in a process
2941 */
2942 static boolean_t
2943 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2944 {
2945 pd_entry_t newpde, oldpde;
2946 vm_offset_t eva, va;
2947 vm_page_t m;
2948 boolean_t anychanged;
2949
2950 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2951 KASSERT((sva & PDRMASK) == 0,
2952 ("pmap_protect_pde: sva is not 4mpage aligned"));
2953 anychanged = FALSE;
2954 retry:
2955 oldpde = newpde = *pde;
2956 if (oldpde & PG_MANAGED) {
2957 eva = sva + NBPDR;
2958 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2959 va < eva; va += PAGE_SIZE, m++) {
2960 /*
2961 * In contrast to the analogous operation on a 4KB page
2962 * mapping, the mapping's PG_A flag is not cleared and
2963 * the page's PG_REFERENCED flag is not set. The
2964 * reason is that pmap_demote_pde() expects that a 2/4MB
2965 * page mapping with a stored page table page has PG_A
2966 * set.
2967 */
2968 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2969 vm_page_dirty(m);
2970 }
2971 }
2972 if ((prot & VM_PROT_WRITE) == 0)
2973 newpde &= ~(PG_RW | PG_M);
2974 #ifdef PAE
2975 if ((prot & VM_PROT_EXECUTE) == 0)
2976 newpde |= pg_nx;
2977 #endif
2978 if (newpde != oldpde) {
2979 if (!pde_cmpset(pde, oldpde, newpde))
2980 goto retry;
2981 if (oldpde & PG_G)
2982 pmap_invalidate_page(pmap, sva);
2983 else
2984 anychanged = TRUE;
2985 }
2986 return (anychanged);
2987 }
2988
2989 /*
2990 * Set the physical protection on the
2991 * specified range of this map as requested.
2992 */
2993 void
2994 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2995 {
2996 vm_offset_t pdnxt;
2997 pd_entry_t ptpaddr;
2998 pt_entry_t *pte;
2999 int anychanged;
3000
3001 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3002 pmap_remove(pmap, sva, eva);
3003 return;
3004 }
3005
3006 #ifdef PAE
3007 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3008 (VM_PROT_WRITE|VM_PROT_EXECUTE))
3009 return;
3010 #else
3011 if (prot & VM_PROT_WRITE)
3012 return;
3013 #endif
3014
3015 anychanged = 0;
3016
3017 vm_page_lock_queues();
3018 sched_pin();
3019 PMAP_LOCK(pmap);
3020 for (; sva < eva; sva = pdnxt) {
3021 pt_entry_t obits, pbits;
3022 unsigned pdirindex;
3023
3024 pdnxt = (sva + NBPDR) & ~PDRMASK;
3025 if (pdnxt < sva)
3026 pdnxt = eva;
3027
3028 pdirindex = sva >> PDRSHIFT;
3029 ptpaddr = pmap->pm_pdir[pdirindex];
3030
3031 /*
3032 * Weed out invalid mappings. Note: we assume that the page
3033 * directory table is always allocated, and in kernel virtual.
3034 */
3035 if (ptpaddr == 0)
3036 continue;
3037
3038 /*
3039 * Check for large page.
3040 */
3041 if ((ptpaddr & PG_PS) != 0) {
3042 /*
3043 * Are we protecting the entire large page? If not,
3044 * demote the mapping and fall through.
3045 */
3046 if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3047 /*
3048 * The TLB entry for a PG_G mapping is
3049 * invalidated by pmap_protect_pde().
3050 */
3051 if (pmap_protect_pde(pmap,
3052 &pmap->pm_pdir[pdirindex], sva, prot))
3053 anychanged = 1;
3054 continue;
3055 } else if (!pmap_demote_pde(pmap,
3056 &pmap->pm_pdir[pdirindex], sva)) {
3057 /* The large page mapping was destroyed. */
3058 continue;
3059 }
3060 }
3061
3062 if (pdnxt > eva)
3063 pdnxt = eva;
3064
3065 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3066 sva += PAGE_SIZE) {
3067 vm_page_t m;
3068
3069 retry:
3070 /*
3071 * Regardless of whether a pte is 32 or 64 bits in
3072 * size, PG_RW, PG_A, and PG_M are among the least
3073 * significant 32 bits.
3074 */
3075 obits = pbits = *pte;
3076 if ((pbits & PG_V) == 0)
3077 continue;
3078 if (pbits & PG_MANAGED) {
3079 m = NULL;
3080 if (pbits & PG_A) {
3081 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3082 vm_page_flag_set(m, PG_REFERENCED);
3083 pbits &= ~PG_A;
3084 }
3085 if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3086 if (m == NULL)
3087 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3088 vm_page_dirty(m);
3089 }
3090 }
3091
3092 if ((prot & VM_PROT_WRITE) == 0)
3093 pbits &= ~(PG_RW | PG_M);
3094 #ifdef PAE
3095 if ((prot & VM_PROT_EXECUTE) == 0)
3096 pbits |= pg_nx;
3097 #endif
3098
3099 if (pbits != obits) {
3100 #ifdef PAE
3101 if (!atomic_cmpset_64(pte, obits, pbits))
3102 goto retry;
3103 #else
3104 if (!atomic_cmpset_int((u_int *)pte, obits,
3105 pbits))
3106 goto retry;
3107 #endif
3108 if (obits & PG_G)
3109 pmap_invalidate_page(pmap, sva);
3110 else
3111 anychanged = 1;
3112 }
3113 }
3114 }
3115 sched_unpin();
3116 if (anychanged)
3117 pmap_invalidate_all(pmap);
3118 vm_page_unlock_queues();
3119 PMAP_UNLOCK(pmap);
3120 }
3121
3122 /*
3123 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3124 * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3125 * For promotion to occur, two conditions must be met: (1) the 4KB page
3126 * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3127 * mappings must have identical characteristics.
3128 *
3129 * Managed (PG_MANAGED) mappings within the kernel address space are not
3130 * promoted. The reason is that kernel PDEs are replicated in each pmap but
3131 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3132 * pmap.
3133 */
3134 static void
3135 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3136 {
3137 pd_entry_t newpde;
3138 pt_entry_t *firstpte, oldpte, pa, *pte;
3139 vm_offset_t oldpteva;
3140 vm_page_t mpte;
3141
3142 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3143
3144 /*
3145 * Examine the first PTE in the specified PTP. Abort if this PTE is
3146 * either invalid, unused, or does not map the first 4KB physical page
3147 * within a 2- or 4MB page.
3148 */
3149 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3150 setpde:
3151 newpde = *firstpte;
3152 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3153 pmap_pde_p_failures++;
3154 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3155 " in pmap %p", va, pmap);
3156 return;
3157 }
3158 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3159 pmap_pde_p_failures++;
3160 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3161 " in pmap %p", va, pmap);
3162 return;
3163 }
3164 if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3165 /*
3166 * When PG_M is already clear, PG_RW can be cleared without
3167 * a TLB invalidation.
3168 */
3169 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3170 ~PG_RW))
3171 goto setpde;
3172 newpde &= ~PG_RW;
3173 }
3174
3175 /*
3176 * Examine each of the other PTEs in the specified PTP. Abort if this
3177 * PTE maps an unexpected 4KB physical page or does not have identical
3178 * characteristics to the first PTE.
3179 */
3180 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3181 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3182 setpte:
3183 oldpte = *pte;
3184 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3185 pmap_pde_p_failures++;
3186 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3187 " in pmap %p", va, pmap);
3188 return;
3189 }
3190 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3191 /*
3192 * When PG_M is already clear, PG_RW can be cleared
3193 * without a TLB invalidation.
3194 */
3195 if (!atomic_cmpset_int((u_int *)pte, oldpte,
3196 oldpte & ~PG_RW))
3197 goto setpte;
3198 oldpte &= ~PG_RW;
3199 oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3200 (va & ~PDRMASK);
3201 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3202 " in pmap %p", oldpteva, pmap);
3203 }
3204 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3205 pmap_pde_p_failures++;
3206 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3207 " in pmap %p", va, pmap);
3208 return;
3209 }
3210 pa -= PAGE_SIZE;
3211 }
3212
3213 /*
3214 * Save the page table page in its current state until the PDE
3215 * mapping the superpage is demoted by pmap_demote_pde() or
3216 * destroyed by pmap_remove_pde().
3217 */
3218 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3219 KASSERT(mpte >= vm_page_array &&
3220 mpte < &vm_page_array[vm_page_array_size],
3221 ("pmap_promote_pde: page table page is out of range"));
3222 KASSERT(mpte->pindex == va >> PDRSHIFT,
3223 ("pmap_promote_pde: page table page's pindex is wrong"));
3224 pmap_insert_pt_page(pmap, mpte);
3225
3226 /*
3227 * Promote the pv entries.
3228 */
3229 if ((newpde & PG_MANAGED) != 0)
3230 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3231
3232 /*
3233 * Propagate the PAT index to its proper position.
3234 */
3235 if ((newpde & PG_PTE_PAT) != 0)
3236 newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3237
3238 /*
3239 * Map the superpage.
3240 */
3241 if (workaround_erratum383)
3242 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3243 else if (pmap == kernel_pmap)
3244 pmap_kenter_pde(va, PG_PS | newpde);
3245 else
3246 pde_store(pde, PG_PS | newpde);
3247
3248 pmap_pde_promotions++;
3249 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3250 " in pmap %p", va, pmap);
3251 }
3252
3253 /*
3254 * Insert the given physical page (p) at
3255 * the specified virtual address (v) in the
3256 * target physical map with the protection requested.
3257 *
3258 * If specified, the page will be wired down, meaning
3259 * that the related pte can not be reclaimed.
3260 *
3261 * NB: This is the only routine which MAY NOT lazy-evaluate
3262 * or lose information. That is, this routine must actually
3263 * insert this page into the given map NOW.
3264 */
3265 void
3266 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3267 vm_prot_t prot, boolean_t wired)
3268 {
3269 vm_paddr_t pa;
3270 pd_entry_t *pde;
3271 pt_entry_t *pte;
3272 vm_paddr_t opa;
3273 pt_entry_t origpte, newpte;
3274 vm_page_t mpte, om;
3275 boolean_t invlva;
3276
3277 va = trunc_page(va);
3278 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3279 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3280 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va));
3281
3282 mpte = NULL;
3283
3284 vm_page_lock_queues();
3285 PMAP_LOCK(pmap);
3286 sched_pin();
3287
3288 /*
3289 * In the case that a page table page is not
3290 * resident, we are creating it here.
3291 */
3292 if (va < VM_MAXUSER_ADDRESS) {
3293 mpte = pmap_allocpte(pmap, va, M_WAITOK);
3294 }
3295
3296 pde = pmap_pde(pmap, va);
3297 if ((*pde & PG_PS) != 0)
3298 panic("pmap_enter: attempted pmap_enter on 4MB page");
3299 pte = pmap_pte_quick(pmap, va);
3300
3301 /*
3302 * Page Directory table entry not valid, we need a new PT page
3303 */
3304 if (pte == NULL) {
3305 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3306 (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3307 }
3308
3309 pa = VM_PAGE_TO_PHYS(m);
3310 om = NULL;
3311 origpte = *pte;
3312 opa = origpte & PG_FRAME;
3313
3314 /*
3315 * Mapping has not changed, must be protection or wiring change.
3316 */
3317 if (origpte && (opa == pa)) {
3318 /*
3319 * Wiring change, just update stats. We don't worry about
3320 * wiring PT pages as they remain resident as long as there
3321 * are valid mappings in them. Hence, if a user page is wired,
3322 * the PT page will be also.
3323 */
3324 if (wired && ((origpte & PG_W) == 0))
3325 pmap->pm_stats.wired_count++;
3326 else if (!wired && (origpte & PG_W))
3327 pmap->pm_stats.wired_count--;
3328
3329 /*
3330 * Remove extra pte reference
3331 */
3332 if (mpte)
3333 mpte->wire_count--;
3334
3335 /*
3336 * We might be turning off write access to the page,
3337 * so we go ahead and sense modify status.
3338 */
3339 if (origpte & PG_MANAGED) {
3340 om = m;
3341 pa |= PG_MANAGED;
3342 }
3343 goto validate;
3344 }
3345 /*
3346 * Mapping has changed, invalidate old range and fall through to
3347 * handle validating new mapping.
3348 */
3349 if (opa) {
3350 if (origpte & PG_W)
3351 pmap->pm_stats.wired_count--;
3352 if (origpte & PG_MANAGED) {
3353 om = PHYS_TO_VM_PAGE(opa);
3354 pmap_remove_entry(pmap, om, va);
3355 }
3356 if (mpte != NULL) {
3357 mpte->wire_count--;
3358 KASSERT(mpte->wire_count > 0,
3359 ("pmap_enter: missing reference to page table page,"
3360 " va: 0x%x", va));
3361 }
3362 } else
3363 pmap->pm_stats.resident_count++;
3364
3365 /*
3366 * Enter on the PV list if part of our managed memory.
3367 */
3368 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3369 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3370 ("pmap_enter: managed mapping within the clean submap"));
3371 pmap_insert_entry(pmap, va, m);
3372 pa |= PG_MANAGED;
3373 }
3374
3375 /*
3376 * Increment counters
3377 */
3378 if (wired)
3379 pmap->pm_stats.wired_count++;
3380
3381 validate:
3382 /*
3383 * Now validate mapping with desired protection/wiring.
3384 */
3385 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3386 if ((prot & VM_PROT_WRITE) != 0) {
3387 newpte |= PG_RW;
3388 vm_page_flag_set(m, PG_WRITEABLE);
3389 }
3390 #ifdef PAE
3391 if ((prot & VM_PROT_EXECUTE) == 0)
3392 newpte |= pg_nx;
3393 #endif
3394 if (wired)
3395 newpte |= PG_W;
3396 if (va < VM_MAXUSER_ADDRESS)
3397 newpte |= PG_U;
3398 if (pmap == kernel_pmap)
3399 newpte |= pgeflag;
3400
3401 /*
3402 * if the mapping or permission bits are different, we need
3403 * to update the pte.
3404 */
3405 if ((origpte & ~(PG_M|PG_A)) != newpte) {
3406 newpte |= PG_A;
3407 if ((access & VM_PROT_WRITE) != 0)
3408 newpte |= PG_M;
3409 if (origpte & PG_V) {
3410 invlva = FALSE;
3411 origpte = pte_load_store(pte, newpte);
3412 if (origpte & PG_A) {
3413 if (origpte & PG_MANAGED)
3414 vm_page_flag_set(om, PG_REFERENCED);
3415 if (opa != VM_PAGE_TO_PHYS(m))
3416 invlva = TRUE;
3417 #ifdef PAE
3418 if ((origpte & PG_NX) == 0 &&
3419 (newpte & PG_NX) != 0)
3420 invlva = TRUE;
3421 #endif
3422 }
3423 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3424 if ((origpte & PG_MANAGED) != 0)
3425 vm_page_dirty(om);
3426 if ((prot & VM_PROT_WRITE) == 0)
3427 invlva = TRUE;
3428 }
3429 if (invlva)
3430 pmap_invalidate_page(pmap, va);
3431 } else
3432 pte_store(pte, newpte);
3433 }
3434
3435 /*
3436 * If both the page table page and the reservation are fully
3437 * populated, then attempt promotion.
3438 */
3439 if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3440 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3441 pmap_promote_pde(pmap, pde, va);
3442
3443 sched_unpin();
3444 vm_page_unlock_queues();
3445 PMAP_UNLOCK(pmap);
3446 }
3447
3448 /*
3449 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and
3450 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without
3451 * blocking, (2) a mapping already exists at the specified virtual address, or
3452 * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3453 */
3454 static boolean_t
3455 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3456 {
3457 pd_entry_t *pde, newpde;
3458
3459 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3460 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3461 pde = pmap_pde(pmap, va);
3462 if (*pde != 0) {
3463 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3464 " in pmap %p", va, pmap);
3465 return (FALSE);
3466 }
3467 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3468 PG_PS | PG_V;
3469 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3470 newpde |= PG_MANAGED;
3471
3472 /*
3473 * Abort this mapping if its PV entry could not be created.
3474 */
3475 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3476 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3477 " in pmap %p", va, pmap);
3478 return (FALSE);
3479 }
3480 }
3481 #ifdef PAE
3482 if ((prot & VM_PROT_EXECUTE) == 0)
3483 newpde |= pg_nx;
3484 #endif
3485 if (va < VM_MAXUSER_ADDRESS)
3486 newpde |= PG_U;
3487
3488 /*
3489 * Increment counters.
3490 */
3491 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3492
3493 /*
3494 * Map the superpage.
3495 */
3496 pde_store(pde, newpde);
3497
3498 pmap_pde_mappings++;
3499 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3500 " in pmap %p", va, pmap);
3501 return (TRUE);
3502 }
3503
3504 /*
3505 * Maps a sequence of resident pages belonging to the same object.
3506 * The sequence begins with the given page m_start. This page is
3507 * mapped at the given virtual address start. Each subsequent page is
3508 * mapped at a virtual address that is offset from start by the same
3509 * amount as the page is offset from m_start within the object. The
3510 * last page in the sequence is the page with the largest offset from
3511 * m_start that can be mapped at a virtual address less than the given
3512 * virtual address end. Not every virtual page between start and end
3513 * is mapped; only those for which a resident page exists with the
3514 * corresponding offset from m_start are mapped.
3515 */
3516 void
3517 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3518 vm_page_t m_start, vm_prot_t prot)
3519 {
3520 vm_offset_t va;
3521 vm_page_t m, mpte;
3522 vm_pindex_t diff, psize;
3523
3524 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3525 psize = atop(end - start);
3526 mpte = NULL;
3527 m = m_start;
3528 PMAP_LOCK(pmap);
3529 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3530 va = start + ptoa(diff);
3531 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3532 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3533 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3534 pmap_enter_pde(pmap, va, m, prot))
3535 m = &m[NBPDR / PAGE_SIZE - 1];
3536 else
3537 mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3538 mpte);
3539 m = TAILQ_NEXT(m, listq);
3540 }
3541 PMAP_UNLOCK(pmap);
3542 }
3543
3544 /*
3545 * this code makes some *MAJOR* assumptions:
3546 * 1. Current pmap & pmap exists.
3547 * 2. Not wired.
3548 * 3. Read access.
3549 * 4. No page table pages.
3550 * but is *MUCH* faster than pmap_enter...
3551 */
3552
3553 void
3554 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3555 {
3556
3557 PMAP_LOCK(pmap);
3558 (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3559 PMAP_UNLOCK(pmap);
3560 }
3561
3562 static vm_page_t
3563 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3564 vm_prot_t prot, vm_page_t mpte)
3565 {
3566 pt_entry_t *pte;
3567 vm_paddr_t pa;
3568 vm_page_t free;
3569
3570 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3571 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3572 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3573 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3574 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3575
3576 /*
3577 * In the case that a page table page is not
3578 * resident, we are creating it here.
3579 */
3580 if (va < VM_MAXUSER_ADDRESS) {
3581 unsigned ptepindex;
3582 pd_entry_t ptepa;
3583
3584 /*
3585 * Calculate pagetable page index
3586 */
3587 ptepindex = va >> PDRSHIFT;
3588 if (mpte && (mpte->pindex == ptepindex)) {
3589 mpte->wire_count++;
3590 } else {
3591 /*
3592 * Get the page directory entry
3593 */
3594 ptepa = pmap->pm_pdir[ptepindex];
3595
3596 /*
3597 * If the page table page is mapped, we just increment
3598 * the hold count, and activate it.
3599 */
3600 if (ptepa) {
3601 if (ptepa & PG_PS)
3602 return (NULL);
3603 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3604 mpte->wire_count++;
3605 } else {
3606 mpte = _pmap_allocpte(pmap, ptepindex,
3607 M_NOWAIT);
3608 if (mpte == NULL)
3609 return (mpte);
3610 }
3611 }
3612 } else {
3613 mpte = NULL;
3614 }
3615
3616 /*
3617 * This call to vtopte makes the assumption that we are
3618 * entering the page into the current pmap. In order to support
3619 * quick entry into any pmap, one would likely use pmap_pte_quick.
3620 * But that isn't as quick as vtopte.
3621 */
3622 pte = vtopte(va);
3623 if (*pte) {
3624 if (mpte != NULL) {
3625 mpte->wire_count--;
3626 mpte = NULL;
3627 }
3628 return (mpte);
3629 }
3630
3631 /*
3632 * Enter on the PV list if part of our managed memory.
3633 */
3634 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3635 !pmap_try_insert_pv_entry(pmap, va, m)) {
3636 if (mpte != NULL) {
3637 free = NULL;
3638 if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
3639 pmap_invalidate_page(pmap, va);
3640 pmap_free_zero_pages(free);
3641 }
3642
3643 mpte = NULL;
3644 }
3645 return (mpte);
3646 }
3647
3648 /*
3649 * Increment counters
3650 */
3651 pmap->pm_stats.resident_count++;
3652
3653 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3654 #ifdef PAE
3655 if ((prot & VM_PROT_EXECUTE) == 0)
3656 pa |= pg_nx;
3657 #endif
3658
3659 /*
3660 * Now validate mapping with RO protection
3661 */
3662 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3663 pte_store(pte, pa | PG_V | PG_U);
3664 else
3665 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3666 return mpte;
3667 }
3668
3669 /*
3670 * Make a temporary mapping for a physical address. This is only intended
3671 * to be used for panic dumps.
3672 */
3673 void *
3674 pmap_kenter_temporary(vm_paddr_t pa, int i)
3675 {
3676 vm_offset_t va;
3677
3678 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3679 pmap_kenter(va, pa);
3680 invlpg(va);
3681 return ((void *)crashdumpmap);
3682 }
3683
3684 /*
3685 * This code maps large physical mmap regions into the
3686 * processor address space. Note that some shortcuts
3687 * are taken, but the code works.
3688 */
3689 void
3690 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3691 vm_pindex_t pindex, vm_size_t size)
3692 {
3693 pd_entry_t *pde;
3694 vm_paddr_t pa, ptepa;
3695 vm_page_t p;
3696 int pat_mode;
3697
3698 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3699 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3700 ("pmap_object_init_pt: non-device object"));
3701 if (pseflag &&
3702 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3703 if (!vm_object_populate(object, pindex, pindex + atop(size)))
3704 return;
3705 p = vm_page_lookup(object, pindex);
3706 KASSERT(p->valid == VM_PAGE_BITS_ALL,
3707 ("pmap_object_init_pt: invalid page %p", p));
3708 pat_mode = p->md.pat_mode;
3709
3710 /*
3711 * Abort the mapping if the first page is not physically
3712 * aligned to a 2/4MB page boundary.
3713 */
3714 ptepa = VM_PAGE_TO_PHYS(p);
3715 if (ptepa & (NBPDR - 1))
3716 return;
3717
3718 /*
3719 * Skip the first page. Abort the mapping if the rest of
3720 * the pages are not physically contiguous or have differing
3721 * memory attributes.
3722 */
3723 p = TAILQ_NEXT(p, listq);
3724 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3725 pa += PAGE_SIZE) {
3726 KASSERT(p->valid == VM_PAGE_BITS_ALL,
3727 ("pmap_object_init_pt: invalid page %p", p));
3728 if (pa != VM_PAGE_TO_PHYS(p) ||
3729 pat_mode != p->md.pat_mode)
3730 return;
3731 p = TAILQ_NEXT(p, listq);
3732 }
3733
3734 /*
3735 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and
3736 * "size" is a multiple of 2/4M, adding the PAT setting to
3737 * "pa" will not affect the termination of this loop.
3738 */
3739 PMAP_LOCK(pmap);
3740 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3741 size; pa += NBPDR) {
3742 pde = pmap_pde(pmap, addr);
3743 if (*pde == 0) {
3744 pde_store(pde, pa | PG_PS | PG_M | PG_A |
3745 PG_U | PG_RW | PG_V);
3746 pmap->pm_stats.resident_count += NBPDR /
3747 PAGE_SIZE;
3748 pmap_pde_mappings++;
3749 }
3750 /* Else continue on if the PDE is already valid. */
3751 addr += NBPDR;
3752 }
3753 PMAP_UNLOCK(pmap);
3754 }
3755 }
3756
3757 /*
3758 * Routine: pmap_change_wiring
3759 * Function: Change the wiring attribute for a map/virtual-address
3760 * pair.
3761 * In/out conditions:
3762 * The mapping must already exist in the pmap.
3763 */
3764 void
3765 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3766 {
3767 pd_entry_t *pde;
3768 pt_entry_t *pte;
3769 boolean_t are_queues_locked;
3770
3771 are_queues_locked = FALSE;
3772 retry:
3773 PMAP_LOCK(pmap);
3774 pde = pmap_pde(pmap, va);
3775 if ((*pde & PG_PS) != 0) {
3776 if (!wired != ((*pde & PG_W) == 0)) {
3777 if (!are_queues_locked) {
3778 are_queues_locked = TRUE;
3779 if (!mtx_trylock(&vm_page_queue_mtx)) {
3780 PMAP_UNLOCK(pmap);
3781 vm_page_lock_queues();
3782 goto retry;
3783 }
3784 }
3785 if (!pmap_demote_pde(pmap, pde, va))
3786 panic("pmap_change_wiring: demotion failed");
3787 } else
3788 goto out;
3789 }
3790 pte = pmap_pte(pmap, va);
3791
3792 if (wired && !pmap_pte_w(pte))
3793 pmap->pm_stats.wired_count++;
3794 else if (!wired && pmap_pte_w(pte))
3795 pmap->pm_stats.wired_count--;
3796
3797 /*
3798 * Wiring is not a hardware characteristic so there is no need to
3799 * invalidate TLB.
3800 */
3801 pmap_pte_set_w(pte, wired);
3802 pmap_pte_release(pte);
3803 out:
3804 if (are_queues_locked)
3805 vm_page_unlock_queues();
3806 PMAP_UNLOCK(pmap);
3807 }
3808
3809
3810
3811 /*
3812 * Copy the range specified by src_addr/len
3813 * from the source map to the range dst_addr/len
3814 * in the destination map.
3815 *
3816 * This routine is only advisory and need not do anything.
3817 */
3818
3819 void
3820 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3821 vm_offset_t src_addr)
3822 {
3823 vm_page_t free;
3824 vm_offset_t addr;
3825 vm_offset_t end_addr = src_addr + len;
3826 vm_offset_t pdnxt;
3827
3828 if (dst_addr != src_addr)
3829 return;
3830
3831 if (!pmap_is_current(src_pmap))
3832 return;
3833
3834 vm_page_lock_queues();
3835 if (dst_pmap < src_pmap) {
3836 PMAP_LOCK(dst_pmap);
3837 PMAP_LOCK(src_pmap);
3838 } else {
3839 PMAP_LOCK(src_pmap);
3840 PMAP_LOCK(dst_pmap);
3841 }
3842 sched_pin();
3843 for (addr = src_addr; addr < end_addr; addr = pdnxt) {
3844 pt_entry_t *src_pte, *dst_pte;
3845 vm_page_t dstmpte, srcmpte;
3846 pd_entry_t srcptepaddr;
3847 unsigned ptepindex;
3848
3849 KASSERT(addr < UPT_MIN_ADDRESS,
3850 ("pmap_copy: invalid to pmap_copy page tables"));
3851
3852 pdnxt = (addr + NBPDR) & ~PDRMASK;
3853 if (pdnxt < addr)
3854 pdnxt = end_addr;
3855 ptepindex = addr >> PDRSHIFT;
3856
3857 srcptepaddr = src_pmap->pm_pdir[ptepindex];
3858 if (srcptepaddr == 0)
3859 continue;
3860
3861 if (srcptepaddr & PG_PS) {
3862 if (dst_pmap->pm_pdir[ptepindex] == 0 &&
3863 ((srcptepaddr & PG_MANAGED) == 0 ||
3864 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
3865 PG_PS_FRAME))) {
3866 dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
3867 ~PG_W;
3868 dst_pmap->pm_stats.resident_count +=
3869 NBPDR / PAGE_SIZE;
3870 }
3871 continue;
3872 }
3873
3874 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3875 KASSERT(srcmpte->wire_count > 0,
3876 ("pmap_copy: source page table page is unused"));
3877
3878 if (pdnxt > end_addr)
3879 pdnxt = end_addr;
3880
3881 src_pte = vtopte(addr);
3882 while (addr < pdnxt) {
3883 pt_entry_t ptetemp;
3884 ptetemp = *src_pte;
3885 /*
3886 * we only virtual copy managed pages
3887 */
3888 if ((ptetemp & PG_MANAGED) != 0) {
3889 dstmpte = pmap_allocpte(dst_pmap, addr,
3890 M_NOWAIT);
3891 if (dstmpte == NULL)
3892 goto out;
3893 dst_pte = pmap_pte_quick(dst_pmap, addr);
3894 if (*dst_pte == 0 &&
3895 pmap_try_insert_pv_entry(dst_pmap, addr,
3896 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3897 /*
3898 * Clear the wired, modified, and
3899 * accessed (referenced) bits
3900 * during the copy.
3901 */
3902 *dst_pte = ptetemp & ~(PG_W | PG_M |
3903 PG_A);
3904 dst_pmap->pm_stats.resident_count++;
3905 } else {
3906 free = NULL;
3907 if (pmap_unwire_pte_hold(dst_pmap,
3908 dstmpte, &free)) {
3909 pmap_invalidate_page(dst_pmap,
3910 addr);
3911 pmap_free_zero_pages(free);
3912 }
3913 goto out;
3914 }
3915 if (dstmpte->wire_count >= srcmpte->wire_count)
3916 break;
3917 }
3918 addr += PAGE_SIZE;
3919 src_pte++;
3920 }
3921 }
3922 out:
3923 sched_unpin();
3924 vm_page_unlock_queues();
3925 PMAP_UNLOCK(src_pmap);
3926 PMAP_UNLOCK(dst_pmap);
3927 }
3928
3929 static __inline void
3930 pagezero(void *page)
3931 {
3932 #if defined(I686_CPU)
3933 if (cpu_class == CPUCLASS_686) {
3934 #if defined(CPU_ENABLE_SSE)
3935 if (cpu_feature & CPUID_SSE2)
3936 sse2_pagezero(page);
3937 else
3938 #endif
3939 i686_pagezero(page);
3940 } else
3941 #endif
3942 bzero(page, PAGE_SIZE);
3943 }
3944
3945 /*
3946 * pmap_zero_page zeros the specified hardware page by mapping
3947 * the page into KVM and using bzero to clear its contents.
3948 */
3949 void
3950 pmap_zero_page(vm_page_t m)
3951 {
3952 struct sysmaps *sysmaps;
3953
3954 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3955 mtx_lock(&sysmaps->lock);
3956 if (*sysmaps->CMAP2)
3957 panic("pmap_zero_page: CMAP2 busy");
3958 sched_pin();
3959 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3960 pmap_cache_bits(m->md.pat_mode, 0);
3961 invlcaddr(sysmaps->CADDR2);
3962 pagezero(sysmaps->CADDR2);
3963 *sysmaps->CMAP2 = 0;
3964 sched_unpin();
3965 mtx_unlock(&sysmaps->lock);
3966 }
3967
3968 /*
3969 * pmap_zero_page_area zeros the specified hardware page by mapping
3970 * the page into KVM and using bzero to clear its contents.
3971 *
3972 * off and size may not cover an area beyond a single hardware page.
3973 */
3974 void
3975 pmap_zero_page_area(vm_page_t m, int off, int size)
3976 {
3977 struct sysmaps *sysmaps;
3978
3979 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3980 mtx_lock(&sysmaps->lock);
3981 if (*sysmaps->CMAP2)
3982 panic("pmap_zero_page_area: CMAP2 busy");
3983 sched_pin();
3984 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3985 pmap_cache_bits(m->md.pat_mode, 0);
3986 invlcaddr(sysmaps->CADDR2);
3987 if (off == 0 && size == PAGE_SIZE)
3988 pagezero(sysmaps->CADDR2);
3989 else
3990 bzero((char *)sysmaps->CADDR2 + off, size);
3991 *sysmaps->CMAP2 = 0;
3992 sched_unpin();
3993 mtx_unlock(&sysmaps->lock);
3994 }
3995
3996 /*
3997 * pmap_zero_page_idle zeros the specified hardware page by mapping
3998 * the page into KVM and using bzero to clear its contents. This
3999 * is intended to be called from the vm_pagezero process only and
4000 * outside of Giant.
4001 */
4002 void
4003 pmap_zero_page_idle(vm_page_t m)
4004 {
4005
4006 if (*CMAP3)
4007 panic("pmap_zero_page_idle: CMAP3 busy");
4008 sched_pin();
4009 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4010 pmap_cache_bits(m->md.pat_mode, 0);
4011 invlcaddr(CADDR3);
4012 pagezero(CADDR3);
4013 *CMAP3 = 0;
4014 sched_unpin();
4015 }
4016
4017 /*
4018 * pmap_copy_page copies the specified (machine independent)
4019 * page by mapping the page into virtual memory and using
4020 * bcopy to copy the page, one machine dependent page at a
4021 * time.
4022 */
4023 void
4024 pmap_copy_page(vm_page_t src, vm_page_t dst)
4025 {
4026 struct sysmaps *sysmaps;
4027
4028 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4029 mtx_lock(&sysmaps->lock);
4030 if (*sysmaps->CMAP1)
4031 panic("pmap_copy_page: CMAP1 busy");
4032 if (*sysmaps->CMAP2)
4033 panic("pmap_copy_page: CMAP2 busy");
4034 sched_pin();
4035 invlpg((u_int)sysmaps->CADDR1);
4036 invlpg((u_int)sysmaps->CADDR2);
4037 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4038 pmap_cache_bits(src->md.pat_mode, 0);
4039 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4040 pmap_cache_bits(dst->md.pat_mode, 0);
4041 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4042 *sysmaps->CMAP1 = 0;
4043 *sysmaps->CMAP2 = 0;
4044 sched_unpin();
4045 mtx_unlock(&sysmaps->lock);
4046 }
4047
4048 /*
4049 * Returns true if the pmap's pv is one of the first
4050 * 16 pvs linked to from this page. This count may
4051 * be changed upwards or downwards in the future; it
4052 * is only necessary that true be returned for a small
4053 * subset of pmaps for proper page aging.
4054 */
4055 boolean_t
4056 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4057 {
4058 struct md_page *pvh;
4059 pv_entry_t pv;
4060 int loops = 0;
4061
4062 if (m->flags & PG_FICTITIOUS)
4063 return FALSE;
4064
4065 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4066 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4067 if (PV_PMAP(pv) == pmap) {
4068 return TRUE;
4069 }
4070 loops++;
4071 if (loops >= 16)
4072 break;
4073 }
4074 if (loops < 16) {
4075 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4076 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4077 if (PV_PMAP(pv) == pmap)
4078 return (TRUE);
4079 loops++;
4080 if (loops >= 16)
4081 break;
4082 }
4083 }
4084 return (FALSE);
4085 }
4086
4087 /*
4088 * Returns TRUE if the given page is mapped individually or as part of
4089 * a 4mpage. Otherwise, returns FALSE.
4090 */
4091 boolean_t
4092 pmap_page_is_mapped(vm_page_t m)
4093 {
4094 struct md_page *pvh;
4095
4096 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
4097 return (FALSE);
4098 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4099 if (TAILQ_EMPTY(&m->md.pv_list)) {
4100 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4101 return (!TAILQ_EMPTY(&pvh->pv_list));
4102 } else
4103 return (TRUE);
4104 }
4105
4106 /*
4107 * Remove all pages from specified address space
4108 * this aids process exit speeds. Also, this code
4109 * is special cased for current process only, but
4110 * can have the more generic (and slightly slower)
4111 * mode enabled. This is much faster than pmap_remove
4112 * in the case of running down an entire address space.
4113 */
4114 void
4115 pmap_remove_pages(pmap_t pmap)
4116 {
4117 pt_entry_t *pte, tpte;
4118 vm_page_t free = NULL;
4119 vm_page_t m, mpte, mt;
4120 pv_entry_t pv;
4121 struct md_page *pvh;
4122 struct pv_chunk *pc, *npc;
4123 int field, idx;
4124 int32_t bit;
4125 uint32_t inuse, bitmask;
4126 int allfree;
4127
4128 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
4129 printf("warning: pmap_remove_pages called with non-current pmap\n");
4130 return;
4131 }
4132 vm_page_lock_queues();
4133 PMAP_LOCK(pmap);
4134 sched_pin();
4135 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4136 allfree = 1;
4137 for (field = 0; field < _NPCM; field++) {
4138 inuse = (~(pc->pc_map[field])) & pc_freemask[field];
4139 while (inuse != 0) {
4140 bit = bsfl(inuse);
4141 bitmask = 1UL << bit;
4142 idx = field * 32 + bit;
4143 pv = &pc->pc_pventry[idx];
4144 inuse &= ~bitmask;
4145
4146 pte = pmap_pde(pmap, pv->pv_va);
4147 tpte = *pte;
4148 if ((tpte & PG_PS) == 0) {
4149 pte = vtopte(pv->pv_va);
4150 tpte = *pte & ~PG_PTE_PAT;
4151 }
4152
4153 if (tpte == 0) {
4154 printf(
4155 "TPTE at %p IS ZERO @ VA %08x\n",
4156 pte, pv->pv_va);
4157 panic("bad pte");
4158 }
4159
4160 /*
4161 * We cannot remove wired pages from a process' mapping at this time
4162 */
4163 if (tpte & PG_W) {
4164 allfree = 0;
4165 continue;
4166 }
4167
4168 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4169 KASSERT(m->phys_addr == (tpte & PG_FRAME),
4170 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4171 m, (uintmax_t)m->phys_addr,
4172 (uintmax_t)tpte));
4173
4174 KASSERT(m < &vm_page_array[vm_page_array_size],
4175 ("pmap_remove_pages: bad tpte %#jx",
4176 (uintmax_t)tpte));
4177
4178 pte_clear(pte);
4179
4180 /*
4181 * Update the vm_page_t clean/reference bits.
4182 */
4183 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4184 if ((tpte & PG_PS) != 0) {
4185 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4186 vm_page_dirty(mt);
4187 } else
4188 vm_page_dirty(m);
4189 }
4190
4191 /* Mark free */
4192 PV_STAT(pv_entry_frees++);
4193 PV_STAT(pv_entry_spare++);
4194 pv_entry_count--;
4195 pc->pc_map[field] |= bitmask;
4196 if ((tpte & PG_PS) != 0) {
4197 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4198 pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4199 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4200 if (TAILQ_EMPTY(&pvh->pv_list)) {
4201 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4202 if (TAILQ_EMPTY(&mt->md.pv_list))
4203 vm_page_flag_clear(mt, PG_WRITEABLE);
4204 }
4205 mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4206 if (mpte != NULL) {
4207 pmap_remove_pt_page(pmap, mpte);
4208 pmap->pm_stats.resident_count--;
4209 KASSERT(mpte->wire_count == NPTEPG,
4210 ("pmap_remove_pages: pte page wire count error"));
4211 mpte->wire_count = 0;
4212 pmap_add_delayed_free_list(mpte, &free, FALSE);
4213 atomic_subtract_int(&cnt.v_wire_count, 1);
4214 }
4215 } else {
4216 pmap->pm_stats.resident_count--;
4217 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4218 if (TAILQ_EMPTY(&m->md.pv_list)) {
4219 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4220 if (TAILQ_EMPTY(&pvh->pv_list))
4221 vm_page_flag_clear(m, PG_WRITEABLE);
4222 }
4223 pmap_unuse_pt(pmap, pv->pv_va, &free);
4224 }
4225 }
4226 }
4227 if (allfree) {
4228 PV_STAT(pv_entry_spare -= _NPCPV);
4229 PV_STAT(pc_chunk_count--);
4230 PV_STAT(pc_chunk_frees++);
4231 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4232 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
4233 pmap_qremove((vm_offset_t)pc, 1);
4234 vm_page_unwire(m, 0);
4235 vm_page_free(m);
4236 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
4237 }
4238 }
4239 sched_unpin();
4240 pmap_invalidate_all(pmap);
4241 vm_page_unlock_queues();
4242 PMAP_UNLOCK(pmap);
4243 pmap_free_zero_pages(free);
4244 }
4245
4246 /*
4247 * pmap_is_modified:
4248 *
4249 * Return whether or not the specified physical page was modified
4250 * in any physical maps.
4251 */
4252 boolean_t
4253 pmap_is_modified(vm_page_t m)
4254 {
4255
4256 if (m->flags & PG_FICTITIOUS)
4257 return (FALSE);
4258 if (pmap_is_modified_pvh(&m->md))
4259 return (TRUE);
4260 return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4261 }
4262
4263 /*
4264 * Returns TRUE if any of the given mappings were used to modify
4265 * physical memory. Otherwise, returns FALSE. Both page and 2mpage
4266 * mappings are supported.
4267 */
4268 static boolean_t
4269 pmap_is_modified_pvh(struct md_page *pvh)
4270 {
4271 pv_entry_t pv;
4272 pt_entry_t *pte;
4273 pmap_t pmap;
4274 boolean_t rv;
4275
4276 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4277 rv = FALSE;
4278 sched_pin();
4279 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4280 pmap = PV_PMAP(pv);
4281 PMAP_LOCK(pmap);
4282 pte = pmap_pte_quick(pmap, pv->pv_va);
4283 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4284 PMAP_UNLOCK(pmap);
4285 if (rv)
4286 break;
4287 }
4288 sched_unpin();
4289 return (rv);
4290 }
4291
4292 /*
4293 * pmap_is_prefaultable:
4294 *
4295 * Return whether or not the specified virtual address is elgible
4296 * for prefault.
4297 */
4298 boolean_t
4299 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4300 {
4301 pd_entry_t *pde;
4302 pt_entry_t *pte;
4303 boolean_t rv;
4304
4305 rv = FALSE;
4306 PMAP_LOCK(pmap);
4307 pde = pmap_pde(pmap, addr);
4308 if (*pde != 0 && (*pde & PG_PS) == 0) {
4309 pte = vtopte(addr);
4310 rv = *pte == 0;
4311 }
4312 PMAP_UNLOCK(pmap);
4313 return (rv);
4314 }
4315
4316 /*
4317 * Clear the write and modified bits in each of the given page's mappings.
4318 */
4319 void
4320 pmap_remove_write(vm_page_t m)
4321 {
4322 struct md_page *pvh;
4323 pv_entry_t next_pv, pv;
4324 pmap_t pmap;
4325 pd_entry_t *pde;
4326 pt_entry_t oldpte, *pte;
4327 vm_offset_t va;
4328
4329 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4330 if ((m->flags & PG_FICTITIOUS) != 0 ||
4331 (m->flags & PG_WRITEABLE) == 0)
4332 return;
4333 sched_pin();
4334 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4335 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4336 va = pv->pv_va;
4337 pmap = PV_PMAP(pv);
4338 PMAP_LOCK(pmap);
4339 pde = pmap_pde(pmap, va);
4340 if ((*pde & PG_RW) != 0)
4341 (void)pmap_demote_pde(pmap, pde, va);
4342 PMAP_UNLOCK(pmap);
4343 }
4344 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4345 pmap = PV_PMAP(pv);
4346 PMAP_LOCK(pmap);
4347 pde = pmap_pde(pmap, pv->pv_va);
4348 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4349 " a 4mpage in page %p's pv list", m));
4350 pte = pmap_pte_quick(pmap, pv->pv_va);
4351 retry:
4352 oldpte = *pte;
4353 if ((oldpte & PG_RW) != 0) {
4354 /*
4355 * Regardless of whether a pte is 32 or 64 bits
4356 * in size, PG_RW and PG_M are among the least
4357 * significant 32 bits.
4358 */
4359 if (!atomic_cmpset_int((u_int *)pte, oldpte,
4360 oldpte & ~(PG_RW | PG_M)))
4361 goto retry;
4362 if ((oldpte & PG_M) != 0)
4363 vm_page_dirty(m);
4364 pmap_invalidate_page(pmap, pv->pv_va);
4365 }
4366 PMAP_UNLOCK(pmap);
4367 }
4368 vm_page_flag_clear(m, PG_WRITEABLE);
4369 sched_unpin();
4370 }
4371
4372 /*
4373 * pmap_ts_referenced:
4374 *
4375 * Return a count of reference bits for a page, clearing those bits.
4376 * It is not necessary for every reference bit to be cleared, but it
4377 * is necessary that 0 only be returned when there are truly no
4378 * reference bits set.
4379 *
4380 * XXX: The exact number of bits to check and clear is a matter that
4381 * should be tested and standardized at some point in the future for
4382 * optimal aging of shared pages.
4383 */
4384 int
4385 pmap_ts_referenced(vm_page_t m)
4386 {
4387 struct md_page *pvh;
4388 pv_entry_t pv, pvf, pvn;
4389 pmap_t pmap;
4390 pd_entry_t oldpde, *pde;
4391 pt_entry_t *pte;
4392 vm_offset_t va;
4393 int rtval = 0;
4394
4395 if (m->flags & PG_FICTITIOUS)
4396 return (rtval);
4397 sched_pin();
4398 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4399 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4400 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4401 va = pv->pv_va;
4402 pmap = PV_PMAP(pv);
4403 PMAP_LOCK(pmap);
4404 pde = pmap_pde(pmap, va);
4405 oldpde = *pde;
4406 if ((oldpde & PG_A) != 0) {
4407 if (pmap_demote_pde(pmap, pde, va)) {
4408 if ((oldpde & PG_W) == 0) {
4409 /*
4410 * Remove the mapping to a single page
4411 * so that a subsequent access may
4412 * repromote. Since the underlying
4413 * page table page is fully populated,
4414 * this removal never frees a page
4415 * table page.
4416 */
4417 va += VM_PAGE_TO_PHYS(m) - (oldpde &
4418 PG_PS_FRAME);
4419 pmap_remove_page(pmap, va, NULL);
4420 rtval++;
4421 if (rtval > 4) {
4422 PMAP_UNLOCK(pmap);
4423 goto out;
4424 }
4425 }
4426 }
4427 }
4428 PMAP_UNLOCK(pmap);
4429 }
4430 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4431 pvf = pv;
4432 do {
4433 pvn = TAILQ_NEXT(pv, pv_list);
4434 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4435 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4436 pmap = PV_PMAP(pv);
4437 PMAP_LOCK(pmap);
4438 pde = pmap_pde(pmap, pv->pv_va);
4439 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4440 " found a 4mpage in page %p's pv list", m));
4441 pte = pmap_pte_quick(pmap, pv->pv_va);
4442 if ((*pte & PG_A) != 0) {
4443 atomic_clear_int((u_int *)pte, PG_A);
4444 pmap_invalidate_page(pmap, pv->pv_va);
4445 rtval++;
4446 if (rtval > 4)
4447 pvn = NULL;
4448 }
4449 PMAP_UNLOCK(pmap);
4450 } while ((pv = pvn) != NULL && pv != pvf);
4451 }
4452 out:
4453 sched_unpin();
4454 return (rtval);
4455 }
4456
4457 /*
4458 * Clear the modify bits on the specified physical page.
4459 */
4460 void
4461 pmap_clear_modify(vm_page_t m)
4462 {
4463 struct md_page *pvh;
4464 pv_entry_t next_pv, pv;
4465 pmap_t pmap;
4466 pd_entry_t oldpde, *pde;
4467 pt_entry_t oldpte, *pte;
4468 vm_offset_t va;
4469
4470 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4471 if ((m->flags & PG_FICTITIOUS) != 0)
4472 return;
4473 sched_pin();
4474 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4475 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4476 va = pv->pv_va;
4477 pmap = PV_PMAP(pv);
4478 PMAP_LOCK(pmap);
4479 pde = pmap_pde(pmap, va);
4480 oldpde = *pde;
4481 if ((oldpde & PG_RW) != 0) {
4482 if (pmap_demote_pde(pmap, pde, va)) {
4483 if ((oldpde & PG_W) == 0) {
4484 /*
4485 * Write protect the mapping to a
4486 * single page so that a subsequent
4487 * write access may repromote.
4488 */
4489 va += VM_PAGE_TO_PHYS(m) - (oldpde &
4490 PG_PS_FRAME);
4491 pte = pmap_pte_quick(pmap, va);
4492 oldpte = *pte;
4493 if ((oldpte & PG_V) != 0) {
4494 /*
4495 * Regardless of whether a pte is 32 or 64 bits
4496 * in size, PG_RW and PG_M are among the least
4497 * significant 32 bits.
4498 */
4499 while (!atomic_cmpset_int((u_int *)pte,
4500 oldpte,
4501 oldpte & ~(PG_M | PG_RW)))
4502 oldpte = *pte;
4503 vm_page_dirty(m);
4504 pmap_invalidate_page(pmap, va);
4505 }
4506 }
4507 }
4508 }
4509 PMAP_UNLOCK(pmap);
4510 }
4511 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4512 pmap = PV_PMAP(pv);
4513 PMAP_LOCK(pmap);
4514 pde = pmap_pde(pmap, pv->pv_va);
4515 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4516 " a 4mpage in page %p's pv list", m));
4517 pte = pmap_pte_quick(pmap, pv->pv_va);
4518 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4519 /*
4520 * Regardless of whether a pte is 32 or 64 bits
4521 * in size, PG_M is among the least significant
4522 * 32 bits.
4523 */
4524 atomic_clear_int((u_int *)pte, PG_M);
4525 pmap_invalidate_page(pmap, pv->pv_va);
4526 }
4527 PMAP_UNLOCK(pmap);
4528 }
4529 sched_unpin();
4530 }
4531
4532 /*
4533 * pmap_clear_reference:
4534 *
4535 * Clear the reference bit on the specified physical page.
4536 */
4537 void
4538 pmap_clear_reference(vm_page_t m)
4539 {
4540 struct md_page *pvh;
4541 pv_entry_t next_pv, pv;
4542 pmap_t pmap;
4543 pd_entry_t oldpde, *pde;
4544 pt_entry_t *pte;
4545 vm_offset_t va;
4546
4547 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4548 if ((m->flags & PG_FICTITIOUS) != 0)
4549 return;
4550 sched_pin();
4551 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4552 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4553 va = pv->pv_va;
4554 pmap = PV_PMAP(pv);
4555 PMAP_LOCK(pmap);
4556 pde = pmap_pde(pmap, va);
4557 oldpde = *pde;
4558 if ((oldpde & PG_A) != 0) {
4559 if (pmap_demote_pde(pmap, pde, va)) {
4560 /*
4561 * Remove the mapping to a single page so
4562 * that a subsequent access may repromote.
4563 * Since the underlying page table page is
4564 * fully populated, this removal never frees
4565 * a page table page.
4566 */
4567 va += VM_PAGE_TO_PHYS(m) - (oldpde &
4568 PG_PS_FRAME);
4569 pmap_remove_page(pmap, va, NULL);
4570 }
4571 }
4572 PMAP_UNLOCK(pmap);
4573 }
4574 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4575 pmap = PV_PMAP(pv);
4576 PMAP_LOCK(pmap);
4577 pde = pmap_pde(pmap, pv->pv_va);
4578 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4579 " a 4mpage in page %p's pv list", m));
4580 pte = pmap_pte_quick(pmap, pv->pv_va);
4581 if ((*pte & PG_A) != 0) {
4582 /*
4583 * Regardless of whether a pte is 32 or 64 bits
4584 * in size, PG_A is among the least significant
4585 * 32 bits.
4586 */
4587 atomic_clear_int((u_int *)pte, PG_A);
4588 pmap_invalidate_page(pmap, pv->pv_va);
4589 }
4590 PMAP_UNLOCK(pmap);
4591 }
4592 sched_unpin();
4593 }
4594
4595 /*
4596 * Miscellaneous support routines follow
4597 */
4598
4599 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
4600 static __inline void
4601 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4602 {
4603 u_int opte, npte;
4604
4605 /*
4606 * The cache mode bits are all in the low 32-bits of the
4607 * PTE, so we can just spin on updating the low 32-bits.
4608 */
4609 do {
4610 opte = *(u_int *)pte;
4611 npte = opte & ~PG_PTE_CACHE;
4612 npte |= cache_bits;
4613 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4614 }
4615
4616 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
4617 static __inline void
4618 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4619 {
4620 u_int opde, npde;
4621
4622 /*
4623 * The cache mode bits are all in the low 32-bits of the
4624 * PDE, so we can just spin on updating the low 32-bits.
4625 */
4626 do {
4627 opde = *(u_int *)pde;
4628 npde = opde & ~PG_PDE_CACHE;
4629 npde |= cache_bits;
4630 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4631 }
4632
4633 /*
4634 * Map a set of physical memory pages into the kernel virtual
4635 * address space. Return a pointer to where it is mapped. This
4636 * routine is intended to be used for mapping device memory,
4637 * NOT real memory.
4638 */
4639 void *
4640 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4641 {
4642 vm_offset_t va, offset;
4643 vm_size_t tmpsize;
4644
4645 offset = pa & PAGE_MASK;
4646 size = roundup(offset + size, PAGE_SIZE);
4647 pa = pa & PG_FRAME;
4648
4649 if (pa < KERNLOAD && pa + size <= KERNLOAD)
4650 va = KERNBASE + pa;
4651 else
4652 va = kmem_alloc_nofault(kernel_map, size);
4653 if (!va)
4654 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4655
4656 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4657 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4658 pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4659 pmap_invalidate_cache_range(va, va + size);
4660 return ((void *)(va + offset));
4661 }
4662
4663 void *
4664 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4665 {
4666
4667 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4668 }
4669
4670 void *
4671 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4672 {
4673
4674 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4675 }
4676
4677 void
4678 pmap_unmapdev(vm_offset_t va, vm_size_t size)
4679 {
4680 vm_offset_t base, offset, tmpva;
4681
4682 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
4683 return;
4684 base = trunc_page(va);
4685 offset = va & PAGE_MASK;
4686 size = roundup(offset + size, PAGE_SIZE);
4687 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4688 pmap_kremove(tmpva);
4689 pmap_invalidate_range(kernel_pmap, va, tmpva);
4690 kmem_free(kernel_map, base, size);
4691 }
4692
4693 /*
4694 * Sets the memory attribute for the specified page.
4695 */
4696 void
4697 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4698 {
4699 struct sysmaps *sysmaps;
4700 vm_offset_t sva, eva;
4701
4702 m->md.pat_mode = ma;
4703 if ((m->flags & PG_FICTITIOUS) != 0)
4704 return;
4705
4706 /*
4707 * If "m" is a normal page, flush it from the cache.
4708 * See pmap_invalidate_cache_range().
4709 *
4710 * First, try to find an existing mapping of the page by sf
4711 * buffer. sf_buf_invalidate_cache() modifies mapping and
4712 * flushes the cache.
4713 */
4714 if (sf_buf_invalidate_cache(m))
4715 return;
4716
4717 /*
4718 * If page is not mapped by sf buffer, but CPU does not
4719 * support self snoop, map the page transient and do
4720 * invalidation. In the worst case, whole cache is flushed by
4721 * pmap_invalidate_cache_range().
4722 */
4723 if ((cpu_feature & (CPUID_SS|CPUID_CLFSH)) == CPUID_CLFSH) {
4724 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4725 mtx_lock(&sysmaps->lock);
4726 if (*sysmaps->CMAP2)
4727 panic("pmap_page_set_memattr: CMAP2 busy");
4728 sched_pin();
4729 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
4730 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
4731 invlcaddr(sysmaps->CADDR2);
4732 sva = (vm_offset_t)sysmaps->CADDR2;
4733 eva = sva + PAGE_SIZE;
4734 } else
4735 sva = eva = 0; /* gcc */
4736 pmap_invalidate_cache_range(sva, eva);
4737 if (sva != 0) {
4738 *sysmaps->CMAP2 = 0;
4739 sched_unpin();
4740 mtx_unlock(&sysmaps->lock);
4741 }
4742 }
4743
4744 /*
4745 * Changes the specified virtual address range's memory type to that given by
4746 * the parameter "mode". The specified virtual address range must be
4747 * completely contained within either the kernel map.
4748 *
4749 * Returns zero if the change completed successfully, and either EINVAL or
4750 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
4751 * of the virtual address range was not mapped, and ENOMEM is returned if
4752 * there was insufficient memory available to complete the change.
4753 */
4754 int
4755 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4756 {
4757 vm_offset_t base, offset, tmpva;
4758 pd_entry_t *pde;
4759 pt_entry_t *pte;
4760 int cache_bits_pte, cache_bits_pde;
4761 boolean_t changed;
4762
4763 base = trunc_page(va);
4764 offset = va & PAGE_MASK;
4765 size = roundup(offset + size, PAGE_SIZE);
4766
4767 /*
4768 * Only supported on kernel virtual addresses above the recursive map.
4769 */
4770 if (base < VM_MIN_KERNEL_ADDRESS)
4771 return (EINVAL);
4772
4773 cache_bits_pde = pmap_cache_bits(mode, 1);
4774 cache_bits_pte = pmap_cache_bits(mode, 0);
4775 changed = FALSE;
4776
4777 /*
4778 * Pages that aren't mapped aren't supported. Also break down
4779 * 2/4MB pages into 4KB pages if required.
4780 */
4781 PMAP_LOCK(kernel_pmap);
4782 for (tmpva = base; tmpva < base + size; ) {
4783 pde = pmap_pde(kernel_pmap, tmpva);
4784 if (*pde == 0) {
4785 PMAP_UNLOCK(kernel_pmap);
4786 return (EINVAL);
4787 }
4788 if (*pde & PG_PS) {
4789 /*
4790 * If the current 2/4MB page already has
4791 * the required memory type, then we need not
4792 * demote this page. Just increment tmpva to
4793 * the next 2/4MB page frame.
4794 */
4795 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
4796 tmpva = trunc_4mpage(tmpva) + NBPDR;
4797 continue;
4798 }
4799
4800 /*
4801 * If the current offset aligns with a 2/4MB
4802 * page frame and there is at least 2/4MB left
4803 * within the range, then we need not break
4804 * down this page into 4KB pages.
4805 */
4806 if ((tmpva & PDRMASK) == 0 &&
4807 tmpva + PDRMASK < base + size) {
4808 tmpva += NBPDR;
4809 continue;
4810 }
4811 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
4812 PMAP_UNLOCK(kernel_pmap);
4813 return (ENOMEM);
4814 }
4815 }
4816 pte = vtopte(tmpva);
4817 if (*pte == 0) {
4818 PMAP_UNLOCK(kernel_pmap);
4819 return (EINVAL);
4820 }
4821 tmpva += PAGE_SIZE;
4822 }
4823 PMAP_UNLOCK(kernel_pmap);
4824
4825 /*
4826 * Ok, all the pages exist, so run through them updating their
4827 * cache mode if required.
4828 */
4829 for (tmpva = base; tmpva < base + size; ) {
4830 pde = pmap_pde(kernel_pmap, tmpva);
4831 if (*pde & PG_PS) {
4832 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
4833 pmap_pde_attr(pde, cache_bits_pde);
4834 changed = TRUE;
4835 }
4836 tmpva = trunc_4mpage(tmpva) + NBPDR;
4837 } else {
4838 pte = vtopte(tmpva);
4839 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
4840 pmap_pte_attr(pte, cache_bits_pte);
4841 changed = TRUE;
4842 }
4843 tmpva += PAGE_SIZE;
4844 }
4845 }
4846
4847 /*
4848 * Flush CPU caches to make sure any data isn't cached that
4849 * shouldn't be, etc.
4850 */
4851 if (changed) {
4852 pmap_invalidate_range(kernel_pmap, base, tmpva);
4853 pmap_invalidate_cache_range(base, tmpva);
4854 }
4855 return (0);
4856 }
4857
4858 /*
4859 * perform the pmap work for mincore
4860 */
4861 int
4862 pmap_mincore(pmap_t pmap, vm_offset_t addr)
4863 {
4864 pd_entry_t *pdep;
4865 pt_entry_t *ptep, pte;
4866 vm_paddr_t pa;
4867 vm_page_t m;
4868 int val = 0;
4869
4870 PMAP_LOCK(pmap);
4871 pdep = pmap_pde(pmap, addr);
4872 if (*pdep != 0) {
4873 if (*pdep & PG_PS) {
4874 pte = *pdep;
4875 val = MINCORE_SUPER;
4876 /* Compute the physical address of the 4KB page. */
4877 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
4878 PG_FRAME;
4879 } else {
4880 ptep = pmap_pte(pmap, addr);
4881 pte = *ptep;
4882 pmap_pte_release(ptep);
4883 pa = pte & PG_FRAME;
4884 }
4885 } else {
4886 pte = 0;
4887 pa = 0;
4888 }
4889 PMAP_UNLOCK(pmap);
4890
4891 if (pte != 0) {
4892 val |= MINCORE_INCORE;
4893 if ((pte & PG_MANAGED) == 0)
4894 return val;
4895
4896 m = PHYS_TO_VM_PAGE(pa);
4897
4898 /*
4899 * Modified by us
4900 */
4901 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4902 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
4903 else {
4904 /*
4905 * Modified by someone else
4906 */
4907 vm_page_lock_queues();
4908 if (m->dirty || pmap_is_modified(m))
4909 val |= MINCORE_MODIFIED_OTHER;
4910 vm_page_unlock_queues();
4911 }
4912 /*
4913 * Referenced by us
4914 */
4915 if (pte & PG_A)
4916 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
4917 else {
4918 /*
4919 * Referenced by someone else
4920 */
4921 vm_page_lock_queues();
4922 if ((m->flags & PG_REFERENCED) ||
4923 pmap_ts_referenced(m)) {
4924 val |= MINCORE_REFERENCED_OTHER;
4925 vm_page_flag_set(m, PG_REFERENCED);
4926 }
4927 vm_page_unlock_queues();
4928 }
4929 }
4930 return val;
4931 }
4932
4933 void
4934 pmap_activate(struct thread *td)
4935 {
4936 pmap_t pmap, oldpmap;
4937 u_int32_t cr3;
4938
4939 critical_enter();
4940 pmap = vmspace_pmap(td->td_proc->p_vmspace);
4941 oldpmap = PCPU_GET(curpmap);
4942 #if defined(SMP)
4943 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
4944 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
4945 #else
4946 oldpmap->pm_active &= ~1;
4947 pmap->pm_active |= 1;
4948 #endif
4949 #ifdef PAE
4950 cr3 = vtophys(pmap->pm_pdpt);
4951 #else
4952 cr3 = vtophys(pmap->pm_pdir);
4953 #endif
4954 /*
4955 * pmap_activate is for the current thread on the current cpu
4956 */
4957 td->td_pcb->pcb_cr3 = cr3;
4958 load_cr3(cr3);
4959 PCPU_SET(curpmap, pmap);
4960 critical_exit();
4961 }
4962
4963 vm_offset_t
4964 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
4965 {
4966
4967 if ((obj == NULL) || (size < NBPDR) ||
4968 (obj->type != OBJT_DEVICE && obj->type != OBJT_SG)) {
4969 return addr;
4970 }
4971
4972 addr = (addr + PDRMASK) & ~PDRMASK;
4973 return addr;
4974 }
4975
4976 /*
4977 * Increase the starting virtual address of the given mapping if a
4978 * different alignment might result in more superpage mappings.
4979 */
4980 void
4981 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4982 vm_offset_t *addr, vm_size_t size)
4983 {
4984 vm_offset_t superpage_offset;
4985
4986 if (size < NBPDR)
4987 return;
4988 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4989 offset += ptoa(object->pg_color);
4990 superpage_offset = offset & PDRMASK;
4991 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
4992 (*addr & PDRMASK) == superpage_offset)
4993 return;
4994 if ((*addr & PDRMASK) < superpage_offset)
4995 *addr = (*addr & ~PDRMASK) + superpage_offset;
4996 else
4997 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
4998 }
4999
5000
5001 #if defined(PMAP_DEBUG)
5002 pmap_pid_dump(int pid)
5003 {
5004 pmap_t pmap;
5005 struct proc *p;
5006 int npte = 0;
5007 int index;
5008
5009 sx_slock(&allproc_lock);
5010 FOREACH_PROC_IN_SYSTEM(p) {
5011 if (p->p_pid != pid)
5012 continue;
5013
5014 if (p->p_vmspace) {
5015 int i,j;
5016 index = 0;
5017 pmap = vmspace_pmap(p->p_vmspace);
5018 for (i = 0; i < NPDEPTD; i++) {
5019 pd_entry_t *pde;
5020 pt_entry_t *pte;
5021 vm_offset_t base = i << PDRSHIFT;
5022
5023 pde = &pmap->pm_pdir[i];
5024 if (pde && pmap_pde_v(pde)) {
5025 for (j = 0; j < NPTEPG; j++) {
5026 vm_offset_t va = base + (j << PAGE_SHIFT);
5027 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5028 if (index) {
5029 index = 0;
5030 printf("\n");
5031 }
5032 sx_sunlock(&allproc_lock);
5033 return npte;
5034 }
5035 pte = pmap_pte(pmap, va);
5036 if (pte && pmap_pte_v(pte)) {
5037 pt_entry_t pa;
5038 vm_page_t m;
5039 pa = *pte;
5040 m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5041 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5042 va, pa, m->hold_count, m->wire_count, m->flags);
5043 npte++;
5044 index++;
5045 if (index >= 2) {
5046 index = 0;
5047 printf("\n");
5048 } else {
5049 printf(" ");
5050 }
5051 }
5052 }
5053 }
5054 }
5055 }
5056 }
5057 sx_sunlock(&allproc_lock);
5058 return npte;
5059 }
5060 #endif
5061
5062 #if defined(DEBUG)
5063
5064 static void pads(pmap_t pm);
5065 void pmap_pvdump(vm_offset_t pa);
5066
5067 /* print address space of pmap*/
5068 static void
5069 pads(pmap_t pm)
5070 {
5071 int i, j;
5072 vm_paddr_t va;
5073 pt_entry_t *ptep;
5074
5075 if (pm == kernel_pmap)
5076 return;
5077 for (i = 0; i < NPDEPTD; i++)
5078 if (pm->pm_pdir[i])
5079 for (j = 0; j < NPTEPG; j++) {
5080 va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5081 if (pm == kernel_pmap && va < KERNBASE)
5082 continue;
5083 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5084 continue;
5085 ptep = pmap_pte(pm, va);
5086 if (pmap_pte_v(ptep))
5087 printf("%x:%x ", va, *ptep);
5088 };
5089
5090 }
5091
5092 void
5093 pmap_pvdump(vm_paddr_t pa)
5094 {
5095 pv_entry_t pv;
5096 pmap_t pmap;
5097 vm_page_t m;
5098
5099 printf("pa %x", pa);
5100 m = PHYS_TO_VM_PAGE(pa);
5101 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5102 pmap = PV_PMAP(pv);
5103 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5104 pads(pmap);
5105 }
5106 printf(" ");
5107 }
5108 #endif
Cache object: 226c253883b518771c45c9b01d6b67a0
|