FreeBSD/Linux Kernel Cross Reference
sys/i386/i386/pmap.c
1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 * must display the following acknowledgement:
25 * This product includes software developed by the University of
26 * California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
44 */
45 /*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD: releng/6.1/sys/i386/i386/pmap.c 158179 2006-04-30 16:44:43Z cvs2svn $");
79
80 /*
81 * Manages physical address maps.
82 *
83 * In addition to hardware address maps, this
84 * module is called upon to provide software-use-only
85 * maps which may or may not be stored in the same
86 * form as hardware maps. These pseudo-maps are
87 * used to store intermediate results from copy
88 * operations to and from address spaces.
89 *
90 * Since the information managed by this module is
91 * also stored by the logical address mapping module,
92 * this module may throw away valid virtual-to-physical
93 * mappings at almost any time. However, invalidations
94 * of virtual-to-physical mappings must be done as
95 * requested.
96 *
97 * In order to cope with hardware architectures which
98 * make virtual-to-physical map invalidates expensive,
99 * this module may delay invalidate or reduced protection
100 * operations until such time as they are actually
101 * necessary. This module is given full information as
102 * to which processors are currently using which maps,
103 * and to when physical maps must be made correct.
104 */
105
106 #include "opt_cpu.h"
107 #include "opt_pmap.h"
108 #include "opt_msgbuf.h"
109
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/kernel.h>
113 #include <sys/lock.h>
114 #include <sys/malloc.h>
115 #include <sys/mman.h>
116 #include <sys/msgbuf.h>
117 #include <sys/mutex.h>
118 #include <sys/proc.h>
119 #include <sys/sx.h>
120 #include <sys/vmmeter.h>
121 #include <sys/sched.h>
122 #include <sys/sysctl.h>
123 #ifdef SMP
124 #include <sys/smp.h>
125 #endif
126
127 #include <vm/vm.h>
128 #include <vm/vm_param.h>
129 #include <vm/vm_kern.h>
130 #include <vm/vm_page.h>
131 #include <vm/vm_map.h>
132 #include <vm/vm_object.h>
133 #include <vm/vm_extern.h>
134 #include <vm/vm_pageout.h>
135 #include <vm/vm_pager.h>
136 #include <vm/uma.h>
137
138 #include <machine/cpu.h>
139 #include <machine/cputypes.h>
140 #include <machine/md_var.h>
141 #include <machine/pcb.h>
142 #include <machine/specialreg.h>
143 #ifdef SMP
144 #include <machine/smp.h>
145 #endif
146
147 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
148 #define CPU_ENABLE_SSE
149 #endif
150
151 #ifndef PMAP_SHPGPERPROC
152 #define PMAP_SHPGPERPROC 200
153 #endif
154
155 #if defined(DIAGNOSTIC)
156 #define PMAP_DIAGNOSTIC
157 #endif
158
159 #if !defined(PMAP_DIAGNOSTIC)
160 #define PMAP_INLINE __inline
161 #else
162 #define PMAP_INLINE
163 #endif
164
165 /*
166 * Get PDEs and PTEs for user/kernel address space
167 */
168 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
169 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
170
171 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0)
172 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0)
173 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0)
174 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0)
175 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0)
176
177 #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
178 atomic_clear_int((u_int *)(pte), PG_W))
179 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
180
181 struct pmap kernel_pmap_store;
182 LIST_HEAD(pmaplist, pmap);
183 static struct pmaplist allpmaps;
184 static struct mtx allpmaps_lock;
185
186 vm_paddr_t avail_end; /* PA of last available physical page */
187 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
188 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
189 int pgeflag = 0; /* PG_G or-in */
190 int pseflag = 0; /* PG_PS or-in */
191
192 static int nkpt;
193 vm_offset_t kernel_vm_end;
194 extern u_int32_t KERNend;
195
196 #ifdef PAE
197 static uma_zone_t pdptzone;
198 #endif
199
200 /*
201 * Data for the pv entry allocation mechanism
202 */
203 static uma_zone_t pvzone;
204 static struct vm_object pvzone_obj;
205 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
206 int pmap_pagedaemon_waken;
207
208 /*
209 * All those kernel PT submaps that BSD is so fond of
210 */
211 struct sysmaps {
212 struct mtx lock;
213 pt_entry_t *CMAP1;
214 pt_entry_t *CMAP2;
215 caddr_t CADDR1;
216 caddr_t CADDR2;
217 };
218 static struct sysmaps sysmaps_pcpu[MAXCPU];
219 pt_entry_t *CMAP1 = 0;
220 static pt_entry_t *CMAP3;
221 caddr_t CADDR1 = 0, ptvmmap = 0;
222 static caddr_t CADDR3;
223 struct msgbuf *msgbufp = 0;
224
225 /*
226 * Crashdump maps.
227 */
228 static caddr_t crashdumpmap;
229
230 #ifdef SMP
231 extern pt_entry_t *SMPpt;
232 #endif
233 static pt_entry_t *PMAP1 = 0, *PMAP2;
234 static pt_entry_t *PADDR1 = 0, *PADDR2;
235 #ifdef SMP
236 static int PMAP1cpu;
237 static int PMAP1changedcpu;
238 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
239 &PMAP1changedcpu, 0,
240 "Number of times pmap_pte_quick changed CPU with same PMAP1");
241 #endif
242 static int PMAP1changed;
243 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
244 &PMAP1changed, 0,
245 "Number of times pmap_pte_quick changed PMAP1");
246 static int PMAP1unchanged;
247 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
248 &PMAP1unchanged, 0,
249 "Number of times pmap_pte_quick didn't change PMAP1");
250 static struct mtx PMAP2mutex;
251
252 static PMAP_INLINE void free_pv_entry(pv_entry_t pv);
253 static pv_entry_t get_pv_entry(void);
254 static void pmap_clear_ptes(vm_page_t m, int bit);
255
256 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
257 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
258 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
259 vm_offset_t va);
260 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
261
262 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
263
264 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
265 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m);
266 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
267 static void pmap_pte_release(pt_entry_t *pte);
268 static int pmap_unuse_pt(pmap_t, vm_offset_t);
269 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
270 #ifdef PAE
271 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
272 #endif
273
274 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
275 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
276
277 /*
278 * Move the kernel virtual free pointer to the next
279 * 4MB. This is used to help improve performance
280 * by using a large (4MB) page for much of the kernel
281 * (.text, .data, .bss)
282 */
283 static vm_offset_t
284 pmap_kmem_choose(vm_offset_t addr)
285 {
286 vm_offset_t newaddr = addr;
287
288 #ifndef DISABLE_PSE
289 if (cpu_feature & CPUID_PSE)
290 newaddr = (addr + PDRMASK) & ~PDRMASK;
291 #endif
292 return newaddr;
293 }
294
295 /*
296 * Bootstrap the system enough to run with virtual memory.
297 *
298 * On the i386 this is called after mapping has already been enabled
299 * and just syncs the pmap module with what has already been done.
300 * [We can't call it easily with mapping off since the kernel is not
301 * mapped with PA == VA, hence we would have to relocate every address
302 * from the linked base (virtual) address "KERNBASE" to the actual
303 * (physical) address starting relative to 0]
304 */
305 void
306 pmap_bootstrap(firstaddr, loadaddr)
307 vm_paddr_t firstaddr;
308 vm_paddr_t loadaddr;
309 {
310 vm_offset_t va;
311 pt_entry_t *pte, *unused;
312 struct sysmaps *sysmaps;
313 int i;
314
315 /*
316 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
317 * large. It should instead be correctly calculated in locore.s and
318 * not based on 'first' (which is a physical address, not a virtual
319 * address, for the start of unused physical memory). The kernel
320 * page tables are NOT double mapped and thus should not be included
321 * in this calculation.
322 */
323 virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
324 virtual_avail = pmap_kmem_choose(virtual_avail);
325
326 virtual_end = VM_MAX_KERNEL_ADDRESS;
327
328 /*
329 * Initialize the kernel pmap (which is statically allocated).
330 */
331 PMAP_LOCK_INIT(kernel_pmap);
332 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
333 #ifdef PAE
334 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
335 #endif
336 kernel_pmap->pm_active = -1; /* don't allow deactivation */
337 TAILQ_INIT(&kernel_pmap->pm_pvlist);
338 LIST_INIT(&allpmaps);
339 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
340 mtx_lock_spin(&allpmaps_lock);
341 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
342 mtx_unlock_spin(&allpmaps_lock);
343 nkpt = NKPT;
344
345 /*
346 * Reserve some special page table entries/VA space for temporary
347 * mapping of pages.
348 */
349 #define SYSMAP(c, p, v, n) \
350 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
351
352 va = virtual_avail;
353 pte = vtopte(va);
354
355 /*
356 * CMAP1/CMAP2 are used for zeroing and copying pages.
357 * CMAP3 is used for the idle process page zeroing.
358 */
359 for (i = 0; i < MAXCPU; i++) {
360 sysmaps = &sysmaps_pcpu[i];
361 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
362 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
363 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
364 }
365 SYSMAP(caddr_t, CMAP1, CADDR1, 1)
366 SYSMAP(caddr_t, CMAP3, CADDR3, 1)
367 *CMAP3 = 0;
368
369 /*
370 * Crashdump maps.
371 */
372 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
373
374 /*
375 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
376 */
377 SYSMAP(caddr_t, unused, ptvmmap, 1)
378
379 /*
380 * msgbufp is used to map the system message buffer.
381 */
382 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
383
384 /*
385 * ptemap is used for pmap_pte_quick
386 */
387 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
388 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
389
390 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
391
392 virtual_avail = va;
393
394 *CMAP1 = 0;
395 for (i = 0; i < NKPT; i++)
396 PTD[i] = 0;
397
398 /* Turn on PG_G on kernel page(s) */
399 pmap_set_pg();
400 }
401
402 /*
403 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on.
404 */
405 void
406 pmap_set_pg(void)
407 {
408 pd_entry_t pdir;
409 pt_entry_t *pte;
410 vm_offset_t va, endva;
411 int i;
412
413 if (pgeflag == 0)
414 return;
415
416 i = KERNLOAD/NBPDR;
417 endva = KERNBASE + KERNend;
418
419 if (pseflag) {
420 va = KERNBASE + KERNLOAD;
421 while (va < endva) {
422 pdir = kernel_pmap->pm_pdir[KPTDI+i];
423 pdir |= pgeflag;
424 kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
425 invltlb(); /* Play it safe, invltlb() every time */
426 i++;
427 va += NBPDR;
428 }
429 } else {
430 va = (vm_offset_t)btext;
431 while (va < endva) {
432 pte = vtopte(va);
433 if (*pte)
434 *pte |= pgeflag;
435 invltlb(); /* Play it safe, invltlb() every time */
436 va += PAGE_SIZE;
437 }
438 }
439 }
440
441 /*
442 * Initialize a vm_page's machine-dependent fields.
443 */
444 void
445 pmap_page_init(vm_page_t m)
446 {
447
448 TAILQ_INIT(&m->md.pv_list);
449 m->md.pv_list_count = 0;
450 }
451
452 #ifdef PAE
453
454 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
455
456 static void *
457 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
458 {
459 *flags = UMA_SLAB_PRIV;
460 return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
461 1, 0));
462 }
463 #endif
464
465 /*
466 * Initialize the pmap module.
467 * Called by vm_init, to initialize any structures that the pmap
468 * system needs to map virtual memory.
469 */
470 void
471 pmap_init(void)
472 {
473 int shpgperproc = PMAP_SHPGPERPROC;
474
475 /*
476 * Initialize the address space (zone) for the pv entries. Set a
477 * high water mark so that the system can recover from excessive
478 * numbers of pv entries.
479 */
480 pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
481 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
482 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
483 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
484 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
485 pv_entry_high_water = 9 * (pv_entry_max / 10);
486 uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
487
488 #ifdef PAE
489 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
490 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
491 UMA_ZONE_VM | UMA_ZONE_NOFREE);
492 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
493 #endif
494 }
495
496 void
497 pmap_init2()
498 {
499 }
500
501
502 /***************************************************
503 * Low level helper routines.....
504 ***************************************************/
505
506 #if defined(PMAP_DIAGNOSTIC)
507
508 /*
509 * This code checks for non-writeable/modified pages.
510 * This should be an invalid condition.
511 */
512 static int
513 pmap_nw_modified(pt_entry_t ptea)
514 {
515 int pte;
516
517 pte = (int) ptea;
518
519 if ((pte & (PG_M|PG_RW)) == PG_M)
520 return 1;
521 else
522 return 0;
523 }
524 #endif
525
526
527 /*
528 * this routine defines the region(s) of memory that should
529 * not be tested for the modified bit.
530 */
531 static PMAP_INLINE int
532 pmap_track_modified(vm_offset_t va)
533 {
534 if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
535 return 1;
536 else
537 return 0;
538 }
539
540 #ifdef SMP
541 /*
542 * For SMP, these functions have to use the IPI mechanism for coherence.
543 */
544 void
545 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
546 {
547 u_int cpumask;
548 u_int other_cpus;
549
550 if (smp_started) {
551 if (!(read_eflags() & PSL_I))
552 panic("%s: interrupts disabled", __func__);
553 mtx_lock_spin(&smp_ipi_mtx);
554 } else
555 critical_enter();
556 /*
557 * We need to disable interrupt preemption but MUST NOT have
558 * interrupts disabled here.
559 * XXX we may need to hold schedlock to get a coherent pm_active
560 * XXX critical sections disable interrupts again
561 */
562 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
563 invlpg(va);
564 smp_invlpg(va);
565 } else {
566 cpumask = PCPU_GET(cpumask);
567 other_cpus = PCPU_GET(other_cpus);
568 if (pmap->pm_active & cpumask)
569 invlpg(va);
570 if (pmap->pm_active & other_cpus)
571 smp_masked_invlpg(pmap->pm_active & other_cpus, va);
572 }
573 if (smp_started)
574 mtx_unlock_spin(&smp_ipi_mtx);
575 else
576 critical_exit();
577 }
578
579 void
580 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
581 {
582 u_int cpumask;
583 u_int other_cpus;
584 vm_offset_t addr;
585
586 if (smp_started) {
587 if (!(read_eflags() & PSL_I))
588 panic("%s: interrupts disabled", __func__);
589 mtx_lock_spin(&smp_ipi_mtx);
590 } else
591 critical_enter();
592 /*
593 * We need to disable interrupt preemption but MUST NOT have
594 * interrupts disabled here.
595 * XXX we may need to hold schedlock to get a coherent pm_active
596 * XXX critical sections disable interrupts again
597 */
598 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
599 for (addr = sva; addr < eva; addr += PAGE_SIZE)
600 invlpg(addr);
601 smp_invlpg_range(sva, eva);
602 } else {
603 cpumask = PCPU_GET(cpumask);
604 other_cpus = PCPU_GET(other_cpus);
605 if (pmap->pm_active & cpumask)
606 for (addr = sva; addr < eva; addr += PAGE_SIZE)
607 invlpg(addr);
608 if (pmap->pm_active & other_cpus)
609 smp_masked_invlpg_range(pmap->pm_active & other_cpus,
610 sva, eva);
611 }
612 if (smp_started)
613 mtx_unlock_spin(&smp_ipi_mtx);
614 else
615 critical_exit();
616 }
617
618 void
619 pmap_invalidate_all(pmap_t pmap)
620 {
621 u_int cpumask;
622 u_int other_cpus;
623
624 if (smp_started) {
625 if (!(read_eflags() & PSL_I))
626 panic("%s: interrupts disabled", __func__);
627 mtx_lock_spin(&smp_ipi_mtx);
628 } else
629 critical_enter();
630 /*
631 * We need to disable interrupt preemption but MUST NOT have
632 * interrupts disabled here.
633 * XXX we may need to hold schedlock to get a coherent pm_active
634 * XXX critical sections disable interrupts again
635 */
636 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
637 invltlb();
638 smp_invltlb();
639 } else {
640 cpumask = PCPU_GET(cpumask);
641 other_cpus = PCPU_GET(other_cpus);
642 if (pmap->pm_active & cpumask)
643 invltlb();
644 if (pmap->pm_active & other_cpus)
645 smp_masked_invltlb(pmap->pm_active & other_cpus);
646 }
647 if (smp_started)
648 mtx_unlock_spin(&smp_ipi_mtx);
649 else
650 critical_exit();
651 }
652 #else /* !SMP */
653 /*
654 * Normal, non-SMP, 486+ invalidation functions.
655 * We inline these within pmap.c for speed.
656 */
657 PMAP_INLINE void
658 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
659 {
660
661 if (pmap == kernel_pmap || pmap->pm_active)
662 invlpg(va);
663 }
664
665 PMAP_INLINE void
666 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
667 {
668 vm_offset_t addr;
669
670 if (pmap == kernel_pmap || pmap->pm_active)
671 for (addr = sva; addr < eva; addr += PAGE_SIZE)
672 invlpg(addr);
673 }
674
675 PMAP_INLINE void
676 pmap_invalidate_all(pmap_t pmap)
677 {
678
679 if (pmap == kernel_pmap || pmap->pm_active)
680 invltlb();
681 }
682 #endif /* !SMP */
683
684 /*
685 * Are we current address space or kernel? N.B. We return FALSE when
686 * a pmap's page table is in use because a kernel thread is borrowing
687 * it. The borrowed page table can change spontaneously, making any
688 * dependence on its continued use subject to a race condition.
689 */
690 static __inline int
691 pmap_is_current(pmap_t pmap)
692 {
693
694 return (pmap == kernel_pmap ||
695 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
696 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
697 }
698
699 /*
700 * If the given pmap is not the current or kernel pmap, the returned pte must
701 * be released by passing it to pmap_pte_release().
702 */
703 pt_entry_t *
704 pmap_pte(pmap_t pmap, vm_offset_t va)
705 {
706 pd_entry_t newpf;
707 pd_entry_t *pde;
708
709 pde = pmap_pde(pmap, va);
710 if (*pde & PG_PS)
711 return (pde);
712 if (*pde != 0) {
713 /* are we current address space or kernel? */
714 if (pmap_is_current(pmap))
715 return (vtopte(va));
716 mtx_lock(&PMAP2mutex);
717 newpf = *pde & PG_FRAME;
718 if ((*PMAP2 & PG_FRAME) != newpf) {
719 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
720 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
721 }
722 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
723 }
724 return (0);
725 }
726
727 /*
728 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte
729 * being NULL.
730 */
731 static __inline void
732 pmap_pte_release(pt_entry_t *pte)
733 {
734
735 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
736 mtx_unlock(&PMAP2mutex);
737 }
738
739 static __inline void
740 invlcaddr(void *caddr)
741 {
742
743 invlpg((u_int)caddr);
744 }
745
746 /*
747 * Super fast pmap_pte routine best used when scanning
748 * the pv lists. This eliminates many coarse-grained
749 * invltlb calls. Note that many of the pv list
750 * scans are across different pmaps. It is very wasteful
751 * to do an entire invltlb for checking a single mapping.
752 *
753 * If the given pmap is not the current pmap, vm_page_queue_mtx
754 * must be held and curthread pinned to a CPU.
755 */
756 static pt_entry_t *
757 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
758 {
759 pd_entry_t newpf;
760 pd_entry_t *pde;
761
762 pde = pmap_pde(pmap, va);
763 if (*pde & PG_PS)
764 return (pde);
765 if (*pde != 0) {
766 /* are we current address space or kernel? */
767 if (pmap_is_current(pmap))
768 return (vtopte(va));
769 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
770 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
771 newpf = *pde & PG_FRAME;
772 if ((*PMAP1 & PG_FRAME) != newpf) {
773 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
774 #ifdef SMP
775 PMAP1cpu = PCPU_GET(cpuid);
776 #endif
777 invlcaddr(PADDR1);
778 PMAP1changed++;
779 } else
780 #ifdef SMP
781 if (PMAP1cpu != PCPU_GET(cpuid)) {
782 PMAP1cpu = PCPU_GET(cpuid);
783 invlcaddr(PADDR1);
784 PMAP1changedcpu++;
785 } else
786 #endif
787 PMAP1unchanged++;
788 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
789 }
790 return (0);
791 }
792
793 /*
794 * Routine: pmap_extract
795 * Function:
796 * Extract the physical page address associated
797 * with the given map/virtual_address pair.
798 */
799 vm_paddr_t
800 pmap_extract(pmap_t pmap, vm_offset_t va)
801 {
802 vm_paddr_t rtval;
803 pt_entry_t *pte;
804 pd_entry_t pde;
805
806 rtval = 0;
807 PMAP_LOCK(pmap);
808 pde = pmap->pm_pdir[va >> PDRSHIFT];
809 if (pde != 0) {
810 if ((pde & PG_PS) != 0) {
811 rtval = (pde & ~PDRMASK) | (va & PDRMASK);
812 PMAP_UNLOCK(pmap);
813 return rtval;
814 }
815 pte = pmap_pte(pmap, va);
816 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
817 pmap_pte_release(pte);
818 }
819 PMAP_UNLOCK(pmap);
820 return (rtval);
821 }
822
823 /*
824 * Routine: pmap_extract_and_hold
825 * Function:
826 * Atomically extract and hold the physical page
827 * with the given pmap and virtual address pair
828 * if that mapping permits the given protection.
829 */
830 vm_page_t
831 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
832 {
833 pd_entry_t pde;
834 pt_entry_t pte;
835 vm_page_t m;
836
837 m = NULL;
838 vm_page_lock_queues();
839 PMAP_LOCK(pmap);
840 pde = *pmap_pde(pmap, va);
841 if (pde != 0) {
842 if (pde & PG_PS) {
843 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
844 m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
845 (va & PDRMASK));
846 vm_page_hold(m);
847 }
848 } else {
849 sched_pin();
850 pte = *pmap_pte_quick(pmap, va);
851 if (pte != 0 &&
852 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
853 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
854 vm_page_hold(m);
855 }
856 sched_unpin();
857 }
858 }
859 vm_page_unlock_queues();
860 PMAP_UNLOCK(pmap);
861 return (m);
862 }
863
864 /***************************************************
865 * Low level mapping routines.....
866 ***************************************************/
867
868 /*
869 * Add a wired page to the kva.
870 * Note: not SMP coherent.
871 */
872 PMAP_INLINE void
873 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
874 {
875 pt_entry_t *pte;
876
877 pte = vtopte(va);
878 pte_store(pte, pa | PG_RW | PG_V | pgeflag);
879 }
880
881 /*
882 * Remove a page from the kernel pagetables.
883 * Note: not SMP coherent.
884 */
885 PMAP_INLINE void
886 pmap_kremove(vm_offset_t va)
887 {
888 pt_entry_t *pte;
889
890 pte = vtopte(va);
891 pte_clear(pte);
892 }
893
894 /*
895 * Used to map a range of physical addresses into kernel
896 * virtual address space.
897 *
898 * The value passed in '*virt' is a suggested virtual address for
899 * the mapping. Architectures which can support a direct-mapped
900 * physical to virtual region can return the appropriate address
901 * within that region, leaving '*virt' unchanged. Other
902 * architectures should map the pages starting at '*virt' and
903 * update '*virt' with the first usable address after the mapped
904 * region.
905 */
906 vm_offset_t
907 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
908 {
909 vm_offset_t va, sva;
910
911 va = sva = *virt;
912 while (start < end) {
913 pmap_kenter(va, start);
914 va += PAGE_SIZE;
915 start += PAGE_SIZE;
916 }
917 pmap_invalidate_range(kernel_pmap, sva, va);
918 *virt = va;
919 return (sva);
920 }
921
922
923 /*
924 * Add a list of wired pages to the kva
925 * this routine is only used for temporary
926 * kernel mappings that do not need to have
927 * page modification or references recorded.
928 * Note that old mappings are simply written
929 * over. The page *must* be wired.
930 * Note: SMP coherent. Uses a ranged shootdown IPI.
931 */
932 void
933 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
934 {
935 vm_offset_t va;
936
937 va = sva;
938 while (count-- > 0) {
939 pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
940 va += PAGE_SIZE;
941 m++;
942 }
943 pmap_invalidate_range(kernel_pmap, sva, va);
944 }
945
946 /*
947 * This routine tears out page mappings from the
948 * kernel -- it is meant only for temporary mappings.
949 * Note: SMP coherent. Uses a ranged shootdown IPI.
950 */
951 void
952 pmap_qremove(vm_offset_t sva, int count)
953 {
954 vm_offset_t va;
955
956 va = sva;
957 while (count-- > 0) {
958 pmap_kremove(va);
959 va += PAGE_SIZE;
960 }
961 pmap_invalidate_range(kernel_pmap, sva, va);
962 }
963
964 /***************************************************
965 * Page table page management routines.....
966 ***************************************************/
967
968 /*
969 * This routine unholds page table pages, and if the hold count
970 * drops to zero, then it decrements the wire count.
971 */
972 static PMAP_INLINE int
973 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
974 {
975
976 --m->wire_count;
977 if (m->wire_count == 0)
978 return _pmap_unwire_pte_hold(pmap, m);
979 else
980 return 0;
981 }
982
983 static int
984 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
985 {
986 vm_offset_t pteva;
987
988 /*
989 * unmap the page table page
990 */
991 pmap->pm_pdir[m->pindex] = 0;
992 --pmap->pm_stats.resident_count;
993
994 /*
995 * Do an invltlb to make the invalidated mapping
996 * take effect immediately.
997 */
998 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
999 pmap_invalidate_page(pmap, pteva);
1000
1001 vm_page_free_zero(m);
1002 atomic_subtract_int(&cnt.v_wire_count, 1);
1003 return 1;
1004 }
1005
1006 /*
1007 * After removing a page table entry, this routine is used to
1008 * conditionally free the page, and manage the hold/wire counts.
1009 */
1010 static int
1011 pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
1012 {
1013 pd_entry_t ptepde;
1014 vm_page_t mpte;
1015
1016 if (va >= VM_MAXUSER_ADDRESS)
1017 return 0;
1018 ptepde = *pmap_pde(pmap, va);
1019 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1020 return pmap_unwire_pte_hold(pmap, mpte);
1021 }
1022
1023 void
1024 pmap_pinit0(pmap)
1025 struct pmap *pmap;
1026 {
1027
1028 PMAP_LOCK_INIT(pmap);
1029 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1030 #ifdef PAE
1031 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1032 #endif
1033 pmap->pm_active = 0;
1034 PCPU_SET(curpmap, pmap);
1035 TAILQ_INIT(&pmap->pm_pvlist);
1036 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1037 mtx_lock_spin(&allpmaps_lock);
1038 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1039 mtx_unlock_spin(&allpmaps_lock);
1040 }
1041
1042 /*
1043 * Initialize a preallocated and zeroed pmap structure,
1044 * such as one in a vmspace structure.
1045 */
1046 void
1047 pmap_pinit(pmap)
1048 register struct pmap *pmap;
1049 {
1050 vm_page_t m, ptdpg[NPGPTD];
1051 vm_paddr_t pa;
1052 static int color;
1053 int i;
1054
1055 PMAP_LOCK_INIT(pmap);
1056
1057 /*
1058 * No need to allocate page table space yet but we do need a valid
1059 * page directory table.
1060 */
1061 if (pmap->pm_pdir == NULL) {
1062 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1063 NBPTD);
1064 #ifdef PAE
1065 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1066 KASSERT(((vm_offset_t)pmap->pm_pdpt &
1067 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1068 ("pmap_pinit: pdpt misaligned"));
1069 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1070 ("pmap_pinit: pdpt above 4g"));
1071 #endif
1072 }
1073
1074 /*
1075 * allocate the page directory page(s)
1076 */
1077 for (i = 0; i < NPGPTD;) {
1078 m = vm_page_alloc(NULL, color++,
1079 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1080 VM_ALLOC_ZERO);
1081 if (m == NULL)
1082 VM_WAIT;
1083 else {
1084 ptdpg[i++] = m;
1085 }
1086 }
1087
1088 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1089
1090 for (i = 0; i < NPGPTD; i++) {
1091 if ((ptdpg[i]->flags & PG_ZERO) == 0)
1092 bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1093 }
1094
1095 mtx_lock_spin(&allpmaps_lock);
1096 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1097 mtx_unlock_spin(&allpmaps_lock);
1098 /* Wire in kernel global address entries. */
1099 /* XXX copies current process, does not fill in MPPTDI */
1100 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1101 #ifdef SMP
1102 pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1103 #endif
1104
1105 /* install self-referential address mapping entry(s) */
1106 for (i = 0; i < NPGPTD; i++) {
1107 pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1108 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1109 #ifdef PAE
1110 pmap->pm_pdpt[i] = pa | PG_V;
1111 #endif
1112 }
1113
1114 pmap->pm_active = 0;
1115 TAILQ_INIT(&pmap->pm_pvlist);
1116 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1117 }
1118
1119 /*
1120 * this routine is called if the page table page is not
1121 * mapped correctly.
1122 */
1123 static vm_page_t
1124 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1125 {
1126 vm_paddr_t ptepa;
1127 vm_page_t m;
1128
1129 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1130 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1131 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1132
1133 /*
1134 * Allocate a page table page.
1135 */
1136 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1137 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1138 if (flags & M_WAITOK) {
1139 PMAP_UNLOCK(pmap);
1140 vm_page_unlock_queues();
1141 VM_WAIT;
1142 vm_page_lock_queues();
1143 PMAP_LOCK(pmap);
1144 }
1145
1146 /*
1147 * Indicate the need to retry. While waiting, the page table
1148 * page may have been allocated.
1149 */
1150 return (NULL);
1151 }
1152 if ((m->flags & PG_ZERO) == 0)
1153 pmap_zero_page(m);
1154
1155 /*
1156 * Map the pagetable page into the process address space, if
1157 * it isn't already there.
1158 */
1159
1160 pmap->pm_stats.resident_count++;
1161
1162 ptepa = VM_PAGE_TO_PHYS(m);
1163 pmap->pm_pdir[ptepindex] =
1164 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1165
1166 return m;
1167 }
1168
1169 static vm_page_t
1170 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1171 {
1172 unsigned ptepindex;
1173 pd_entry_t ptepa;
1174 vm_page_t m;
1175
1176 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1177 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1178 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1179
1180 /*
1181 * Calculate pagetable page index
1182 */
1183 ptepindex = va >> PDRSHIFT;
1184 retry:
1185 /*
1186 * Get the page directory entry
1187 */
1188 ptepa = pmap->pm_pdir[ptepindex];
1189
1190 /*
1191 * This supports switching from a 4MB page to a
1192 * normal 4K page.
1193 */
1194 if (ptepa & PG_PS) {
1195 pmap->pm_pdir[ptepindex] = 0;
1196 ptepa = 0;
1197 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1198 pmap_invalidate_all(kernel_pmap);
1199 }
1200
1201 /*
1202 * If the page table page is mapped, we just increment the
1203 * hold count, and activate it.
1204 */
1205 if (ptepa) {
1206 m = PHYS_TO_VM_PAGE(ptepa);
1207 m->wire_count++;
1208 } else {
1209 /*
1210 * Here if the pte page isn't mapped, or if it has
1211 * been deallocated.
1212 */
1213 m = _pmap_allocpte(pmap, ptepindex, flags);
1214 if (m == NULL && (flags & M_WAITOK))
1215 goto retry;
1216 }
1217 return (m);
1218 }
1219
1220
1221 /***************************************************
1222 * Pmap allocation/deallocation routines.
1223 ***************************************************/
1224
1225 #ifdef SMP
1226 /*
1227 * Deal with a SMP shootdown of other users of the pmap that we are
1228 * trying to dispose of. This can be a bit hairy.
1229 */
1230 static u_int *lazymask;
1231 static u_int lazyptd;
1232 static volatile u_int lazywait;
1233
1234 void pmap_lazyfix_action(void);
1235
1236 void
1237 pmap_lazyfix_action(void)
1238 {
1239 u_int mymask = PCPU_GET(cpumask);
1240
1241 if (rcr3() == lazyptd)
1242 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1243 atomic_clear_int(lazymask, mymask);
1244 atomic_store_rel_int(&lazywait, 1);
1245 }
1246
1247 static void
1248 pmap_lazyfix_self(u_int mymask)
1249 {
1250
1251 if (rcr3() == lazyptd)
1252 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1253 atomic_clear_int(lazymask, mymask);
1254 }
1255
1256
1257 static void
1258 pmap_lazyfix(pmap_t pmap)
1259 {
1260 u_int mymask;
1261 u_int mask;
1262 register u_int spins;
1263
1264 while ((mask = pmap->pm_active) != 0) {
1265 spins = 50000000;
1266 mask = mask & -mask; /* Find least significant set bit */
1267 mtx_lock_spin(&smp_ipi_mtx);
1268 #ifdef PAE
1269 lazyptd = vtophys(pmap->pm_pdpt);
1270 #else
1271 lazyptd = vtophys(pmap->pm_pdir);
1272 #endif
1273 mymask = PCPU_GET(cpumask);
1274 if (mask == mymask) {
1275 lazymask = &pmap->pm_active;
1276 pmap_lazyfix_self(mymask);
1277 } else {
1278 atomic_store_rel_int((u_int *)&lazymask,
1279 (u_int)&pmap->pm_active);
1280 atomic_store_rel_int(&lazywait, 0);
1281 ipi_selected(mask, IPI_LAZYPMAP);
1282 while (lazywait == 0) {
1283 ia32_pause();
1284 if (--spins == 0)
1285 break;
1286 }
1287 }
1288 mtx_unlock_spin(&smp_ipi_mtx);
1289 if (spins == 0)
1290 printf("pmap_lazyfix: spun for 50000000\n");
1291 }
1292 }
1293
1294 #else /* SMP */
1295
1296 /*
1297 * Cleaning up on uniprocessor is easy. For various reasons, we're
1298 * unlikely to have to even execute this code, including the fact
1299 * that the cleanup is deferred until the parent does a wait(2), which
1300 * means that another userland process has run.
1301 */
1302 static void
1303 pmap_lazyfix(pmap_t pmap)
1304 {
1305 u_int cr3;
1306
1307 cr3 = vtophys(pmap->pm_pdir);
1308 if (cr3 == rcr3()) {
1309 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1310 pmap->pm_active &= ~(PCPU_GET(cpumask));
1311 }
1312 }
1313 #endif /* SMP */
1314
1315 /*
1316 * Release any resources held by the given physical map.
1317 * Called when a pmap initialized by pmap_pinit is being released.
1318 * Should only be called if the map contains no valid mappings.
1319 */
1320 void
1321 pmap_release(pmap_t pmap)
1322 {
1323 vm_page_t m, ptdpg[NPGPTD];
1324 int i;
1325
1326 KASSERT(pmap->pm_stats.resident_count == 0,
1327 ("pmap_release: pmap resident count %ld != 0",
1328 pmap->pm_stats.resident_count));
1329
1330 pmap_lazyfix(pmap);
1331 mtx_lock_spin(&allpmaps_lock);
1332 LIST_REMOVE(pmap, pm_list);
1333 mtx_unlock_spin(&allpmaps_lock);
1334
1335 for (i = 0; i < NPGPTD; i++)
1336 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1337
1338 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1339 sizeof(*pmap->pm_pdir));
1340 #ifdef SMP
1341 pmap->pm_pdir[MPPTDI] = 0;
1342 #endif
1343
1344 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1345
1346 vm_page_lock_queues();
1347 for (i = 0; i < NPGPTD; i++) {
1348 m = ptdpg[i];
1349 #ifdef PAE
1350 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1351 ("pmap_release: got wrong ptd page"));
1352 #endif
1353 m->wire_count--;
1354 atomic_subtract_int(&cnt.v_wire_count, 1);
1355 vm_page_free_zero(m);
1356 }
1357 vm_page_unlock_queues();
1358 PMAP_LOCK_DESTROY(pmap);
1359 }
1360
1361 static int
1362 kvm_size(SYSCTL_HANDLER_ARGS)
1363 {
1364 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1365
1366 return sysctl_handle_long(oidp, &ksize, 0, req);
1367 }
1368 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1369 0, 0, kvm_size, "IU", "Size of KVM");
1370
1371 static int
1372 kvm_free(SYSCTL_HANDLER_ARGS)
1373 {
1374 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1375
1376 return sysctl_handle_long(oidp, &kfree, 0, req);
1377 }
1378 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1379 0, 0, kvm_free, "IU", "Amount of KVM free");
1380
1381 /*
1382 * grow the number of kernel page table entries, if needed
1383 */
1384 void
1385 pmap_growkernel(vm_offset_t addr)
1386 {
1387 struct pmap *pmap;
1388 vm_paddr_t ptppaddr;
1389 vm_page_t nkpg;
1390 pd_entry_t newpdir;
1391 pt_entry_t *pde;
1392
1393 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1394 if (kernel_vm_end == 0) {
1395 kernel_vm_end = KERNBASE;
1396 nkpt = 0;
1397 while (pdir_pde(PTD, kernel_vm_end)) {
1398 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1399 nkpt++;
1400 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1401 kernel_vm_end = kernel_map->max_offset;
1402 break;
1403 }
1404 }
1405 }
1406 addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1407 if (addr - 1 >= kernel_map->max_offset)
1408 addr = kernel_map->max_offset;
1409 while (kernel_vm_end < addr) {
1410 if (pdir_pde(PTD, kernel_vm_end)) {
1411 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1412 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1413 kernel_vm_end = kernel_map->max_offset;
1414 break;
1415 }
1416 continue;
1417 }
1418
1419 /*
1420 * This index is bogus, but out of the way
1421 */
1422 nkpg = vm_page_alloc(NULL, nkpt,
1423 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1424 if (!nkpg)
1425 panic("pmap_growkernel: no memory to grow kernel");
1426
1427 nkpt++;
1428
1429 pmap_zero_page(nkpg);
1430 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1431 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1432 pdir_pde(PTD, kernel_vm_end) = newpdir;
1433
1434 mtx_lock_spin(&allpmaps_lock);
1435 LIST_FOREACH(pmap, &allpmaps, pm_list) {
1436 pde = pmap_pde(pmap, kernel_vm_end);
1437 pde_store(pde, newpdir);
1438 }
1439 mtx_unlock_spin(&allpmaps_lock);
1440 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1441 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1442 kernel_vm_end = kernel_map->max_offset;
1443 break;
1444 }
1445 }
1446 }
1447
1448
1449 /***************************************************
1450 * page management routines.
1451 ***************************************************/
1452
1453 /*
1454 * free the pv_entry back to the free list
1455 */
1456 static PMAP_INLINE void
1457 free_pv_entry(pv_entry_t pv)
1458 {
1459 pv_entry_count--;
1460 uma_zfree(pvzone, pv);
1461 }
1462
1463 /*
1464 * get a new pv_entry, allocating a block from the system
1465 * when needed.
1466 * the memory allocation is performed bypassing the malloc code
1467 * because of the possibility of allocations at interrupt time.
1468 */
1469 static pv_entry_t
1470 get_pv_entry(void)
1471 {
1472 pv_entry_count++;
1473 if ((pv_entry_count > pv_entry_high_water) &&
1474 (pmap_pagedaemon_waken == 0)) {
1475 pmap_pagedaemon_waken = 1;
1476 wakeup (&vm_pages_needed);
1477 }
1478 return uma_zalloc(pvzone, M_NOWAIT);
1479 }
1480
1481
1482 static void
1483 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1484 {
1485 pv_entry_t pv;
1486
1487 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1488 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1489 if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1490 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1491 if (pmap == pv->pv_pmap && va == pv->pv_va)
1492 break;
1493 }
1494 } else {
1495 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1496 if (va == pv->pv_va)
1497 break;
1498 }
1499 }
1500 KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1501 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1502 m->md.pv_list_count--;
1503 if (TAILQ_EMPTY(&m->md.pv_list))
1504 vm_page_flag_clear(m, PG_WRITEABLE);
1505 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1506 free_pv_entry(pv);
1507 }
1508
1509 /*
1510 * Create a pv entry for page at pa for
1511 * (pmap, va).
1512 */
1513 static void
1514 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1515 {
1516 pv_entry_t pv;
1517
1518 pv = get_pv_entry();
1519 if (pv == NULL)
1520 panic("no pv entries: increase vm.pmap.shpgperproc");
1521 pv->pv_va = va;
1522 pv->pv_pmap = pmap;
1523
1524 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1525 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1526 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1527 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1528 m->md.pv_list_count++;
1529 }
1530
1531 /*
1532 * pmap_remove_pte: do the things to unmap a page in a process
1533 */
1534 static int
1535 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1536 {
1537 pt_entry_t oldpte;
1538 vm_page_t m;
1539
1540 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1541 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1542 oldpte = pte_load_clear(ptq);
1543 if (oldpte & PG_W)
1544 pmap->pm_stats.wired_count -= 1;
1545 /*
1546 * Machines that don't support invlpg, also don't support
1547 * PG_G.
1548 */
1549 if (oldpte & PG_G)
1550 pmap_invalidate_page(kernel_pmap, va);
1551 pmap->pm_stats.resident_count -= 1;
1552 if (oldpte & PG_MANAGED) {
1553 m = PHYS_TO_VM_PAGE(oldpte);
1554 if (oldpte & PG_M) {
1555 #if defined(PMAP_DIAGNOSTIC)
1556 if (pmap_nw_modified((pt_entry_t) oldpte)) {
1557 printf(
1558 "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1559 va, oldpte);
1560 }
1561 #endif
1562 if (pmap_track_modified(va))
1563 vm_page_dirty(m);
1564 }
1565 if (oldpte & PG_A)
1566 vm_page_flag_set(m, PG_REFERENCED);
1567 pmap_remove_entry(pmap, m, va);
1568 }
1569 return (pmap_unuse_pt(pmap, va));
1570 }
1571
1572 /*
1573 * Remove a single page from a process address space
1574 */
1575 static void
1576 pmap_remove_page(pmap_t pmap, vm_offset_t va)
1577 {
1578 pt_entry_t *pte;
1579
1580 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1581 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1582 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1583 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
1584 return;
1585 pmap_remove_pte(pmap, pte, va);
1586 pmap_invalidate_page(pmap, va);
1587 }
1588
1589 /*
1590 * Remove the given range of addresses from the specified map.
1591 *
1592 * It is assumed that the start and end are properly
1593 * rounded to the page size.
1594 */
1595 void
1596 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1597 {
1598 vm_offset_t pdnxt;
1599 pd_entry_t ptpaddr;
1600 pt_entry_t *pte;
1601 int anyvalid;
1602
1603 /*
1604 * Perform an unsynchronized read. This is, however, safe.
1605 */
1606 if (pmap->pm_stats.resident_count == 0)
1607 return;
1608
1609 anyvalid = 0;
1610
1611 vm_page_lock_queues();
1612 sched_pin();
1613 PMAP_LOCK(pmap);
1614
1615 /*
1616 * special handling of removing one page. a very
1617 * common operation and easy to short circuit some
1618 * code.
1619 */
1620 if ((sva + PAGE_SIZE == eva) &&
1621 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1622 pmap_remove_page(pmap, sva);
1623 goto out;
1624 }
1625
1626 for (; sva < eva; sva = pdnxt) {
1627 unsigned pdirindex;
1628
1629 /*
1630 * Calculate index for next page table.
1631 */
1632 pdnxt = (sva + NBPDR) & ~PDRMASK;
1633 if (pmap->pm_stats.resident_count == 0)
1634 break;
1635
1636 pdirindex = sva >> PDRSHIFT;
1637 ptpaddr = pmap->pm_pdir[pdirindex];
1638
1639 /*
1640 * Weed out invalid mappings. Note: we assume that the page
1641 * directory table is always allocated, and in kernel virtual.
1642 */
1643 if (ptpaddr == 0)
1644 continue;
1645
1646 /*
1647 * Check for large page.
1648 */
1649 if ((ptpaddr & PG_PS) != 0) {
1650 pmap->pm_pdir[pdirindex] = 0;
1651 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1652 anyvalid = 1;
1653 continue;
1654 }
1655
1656 /*
1657 * Limit our scan to either the end of the va represented
1658 * by the current page table page, or to the end of the
1659 * range being removed.
1660 */
1661 if (pdnxt > eva)
1662 pdnxt = eva;
1663
1664 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
1665 sva += PAGE_SIZE) {
1666 if (*pte == 0)
1667 continue;
1668 anyvalid = 1;
1669 if (pmap_remove_pte(pmap, pte, sva))
1670 break;
1671 }
1672 }
1673 out:
1674 sched_unpin();
1675 vm_page_unlock_queues();
1676 if (anyvalid)
1677 pmap_invalidate_all(pmap);
1678 PMAP_UNLOCK(pmap);
1679 }
1680
1681 /*
1682 * Routine: pmap_remove_all
1683 * Function:
1684 * Removes this physical page from
1685 * all physical maps in which it resides.
1686 * Reflects back modify bits to the pager.
1687 *
1688 * Notes:
1689 * Original versions of this routine were very
1690 * inefficient because they iteratively called
1691 * pmap_remove (slow...)
1692 */
1693
1694 void
1695 pmap_remove_all(vm_page_t m)
1696 {
1697 register pv_entry_t pv;
1698 pt_entry_t *pte, tpte;
1699
1700 #if defined(PMAP_DIAGNOSTIC)
1701 /*
1702 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1703 */
1704 if (m->flags & PG_FICTITIOUS) {
1705 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1706 VM_PAGE_TO_PHYS(m));
1707 }
1708 #endif
1709 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1710 sched_pin();
1711 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1712 PMAP_LOCK(pv->pv_pmap);
1713 pv->pv_pmap->pm_stats.resident_count--;
1714 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1715 tpte = pte_load_clear(pte);
1716 if (tpte & PG_W)
1717 pv->pv_pmap->pm_stats.wired_count--;
1718 if (tpte & PG_A)
1719 vm_page_flag_set(m, PG_REFERENCED);
1720
1721 /*
1722 * Update the vm_page_t clean and reference bits.
1723 */
1724 if (tpte & PG_M) {
1725 #if defined(PMAP_DIAGNOSTIC)
1726 if (pmap_nw_modified((pt_entry_t) tpte)) {
1727 printf(
1728 "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1729 pv->pv_va, tpte);
1730 }
1731 #endif
1732 if (pmap_track_modified(pv->pv_va))
1733 vm_page_dirty(m);
1734 }
1735 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1736 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1737 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1738 m->md.pv_list_count--;
1739 pmap_unuse_pt(pv->pv_pmap, pv->pv_va);
1740 PMAP_UNLOCK(pv->pv_pmap);
1741 free_pv_entry(pv);
1742 }
1743 vm_page_flag_clear(m, PG_WRITEABLE);
1744 sched_unpin();
1745 }
1746
1747 /*
1748 * Set the physical protection on the
1749 * specified range of this map as requested.
1750 */
1751 void
1752 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1753 {
1754 vm_offset_t pdnxt;
1755 pd_entry_t ptpaddr;
1756 pt_entry_t *pte;
1757 int anychanged;
1758
1759 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1760 pmap_remove(pmap, sva, eva);
1761 return;
1762 }
1763
1764 if (prot & VM_PROT_WRITE)
1765 return;
1766
1767 anychanged = 0;
1768
1769 vm_page_lock_queues();
1770 sched_pin();
1771 PMAP_LOCK(pmap);
1772 for (; sva < eva; sva = pdnxt) {
1773 unsigned obits, pbits, pdirindex;
1774
1775 pdnxt = (sva + NBPDR) & ~PDRMASK;
1776
1777 pdirindex = sva >> PDRSHIFT;
1778 ptpaddr = pmap->pm_pdir[pdirindex];
1779
1780 /*
1781 * Weed out invalid mappings. Note: we assume that the page
1782 * directory table is always allocated, and in kernel virtual.
1783 */
1784 if (ptpaddr == 0)
1785 continue;
1786
1787 /*
1788 * Check for large page.
1789 */
1790 if ((ptpaddr & PG_PS) != 0) {
1791 pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1792 anychanged = 1;
1793 continue;
1794 }
1795
1796 if (pdnxt > eva)
1797 pdnxt = eva;
1798
1799 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
1800 sva += PAGE_SIZE) {
1801 vm_page_t m;
1802
1803 retry:
1804 /*
1805 * Regardless of whether a pte is 32 or 64 bits in
1806 * size, PG_RW, PG_A, and PG_M are among the least
1807 * significant 32 bits.
1808 */
1809 obits = pbits = *(u_int *)pte;
1810 if (pbits & PG_MANAGED) {
1811 m = NULL;
1812 if (pbits & PG_A) {
1813 m = PHYS_TO_VM_PAGE(*pte);
1814 vm_page_flag_set(m, PG_REFERENCED);
1815 pbits &= ~PG_A;
1816 }
1817 if ((pbits & PG_M) != 0 &&
1818 pmap_track_modified(sva)) {
1819 if (m == NULL)
1820 m = PHYS_TO_VM_PAGE(*pte);
1821 vm_page_dirty(m);
1822 }
1823 }
1824
1825 pbits &= ~(PG_RW | PG_M);
1826
1827 if (pbits != obits) {
1828 if (!atomic_cmpset_int((u_int *)pte, obits,
1829 pbits))
1830 goto retry;
1831 if (obits & PG_G)
1832 pmap_invalidate_page(pmap, sva);
1833 else
1834 anychanged = 1;
1835 }
1836 }
1837 }
1838 sched_unpin();
1839 vm_page_unlock_queues();
1840 if (anychanged)
1841 pmap_invalidate_all(pmap);
1842 PMAP_UNLOCK(pmap);
1843 }
1844
1845 /*
1846 * Insert the given physical page (p) at
1847 * the specified virtual address (v) in the
1848 * target physical map with the protection requested.
1849 *
1850 * If specified, the page will be wired down, meaning
1851 * that the related pte can not be reclaimed.
1852 *
1853 * NB: This is the only routine which MAY NOT lazy-evaluate
1854 * or lose information. That is, this routine must actually
1855 * insert this page into the given map NOW.
1856 */
1857 void
1858 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1859 boolean_t wired)
1860 {
1861 vm_paddr_t pa;
1862 register pt_entry_t *pte;
1863 vm_paddr_t opa;
1864 pt_entry_t origpte, newpte;
1865 vm_page_t mpte, om;
1866 boolean_t invlva;
1867
1868 va &= PG_FRAME;
1869 #ifdef PMAP_DIAGNOSTIC
1870 if (va > VM_MAX_KERNEL_ADDRESS)
1871 panic("pmap_enter: toobig");
1872 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1873 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1874 #endif
1875
1876 mpte = NULL;
1877
1878 vm_page_lock_queues();
1879 PMAP_LOCK(pmap);
1880 sched_pin();
1881
1882 /*
1883 * In the case that a page table page is not
1884 * resident, we are creating it here.
1885 */
1886 if (va < VM_MAXUSER_ADDRESS) {
1887 mpte = pmap_allocpte(pmap, va, M_WAITOK);
1888 }
1889 #if 0 && defined(PMAP_DIAGNOSTIC)
1890 else {
1891 pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1892 origpte = *pdeaddr;
1893 if ((origpte & PG_V) == 0) {
1894 panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
1895 pmap->pm_pdir[PTDPTDI], origpte, va);
1896 }
1897 }
1898 #endif
1899
1900 pte = pmap_pte_quick(pmap, va);
1901
1902 /*
1903 * Page Directory table entry not valid, we need a new PT page
1904 */
1905 if (pte == NULL) {
1906 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
1907 (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
1908 }
1909
1910 pa = VM_PAGE_TO_PHYS(m);
1911 om = NULL;
1912 origpte = *pte;
1913 opa = origpte & PG_FRAME;
1914
1915 if (origpte & PG_PS) {
1916 /*
1917 * Yes, I know this will truncate upper address bits for PAE,
1918 * but I'm actually more interested in the lower bits
1919 */
1920 printf("pmap_enter: va %p, pte %p, origpte %p\n",
1921 (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
1922 panic("pmap_enter: attempted pmap_enter on 4MB page");
1923 }
1924
1925 /*
1926 * Mapping has not changed, must be protection or wiring change.
1927 */
1928 if (origpte && (opa == pa)) {
1929 /*
1930 * Wiring change, just update stats. We don't worry about
1931 * wiring PT pages as they remain resident as long as there
1932 * are valid mappings in them. Hence, if a user page is wired,
1933 * the PT page will be also.
1934 */
1935 if (wired && ((origpte & PG_W) == 0))
1936 pmap->pm_stats.wired_count++;
1937 else if (!wired && (origpte & PG_W))
1938 pmap->pm_stats.wired_count--;
1939
1940 /*
1941 * Remove extra pte reference
1942 */
1943 if (mpte)
1944 mpte->wire_count--;
1945
1946 /*
1947 * We might be turning off write access to the page,
1948 * so we go ahead and sense modify status.
1949 */
1950 if (origpte & PG_MANAGED) {
1951 om = m;
1952 pa |= PG_MANAGED;
1953 }
1954 goto validate;
1955 }
1956 /*
1957 * Mapping has changed, invalidate old range and fall through to
1958 * handle validating new mapping.
1959 */
1960 if (opa) {
1961 if (origpte & PG_W)
1962 pmap->pm_stats.wired_count--;
1963 if (origpte & PG_MANAGED) {
1964 om = PHYS_TO_VM_PAGE(opa);
1965 pmap_remove_entry(pmap, om, va);
1966 }
1967 if (mpte != NULL) {
1968 mpte->wire_count--;
1969 KASSERT(mpte->wire_count > 0,
1970 ("pmap_enter: missing reference to page table page,"
1971 " va: 0x%x", va));
1972 }
1973 } else
1974 pmap->pm_stats.resident_count++;
1975
1976 /*
1977 * Enter on the PV list if part of our managed memory.
1978 */
1979 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
1980 pmap_insert_entry(pmap, va, m);
1981 pa |= PG_MANAGED;
1982 }
1983
1984 /*
1985 * Increment counters
1986 */
1987 if (wired)
1988 pmap->pm_stats.wired_count++;
1989
1990 validate:
1991 /*
1992 * Now validate mapping with desired protection/wiring.
1993 */
1994 newpte = (pt_entry_t)(pa | PG_V);
1995 if ((prot & VM_PROT_WRITE) != 0)
1996 newpte |= PG_RW;
1997 if (wired)
1998 newpte |= PG_W;
1999 if (va < VM_MAXUSER_ADDRESS)
2000 newpte |= PG_U;
2001 if (pmap == kernel_pmap)
2002 newpte |= pgeflag;
2003
2004 /*
2005 * if the mapping or permission bits are different, we need
2006 * to update the pte.
2007 */
2008 if ((origpte & ~(PG_M|PG_A)) != newpte) {
2009 if (origpte & PG_V) {
2010 invlva = FALSE;
2011 origpte = pte_load_store(pte, newpte | PG_A);
2012 if (origpte & PG_A) {
2013 if (origpte & PG_MANAGED)
2014 vm_page_flag_set(om, PG_REFERENCED);
2015 if (opa != VM_PAGE_TO_PHYS(m))
2016 invlva = TRUE;
2017 }
2018 if (origpte & PG_M) {
2019 KASSERT((origpte & PG_RW),
2020 ("pmap_enter: modified page not writable:"
2021 " va: 0x%x, pte: 0x%x", va, origpte));
2022 if ((origpte & PG_MANAGED) &&
2023 pmap_track_modified(va))
2024 vm_page_dirty(om);
2025 if ((prot & VM_PROT_WRITE) == 0)
2026 invlva = TRUE;
2027 }
2028 if (invlva)
2029 pmap_invalidate_page(pmap, va);
2030 } else
2031 pte_store(pte, newpte | PG_A);
2032 }
2033 sched_unpin();
2034 vm_page_unlock_queues();
2035 PMAP_UNLOCK(pmap);
2036 }
2037
2038 /*
2039 * this code makes some *MAJOR* assumptions:
2040 * 1. Current pmap & pmap exists.
2041 * 2. Not wired.
2042 * 3. Read access.
2043 * 4. No page table pages.
2044 * but is *MUCH* faster than pmap_enter...
2045 */
2046
2047 vm_page_t
2048 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2049 vm_page_t mpte)
2050 {
2051 pt_entry_t *pte;
2052 vm_paddr_t pa;
2053
2054 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2055 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2056 PMAP_LOCK(pmap);
2057
2058 /*
2059 * In the case that a page table page is not
2060 * resident, we are creating it here.
2061 */
2062 if (va < VM_MAXUSER_ADDRESS) {
2063 unsigned ptepindex;
2064 pd_entry_t ptepa;
2065
2066 /*
2067 * Calculate pagetable page index
2068 */
2069 ptepindex = va >> PDRSHIFT;
2070 if (mpte && (mpte->pindex == ptepindex)) {
2071 mpte->wire_count++;
2072 } else {
2073 retry:
2074 /*
2075 * Get the page directory entry
2076 */
2077 ptepa = pmap->pm_pdir[ptepindex];
2078
2079 /*
2080 * If the page table page is mapped, we just increment
2081 * the hold count, and activate it.
2082 */
2083 if (ptepa) {
2084 if (ptepa & PG_PS)
2085 panic("pmap_enter_quick: unexpected mapping into 4MB page");
2086 mpte = PHYS_TO_VM_PAGE(ptepa);
2087 mpte->wire_count++;
2088 } else {
2089 mpte = _pmap_allocpte(pmap, ptepindex,
2090 M_NOWAIT);
2091 if (mpte == NULL) {
2092 PMAP_UNLOCK(pmap);
2093 vm_page_busy(m);
2094 vm_page_unlock_queues();
2095 VM_OBJECT_UNLOCK(m->object);
2096 VM_WAIT;
2097 VM_OBJECT_LOCK(m->object);
2098 vm_page_lock_queues();
2099 vm_page_wakeup(m);
2100 PMAP_LOCK(pmap);
2101 goto retry;
2102 }
2103 }
2104 }
2105 } else {
2106 mpte = NULL;
2107 }
2108
2109 /*
2110 * This call to vtopte makes the assumption that we are
2111 * entering the page into the current pmap. In order to support
2112 * quick entry into any pmap, one would likely use pmap_pte_quick.
2113 * But that isn't as quick as vtopte.
2114 */
2115 pte = vtopte(va);
2116 if (*pte) {
2117 if (mpte != NULL) {
2118 pmap_unwire_pte_hold(pmap, mpte);
2119 mpte = NULL;
2120 }
2121 goto out;
2122 }
2123
2124 /*
2125 * Enter on the PV list if part of our managed memory. Note that we
2126 * raise IPL while manipulating pv_table since pmap_enter can be
2127 * called at interrupt time.
2128 */
2129 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2130 pmap_insert_entry(pmap, va, m);
2131
2132 /*
2133 * Increment counters
2134 */
2135 pmap->pm_stats.resident_count++;
2136
2137 pa = VM_PAGE_TO_PHYS(m);
2138
2139 /*
2140 * Now validate mapping with RO protection
2141 */
2142 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2143 pte_store(pte, pa | PG_V | PG_U);
2144 else
2145 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2146 out:
2147 PMAP_UNLOCK(pmap);
2148 return mpte;
2149 }
2150
2151 /*
2152 * Make a temporary mapping for a physical address. This is only intended
2153 * to be used for panic dumps.
2154 */
2155 void *
2156 pmap_kenter_temporary(vm_paddr_t pa, int i)
2157 {
2158 vm_offset_t va;
2159
2160 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2161 pmap_kenter(va, pa);
2162 invlpg(va);
2163 return ((void *)crashdumpmap);
2164 }
2165
2166 /*
2167 * This code maps large physical mmap regions into the
2168 * processor address space. Note that some shortcuts
2169 * are taken, but the code works.
2170 */
2171 void
2172 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2173 vm_object_t object, vm_pindex_t pindex,
2174 vm_size_t size)
2175 {
2176 vm_page_t p;
2177
2178 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2179 KASSERT(object->type == OBJT_DEVICE,
2180 ("pmap_object_init_pt: non-device object"));
2181 if (pseflag &&
2182 ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2183 int i;
2184 vm_page_t m[1];
2185 unsigned int ptepindex;
2186 int npdes;
2187 pd_entry_t ptepa;
2188
2189 PMAP_LOCK(pmap);
2190 if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2191 goto out;
2192 PMAP_UNLOCK(pmap);
2193 retry:
2194 p = vm_page_lookup(object, pindex);
2195 if (p != NULL) {
2196 vm_page_lock_queues();
2197 if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2198 goto retry;
2199 } else {
2200 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2201 if (p == NULL)
2202 return;
2203 m[0] = p;
2204
2205 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2206 vm_page_lock_queues();
2207 vm_page_free(p);
2208 vm_page_unlock_queues();
2209 return;
2210 }
2211
2212 p = vm_page_lookup(object, pindex);
2213 vm_page_lock_queues();
2214 vm_page_wakeup(p);
2215 }
2216 vm_page_unlock_queues();
2217
2218 ptepa = VM_PAGE_TO_PHYS(p);
2219 if (ptepa & (NBPDR - 1))
2220 return;
2221
2222 p->valid = VM_PAGE_BITS_ALL;
2223
2224 PMAP_LOCK(pmap);
2225 pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2226 npdes = size >> PDRSHIFT;
2227 for(i = 0; i < npdes; i++) {
2228 pde_store(&pmap->pm_pdir[ptepindex],
2229 ptepa | PG_U | PG_RW | PG_V | PG_PS);
2230 ptepa += NBPDR;
2231 ptepindex += 1;
2232 }
2233 pmap_invalidate_all(pmap);
2234 out:
2235 PMAP_UNLOCK(pmap);
2236 }
2237 }
2238
2239 /*
2240 * Routine: pmap_change_wiring
2241 * Function: Change the wiring attribute for a map/virtual-address
2242 * pair.
2243 * In/out conditions:
2244 * The mapping must already exist in the pmap.
2245 */
2246 void
2247 pmap_change_wiring(pmap, va, wired)
2248 register pmap_t pmap;
2249 vm_offset_t va;
2250 boolean_t wired;
2251 {
2252 register pt_entry_t *pte;
2253
2254 PMAP_LOCK(pmap);
2255 pte = pmap_pte(pmap, va);
2256
2257 if (wired && !pmap_pte_w(pte))
2258 pmap->pm_stats.wired_count++;
2259 else if (!wired && pmap_pte_w(pte))
2260 pmap->pm_stats.wired_count--;
2261
2262 /*
2263 * Wiring is not a hardware characteristic so there is no need to
2264 * invalidate TLB.
2265 */
2266 pmap_pte_set_w(pte, wired);
2267 pmap_pte_release(pte);
2268 PMAP_UNLOCK(pmap);
2269 }
2270
2271
2272
2273 /*
2274 * Copy the range specified by src_addr/len
2275 * from the source map to the range dst_addr/len
2276 * in the destination map.
2277 *
2278 * This routine is only advisory and need not do anything.
2279 */
2280
2281 void
2282 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2283 vm_offset_t src_addr)
2284 {
2285 vm_offset_t addr;
2286 vm_offset_t end_addr = src_addr + len;
2287 vm_offset_t pdnxt;
2288 vm_page_t m;
2289
2290 if (dst_addr != src_addr)
2291 return;
2292
2293 if (!pmap_is_current(src_pmap))
2294 return;
2295
2296 vm_page_lock_queues();
2297 if (dst_pmap < src_pmap) {
2298 PMAP_LOCK(dst_pmap);
2299 PMAP_LOCK(src_pmap);
2300 } else {
2301 PMAP_LOCK(src_pmap);
2302 PMAP_LOCK(dst_pmap);
2303 }
2304 sched_pin();
2305 for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2306 pt_entry_t *src_pte, *dst_pte;
2307 vm_page_t dstmpte, srcmpte;
2308 pd_entry_t srcptepaddr;
2309 unsigned ptepindex;
2310
2311 if (addr >= UPT_MIN_ADDRESS)
2312 panic("pmap_copy: invalid to pmap_copy page tables");
2313
2314 /*
2315 * Don't let optional prefaulting of pages make us go
2316 * way below the low water mark of free pages or way
2317 * above high water mark of used pv entries.
2318 */
2319 if (cnt.v_free_count < cnt.v_free_reserved ||
2320 pv_entry_count > pv_entry_high_water)
2321 break;
2322
2323 pdnxt = (addr + NBPDR) & ~PDRMASK;
2324 ptepindex = addr >> PDRSHIFT;
2325
2326 srcptepaddr = src_pmap->pm_pdir[ptepindex];
2327 if (srcptepaddr == 0)
2328 continue;
2329
2330 if (srcptepaddr & PG_PS) {
2331 if (dst_pmap->pm_pdir[ptepindex] == 0) {
2332 dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2333 dst_pmap->pm_stats.resident_count +=
2334 NBPDR / PAGE_SIZE;
2335 }
2336 continue;
2337 }
2338
2339 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2340 if (srcmpte->wire_count == 0)
2341 panic("pmap_copy: source page table page is unused");
2342
2343 if (pdnxt > end_addr)
2344 pdnxt = end_addr;
2345
2346 src_pte = vtopte(addr);
2347 while (addr < pdnxt) {
2348 pt_entry_t ptetemp;
2349 ptetemp = *src_pte;
2350 /*
2351 * we only virtual copy managed pages
2352 */
2353 if ((ptetemp & PG_MANAGED) != 0) {
2354 /*
2355 * We have to check after allocpte for the
2356 * pte still being around... allocpte can
2357 * block.
2358 */
2359 dstmpte = pmap_allocpte(dst_pmap, addr,
2360 M_NOWAIT);
2361 if (dstmpte == NULL)
2362 break;
2363 dst_pte = pmap_pte_quick(dst_pmap, addr);
2364 if (*dst_pte == 0) {
2365 /*
2366 * Clear the modified and
2367 * accessed (referenced) bits
2368 * during the copy.
2369 */
2370 m = PHYS_TO_VM_PAGE(ptetemp);
2371 *dst_pte = ptetemp & ~(PG_M | PG_A);
2372 dst_pmap->pm_stats.resident_count++;
2373 pmap_insert_entry(dst_pmap, addr, m);
2374 } else
2375 pmap_unwire_pte_hold(dst_pmap, dstmpte);
2376 if (dstmpte->wire_count >= srcmpte->wire_count)
2377 break;
2378 }
2379 addr += PAGE_SIZE;
2380 src_pte++;
2381 }
2382 }
2383 sched_unpin();
2384 vm_page_unlock_queues();
2385 PMAP_UNLOCK(src_pmap);
2386 PMAP_UNLOCK(dst_pmap);
2387 }
2388
2389 static __inline void
2390 pagezero(void *page)
2391 {
2392 #if defined(I686_CPU)
2393 if (cpu_class == CPUCLASS_686) {
2394 #if defined(CPU_ENABLE_SSE)
2395 if (cpu_feature & CPUID_SSE2)
2396 sse2_pagezero(page);
2397 else
2398 #endif
2399 i686_pagezero(page);
2400 } else
2401 #endif
2402 bzero(page, PAGE_SIZE);
2403 }
2404
2405 /*
2406 * pmap_zero_page zeros the specified hardware page by mapping
2407 * the page into KVM and using bzero to clear its contents.
2408 */
2409 void
2410 pmap_zero_page(vm_page_t m)
2411 {
2412 struct sysmaps *sysmaps;
2413
2414 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2415 mtx_lock(&sysmaps->lock);
2416 if (*sysmaps->CMAP2)
2417 panic("pmap_zero_page: CMAP2 busy");
2418 sched_pin();
2419 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2420 invlcaddr(sysmaps->CADDR2);
2421 pagezero(sysmaps->CADDR2);
2422 *sysmaps->CMAP2 = 0;
2423 sched_unpin();
2424 mtx_unlock(&sysmaps->lock);
2425 }
2426
2427 /*
2428 * pmap_zero_page_area zeros the specified hardware page by mapping
2429 * the page into KVM and using bzero to clear its contents.
2430 *
2431 * off and size may not cover an area beyond a single hardware page.
2432 */
2433 void
2434 pmap_zero_page_area(vm_page_t m, int off, int size)
2435 {
2436 struct sysmaps *sysmaps;
2437
2438 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2439 mtx_lock(&sysmaps->lock);
2440 if (*sysmaps->CMAP2)
2441 panic("pmap_zero_page: CMAP2 busy");
2442 sched_pin();
2443 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2444 invlcaddr(sysmaps->CADDR2);
2445 if (off == 0 && size == PAGE_SIZE)
2446 pagezero(sysmaps->CADDR2);
2447 else
2448 bzero((char *)sysmaps->CADDR2 + off, size);
2449 *sysmaps->CMAP2 = 0;
2450 sched_unpin();
2451 mtx_unlock(&sysmaps->lock);
2452 }
2453
2454 /*
2455 * pmap_zero_page_idle zeros the specified hardware page by mapping
2456 * the page into KVM and using bzero to clear its contents. This
2457 * is intended to be called from the vm_pagezero process only and
2458 * outside of Giant.
2459 */
2460 void
2461 pmap_zero_page_idle(vm_page_t m)
2462 {
2463
2464 if (*CMAP3)
2465 panic("pmap_zero_page: CMAP3 busy");
2466 sched_pin();
2467 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2468 invlcaddr(CADDR3);
2469 pagezero(CADDR3);
2470 *CMAP3 = 0;
2471 sched_unpin();
2472 }
2473
2474 /*
2475 * pmap_copy_page copies the specified (machine independent)
2476 * page by mapping the page into virtual memory and using
2477 * bcopy to copy the page, one machine dependent page at a
2478 * time.
2479 */
2480 void
2481 pmap_copy_page(vm_page_t src, vm_page_t dst)
2482 {
2483 struct sysmaps *sysmaps;
2484
2485 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2486 mtx_lock(&sysmaps->lock);
2487 if (*sysmaps->CMAP1)
2488 panic("pmap_copy_page: CMAP1 busy");
2489 if (*sysmaps->CMAP2)
2490 panic("pmap_copy_page: CMAP2 busy");
2491 sched_pin();
2492 invlpg((u_int)sysmaps->CADDR1);
2493 invlpg((u_int)sysmaps->CADDR2);
2494 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2495 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2496 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
2497 *sysmaps->CMAP1 = 0;
2498 *sysmaps->CMAP2 = 0;
2499 sched_unpin();
2500 mtx_unlock(&sysmaps->lock);
2501 }
2502
2503 /*
2504 * Returns true if the pmap's pv is one of the first
2505 * 16 pvs linked to from this page. This count may
2506 * be changed upwards or downwards in the future; it
2507 * is only necessary that true be returned for a small
2508 * subset of pmaps for proper page aging.
2509 */
2510 boolean_t
2511 pmap_page_exists_quick(pmap, m)
2512 pmap_t pmap;
2513 vm_page_t m;
2514 {
2515 pv_entry_t pv;
2516 int loops = 0;
2517
2518 if (m->flags & PG_FICTITIOUS)
2519 return FALSE;
2520
2521 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2522 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2523 if (pv->pv_pmap == pmap) {
2524 return TRUE;
2525 }
2526 loops++;
2527 if (loops >= 16)
2528 break;
2529 }
2530 return (FALSE);
2531 }
2532
2533 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
2534 /*
2535 * Remove all pages from specified address space
2536 * this aids process exit speeds. Also, this code
2537 * is special cased for current process only, but
2538 * can have the more generic (and slightly slower)
2539 * mode enabled. This is much faster than pmap_remove
2540 * in the case of running down an entire address space.
2541 */
2542 void
2543 pmap_remove_pages(pmap, sva, eva)
2544 pmap_t pmap;
2545 vm_offset_t sva, eva;
2546 {
2547 pt_entry_t *pte, tpte;
2548 vm_page_t m;
2549 pv_entry_t pv, npv;
2550
2551 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2552 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2553 printf("warning: pmap_remove_pages called with non-current pmap\n");
2554 return;
2555 }
2556 #endif
2557 vm_page_lock_queues();
2558 PMAP_LOCK(pmap);
2559 sched_pin();
2560 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2561
2562 if (pv->pv_va >= eva || pv->pv_va < sva) {
2563 npv = TAILQ_NEXT(pv, pv_plist);
2564 continue;
2565 }
2566
2567 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2568 pte = vtopte(pv->pv_va);
2569 #else
2570 pte = pmap_pte_quick(pmap, pv->pv_va);
2571 #endif
2572 tpte = *pte;
2573
2574 if (tpte == 0) {
2575 printf("TPTE at %p IS ZERO @ VA %08x\n",
2576 pte, pv->pv_va);
2577 panic("bad pte");
2578 }
2579
2580 /*
2581 * We cannot remove wired pages from a process' mapping at this time
2582 */
2583 if (tpte & PG_W) {
2584 npv = TAILQ_NEXT(pv, pv_plist);
2585 continue;
2586 }
2587
2588 m = PHYS_TO_VM_PAGE(tpte);
2589 KASSERT(m->phys_addr == (tpte & PG_FRAME),
2590 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2591 m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2592
2593 KASSERT(m < &vm_page_array[vm_page_array_size],
2594 ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2595
2596 pmap->pm_stats.resident_count--;
2597
2598 pte_clear(pte);
2599
2600 /*
2601 * Update the vm_page_t clean and reference bits.
2602 */
2603 if (tpte & PG_M) {
2604 vm_page_dirty(m);
2605 }
2606
2607 npv = TAILQ_NEXT(pv, pv_plist);
2608 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2609
2610 m->md.pv_list_count--;
2611 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2612 if (TAILQ_EMPTY(&m->md.pv_list))
2613 vm_page_flag_clear(m, PG_WRITEABLE);
2614
2615 pmap_unuse_pt(pmap, pv->pv_va);
2616 free_pv_entry(pv);
2617 }
2618 sched_unpin();
2619 pmap_invalidate_all(pmap);
2620 PMAP_UNLOCK(pmap);
2621 vm_page_unlock_queues();
2622 }
2623
2624 /*
2625 * pmap_is_modified:
2626 *
2627 * Return whether or not the specified physical page was modified
2628 * in any physical maps.
2629 */
2630 boolean_t
2631 pmap_is_modified(vm_page_t m)
2632 {
2633 pv_entry_t pv;
2634 pt_entry_t *pte;
2635 boolean_t rv;
2636
2637 rv = FALSE;
2638 if (m->flags & PG_FICTITIOUS)
2639 return (rv);
2640
2641 sched_pin();
2642 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2643 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2644 /*
2645 * if the bit being tested is the modified bit, then
2646 * mark clean_map and ptes as never
2647 * modified.
2648 */
2649 if (!pmap_track_modified(pv->pv_va))
2650 continue;
2651 PMAP_LOCK(pv->pv_pmap);
2652 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2653 rv = (*pte & PG_M) != 0;
2654 PMAP_UNLOCK(pv->pv_pmap);
2655 if (rv)
2656 break;
2657 }
2658 sched_unpin();
2659 return (rv);
2660 }
2661
2662 /*
2663 * pmap_is_prefaultable:
2664 *
2665 * Return whether or not the specified virtual address is elgible
2666 * for prefault.
2667 */
2668 boolean_t
2669 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2670 {
2671 pt_entry_t *pte;
2672 boolean_t rv;
2673
2674 rv = FALSE;
2675 PMAP_LOCK(pmap);
2676 if (*pmap_pde(pmap, addr)) {
2677 pte = vtopte(addr);
2678 rv = *pte == 0;
2679 }
2680 PMAP_UNLOCK(pmap);
2681 return (rv);
2682 }
2683
2684 /*
2685 * Clear the given bit in each of the given page's ptes. The bit is
2686 * expressed as a 32-bit mask. Consequently, if the pte is 64 bits in
2687 * size, only a bit within the least significant 32 can be cleared.
2688 */
2689 static __inline void
2690 pmap_clear_ptes(vm_page_t m, int bit)
2691 {
2692 register pv_entry_t pv;
2693 pt_entry_t pbits, *pte;
2694
2695 if ((m->flags & PG_FICTITIOUS) ||
2696 (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2697 return;
2698
2699 sched_pin();
2700 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2701 /*
2702 * Loop over all current mappings setting/clearing as appropos If
2703 * setting RO do we need to clear the VAC?
2704 */
2705 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2706 /*
2707 * don't write protect pager mappings
2708 */
2709 if (bit == PG_RW) {
2710 if (!pmap_track_modified(pv->pv_va))
2711 continue;
2712 }
2713
2714 PMAP_LOCK(pv->pv_pmap);
2715 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2716 retry:
2717 pbits = *pte;
2718 if (pbits & bit) {
2719 if (bit == PG_RW) {
2720 /*
2721 * Regardless of whether a pte is 32 or 64 bits
2722 * in size, PG_RW and PG_M are among the least
2723 * significant 32 bits.
2724 */
2725 if (!atomic_cmpset_int((u_int *)pte, pbits,
2726 pbits & ~(PG_RW | PG_M)))
2727 goto retry;
2728 if (pbits & PG_M) {
2729 vm_page_dirty(m);
2730 }
2731 } else {
2732 atomic_clear_int((u_int *)pte, bit);
2733 }
2734 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2735 }
2736 PMAP_UNLOCK(pv->pv_pmap);
2737 }
2738 if (bit == PG_RW)
2739 vm_page_flag_clear(m, PG_WRITEABLE);
2740 sched_unpin();
2741 }
2742
2743 /*
2744 * pmap_page_protect:
2745 *
2746 * Lower the permission for all mappings to a given page.
2747 */
2748 void
2749 pmap_page_protect(vm_page_t m, vm_prot_t prot)
2750 {
2751 if ((prot & VM_PROT_WRITE) == 0) {
2752 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2753 pmap_clear_ptes(m, PG_RW);
2754 } else {
2755 pmap_remove_all(m);
2756 }
2757 }
2758 }
2759
2760 /*
2761 * pmap_ts_referenced:
2762 *
2763 * Return a count of reference bits for a page, clearing those bits.
2764 * It is not necessary for every reference bit to be cleared, but it
2765 * is necessary that 0 only be returned when there are truly no
2766 * reference bits set.
2767 *
2768 * XXX: The exact number of bits to check and clear is a matter that
2769 * should be tested and standardized at some point in the future for
2770 * optimal aging of shared pages.
2771 */
2772 int
2773 pmap_ts_referenced(vm_page_t m)
2774 {
2775 register pv_entry_t pv, pvf, pvn;
2776 pt_entry_t *pte;
2777 pt_entry_t v;
2778 int rtval = 0;
2779
2780 if (m->flags & PG_FICTITIOUS)
2781 return (rtval);
2782
2783 sched_pin();
2784 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2785 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2786
2787 pvf = pv;
2788
2789 do {
2790 pvn = TAILQ_NEXT(pv, pv_list);
2791
2792 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2793
2794 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2795
2796 if (!pmap_track_modified(pv->pv_va))
2797 continue;
2798
2799 PMAP_LOCK(pv->pv_pmap);
2800 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2801
2802 if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2803 atomic_clear_int((u_int *)pte, PG_A);
2804 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2805
2806 rtval++;
2807 if (rtval > 4) {
2808 PMAP_UNLOCK(pv->pv_pmap);
2809 break;
2810 }
2811 }
2812 PMAP_UNLOCK(pv->pv_pmap);
2813 } while ((pv = pvn) != NULL && pv != pvf);
2814 }
2815 sched_unpin();
2816
2817 return (rtval);
2818 }
2819
2820 /*
2821 * Clear the modify bits on the specified physical page.
2822 */
2823 void
2824 pmap_clear_modify(vm_page_t m)
2825 {
2826 pmap_clear_ptes(m, PG_M);
2827 }
2828
2829 /*
2830 * pmap_clear_reference:
2831 *
2832 * Clear the reference bit on the specified physical page.
2833 */
2834 void
2835 pmap_clear_reference(vm_page_t m)
2836 {
2837 pmap_clear_ptes(m, PG_A);
2838 }
2839
2840 /*
2841 * Miscellaneous support routines follow
2842 */
2843
2844 /*
2845 * Map a set of physical memory pages into the kernel virtual
2846 * address space. Return a pointer to where it is mapped. This
2847 * routine is intended to be used for mapping device memory,
2848 * NOT real memory.
2849 */
2850 void *
2851 pmap_mapdev(pa, size)
2852 vm_paddr_t pa;
2853 vm_size_t size;
2854 {
2855 vm_offset_t va, tmpva, offset;
2856
2857 offset = pa & PAGE_MASK;
2858 size = roundup(offset + size, PAGE_SIZE);
2859 pa = pa & PG_FRAME;
2860
2861 if (pa < KERNLOAD && pa + size <= KERNLOAD)
2862 va = KERNBASE + pa;
2863 else
2864 va = kmem_alloc_nofault(kernel_map, size);
2865 if (!va)
2866 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2867
2868 for (tmpva = va; size > 0; ) {
2869 pmap_kenter(tmpva, pa);
2870 size -= PAGE_SIZE;
2871 tmpva += PAGE_SIZE;
2872 pa += PAGE_SIZE;
2873 }
2874 pmap_invalidate_range(kernel_pmap, va, tmpva);
2875 return ((void *)(va + offset));
2876 }
2877
2878 void
2879 pmap_unmapdev(va, size)
2880 vm_offset_t va;
2881 vm_size_t size;
2882 {
2883 vm_offset_t base, offset, tmpva;
2884
2885 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
2886 return;
2887 base = va & PG_FRAME;
2888 offset = va & PAGE_MASK;
2889 size = roundup(offset + size, PAGE_SIZE);
2890 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2891 pmap_kremove(tmpva);
2892 pmap_invalidate_range(kernel_pmap, va, tmpva);
2893 kmem_free(kernel_map, base, size);
2894 }
2895
2896 /*
2897 * perform the pmap work for mincore
2898 */
2899 int
2900 pmap_mincore(pmap, addr)
2901 pmap_t pmap;
2902 vm_offset_t addr;
2903 {
2904 pt_entry_t *ptep, pte;
2905 vm_page_t m;
2906 int val = 0;
2907
2908 PMAP_LOCK(pmap);
2909 ptep = pmap_pte(pmap, addr);
2910 pte = (ptep != NULL) ? *ptep : 0;
2911 pmap_pte_release(ptep);
2912 PMAP_UNLOCK(pmap);
2913
2914 if (pte != 0) {
2915 vm_paddr_t pa;
2916
2917 val = MINCORE_INCORE;
2918 if ((pte & PG_MANAGED) == 0)
2919 return val;
2920
2921 pa = pte & PG_FRAME;
2922
2923 m = PHYS_TO_VM_PAGE(pa);
2924
2925 /*
2926 * Modified by us
2927 */
2928 if (pte & PG_M)
2929 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2930 else {
2931 /*
2932 * Modified by someone else
2933 */
2934 vm_page_lock_queues();
2935 if (m->dirty || pmap_is_modified(m))
2936 val |= MINCORE_MODIFIED_OTHER;
2937 vm_page_unlock_queues();
2938 }
2939 /*
2940 * Referenced by us
2941 */
2942 if (pte & PG_A)
2943 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2944 else {
2945 /*
2946 * Referenced by someone else
2947 */
2948 vm_page_lock_queues();
2949 if ((m->flags & PG_REFERENCED) ||
2950 pmap_ts_referenced(m)) {
2951 val |= MINCORE_REFERENCED_OTHER;
2952 vm_page_flag_set(m, PG_REFERENCED);
2953 }
2954 vm_page_unlock_queues();
2955 }
2956 }
2957 return val;
2958 }
2959
2960 void
2961 pmap_activate(struct thread *td)
2962 {
2963 struct proc *p = td->td_proc;
2964 pmap_t pmap, oldpmap;
2965 u_int32_t cr3;
2966
2967 critical_enter();
2968 pmap = vmspace_pmap(td->td_proc->p_vmspace);
2969 oldpmap = PCPU_GET(curpmap);
2970 #if defined(SMP)
2971 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
2972 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
2973 #else
2974 oldpmap->pm_active &= ~1;
2975 pmap->pm_active |= 1;
2976 #endif
2977 #ifdef PAE
2978 cr3 = vtophys(pmap->pm_pdpt);
2979 #else
2980 cr3 = vtophys(pmap->pm_pdir);
2981 #endif
2982 /* XXXKSE this is wrong.
2983 * pmap_activate is for the current thread on the current cpu
2984 */
2985 if (p->p_flag & P_SA) {
2986 /* Make sure all other cr3 entries are updated. */
2987 /* what if they are running? XXXKSE (maybe abort them) */
2988 FOREACH_THREAD_IN_PROC(p, td) {
2989 td->td_pcb->pcb_cr3 = cr3;
2990 }
2991 } else {
2992 td->td_pcb->pcb_cr3 = cr3;
2993 }
2994 load_cr3(cr3);
2995 PCPU_SET(curpmap, pmap);
2996 critical_exit();
2997 }
2998
2999 vm_offset_t
3000 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3001 {
3002
3003 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3004 return addr;
3005 }
3006
3007 addr = (addr + PDRMASK) & ~PDRMASK;
3008 return addr;
3009 }
3010
3011
3012 #if defined(PMAP_DEBUG)
3013 pmap_pid_dump(int pid)
3014 {
3015 pmap_t pmap;
3016 struct proc *p;
3017 int npte = 0;
3018 int index;
3019
3020 sx_slock(&allproc_lock);
3021 LIST_FOREACH(p, &allproc, p_list) {
3022 if (p->p_pid != pid)
3023 continue;
3024
3025 if (p->p_vmspace) {
3026 int i,j;
3027 index = 0;
3028 pmap = vmspace_pmap(p->p_vmspace);
3029 for (i = 0; i < NPDEPTD; i++) {
3030 pd_entry_t *pde;
3031 pt_entry_t *pte;
3032 vm_offset_t base = i << PDRSHIFT;
3033
3034 pde = &pmap->pm_pdir[i];
3035 if (pde && pmap_pde_v(pde)) {
3036 for (j = 0; j < NPTEPG; j++) {
3037 vm_offset_t va = base + (j << PAGE_SHIFT);
3038 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3039 if (index) {
3040 index = 0;
3041 printf("\n");
3042 }
3043 sx_sunlock(&allproc_lock);
3044 return npte;
3045 }
3046 pte = pmap_pte(pmap, va);
3047 if (pte && pmap_pte_v(pte)) {
3048 pt_entry_t pa;
3049 vm_page_t m;
3050 pa = *pte;
3051 m = PHYS_TO_VM_PAGE(pa);
3052 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3053 va, pa, m->hold_count, m->wire_count, m->flags);
3054 npte++;
3055 index++;
3056 if (index >= 2) {
3057 index = 0;
3058 printf("\n");
3059 } else {
3060 printf(" ");
3061 }
3062 }
3063 }
3064 }
3065 }
3066 }
3067 }
3068 sx_sunlock(&allproc_lock);
3069 return npte;
3070 }
3071 #endif
3072
3073 #if defined(DEBUG)
3074
3075 static void pads(pmap_t pm);
3076 void pmap_pvdump(vm_offset_t pa);
3077
3078 /* print address space of pmap*/
3079 static void
3080 pads(pm)
3081 pmap_t pm;
3082 {
3083 int i, j;
3084 vm_paddr_t va;
3085 pt_entry_t *ptep;
3086
3087 if (pm == kernel_pmap)
3088 return;
3089 for (i = 0; i < NPDEPTD; i++)
3090 if (pm->pm_pdir[i])
3091 for (j = 0; j < NPTEPG; j++) {
3092 va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3093 if (pm == kernel_pmap && va < KERNBASE)
3094 continue;
3095 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3096 continue;
3097 ptep = pmap_pte(pm, va);
3098 if (pmap_pte_v(ptep))
3099 printf("%x:%x ", va, *ptep);
3100 };
3101
3102 }
3103
3104 void
3105 pmap_pvdump(pa)
3106 vm_paddr_t pa;
3107 {
3108 pv_entry_t pv;
3109 vm_page_t m;
3110
3111 printf("pa %x", pa);
3112 m = PHYS_TO_VM_PAGE(pa);
3113 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3114 printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3115 pads(pv->pv_pmap);
3116 }
3117 printf(" ");
3118 }
3119 #endif
Cache object: 3263759ac19bbb8557656603dab3f36b
|