FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c
1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 * must display the following acknowledgement:
25 * This product includes software developed by the University of
26 * California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
44 */
45 /*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD: releng/5.2/sys/amd64/amd64/pmap.c 123179 2003-12-06 23:18:42Z peter $");
79
80 /*
81 * Manages physical address maps.
82 *
83 * In addition to hardware address maps, this
84 * module is called upon to provide software-use-only
85 * maps which may or may not be stored in the same
86 * form as hardware maps. These pseudo-maps are
87 * used to store intermediate results from copy
88 * operations to and from address spaces.
89 *
90 * Since the information managed by this module is
91 * also stored by the logical address mapping module,
92 * this module may throw away valid virtual-to-physical
93 * mappings at almost any time. However, invalidations
94 * of virtual-to-physical mappings must be done as
95 * requested.
96 *
97 * In order to cope with hardware architectures which
98 * make virtual-to-physical map invalidates expensive,
99 * this module may delay invalidate or reduced protection
100 * operations until such time as they are actually
101 * necessary. This module is given full information as
102 * to which processors are currently using which maps,
103 * and to when physical maps must be made correct.
104 */
105
106 #include "opt_msgbuf.h"
107 #include "opt_kstack_pages.h"
108
109 #include <sys/param.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/lock.h>
113 #include <sys/mman.h>
114 #include <sys/msgbuf.h>
115 #include <sys/mutex.h>
116 #include <sys/proc.h>
117 #include <sys/sx.h>
118 #include <sys/user.h>
119 #include <sys/vmmeter.h>
120 #include <sys/sysctl.h>
121 #ifdef SMP
122 #include <sys/smp.h>
123 #endif
124
125 #include <vm/vm.h>
126 #include <vm/vm_param.h>
127 #include <vm/vm_kern.h>
128 #include <vm/vm_page.h>
129 #include <vm/vm_map.h>
130 #include <vm/vm_object.h>
131 #include <vm/vm_extern.h>
132 #include <vm/vm_pageout.h>
133 #include <vm/vm_pager.h>
134 #include <vm/uma.h>
135
136 #include <machine/cpu.h>
137 #include <machine/cputypes.h>
138 #include <machine/md_var.h>
139 #include <machine/specialreg.h>
140 #ifdef SMP
141 #include <machine/smp.h>
142 #endif
143
144 #define PMAP_KEEP_PDIRS
145 #ifndef PMAP_SHPGPERPROC
146 #define PMAP_SHPGPERPROC 200
147 #endif
148
149 #if defined(DIAGNOSTIC)
150 #define PMAP_DIAGNOSTIC
151 #endif
152
153 #define MINPV 2048
154
155 #if !defined(PMAP_DIAGNOSTIC)
156 #define PMAP_INLINE __inline
157 #else
158 #define PMAP_INLINE
159 #endif
160
161 struct pmap kernel_pmap_store;
162 LIST_HEAD(pmaplist, pmap);
163 static struct pmaplist allpmaps;
164 static struct mtx allpmaps_lock;
165 #ifdef LAZY_SWITCH
166 #ifdef SMP
167 static struct mtx lazypmap_lock;
168 #endif
169 #endif
170
171 vm_paddr_t avail_start; /* PA of first available physical page */
172 vm_paddr_t avail_end; /* PA of last available physical page */
173 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
174 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
175 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */
176
177 static int nkpt;
178 static int ndmpdp;
179 static vm_paddr_t dmaplimit;
180 vm_offset_t kernel_vm_end;
181
182 static u_int64_t KPTphys; /* phys addr of kernel level 1 */
183 static u_int64_t KPDphys; /* phys addr of kernel level 2 */
184 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */
185 u_int64_t KPML4phys; /* phys addr of kernel level 4 */
186
187 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
188 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
189
190 /*
191 * Data for the pv entry allocation mechanism
192 */
193 static uma_zone_t pvzone;
194 static struct vm_object pvzone_obj;
195 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
196 int pmap_pagedaemon_waken;
197
198 /*
199 * All those kernel PT submaps that BSD is so fond of
200 */
201 pt_entry_t *CMAP1 = 0;
202 static pt_entry_t *ptmmap;
203 caddr_t CADDR1 = 0, ptvmmap = 0;
204 static pt_entry_t *msgbufmap;
205 struct msgbuf *msgbufp = 0;
206
207 /*
208 * Crashdump maps.
209 */
210 static pt_entry_t *pt_crashdumpmap;
211 static caddr_t crashdumpmap;
212
213 static PMAP_INLINE void free_pv_entry(pv_entry_t pv);
214 static pv_entry_t get_pv_entry(void);
215 static void pmap_clear_ptes(vm_page_t m, int bit)
216 __always_inline;
217
218 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
219 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
220 static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
221 vm_offset_t va);
222 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
223 vm_page_t mpte, vm_page_t m);
224
225 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
226
227 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex);
228 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
229 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
230 static void *pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
231
232 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
233 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
234
235 /*
236 * Move the kernel virtual free pointer to the next
237 * 2MB. This is used to help improve performance
238 * by using a large (2MB) page for much of the kernel
239 * (.text, .data, .bss)
240 */
241 static vm_offset_t
242 pmap_kmem_choose(vm_offset_t addr)
243 {
244 vm_offset_t newaddr = addr;
245
246 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
247 return newaddr;
248 }
249
250 /********************/
251 /* Inline functions */
252 /********************/
253
254 /* Return a non-clipped PD index for a given VA */
255 static __inline vm_pindex_t
256 pmap_pde_pindex(vm_offset_t va)
257 {
258 return va >> PDRSHIFT;
259 }
260
261
262 /* Return various clipped indexes for a given VA */
263 static __inline vm_pindex_t
264 pmap_pte_index(vm_offset_t va)
265 {
266
267 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
268 }
269
270 static __inline vm_pindex_t
271 pmap_pde_index(vm_offset_t va)
272 {
273
274 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
275 }
276
277 static __inline vm_pindex_t
278 pmap_pdpe_index(vm_offset_t va)
279 {
280
281 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
282 }
283
284 static __inline vm_pindex_t
285 pmap_pml4e_index(vm_offset_t va)
286 {
287
288 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
289 }
290
291 /* Return a pointer to the PML4 slot that corresponds to a VA */
292 static __inline pml4_entry_t *
293 pmap_pml4e(pmap_t pmap, vm_offset_t va)
294 {
295
296 if (!pmap)
297 return NULL;
298 return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
299 }
300
301 /* Return a pointer to the PDP slot that corresponds to a VA */
302 static __inline pdp_entry_t *
303 pmap_pdpe(pmap_t pmap, vm_offset_t va)
304 {
305 pml4_entry_t *pml4e;
306 pdp_entry_t *pdpe;
307
308 pml4e = pmap_pml4e(pmap, va);
309 if (pml4e == NULL || (*pml4e & PG_V) == 0)
310 return NULL;
311 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
312 return (&pdpe[pmap_pdpe_index(va)]);
313 }
314
315 /* Return a pointer to the PD slot that corresponds to a VA */
316 static __inline pd_entry_t *
317 pmap_pde(pmap_t pmap, vm_offset_t va)
318 {
319 pdp_entry_t *pdpe;
320 pd_entry_t *pde;
321
322 pdpe = pmap_pdpe(pmap, va);
323 if (pdpe == NULL || (*pdpe & PG_V) == 0)
324 return NULL;
325 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
326 return (&pde[pmap_pde_index(va)]);
327 }
328
329 /* Return a pointer to the PT slot that corresponds to a VA */
330 static __inline pt_entry_t *
331 pmap_pte(pmap_t pmap, vm_offset_t va)
332 {
333 pd_entry_t *pde;
334 pt_entry_t *pte;
335
336 pde = pmap_pde(pmap, va);
337 if (pde == NULL || (*pde & PG_V) == 0)
338 return NULL;
339 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */
340 return ((pt_entry_t *)pde);
341 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
342 return (&pte[pmap_pte_index(va)]);
343 }
344
345
346 PMAP_INLINE pt_entry_t *
347 vtopte(vm_offset_t va)
348 {
349 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
350
351 return (PTmap + (amd64_btop(va) & mask));
352 }
353
354 static u_int64_t
355 allocpages(int n)
356 {
357 u_int64_t ret;
358
359 ret = avail_start;
360 bzero((void *)ret, n * PAGE_SIZE);
361 avail_start += n * PAGE_SIZE;
362 return (ret);
363 }
364
365 static void
366 create_pagetables(void)
367 {
368 int i;
369
370 /* Allocate pages */
371 KPTphys = allocpages(NKPT);
372 KPML4phys = allocpages(1);
373 KPDPphys = allocpages(NKPML4E);
374 KPDphys = allocpages(NKPDPE);
375
376 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
377 if (ndmpdp < 4) /* Minimum 4GB of dirmap */
378 ndmpdp = 4;
379 DMPDPphys = allocpages(NDMPML4E);
380 DMPDphys = allocpages(ndmpdp);
381 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
382
383 /* Fill in the underlying page table pages */
384 /* Read-only from zero to physfree */
385 /* XXX not fully used, underneath 2M pages */
386 for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) {
387 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
388 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V;
389 }
390
391 /* Now map the page tables at their location within PTmap */
392 for (i = 0; i < NKPT; i++) {
393 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
394 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
395 }
396
397 /* Map from zero to end of allocations under 2M pages */
398 /* This replaces some of the KPTphys entries above */
399 for (i = 0; (i << PDRSHIFT) < avail_start; i++) {
400 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
401 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS;
402 }
403
404 /* And connect up the PD to the PDP */
405 for (i = 0; i < NKPDPE; i++) {
406 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
407 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
408 }
409
410
411 /* Now set up the direct map space using 2MB pages */
412 for (i = 0; i < NPDEPG * ndmpdp; i++) {
413 ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
414 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS;
415 }
416
417 /* And the direct map space's PDP */
418 for (i = 0; i < ndmpdp; i++) {
419 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
420 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
421 }
422
423 /* And recursively map PML4 to itself in order to get PTmap */
424 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
425 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
426
427 /* Connect the Direct Map slot up to the PML4 */
428 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
429 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
430
431 /* Connect the KVA slot up to the PML4 */
432 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
433 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
434 }
435
436 /*
437 * Bootstrap the system enough to run with virtual memory.
438 *
439 * On amd64 this is called after mapping has already been enabled
440 * and just syncs the pmap module with what has already been done.
441 * [We can't call it easily with mapping off since the kernel is not
442 * mapped with PA == VA, hence we would have to relocate every address
443 * from the linked base (virtual) address "KERNBASE" to the actual
444 * (physical) address starting relative to 0]
445 */
446 void
447 pmap_bootstrap(firstaddr)
448 vm_paddr_t *firstaddr;
449 {
450 vm_offset_t va;
451 pt_entry_t *pte;
452
453 avail_start = *firstaddr;
454
455 /*
456 * Create an initial set of page tables to run the kernel in.
457 */
458 create_pagetables();
459 *firstaddr = avail_start;
460
461 virtual_avail = (vm_offset_t) KERNBASE + avail_start;
462 virtual_avail = pmap_kmem_choose(virtual_avail);
463
464 virtual_end = VM_MAX_KERNEL_ADDRESS;
465
466
467 /* XXX do %cr0 as well */
468 load_cr4(rcr4() | CR4_PGE | CR4_PSE);
469 load_cr3(KPML4phys);
470
471 /*
472 * Initialize the kernel pmap (which is statically allocated).
473 */
474 kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
475 kernel_pmap->pm_active = -1; /* don't allow deactivation */
476 TAILQ_INIT(&kernel_pmap->pm_pvlist);
477 LIST_INIT(&allpmaps);
478 #ifdef LAZY_SWITCH
479 #ifdef SMP
480 mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN);
481 #endif
482 #endif
483 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
484 mtx_lock_spin(&allpmaps_lock);
485 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
486 mtx_unlock_spin(&allpmaps_lock);
487 nkpt = NKPT;
488
489 /*
490 * Reserve some special page table entries/VA space for temporary
491 * mapping of pages.
492 */
493 #define SYSMAP(c, p, v, n) \
494 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
495
496 va = virtual_avail;
497 pte = vtopte(va);
498
499 /*
500 * CMAP1 is only used for the memory test.
501 */
502 SYSMAP(caddr_t, CMAP1, CADDR1, 1)
503
504 /*
505 * Crashdump maps.
506 */
507 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
508
509 /*
510 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
511 * XXX ptmmap is not used.
512 */
513 SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
514
515 /*
516 * msgbufp is used to map the system message buffer.
517 * XXX msgbufmap is not used.
518 */
519 SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
520 atop(round_page(MSGBUF_SIZE)))
521
522 virtual_avail = va;
523
524 *CMAP1 = 0;
525
526 invltlb();
527 }
528
529 static void *
530 pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
531 {
532 *flags = UMA_SLAB_PRIV;
533 return (void *)kmem_alloc(kernel_map, bytes);
534 }
535
536 /*
537 * Initialize the pmap module.
538 * Called by vm_init, to initialize any structures that the pmap
539 * system needs to map virtual memory.
540 * pmap_init has been enhanced to support in a fairly consistant
541 * way, discontiguous physical memory.
542 */
543 void
544 pmap_init(phys_start, phys_end)
545 vm_paddr_t phys_start, phys_end;
546 {
547 int i;
548 int initial_pvs;
549
550 /*
551 * Allocate memory for random pmap data structures. Includes the
552 * pv_head_table.
553 */
554
555 for(i = 0; i < vm_page_array_size; i++) {
556 vm_page_t m;
557
558 m = &vm_page_array[i];
559 TAILQ_INIT(&m->md.pv_list);
560 m->md.pv_list_count = 0;
561 }
562
563 /*
564 * init the pv free list
565 */
566 initial_pvs = vm_page_array_size;
567 if (initial_pvs < MINPV)
568 initial_pvs = MINPV;
569 pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
570 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
571 uma_zone_set_allocf(pvzone, pmap_pv_allocf);
572 uma_prealloc(pvzone, initial_pvs);
573
574 /*
575 * Now it is safe to enable pv_table recording.
576 */
577 pmap_initialized = TRUE;
578 }
579
580 /*
581 * Initialize the address space (zone) for the pv_entries. Set a
582 * high water mark so that the system can recover from excessive
583 * numbers of pv entries.
584 */
585 void
586 pmap_init2()
587 {
588 int shpgperproc = PMAP_SHPGPERPROC;
589
590 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
591 pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
592 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
593 pv_entry_high_water = 9 * (pv_entry_max / 10);
594 uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
595 }
596
597
598 /***************************************************
599 * Low level helper routines.....
600 ***************************************************/
601
602 #if defined(PMAP_DIAGNOSTIC)
603
604 /*
605 * This code checks for non-writeable/modified pages.
606 * This should be an invalid condition.
607 */
608 static int
609 pmap_nw_modified(pt_entry_t ptea)
610 {
611 int pte;
612
613 pte = (int) ptea;
614
615 if ((pte & (PG_M|PG_RW)) == PG_M)
616 return 1;
617 else
618 return 0;
619 }
620 #endif
621
622
623 /*
624 * this routine defines the region(s) of memory that should
625 * not be tested for the modified bit.
626 */
627 static PMAP_INLINE int
628 pmap_track_modified(vm_offset_t va)
629 {
630 if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
631 return 1;
632 else
633 return 0;
634 }
635
636 #ifdef SMP
637 /*
638 * For SMP, these functions have to use the IPI mechanism for coherence.
639 */
640 void
641 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
642 {
643 u_int cpumask;
644 u_int other_cpus;
645
646 if (smp_started) {
647 if (!(read_rflags() & PSL_I))
648 panic("%s: interrupts disabled", __func__);
649 mtx_lock_spin(&smp_tlb_mtx);
650 } else
651 critical_enter();
652 /*
653 * We need to disable interrupt preemption but MUST NOT have
654 * interrupts disabled here.
655 * XXX we may need to hold schedlock to get a coherent pm_active
656 * XXX critical sections disable interrupts again
657 */
658 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
659 invlpg(va);
660 smp_invlpg(va);
661 } else {
662 cpumask = PCPU_GET(cpumask);
663 other_cpus = PCPU_GET(other_cpus);
664 if (pmap->pm_active & cpumask)
665 invlpg(va);
666 if (pmap->pm_active & other_cpus)
667 smp_masked_invlpg(pmap->pm_active & other_cpus, va);
668 }
669 if (smp_started)
670 mtx_unlock_spin(&smp_tlb_mtx);
671 else
672 critical_exit();
673 }
674
675 void
676 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
677 {
678 u_int cpumask;
679 u_int other_cpus;
680 vm_offset_t addr;
681
682 if (smp_started) {
683 if (!(read_rflags() & PSL_I))
684 panic("%s: interrupts disabled", __func__);
685 mtx_lock_spin(&smp_tlb_mtx);
686 } else
687 critical_enter();
688 /*
689 * We need to disable interrupt preemption but MUST NOT have
690 * interrupts disabled here.
691 * XXX we may need to hold schedlock to get a coherent pm_active
692 * XXX critical sections disable interrupts again
693 */
694 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
695 for (addr = sva; addr < eva; addr += PAGE_SIZE)
696 invlpg(addr);
697 smp_invlpg_range(sva, eva);
698 } else {
699 cpumask = PCPU_GET(cpumask);
700 other_cpus = PCPU_GET(other_cpus);
701 if (pmap->pm_active & cpumask)
702 for (addr = sva; addr < eva; addr += PAGE_SIZE)
703 invlpg(addr);
704 if (pmap->pm_active & other_cpus)
705 smp_masked_invlpg_range(pmap->pm_active & other_cpus,
706 sva, eva);
707 }
708 if (smp_started)
709 mtx_unlock_spin(&smp_tlb_mtx);
710 else
711 critical_exit();
712 }
713
714 void
715 pmap_invalidate_all(pmap_t pmap)
716 {
717 u_int cpumask;
718 u_int other_cpus;
719
720 if (smp_started) {
721 if (!(read_rflags() & PSL_I))
722 panic("%s: interrupts disabled", __func__);
723 mtx_lock_spin(&smp_tlb_mtx);
724 } else
725 critical_enter();
726 /*
727 * We need to disable interrupt preemption but MUST NOT have
728 * interrupts disabled here.
729 * XXX we may need to hold schedlock to get a coherent pm_active
730 * XXX critical sections disable interrupts again
731 */
732 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
733 invltlb();
734 smp_invltlb();
735 } else {
736 cpumask = PCPU_GET(cpumask);
737 other_cpus = PCPU_GET(other_cpus);
738 if (pmap->pm_active & cpumask)
739 invltlb();
740 if (pmap->pm_active & other_cpus)
741 smp_masked_invltlb(pmap->pm_active & other_cpus);
742 }
743 if (smp_started)
744 mtx_unlock_spin(&smp_tlb_mtx);
745 else
746 critical_exit();
747 }
748 #else /* !SMP */
749 /*
750 * Normal, non-SMP, invalidation functions.
751 * We inline these within pmap.c for speed.
752 */
753 PMAP_INLINE void
754 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
755 {
756
757 if (pmap == kernel_pmap || pmap->pm_active)
758 invlpg(va);
759 }
760
761 PMAP_INLINE void
762 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
763 {
764 vm_offset_t addr;
765
766 if (pmap == kernel_pmap || pmap->pm_active)
767 for (addr = sva; addr < eva; addr += PAGE_SIZE)
768 invlpg(addr);
769 }
770
771 PMAP_INLINE void
772 pmap_invalidate_all(pmap_t pmap)
773 {
774
775 if (pmap == kernel_pmap || pmap->pm_active)
776 invltlb();
777 }
778 #endif /* !SMP */
779
780 /*
781 * Are we current address space or kernel?
782 */
783 static __inline int
784 pmap_is_current(pmap_t pmap)
785 {
786 return (pmap == kernel_pmap ||
787 (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
788 }
789
790 /*
791 * Routine: pmap_extract
792 * Function:
793 * Extract the physical page address associated
794 * with the given map/virtual_address pair.
795 */
796 vm_paddr_t
797 pmap_extract(pmap, va)
798 register pmap_t pmap;
799 vm_offset_t va;
800 {
801 vm_paddr_t rtval;
802 pt_entry_t *pte;
803 pd_entry_t pde, *pdep;
804
805 if (pmap == 0)
806 return 0;
807 pdep = pmap_pde(pmap, va);
808 if (pdep) {
809 pde = *pdep;
810 if (pde) {
811 if ((pde & PG_PS) != 0) {
812 rtval = (pde & ~PDRMASK) | (va & PDRMASK);
813 return rtval;
814 }
815 pte = pmap_pte(pmap, va);
816 rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
817 return rtval;
818 }
819 }
820 return 0;
821
822 }
823
824 /*
825 * Routine: pmap_extract_and_hold
826 * Function:
827 * Atomically extract and hold the physical page
828 * with the given pmap and virtual address pair
829 * if that mapping permits the given protection.
830 */
831 vm_page_t
832 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
833 {
834 vm_paddr_t pa;
835 vm_page_t m;
836
837 m = NULL;
838 mtx_lock(&Giant);
839 if ((pa = pmap_extract(pmap, va)) != 0) {
840 m = PHYS_TO_VM_PAGE(pa);
841 vm_page_lock_queues();
842 vm_page_hold(m);
843 vm_page_unlock_queues();
844 }
845 mtx_unlock(&Giant);
846 return (m);
847 }
848
849 vm_paddr_t
850 pmap_kextract(vm_offset_t va)
851 {
852 pd_entry_t *pde;
853 vm_paddr_t pa;
854
855 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
856 pa = DMAP_TO_PHYS(va);
857 } else {
858 pde = pmap_pde(kernel_pmap, va);
859 if (*pde & PG_PS) {
860 pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1));
861 } else {
862 pa = *vtopte(va);
863 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
864 }
865 }
866 return pa;
867 }
868
869 /***************************************************
870 * Low level mapping routines.....
871 ***************************************************/
872
873 /*
874 * Add a wired page to the kva.
875 * Note: not SMP coherent.
876 */
877 PMAP_INLINE void
878 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
879 {
880 pt_entry_t *pte;
881
882 pte = vtopte(va);
883 pte_store(pte, pa | PG_RW | PG_V | PG_G);
884 }
885
886 /*
887 * Remove a page from the kernel pagetables.
888 * Note: not SMP coherent.
889 */
890 PMAP_INLINE void
891 pmap_kremove(vm_offset_t va)
892 {
893 pt_entry_t *pte;
894
895 pte = vtopte(va);
896 pte_clear(pte);
897 }
898
899 /*
900 * Used to map a range of physical addresses into kernel
901 * virtual address space.
902 *
903 * The value passed in '*virt' is a suggested virtual address for
904 * the mapping. Architectures which can support a direct-mapped
905 * physical to virtual region can return the appropriate address
906 * within that region, leaving '*virt' unchanged. Other
907 * architectures should map the pages starting at '*virt' and
908 * update '*virt' with the first usable address after the mapped
909 * region.
910 */
911 vm_offset_t
912 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
913 {
914 return PHYS_TO_DMAP(start);
915 }
916
917
918 /*
919 * Add a list of wired pages to the kva
920 * this routine is only used for temporary
921 * kernel mappings that do not need to have
922 * page modification or references recorded.
923 * Note that old mappings are simply written
924 * over. The page *must* be wired.
925 * Note: SMP coherent. Uses a ranged shootdown IPI.
926 */
927 void
928 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
929 {
930 vm_offset_t va;
931
932 va = sva;
933 while (count-- > 0) {
934 pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
935 va += PAGE_SIZE;
936 m++;
937 }
938 pmap_invalidate_range(kernel_pmap, sva, va);
939 }
940
941 /*
942 * This routine tears out page mappings from the
943 * kernel -- it is meant only for temporary mappings.
944 * Note: SMP coherent. Uses a ranged shootdown IPI.
945 */
946 void
947 pmap_qremove(vm_offset_t sva, int count)
948 {
949 vm_offset_t va;
950
951 va = sva;
952 while (count-- > 0) {
953 pmap_kremove(va);
954 va += PAGE_SIZE;
955 }
956 pmap_invalidate_range(kernel_pmap, sva, va);
957 }
958
959 /***************************************************
960 * Page table page management routines.....
961 ***************************************************/
962
963 /*
964 * This routine unholds page table pages, and if the hold count
965 * drops to zero, then it decrements the wire count.
966 */
967 static int
968 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
969 {
970
971 while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
972 vm_page_lock_queues();
973
974 if (m->hold_count == 0) {
975 vm_offset_t pteva;
976
977 /*
978 * unmap the page table page
979 */
980 if (m->pindex >= (NUPDE + NUPDPE)) {
981 /* PDP page */
982 pml4_entry_t *pml4;
983 pml4 = pmap_pml4e(pmap, va);
984 pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
985 *pml4 = 0;
986 } else if (m->pindex >= NUPDE) {
987 /* PD page */
988 pdp_entry_t *pdp;
989 pdp = pmap_pdpe(pmap, va);
990 pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
991 *pdp = 0;
992 } else {
993 /* PTE page */
994 pd_entry_t *pd;
995 pd = pmap_pde(pmap, va);
996 pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
997 *pd = 0;
998 }
999 --pmap->pm_stats.resident_count;
1000 if (m->pindex < NUPDE) {
1001 /* We just released a PT, unhold the matching PD */
1002 vm_page_t pdpg;
1003
1004 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va));
1005 vm_page_unhold(pdpg);
1006 if (pdpg->hold_count == 0)
1007 _pmap_unwire_pte_hold(pmap, va, pdpg);
1008 }
1009 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1010 /* We just released a PD, unhold the matching PDP */
1011 vm_page_t pdppg;
1012
1013 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va));
1014 vm_page_unhold(pdppg);
1015 if (pdppg->hold_count == 0)
1016 _pmap_unwire_pte_hold(pmap, va, pdppg);
1017 }
1018 if (pmap_is_current(pmap)) {
1019 /*
1020 * Do an invltlb to make the invalidated mapping
1021 * take effect immediately.
1022 */
1023 pmap_invalidate_page(pmap, pteva);
1024 }
1025
1026 /*
1027 * If the page is finally unwired, simply free it.
1028 */
1029 --m->wire_count;
1030 if (m->wire_count == 0) {
1031 vm_page_busy(m);
1032 vm_page_free_zero(m);
1033 atomic_subtract_int(&cnt.v_wire_count, 1);
1034 }
1035 return 1;
1036 }
1037 return 0;
1038 }
1039
1040 static PMAP_INLINE int
1041 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
1042 {
1043 vm_page_unhold(m);
1044 if (m->hold_count == 0)
1045 return _pmap_unwire_pte_hold(pmap, va, m);
1046 else
1047 return 0;
1048 }
1049
1050 /*
1051 * After removing a page table entry, this routine is used to
1052 * conditionally free the page, and manage the hold/wire counts.
1053 */
1054 static int
1055 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1056 {
1057
1058 if (va >= VM_MAXUSER_ADDRESS)
1059 return 0;
1060
1061 return pmap_unwire_pte_hold(pmap, va, mpte);
1062 }
1063
1064 void
1065 pmap_pinit0(pmap)
1066 struct pmap *pmap;
1067 {
1068
1069 pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1070 pmap->pm_active = 0;
1071 TAILQ_INIT(&pmap->pm_pvlist);
1072 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1073 mtx_lock_spin(&allpmaps_lock);
1074 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1075 mtx_unlock_spin(&allpmaps_lock);
1076 }
1077
1078 /*
1079 * Initialize a preallocated and zeroed pmap structure,
1080 * such as one in a vmspace structure.
1081 */
1082 void
1083 pmap_pinit(pmap)
1084 register struct pmap *pmap;
1085 {
1086 vm_page_t pml4pg;
1087 static vm_pindex_t color;
1088
1089 /*
1090 * allocate the page directory page
1091 */
1092 while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
1093 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1094 VM_WAIT;
1095 vm_page_lock_queues();
1096 vm_page_flag_clear(pml4pg, PG_BUSY);
1097 vm_page_unlock_queues();
1098
1099 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1100
1101 if ((pml4pg->flags & PG_ZERO) == 0)
1102 bzero(pmap->pm_pml4, PAGE_SIZE);
1103
1104 mtx_lock_spin(&allpmaps_lock);
1105 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1106 mtx_unlock_spin(&allpmaps_lock);
1107
1108 /* Wire in kernel global address entries. */
1109 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1110 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1111
1112 /* install self-referential address mapping entry(s) */
1113 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1114
1115 pmap->pm_active = 0;
1116 TAILQ_INIT(&pmap->pm_pvlist);
1117 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1118 }
1119
1120 /*
1121 * Wire in kernel global address entries. To avoid a race condition
1122 * between pmap initialization and pmap_growkernel, this procedure
1123 * should be called after the vmspace is attached to the process
1124 * but before this pmap is activated.
1125 */
1126 void
1127 pmap_pinit2(pmap)
1128 struct pmap *pmap;
1129 {
1130 /* XXX: Remove this stub when no longer called */
1131 }
1132
1133 /*
1134 * this routine is called if the page table page is not
1135 * mapped correctly.
1136 *
1137 * Note: If a page allocation fails at page table level two or three,
1138 * one or two pages may be held during the wait, only to be released
1139 * afterwards. This conservative approach is easily argued to avoid
1140 * race conditions.
1141 */
1142 static vm_page_t
1143 _pmap_allocpte(pmap, ptepindex)
1144 pmap_t pmap;
1145 vm_pindex_t ptepindex;
1146 {
1147 vm_page_t m, pdppg, pdpg;
1148
1149 /*
1150 * Allocate a page table page.
1151 */
1152 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1153 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1154 VM_WAIT;
1155 /*
1156 * Indicate the need to retry. While waiting, the page table
1157 * page may have been allocated.
1158 */
1159 return (NULL);
1160 }
1161 if ((m->flags & PG_ZERO) == 0)
1162 pmap_zero_page(m);
1163
1164 KASSERT(m->queue == PQ_NONE,
1165 ("_pmap_allocpte: %p->queue != PQ_NONE", m));
1166
1167 /*
1168 * Increment the hold count for the page table page
1169 * (denoting a new mapping.)
1170 */
1171 m->hold_count++;
1172
1173 /*
1174 * Map the pagetable page into the process address space, if
1175 * it isn't already there.
1176 */
1177
1178 pmap->pm_stats.resident_count++;
1179
1180 if (ptepindex >= (NUPDE + NUPDPE)) {
1181 pml4_entry_t *pml4;
1182 vm_pindex_t pml4index;
1183
1184 /* Wire up a new PDPE page */
1185 pml4index = ptepindex - (NUPDE + NUPDPE);
1186 pml4 = &pmap->pm_pml4[pml4index];
1187 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1188
1189 } else if (ptepindex >= NUPDE) {
1190 vm_pindex_t pml4index;
1191 vm_pindex_t pdpindex;
1192 pml4_entry_t *pml4;
1193 pdp_entry_t *pdp;
1194
1195 /* Wire up a new PDE page */
1196 pdpindex = ptepindex - NUPDE;
1197 pml4index = pdpindex >> NPML4EPGSHIFT;
1198
1199 pml4 = &pmap->pm_pml4[pml4index];
1200 if ((*pml4 & PG_V) == 0) {
1201 /* Have to allocate a new pdp, recurse */
1202 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index) == NULL) {
1203 vm_page_lock_queues();
1204 vm_page_unhold(m);
1205 vm_page_free(m);
1206 vm_page_unlock_queues();
1207 return (NULL);
1208 }
1209 } else {
1210 /* Add reference to pdp page */
1211 pdppg = PHYS_TO_VM_PAGE(*pml4);
1212 pdppg->hold_count++;
1213 }
1214 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1215
1216 /* Now find the pdp page */
1217 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1218 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1219
1220 } else {
1221 vm_pindex_t pml4index;
1222 vm_pindex_t pdpindex;
1223 pml4_entry_t *pml4;
1224 pdp_entry_t *pdp;
1225 pd_entry_t *pd;
1226
1227 /* Wire up a new PTE page */
1228 pdpindex = ptepindex >> NPDPEPGSHIFT;
1229 pml4index = pdpindex >> NPML4EPGSHIFT;
1230
1231 /* First, find the pdp and check that its valid. */
1232 pml4 = &pmap->pm_pml4[pml4index];
1233 if ((*pml4 & PG_V) == 0) {
1234 /* Have to allocate a new pd, recurse */
1235 if (_pmap_allocpte(pmap, NUPDE + pdpindex) == NULL) {
1236 vm_page_lock_queues();
1237 vm_page_unhold(m);
1238 vm_page_free(m);
1239 vm_page_unlock_queues();
1240 return (NULL);
1241 }
1242 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1243 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1244 } else {
1245 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1246 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1247 if ((*pdp & PG_V) == 0) {
1248 /* Have to allocate a new pd, recurse */
1249 if (_pmap_allocpte(pmap, NUPDE + pdpindex) == NULL) {
1250 vm_page_lock_queues();
1251 vm_page_unhold(m);
1252 vm_page_free(m);
1253 vm_page_unlock_queues();
1254 return (NULL);
1255 }
1256 } else {
1257 /* Add reference to the pd page */
1258 pdpg = PHYS_TO_VM_PAGE(*pdp);
1259 pdpg->hold_count++;
1260 }
1261 }
1262 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1263
1264 /* Now we know where the page directory page is */
1265 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1266 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1267 }
1268
1269 vm_page_lock_queues();
1270 vm_page_flag_clear(m, PG_ZERO);
1271 vm_page_wakeup(m);
1272 vm_page_unlock_queues();
1273
1274 return m;
1275 }
1276
1277 static vm_page_t
1278 pmap_allocpte(pmap_t pmap, vm_offset_t va)
1279 {
1280 vm_pindex_t ptepindex;
1281 pd_entry_t *pd;
1282 vm_page_t m;
1283
1284 /*
1285 * Calculate pagetable page index
1286 */
1287 ptepindex = pmap_pde_pindex(va);
1288 retry:
1289 /*
1290 * Get the page directory entry
1291 */
1292 pd = pmap_pde(pmap, va);
1293
1294 /*
1295 * This supports switching from a 2MB page to a
1296 * normal 4K page.
1297 */
1298 if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1299 *pd = 0;
1300 pd = 0;
1301 pmap_invalidate_all(kernel_pmap);
1302 }
1303
1304 /*
1305 * If the page table page is mapped, we just increment the
1306 * hold count, and activate it.
1307 */
1308 if (pd != 0 && (*pd & PG_V) != 0) {
1309 m = PHYS_TO_VM_PAGE(*pd);
1310 m->hold_count++;
1311 } else {
1312 /*
1313 * Here if the pte page isn't mapped, or if it has been
1314 * deallocated.
1315 */
1316 m = _pmap_allocpte(pmap, ptepindex);
1317 if (m == NULL)
1318 goto retry;
1319 }
1320 return (m);
1321 }
1322
1323
1324 /***************************************************
1325 * Pmap allocation/deallocation routines.
1326 ***************************************************/
1327
1328 #ifdef LAZY_SWITCH
1329 #ifdef SMP
1330 /*
1331 * Deal with a SMP shootdown of other users of the pmap that we are
1332 * trying to dispose of. This can be a bit hairy.
1333 */
1334 static u_int *lazymask;
1335 static register_t lazyptd;
1336 static volatile u_int lazywait;
1337
1338 void pmap_lazyfix_action(void);
1339
1340 void
1341 pmap_lazyfix_action(void)
1342 {
1343 u_int mymask = PCPU_GET(cpumask);
1344
1345 if (rcr3() == lazyptd)
1346 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1347 atomic_clear_int(lazymask, mymask);
1348 atomic_store_rel_int(&lazywait, 1);
1349 }
1350
1351 static void
1352 pmap_lazyfix_self(u_int mymask)
1353 {
1354
1355 if (rcr3() == lazyptd)
1356 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1357 atomic_clear_int(lazymask, mymask);
1358 }
1359
1360
1361 static void
1362 pmap_lazyfix(pmap_t pmap)
1363 {
1364 u_int mymask = PCPU_GET(cpumask);
1365 u_int mask;
1366 register u_int spins;
1367
1368 while ((mask = pmap->pm_active) != 0) {
1369 spins = 50000000;
1370 mask = mask & -mask; /* Find least significant set bit */
1371 mtx_lock_spin(&lazypmap_lock);
1372 lazyptd = vtophys(pmap->pm_pml4);
1373 if (mask == mymask) {
1374 lazymask = &pmap->pm_active;
1375 pmap_lazyfix_self(mymask);
1376 } else {
1377 atomic_store_rel_long((u_long *)&lazymask,
1378 (u_long)&pmap->pm_active);
1379 atomic_store_rel_int(&lazywait, 0);
1380 ipi_selected(mask, IPI_LAZYPMAP);
1381 while (lazywait == 0) {
1382 ia32_pause();
1383 if (--spins == 0)
1384 break;
1385 }
1386 }
1387 mtx_unlock_spin(&lazypmap_lock);
1388 if (spins == 0)
1389 printf("pmap_lazyfix: spun for 50000000\n");
1390 }
1391 }
1392
1393 #else /* SMP */
1394
1395 /*
1396 * Cleaning up on uniprocessor is easy. For various reasons, we're
1397 * unlikely to have to even execute this code, including the fact
1398 * that the cleanup is deferred until the parent does a wait(2), which
1399 * means that another userland process has run.
1400 */
1401 static void
1402 pmap_lazyfix(pmap_t pmap)
1403 {
1404 u_long cr3;
1405
1406 cr3 = vtophys(pmap->pm_pml4);
1407 if (cr3 == rcr3()) {
1408 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1409 pmap->pm_active &= ~(PCPU_GET(cpumask));
1410 }
1411 }
1412 #endif /* SMP */
1413 #endif
1414
1415 /*
1416 * Release any resources held by the given physical map.
1417 * Called when a pmap initialized by pmap_pinit is being released.
1418 * Should only be called if the map contains no valid mappings.
1419 */
1420 void
1421 pmap_release(pmap_t pmap)
1422 {
1423 vm_page_t m;
1424
1425 KASSERT(pmap->pm_stats.resident_count == 0,
1426 ("pmap_release: pmap resident count %ld != 0",
1427 pmap->pm_stats.resident_count));
1428
1429 #ifdef LAZY_SWITCH
1430 pmap_lazyfix(pmap);
1431 #endif
1432 mtx_lock_spin(&allpmaps_lock);
1433 LIST_REMOVE(pmap, pm_list);
1434 mtx_unlock_spin(&allpmaps_lock);
1435
1436 vm_page_lock_queues();
1437 m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I]);
1438 m->wire_count--;
1439 atomic_subtract_int(&cnt.v_wire_count, 1);
1440 vm_page_busy(m);
1441 vm_page_free(m);
1442 vm_page_unlock_queues();
1443 }
1444
1445 static int
1446 kvm_size(SYSCTL_HANDLER_ARGS)
1447 {
1448 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1449
1450 return sysctl_handle_long(oidp, &ksize, 0, req);
1451 }
1452 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1453 0, 0, kvm_size, "IU", "Size of KVM");
1454
1455 static int
1456 kvm_free(SYSCTL_HANDLER_ARGS)
1457 {
1458 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1459
1460 return sysctl_handle_long(oidp, &kfree, 0, req);
1461 }
1462 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1463 0, 0, kvm_free, "IU", "Amount of KVM free");
1464
1465 /*
1466 * grow the number of kernel page table entries, if needed
1467 */
1468 void
1469 pmap_growkernel(vm_offset_t addr)
1470 {
1471 int s;
1472 vm_paddr_t paddr;
1473 vm_page_t nkpg;
1474 pd_entry_t *pde, newpdir;
1475 pdp_entry_t newpdp;
1476
1477 s = splhigh();
1478 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1479 if (kernel_vm_end == 0) {
1480 kernel_vm_end = KERNBASE;
1481 nkpt = 0;
1482 while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1483 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1484 nkpt++;
1485 }
1486 }
1487 addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1488 while (kernel_vm_end < addr) {
1489 pde = pmap_pde(kernel_pmap, kernel_vm_end);
1490 if (pde == NULL) {
1491 /* We need a new PDP entry */
1492 nkpg = vm_page_alloc(NULL, nkpt,
1493 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1494 if (!nkpg)
1495 panic("pmap_growkernel: no memory to grow kernel");
1496 pmap_zero_page(nkpg);
1497 paddr = VM_PAGE_TO_PHYS(nkpg);
1498 newpdp = (pdp_entry_t)
1499 (paddr | PG_V | PG_RW | PG_A | PG_M);
1500 *pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
1501 continue; /* try again */
1502 }
1503 if ((*pde & PG_V) != 0) {
1504 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1505 continue;
1506 }
1507
1508 /*
1509 * This index is bogus, but out of the way
1510 */
1511 nkpg = vm_page_alloc(NULL, nkpt,
1512 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1513 if (!nkpg)
1514 panic("pmap_growkernel: no memory to grow kernel");
1515
1516 nkpt++;
1517
1518 pmap_zero_page(nkpg);
1519 paddr = VM_PAGE_TO_PHYS(nkpg);
1520 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
1521 *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1522
1523 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1524 }
1525 splx(s);
1526 }
1527
1528
1529 /***************************************************
1530 * page management routines.
1531 ***************************************************/
1532
1533 /*
1534 * free the pv_entry back to the free list
1535 */
1536 static PMAP_INLINE void
1537 free_pv_entry(pv_entry_t pv)
1538 {
1539 pv_entry_count--;
1540 uma_zfree(pvzone, pv);
1541 }
1542
1543 /*
1544 * get a new pv_entry, allocating a block from the system
1545 * when needed.
1546 * the memory allocation is performed bypassing the malloc code
1547 * because of the possibility of allocations at interrupt time.
1548 */
1549 static pv_entry_t
1550 get_pv_entry(void)
1551 {
1552 pv_entry_count++;
1553 if (pv_entry_high_water &&
1554 (pv_entry_count > pv_entry_high_water) &&
1555 (pmap_pagedaemon_waken == 0)) {
1556 pmap_pagedaemon_waken = 1;
1557 wakeup (&vm_pages_needed);
1558 }
1559 return uma_zalloc(pvzone, M_NOWAIT);
1560 }
1561
1562 /*
1563 * If it is the first entry on the list, it is actually
1564 * in the header and we must copy the following entry up
1565 * to the header. Otherwise we must search the list for
1566 * the entry. In either case we free the now unused entry.
1567 */
1568
1569 static int
1570 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1571 {
1572 pv_entry_t pv;
1573 int rtval;
1574 int s;
1575
1576 s = splvm();
1577 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1578 if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1579 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1580 if (pmap == pv->pv_pmap && va == pv->pv_va)
1581 break;
1582 }
1583 } else {
1584 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1585 if (va == pv->pv_va)
1586 break;
1587 }
1588 }
1589
1590 rtval = 0;
1591 if (pv) {
1592 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1593 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1594 m->md.pv_list_count--;
1595 if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1596 vm_page_flag_clear(m, PG_WRITEABLE);
1597
1598 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1599 free_pv_entry(pv);
1600 }
1601
1602 splx(s);
1603 return rtval;
1604 }
1605
1606 /*
1607 * Create a pv entry for page at pa for
1608 * (pmap, va).
1609 */
1610 static void
1611 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1612 {
1613
1614 int s;
1615 pv_entry_t pv;
1616
1617 s = splvm();
1618 pv = get_pv_entry();
1619 pv->pv_va = va;
1620 pv->pv_pmap = pmap;
1621 pv->pv_ptem = mpte;
1622
1623 vm_page_lock_queues();
1624 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1625 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1626 m->md.pv_list_count++;
1627
1628 vm_page_unlock_queues();
1629 splx(s);
1630 }
1631
1632 /*
1633 * pmap_remove_pte: do the things to unmap a page in a process
1634 */
1635 static int
1636 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1637 {
1638 pt_entry_t oldpte;
1639 vm_page_t m, mpte;
1640
1641 oldpte = pte_load_clear(ptq);
1642 if (oldpte & PG_W)
1643 pmap->pm_stats.wired_count -= 1;
1644 /*
1645 * Machines that don't support invlpg, also don't support
1646 * PG_G.
1647 */
1648 if (oldpte & PG_G)
1649 pmap_invalidate_page(kernel_pmap, va);
1650 pmap->pm_stats.resident_count -= 1;
1651 if (oldpte & PG_MANAGED) {
1652 m = PHYS_TO_VM_PAGE(oldpte);
1653 if (oldpte & PG_M) {
1654 #if defined(PMAP_DIAGNOSTIC)
1655 if (pmap_nw_modified((pt_entry_t) oldpte)) {
1656 printf(
1657 "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1658 va, oldpte);
1659 }
1660 #endif
1661 if (pmap_track_modified(va))
1662 vm_page_dirty(m);
1663 }
1664 if (oldpte & PG_A)
1665 vm_page_flag_set(m, PG_REFERENCED);
1666 return pmap_remove_entry(pmap, m, va);
1667 } else {
1668 mpte = PHYS_TO_VM_PAGE(*pmap_pde(pmap, va));
1669 return pmap_unuse_pt(pmap, va, mpte);
1670 }
1671 }
1672
1673 /*
1674 * Remove a single page from a process address space
1675 */
1676 static void
1677 pmap_remove_page(pmap_t pmap, vm_offset_t va)
1678 {
1679 pt_entry_t *pte;
1680
1681 pte = pmap_pte(pmap, va);
1682 if (pte == NULL || (*pte & PG_V) == 0)
1683 return;
1684 pmap_remove_pte(pmap, pte, va);
1685 pmap_invalidate_page(pmap, va);
1686 }
1687
1688 /*
1689 * Remove the given range of addresses from the specified map.
1690 *
1691 * It is assumed that the start and end are properly
1692 * rounded to the page size.
1693 */
1694 void
1695 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1696 {
1697 vm_offset_t pdnxt;
1698 pd_entry_t ptpaddr, *pde;
1699 pt_entry_t *pte;
1700 int anyvalid;
1701
1702 if (pmap == NULL)
1703 return;
1704
1705 if (pmap->pm_stats.resident_count == 0)
1706 return;
1707
1708 /*
1709 * special handling of removing one page. a very
1710 * common operation and easy to short circuit some
1711 * code.
1712 */
1713 if (sva + PAGE_SIZE == eva) {
1714 pde = pmap_pde(pmap, sva);
1715 if (pde && (*pde & PG_PS) == 0) {
1716 pmap_remove_page(pmap, sva);
1717 return;
1718 }
1719 }
1720
1721 anyvalid = 0;
1722
1723 for (; sva < eva; sva = pdnxt) {
1724
1725 if (pmap->pm_stats.resident_count == 0)
1726 break;
1727
1728 /*
1729 * Calculate index for next page table.
1730 */
1731 pdnxt = (sva + NBPDR) & ~PDRMASK;
1732
1733 pde = pmap_pde(pmap, sva);
1734 if (pde == 0)
1735 continue;
1736 ptpaddr = *pde;
1737
1738 /*
1739 * Weed out invalid mappings. Note: we assume that the page
1740 * directory table is always allocated, and in kernel virtual.
1741 */
1742 if (ptpaddr == 0)
1743 continue;
1744
1745 /*
1746 * Check for large page.
1747 */
1748 if ((ptpaddr & PG_PS) != 0) {
1749 *pde = 0;
1750 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1751 anyvalid = 1;
1752 continue;
1753 }
1754
1755 /*
1756 * Limit our scan to either the end of the va represented
1757 * by the current page table page, or to the end of the
1758 * range being removed.
1759 */
1760 if (pdnxt > eva)
1761 pdnxt = eva;
1762
1763 for (; sva != pdnxt; sva += PAGE_SIZE) {
1764 pte = pmap_pte(pmap, sva);
1765 if (pte == NULL || *pte == 0)
1766 continue;
1767 anyvalid = 1;
1768 if (pmap_remove_pte(pmap, pte, sva))
1769 break;
1770 }
1771 }
1772
1773 if (anyvalid)
1774 pmap_invalidate_all(pmap);
1775 }
1776
1777 /*
1778 * Routine: pmap_remove_all
1779 * Function:
1780 * Removes this physical page from
1781 * all physical maps in which it resides.
1782 * Reflects back modify bits to the pager.
1783 *
1784 * Notes:
1785 * Original versions of this routine were very
1786 * inefficient because they iteratively called
1787 * pmap_remove (slow...)
1788 */
1789
1790 void
1791 pmap_remove_all(vm_page_t m)
1792 {
1793 register pv_entry_t pv;
1794 pt_entry_t *pte, tpte;
1795 int s;
1796
1797 #if defined(PMAP_DIAGNOSTIC)
1798 /*
1799 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1800 */
1801 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1802 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1803 VM_PAGE_TO_PHYS(m));
1804 }
1805 #endif
1806 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1807 s = splvm();
1808 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1809 pv->pv_pmap->pm_stats.resident_count--;
1810 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
1811 tpte = pte_load_clear(pte);
1812 if (tpte & PG_W)
1813 pv->pv_pmap->pm_stats.wired_count--;
1814 if (tpte & PG_A)
1815 vm_page_flag_set(m, PG_REFERENCED);
1816
1817 /*
1818 * Update the vm_page_t clean and reference bits.
1819 */
1820 if (tpte & PG_M) {
1821 #if defined(PMAP_DIAGNOSTIC)
1822 if (pmap_nw_modified((pt_entry_t) tpte)) {
1823 printf(
1824 "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1825 pv->pv_va, tpte);
1826 }
1827 #endif
1828 if (pmap_track_modified(pv->pv_va))
1829 vm_page_dirty(m);
1830 }
1831 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1832 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1833 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1834 m->md.pv_list_count--;
1835 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1836 free_pv_entry(pv);
1837 }
1838 vm_page_flag_clear(m, PG_WRITEABLE);
1839 splx(s);
1840 }
1841
1842 /*
1843 * Set the physical protection on the
1844 * specified range of this map as requested.
1845 */
1846 void
1847 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1848 {
1849 vm_offset_t pdnxt;
1850 pd_entry_t ptpaddr, *pde;
1851 int anychanged;
1852
1853 if (pmap == NULL)
1854 return;
1855
1856 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1857 pmap_remove(pmap, sva, eva);
1858 return;
1859 }
1860
1861 if (prot & VM_PROT_WRITE)
1862 return;
1863
1864 anychanged = 0;
1865
1866 for (; sva < eva; sva = pdnxt) {
1867
1868 pdnxt = (sva + NBPDR) & ~PDRMASK;
1869
1870 pde = pmap_pde(pmap, sva);
1871 if (pde == NULL)
1872 continue;
1873 ptpaddr = *pde;
1874
1875 /*
1876 * Weed out invalid mappings. Note: we assume that the page
1877 * directory table is always allocated, and in kernel virtual.
1878 */
1879 if (ptpaddr == 0)
1880 continue;
1881
1882 /*
1883 * Check for large page.
1884 */
1885 if ((ptpaddr & PG_PS) != 0) {
1886 *pde &= ~(PG_M|PG_RW);
1887 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1888 anychanged = 1;
1889 continue;
1890 }
1891
1892 if (pdnxt > eva)
1893 pdnxt = eva;
1894
1895 for (; sva != pdnxt; sva += PAGE_SIZE) {
1896 pt_entry_t pbits;
1897 pt_entry_t *pte;
1898 vm_page_t m;
1899
1900 pte = pmap_pte(pmap, sva);
1901 if (pte == NULL)
1902 continue;
1903 pbits = *pte;
1904 if (pbits & PG_MANAGED) {
1905 m = NULL;
1906 if (pbits & PG_A) {
1907 m = PHYS_TO_VM_PAGE(pbits);
1908 vm_page_flag_set(m, PG_REFERENCED);
1909 pbits &= ~PG_A;
1910 }
1911 if ((pbits & PG_M) != 0 &&
1912 pmap_track_modified(sva)) {
1913 if (m == NULL)
1914 m = PHYS_TO_VM_PAGE(pbits);
1915 vm_page_dirty(m);
1916 pbits &= ~PG_M;
1917 }
1918 }
1919
1920 pbits &= ~PG_RW;
1921
1922 if (pbits != *pte) {
1923 pte_store(pte, pbits);
1924 anychanged = 1;
1925 }
1926 }
1927 }
1928 if (anychanged)
1929 pmap_invalidate_all(pmap);
1930 }
1931
1932 /*
1933 * Insert the given physical page (p) at
1934 * the specified virtual address (v) in the
1935 * target physical map with the protection requested.
1936 *
1937 * If specified, the page will be wired down, meaning
1938 * that the related pte can not be reclaimed.
1939 *
1940 * NB: This is the only routine which MAY NOT lazy-evaluate
1941 * or lose information. That is, this routine must actually
1942 * insert this page into the given map NOW.
1943 */
1944 void
1945 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1946 boolean_t wired)
1947 {
1948 vm_paddr_t pa;
1949 register pt_entry_t *pte;
1950 vm_paddr_t opa;
1951 pt_entry_t origpte, newpte;
1952 vm_page_t mpte;
1953
1954 if (pmap == NULL)
1955 return;
1956
1957 va &= PG_FRAME;
1958 #ifdef PMAP_DIAGNOSTIC
1959 if (va > VM_MAX_KERNEL_ADDRESS)
1960 panic("pmap_enter: toobig");
1961 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1962 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1963 #endif
1964
1965 mpte = NULL;
1966 /*
1967 * In the case that a page table page is not
1968 * resident, we are creating it here.
1969 */
1970 if (va < VM_MAXUSER_ADDRESS) {
1971 mpte = pmap_allocpte(pmap, va);
1972 }
1973 #if 0 && defined(PMAP_DIAGNOSTIC)
1974 else {
1975 pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1976 origpte = *pdeaddr;
1977 if ((origpte & PG_V) == 0) {
1978 panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
1979 origpte, va);
1980 }
1981 }
1982 #endif
1983
1984 pte = pmap_pte(pmap, va);
1985
1986 /*
1987 * Page Directory table entry not valid, we need a new PT page
1988 */
1989 if (pte == NULL)
1990 panic("pmap_enter: invalid page directory va=%#lx\n", va);
1991
1992 pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
1993 origpte = *pte;
1994 opa = origpte & PG_FRAME;
1995
1996 if (origpte & PG_PS)
1997 panic("pmap_enter: attempted pmap_enter on 2MB page");
1998
1999 /*
2000 * Mapping has not changed, must be protection or wiring change.
2001 */
2002 if (origpte && (opa == pa)) {
2003 /*
2004 * Wiring change, just update stats. We don't worry about
2005 * wiring PT pages as they remain resident as long as there
2006 * are valid mappings in them. Hence, if a user page is wired,
2007 * the PT page will be also.
2008 */
2009 if (wired && ((origpte & PG_W) == 0))
2010 pmap->pm_stats.wired_count++;
2011 else if (!wired && (origpte & PG_W))
2012 pmap->pm_stats.wired_count--;
2013
2014 #if defined(PMAP_DIAGNOSTIC)
2015 if (pmap_nw_modified((pt_entry_t) origpte)) {
2016 printf(
2017 "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2018 va, origpte);
2019 }
2020 #endif
2021
2022 /*
2023 * Remove extra pte reference
2024 */
2025 if (mpte)
2026 mpte->hold_count--;
2027
2028 if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2029 if ((origpte & PG_RW) == 0) {
2030 pte_store(pte, origpte | PG_RW);
2031 pmap_invalidate_page(pmap, va);
2032 }
2033 return;
2034 }
2035
2036 /*
2037 * We might be turning off write access to the page,
2038 * so we go ahead and sense modify status.
2039 */
2040 if (origpte & PG_MANAGED) {
2041 if ((origpte & PG_M) && pmap_track_modified(va)) {
2042 vm_page_t om;
2043 om = PHYS_TO_VM_PAGE(opa);
2044 vm_page_dirty(om);
2045 }
2046 pa |= PG_MANAGED;
2047 }
2048 goto validate;
2049 }
2050 /*
2051 * Mapping has changed, invalidate old range and fall through to
2052 * handle validating new mapping.
2053 */
2054 if (opa) {
2055 int err;
2056 vm_page_lock_queues();
2057 err = pmap_remove_pte(pmap, pte, va);
2058 vm_page_unlock_queues();
2059 if (err)
2060 panic("pmap_enter: pte vanished, va: 0x%lx", va);
2061 }
2062
2063 /*
2064 * Enter on the PV list if part of our managed memory. Note that we
2065 * raise IPL while manipulating pv_table since pmap_enter can be
2066 * called at interrupt time.
2067 */
2068 if (pmap_initialized &&
2069 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2070 pmap_insert_entry(pmap, va, mpte, m);
2071 pa |= PG_MANAGED;
2072 }
2073
2074 /*
2075 * Increment counters
2076 */
2077 pmap->pm_stats.resident_count++;
2078 if (wired)
2079 pmap->pm_stats.wired_count++;
2080
2081 validate:
2082 /*
2083 * Now validate mapping with desired protection/wiring.
2084 */
2085 newpte = (pt_entry_t)(pa | PG_V);
2086 if ((prot & VM_PROT_WRITE) != 0)
2087 newpte |= PG_RW;
2088 #ifdef PG_NX
2089 if ((prot & VM_PROT_EXECUTE) == 0)
2090 newpte |= PG_NX;
2091 #endif
2092 if (wired)
2093 newpte |= PG_W;
2094 if (va < VM_MAXUSER_ADDRESS)
2095 newpte |= PG_U;
2096 if (pmap == kernel_pmap)
2097 newpte |= PG_G;
2098
2099 /*
2100 * if the mapping or permission bits are different, we need
2101 * to update the pte.
2102 */
2103 if ((origpte & ~(PG_M|PG_A)) != newpte) {
2104 pte_store(pte, newpte | PG_A);
2105 /*if (origpte)*/ {
2106 pmap_invalidate_page(pmap, va);
2107 }
2108 }
2109 }
2110
2111 /*
2112 * this code makes some *MAJOR* assumptions:
2113 * 1. Current pmap & pmap exists.
2114 * 2. Not wired.
2115 * 3. Read access.
2116 * 4. No page table pages.
2117 * 5. Tlbflush is deferred to calling procedure.
2118 * 6. Page IS managed.
2119 * but is *MUCH* faster than pmap_enter...
2120 */
2121
2122 vm_page_t
2123 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2124 {
2125 pt_entry_t *pte;
2126 vm_paddr_t pa;
2127
2128 /*
2129 * In the case that a page table page is not
2130 * resident, we are creating it here.
2131 */
2132 if (va < VM_MAXUSER_ADDRESS) {
2133 vm_pindex_t ptepindex;
2134 pd_entry_t *ptepa;
2135
2136 /*
2137 * Calculate pagetable page index
2138 */
2139 ptepindex = pmap_pde_pindex(va);
2140 if (mpte && (mpte->pindex == ptepindex)) {
2141 mpte->hold_count++;
2142 } else {
2143 retry:
2144 /*
2145 * Get the page directory entry
2146 */
2147 ptepa = pmap_pde(pmap, va);
2148
2149 /*
2150 * If the page table page is mapped, we just increment
2151 * the hold count, and activate it.
2152 */
2153 if (ptepa && (*ptepa & PG_V) != 0) {
2154 if (*ptepa & PG_PS)
2155 panic("pmap_enter_quick: unexpected mapping into 2MB page");
2156 mpte = PHYS_TO_VM_PAGE(*ptepa);
2157 mpte->hold_count++;
2158 } else {
2159 mpte = _pmap_allocpte(pmap, ptepindex);
2160 if (mpte == NULL)
2161 goto retry;
2162 }
2163 }
2164 } else {
2165 mpte = NULL;
2166 }
2167
2168 /*
2169 * This call to vtopte makes the assumption that we are
2170 * entering the page into the current pmap. In order to support
2171 * quick entry into any pmap, one would likely use pmap_pte.
2172 * But that isn't as quick as vtopte.
2173 */
2174 pte = vtopte(va);
2175 if (*pte) {
2176 if (mpte != NULL) {
2177 vm_page_lock_queues();
2178 pmap_unwire_pte_hold(pmap, va, mpte);
2179 vm_page_unlock_queues();
2180 }
2181 return 0;
2182 }
2183
2184 /*
2185 * Enter on the PV list if part of our managed memory. Note that we
2186 * raise IPL while manipulating pv_table since pmap_enter can be
2187 * called at interrupt time.
2188 */
2189 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2190 pmap_insert_entry(pmap, va, mpte, m);
2191
2192 /*
2193 * Increment counters
2194 */
2195 pmap->pm_stats.resident_count++;
2196
2197 pa = VM_PAGE_TO_PHYS(m);
2198
2199 /*
2200 * Now validate mapping with RO protection
2201 */
2202 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2203 pte_store(pte, pa | PG_V | PG_U);
2204 else
2205 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2206
2207 return mpte;
2208 }
2209
2210 /*
2211 * Make a temporary mapping for a physical address. This is only intended
2212 * to be used for panic dumps.
2213 */
2214 void *
2215 pmap_kenter_temporary(vm_offset_t pa, int i)
2216 {
2217 vm_offset_t va;
2218
2219 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2220 pmap_kenter(va, pa);
2221 invlpg(va);
2222 return ((void *)crashdumpmap);
2223 }
2224
2225 /*
2226 * This code maps large physical mmap regions into the
2227 * processor address space. Note that some shortcuts
2228 * are taken, but the code works.
2229 */
2230 void
2231 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2232 vm_object_t object, vm_pindex_t pindex,
2233 vm_size_t size)
2234 {
2235 vm_page_t p;
2236
2237 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2238 KASSERT(object->type == OBJT_DEVICE,
2239 ("pmap_object_init_pt: non-device object"));
2240 if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2241 int i;
2242 vm_page_t m[1];
2243 int npdes;
2244 pd_entry_t ptepa, *pde;
2245
2246 pde = pmap_pde(pmap, addr);
2247 if (pde != 0 && (*pde & PG_V) != 0)
2248 return;
2249 retry:
2250 p = vm_page_lookup(object, pindex);
2251 if (p != NULL) {
2252 vm_page_lock_queues();
2253 if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2254 goto retry;
2255 } else {
2256 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2257 if (p == NULL)
2258 return;
2259 m[0] = p;
2260
2261 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2262 vm_page_lock_queues();
2263 vm_page_free(p);
2264 vm_page_unlock_queues();
2265 return;
2266 }
2267
2268 p = vm_page_lookup(object, pindex);
2269 vm_page_lock_queues();
2270 vm_page_wakeup(p);
2271 }
2272 vm_page_unlock_queues();
2273
2274 ptepa = VM_PAGE_TO_PHYS(p);
2275 if (ptepa & (NBPDR - 1))
2276 return;
2277
2278 p->valid = VM_PAGE_BITS_ALL;
2279
2280 pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2281 npdes = size >> PDRSHIFT;
2282 for(i = 0; i < npdes; i++) {
2283 pde_store(pde, ptepa | PG_U | PG_RW | PG_V | PG_PS);
2284 ptepa += NBPDR;
2285 pde++;
2286 }
2287 pmap_invalidate_all(pmap);
2288 }
2289 }
2290
2291 /*
2292 * Routine: pmap_change_wiring
2293 * Function: Change the wiring attribute for a map/virtual-address
2294 * pair.
2295 * In/out conditions:
2296 * The mapping must already exist in the pmap.
2297 */
2298 void
2299 pmap_change_wiring(pmap, va, wired)
2300 register pmap_t pmap;
2301 vm_offset_t va;
2302 boolean_t wired;
2303 {
2304 register pt_entry_t *pte;
2305
2306 if (pmap == NULL)
2307 return;
2308
2309 /*
2310 * Wiring is not a hardware characteristic so there is no need to
2311 * invalidate TLB.
2312 */
2313 pte = pmap_pte(pmap, va);
2314 if (wired && (*pte & PG_W) == 0) {
2315 pmap->pm_stats.wired_count++;
2316 *pte |= PG_W;
2317 } else if (!wired && (*pte & PG_W) != 0) {
2318 pmap->pm_stats.wired_count--;
2319 *pte &= ~PG_W;
2320 }
2321 }
2322
2323
2324
2325 /*
2326 * Copy the range specified by src_addr/len
2327 * from the source map to the range dst_addr/len
2328 * in the destination map.
2329 *
2330 * This routine is only advisory and need not do anything.
2331 */
2332
2333 void
2334 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2335 vm_offset_t src_addr)
2336 {
2337 vm_offset_t addr;
2338 vm_offset_t end_addr = src_addr + len;
2339 vm_offset_t pdnxt;
2340 vm_page_t m;
2341
2342 if (dst_addr != src_addr)
2343 return;
2344
2345 if (!pmap_is_current(src_pmap))
2346 return;
2347
2348 for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2349 pt_entry_t *src_pte, *dst_pte;
2350 vm_page_t dstmpte, srcmpte;
2351 pd_entry_t srcptepaddr, *pde;
2352 vm_pindex_t ptepindex;
2353
2354 if (addr >= UPT_MIN_ADDRESS)
2355 panic("pmap_copy: invalid to pmap_copy page tables\n");
2356
2357 /*
2358 * Don't let optional prefaulting of pages make us go
2359 * way below the low water mark of free pages or way
2360 * above high water mark of used pv entries.
2361 */
2362 if (cnt.v_free_count < cnt.v_free_reserved ||
2363 pv_entry_count > pv_entry_high_water)
2364 break;
2365
2366 pdnxt = (addr + NBPDR) & ~PDRMASK;
2367 ptepindex = pmap_pde_pindex(addr);
2368
2369 pde = pmap_pde(src_pmap, addr);
2370 if (pde)
2371 srcptepaddr = *pde;
2372 else
2373 continue;
2374 if (srcptepaddr == 0)
2375 continue;
2376
2377 if (srcptepaddr & PG_PS) {
2378 pde = pmap_pde(dst_pmap, addr);
2379 if (pde == 0) {
2380 /*
2381 * XXX should do an allocpte here to
2382 * instantiate the pde
2383 */
2384 continue;
2385 }
2386 if (*pde == 0) {
2387 *pde = srcptepaddr;
2388 dst_pmap->pm_stats.resident_count +=
2389 NBPDR / PAGE_SIZE;
2390 }
2391 continue;
2392 }
2393
2394 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2395 if (srcmpte->hold_count == 0 || (srcmpte->flags & PG_BUSY))
2396 continue;
2397
2398 if (pdnxt > end_addr)
2399 pdnxt = end_addr;
2400
2401 src_pte = vtopte(addr);
2402 while (addr < pdnxt) {
2403 pt_entry_t ptetemp;
2404 ptetemp = *src_pte;
2405 /*
2406 * we only virtual copy managed pages
2407 */
2408 if ((ptetemp & PG_MANAGED) != 0) {
2409 /*
2410 * We have to check after allocpte for the
2411 * pte still being around... allocpte can
2412 * block.
2413 */
2414 dstmpte = pmap_allocpte(dst_pmap, addr);
2415 dst_pte = pmap_pte(dst_pmap, addr);
2416 if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2417 /*
2418 * Clear the modified and
2419 * accessed (referenced) bits
2420 * during the copy.
2421 */
2422 m = PHYS_TO_VM_PAGE(ptetemp);
2423 *dst_pte = ptetemp & ~(PG_M | PG_A);
2424 dst_pmap->pm_stats.resident_count++;
2425 pmap_insert_entry(dst_pmap, addr,
2426 dstmpte, m);
2427 } else {
2428 vm_page_lock_queues();
2429 pmap_unwire_pte_hold(dst_pmap, addr, dstmpte);
2430 vm_page_unlock_queues();
2431 }
2432 if (dstmpte->hold_count >= srcmpte->hold_count)
2433 break;
2434 }
2435 addr += PAGE_SIZE;
2436 src_pte++;
2437 }
2438 }
2439 }
2440
2441 /*
2442 * pmap_zero_page zeros the specified hardware page by mapping
2443 * the page into KVM and using bzero to clear its contents.
2444 */
2445 void
2446 pmap_zero_page(vm_page_t m)
2447 {
2448 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2449
2450 pagezero((void *)va);
2451 }
2452
2453 /*
2454 * pmap_zero_page_area zeros the specified hardware page by mapping
2455 * the page into KVM and using bzero to clear its contents.
2456 *
2457 * off and size may not cover an area beyond a single hardware page.
2458 */
2459 void
2460 pmap_zero_page_area(vm_page_t m, int off, int size)
2461 {
2462 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2463
2464 if (off == 0 && size == PAGE_SIZE)
2465 pagezero((void *)va);
2466 else
2467 bzero((char *)va + off, size);
2468 }
2469
2470 /*
2471 * pmap_zero_page_idle zeros the specified hardware page by mapping
2472 * the page into KVM and using bzero to clear its contents. This
2473 * is intended to be called from the vm_pagezero process only and
2474 * outside of Giant.
2475 */
2476 void
2477 pmap_zero_page_idle(vm_page_t m)
2478 {
2479 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2480
2481 pagezero((void *)va);
2482 }
2483
2484 /*
2485 * pmap_copy_page copies the specified (machine independent)
2486 * page by mapping the page into virtual memory and using
2487 * bcopy to copy the page, one machine dependent page at a
2488 * time.
2489 */
2490 void
2491 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2492 {
2493 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2494 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2495
2496 bcopy((void *)src, (void *)dst, PAGE_SIZE);
2497 }
2498
2499 /*
2500 * Returns true if the pmap's pv is one of the first
2501 * 16 pvs linked to from this page. This count may
2502 * be changed upwards or downwards in the future; it
2503 * is only necessary that true be returned for a small
2504 * subset of pmaps for proper page aging.
2505 */
2506 boolean_t
2507 pmap_page_exists_quick(pmap, m)
2508 pmap_t pmap;
2509 vm_page_t m;
2510 {
2511 pv_entry_t pv;
2512 int loops = 0;
2513 int s;
2514
2515 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2516 return FALSE;
2517
2518 s = splvm();
2519 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2520 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2521 if (pv->pv_pmap == pmap) {
2522 splx(s);
2523 return TRUE;
2524 }
2525 loops++;
2526 if (loops >= 16)
2527 break;
2528 }
2529 splx(s);
2530 return (FALSE);
2531 }
2532
2533 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
2534 /*
2535 * Remove all pages from specified address space
2536 * this aids process exit speeds. Also, this code
2537 * is special cased for current process only, but
2538 * can have the more generic (and slightly slower)
2539 * mode enabled. This is much faster than pmap_remove
2540 * in the case of running down an entire address space.
2541 */
2542 void
2543 pmap_remove_pages(pmap, sva, eva)
2544 pmap_t pmap;
2545 vm_offset_t sva, eva;
2546 {
2547 pt_entry_t *pte, tpte;
2548 vm_page_t m;
2549 pv_entry_t pv, npv;
2550 int s;
2551
2552 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2553 if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2554 printf("warning: pmap_remove_pages called with non-current pmap\n");
2555 return;
2556 }
2557 #endif
2558 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2559 s = splvm();
2560 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2561
2562 if (pv->pv_va >= eva || pv->pv_va < sva) {
2563 npv = TAILQ_NEXT(pv, pv_plist);
2564 continue;
2565 }
2566
2567 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2568 pte = vtopte(pv->pv_va);
2569 #else
2570 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2571 #endif
2572 tpte = *pte;
2573
2574 if (tpte == 0) {
2575 printf("TPTE at %p IS ZERO @ VA %08lx\n",
2576 pte, pv->pv_va);
2577 panic("bad pte");
2578 }
2579
2580 /*
2581 * We cannot remove wired pages from a process' mapping at this time
2582 */
2583 if (tpte & PG_W) {
2584 npv = TAILQ_NEXT(pv, pv_plist);
2585 continue;
2586 }
2587
2588 m = PHYS_TO_VM_PAGE(tpte);
2589 KASSERT(m->phys_addr == (tpte & PG_FRAME),
2590 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2591 m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2592
2593 KASSERT(m < &vm_page_array[vm_page_array_size],
2594 ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2595
2596 pv->pv_pmap->pm_stats.resident_count--;
2597
2598 pte_clear(pte);
2599
2600 /*
2601 * Update the vm_page_t clean and reference bits.
2602 */
2603 if (tpte & PG_M) {
2604 vm_page_dirty(m);
2605 }
2606
2607 npv = TAILQ_NEXT(pv, pv_plist);
2608 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2609
2610 m->md.pv_list_count--;
2611 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2612 if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2613 vm_page_flag_clear(m, PG_WRITEABLE);
2614 }
2615
2616 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2617 free_pv_entry(pv);
2618 }
2619 splx(s);
2620 pmap_invalidate_all(pmap);
2621 }
2622
2623 /*
2624 * pmap_is_modified:
2625 *
2626 * Return whether or not the specified physical page was modified
2627 * in any physical maps.
2628 */
2629 boolean_t
2630 pmap_is_modified(vm_page_t m)
2631 {
2632 pv_entry_t pv;
2633 pt_entry_t *pte;
2634 int s;
2635
2636 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2637 return FALSE;
2638
2639 s = splvm();
2640 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2641 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2642 /*
2643 * if the bit being tested is the modified bit, then
2644 * mark clean_map and ptes as never
2645 * modified.
2646 */
2647 if (!pmap_track_modified(pv->pv_va))
2648 continue;
2649 #if defined(PMAP_DIAGNOSTIC)
2650 if (!pv->pv_pmap) {
2651 printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2652 continue;
2653 }
2654 #endif
2655 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2656 if (*pte & PG_M) {
2657 splx(s);
2658 return TRUE;
2659 }
2660 }
2661 splx(s);
2662 return (FALSE);
2663 }
2664
2665 /*
2666 * pmap_is_prefaultable:
2667 *
2668 * Return whether or not the specified virtual address is elgible
2669 * for prefault.
2670 */
2671 boolean_t
2672 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2673 {
2674 pd_entry_t *pde;
2675 pt_entry_t *pte;
2676
2677 pde = pmap_pde(pmap, addr);
2678 if (pde == NULL || (*pde & PG_V) == 0)
2679 return (FALSE);
2680 pte = vtopte(addr);
2681 if ((*pte & PG_V) == 0)
2682 return (FALSE);
2683 return (TRUE);
2684 }
2685
2686 /*
2687 * Clear the given bit in each of the given page's ptes.
2688 */
2689 static __inline void
2690 pmap_clear_ptes(vm_page_t m, int bit)
2691 {
2692 register pv_entry_t pv;
2693 pt_entry_t pbits, *pte;
2694 int s;
2695
2696 if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
2697 (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2698 return;
2699
2700 s = splvm();
2701 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2702 /*
2703 * Loop over all current mappings setting/clearing as appropos If
2704 * setting RO do we need to clear the VAC?
2705 */
2706 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2707 /*
2708 * don't write protect pager mappings
2709 */
2710 if (bit == PG_RW) {
2711 if (!pmap_track_modified(pv->pv_va))
2712 continue;
2713 }
2714
2715 #if defined(PMAP_DIAGNOSTIC)
2716 if (!pv->pv_pmap) {
2717 printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
2718 continue;
2719 }
2720 #endif
2721
2722 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2723 pbits = *pte;
2724 if (pbits & bit) {
2725 if (bit == PG_RW) {
2726 if (pbits & PG_M) {
2727 vm_page_dirty(m);
2728 }
2729 pte_store(pte, pbits & ~(PG_M|PG_RW));
2730 } else {
2731 pte_store(pte, pbits & ~bit);
2732 }
2733 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2734 }
2735 }
2736 if (bit == PG_RW)
2737 vm_page_flag_clear(m, PG_WRITEABLE);
2738 splx(s);
2739 }
2740
2741 /*
2742 * pmap_page_protect:
2743 *
2744 * Lower the permission for all mappings to a given page.
2745 */
2746 void
2747 pmap_page_protect(vm_page_t m, vm_prot_t prot)
2748 {
2749 if ((prot & VM_PROT_WRITE) == 0) {
2750 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2751 pmap_clear_ptes(m, PG_RW);
2752 } else {
2753 pmap_remove_all(m);
2754 }
2755 }
2756 }
2757
2758 /*
2759 * pmap_ts_referenced:
2760 *
2761 * Return a count of reference bits for a page, clearing those bits.
2762 * It is not necessary for every reference bit to be cleared, but it
2763 * is necessary that 0 only be returned when there are truly no
2764 * reference bits set.
2765 *
2766 * XXX: The exact number of bits to check and clear is a matter that
2767 * should be tested and standardized at some point in the future for
2768 * optimal aging of shared pages.
2769 */
2770 int
2771 pmap_ts_referenced(vm_page_t m)
2772 {
2773 register pv_entry_t pv, pvf, pvn;
2774 pt_entry_t *pte;
2775 pt_entry_t v;
2776 int s;
2777 int rtval = 0;
2778
2779 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2780 return (rtval);
2781
2782 s = splvm();
2783 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2784 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2785
2786 pvf = pv;
2787
2788 do {
2789 pvn = TAILQ_NEXT(pv, pv_list);
2790
2791 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2792
2793 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2794
2795 if (!pmap_track_modified(pv->pv_va))
2796 continue;
2797
2798 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2799
2800 if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2801 pte_store(pte, v & ~PG_A);
2802 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2803
2804 rtval++;
2805 if (rtval > 4) {
2806 break;
2807 }
2808 }
2809 } while ((pv = pvn) != NULL && pv != pvf);
2810 }
2811 splx(s);
2812
2813 return (rtval);
2814 }
2815
2816 /*
2817 * Clear the modify bits on the specified physical page.
2818 */
2819 void
2820 pmap_clear_modify(vm_page_t m)
2821 {
2822 pmap_clear_ptes(m, PG_M);
2823 }
2824
2825 /*
2826 * pmap_clear_reference:
2827 *
2828 * Clear the reference bit on the specified physical page.
2829 */
2830 void
2831 pmap_clear_reference(vm_page_t m)
2832 {
2833 pmap_clear_ptes(m, PG_A);
2834 }
2835
2836 /*
2837 * Miscellaneous support routines follow
2838 */
2839
2840 /*
2841 * Map a set of physical memory pages into the kernel virtual
2842 * address space. Return a pointer to where it is mapped. This
2843 * routine is intended to be used for mapping device memory,
2844 * NOT real memory.
2845 */
2846 void *
2847 pmap_mapdev(pa, size)
2848 vm_paddr_t pa;
2849 vm_size_t size;
2850 {
2851 vm_offset_t va, tmpva, offset;
2852
2853 /* If this fits within the direct map window, use it */
2854 if (pa < dmaplimit && (pa + size) < dmaplimit)
2855 return ((void *)PHYS_TO_DMAP(pa));
2856 offset = pa & PAGE_MASK;
2857 size = roundup(offset + size, PAGE_SIZE);
2858 va = kmem_alloc_nofault(kernel_map, size);
2859 if (!va)
2860 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2861 pa = pa & PG_FRAME;
2862 for (tmpva = va; size > 0; ) {
2863 pmap_kenter(tmpva, pa);
2864 size -= PAGE_SIZE;
2865 tmpva += PAGE_SIZE;
2866 pa += PAGE_SIZE;
2867 }
2868 pmap_invalidate_range(kernel_pmap, va, tmpva);
2869 return ((void *)(va + offset));
2870 }
2871
2872 void
2873 pmap_unmapdev(va, size)
2874 vm_offset_t va;
2875 vm_size_t size;
2876 {
2877 vm_offset_t base, offset, tmpva;
2878
2879 /* If we gave a direct map region in pmap_mapdev, do nothing */
2880 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2881 return;
2882 base = va & PG_FRAME;
2883 offset = va & PAGE_MASK;
2884 size = roundup(offset + size, PAGE_SIZE);
2885 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2886 pmap_kremove(tmpva);
2887 pmap_invalidate_range(kernel_pmap, va, tmpva);
2888 kmem_free(kernel_map, base, size);
2889 }
2890
2891 /*
2892 * perform the pmap work for mincore
2893 */
2894 int
2895 pmap_mincore(pmap, addr)
2896 pmap_t pmap;
2897 vm_offset_t addr;
2898 {
2899 pt_entry_t *ptep, pte;
2900 vm_page_t m;
2901 int val = 0;
2902
2903 ptep = pmap_pte(pmap, addr);
2904 if (ptep == 0) {
2905 return 0;
2906 }
2907
2908 if ((pte = *ptep) != 0) {
2909 vm_paddr_t pa;
2910
2911 val = MINCORE_INCORE;
2912 if ((pte & PG_MANAGED) == 0)
2913 return val;
2914
2915 pa = pte & PG_FRAME;
2916
2917 m = PHYS_TO_VM_PAGE(pa);
2918
2919 /*
2920 * Modified by us
2921 */
2922 if (pte & PG_M)
2923 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2924 else {
2925 /*
2926 * Modified by someone else
2927 */
2928 vm_page_lock_queues();
2929 if (m->dirty || pmap_is_modified(m))
2930 val |= MINCORE_MODIFIED_OTHER;
2931 vm_page_unlock_queues();
2932 }
2933 /*
2934 * Referenced by us
2935 */
2936 if (pte & PG_A)
2937 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2938 else {
2939 /*
2940 * Referenced by someone else
2941 */
2942 vm_page_lock_queues();
2943 if ((m->flags & PG_REFERENCED) ||
2944 pmap_ts_referenced(m)) {
2945 val |= MINCORE_REFERENCED_OTHER;
2946 vm_page_flag_set(m, PG_REFERENCED);
2947 }
2948 vm_page_unlock_queues();
2949 }
2950 }
2951 return val;
2952 }
2953
2954 void
2955 pmap_activate(struct thread *td)
2956 {
2957 struct proc *p = td->td_proc;
2958 pmap_t pmap, oldpmap;
2959 u_int64_t cr3;
2960
2961 critical_enter();
2962 pmap = vmspace_pmap(td->td_proc->p_vmspace);
2963 oldpmap = PCPU_GET(curpmap);
2964 #ifdef SMP
2965 if (oldpmap) /* XXX FIXME */
2966 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
2967 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
2968 #else
2969 if (oldpmap) /* XXX FIXME */
2970 oldpmap->pm_active &= ~PCPU_GET(cpumask);
2971 pmap->pm_active |= PCPU_GET(cpumask);
2972 #endif
2973 cr3 = vtophys(pmap->pm_pml4);
2974 /* XXXKSE this is wrong.
2975 * pmap_activate is for the current thread on the current cpu
2976 */
2977 if (p->p_flag & P_SA) {
2978 /* Make sure all other cr3 entries are updated. */
2979 /* what if they are running? XXXKSE (maybe abort them) */
2980 FOREACH_THREAD_IN_PROC(p, td) {
2981 td->td_pcb->pcb_cr3 = cr3;
2982 }
2983 } else {
2984 td->td_pcb->pcb_cr3 = cr3;
2985 }
2986 load_cr3(cr3);
2987 critical_exit();
2988 }
2989
2990 vm_offset_t
2991 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
2992 {
2993
2994 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
2995 return addr;
2996 }
2997
2998 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
2999 return addr;
3000 }
Cache object: 377a1bd301f41e116afe93abac4d3744
|