FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/pmap.c
1 /*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * the Systems Programming Group of the University of Utah Computer
11 * Science Department and William Jolitz of UUNET Technologies Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 * must display the following acknowledgement:
23 * This product includes software developed by the University of
24 * California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 * may be used to endorse or promote products derived from this software
27 * without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
42 * $FreeBSD: releng/5.1/sys/amd64/amd64/pmap.c 115251 2003-05-23 05:04:54Z peter $
43 */
44 /*-
45 * Copyright (c) 2003 Networks Associates Technology, Inc.
46 * All rights reserved.
47 *
48 * This software was developed for the FreeBSD Project by Jake Burkholder,
49 * Safeport Network Services, and Network Associates Laboratories, the
50 * Security Research Division of Network Associates, Inc. under
51 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
52 * CHATS research program.
53 *
54 * Redistribution and use in source and binary forms, with or without
55 * modification, are permitted provided that the following conditions
56 * are met:
57 * 1. Redistributions of source code must retain the above copyright
58 * notice, this list of conditions and the following disclaimer.
59 * 2. Redistributions in binary form must reproduce the above copyright
60 * notice, this list of conditions and the following disclaimer in the
61 * documentation and/or other materials provided with the distribution.
62 *
63 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE.
74 */
75
76 /*
77 * Manages physical address maps.
78 *
79 * In addition to hardware address maps, this
80 * module is called upon to provide software-use-only
81 * maps which may or may not be stored in the same
82 * form as hardware maps. These pseudo-maps are
83 * used to store intermediate results from copy
84 * operations to and from address spaces.
85 *
86 * Since the information managed by this module is
87 * also stored by the logical address mapping module,
88 * this module may throw away valid virtual-to-physical
89 * mappings at almost any time. However, invalidations
90 * of virtual-to-physical mappings must be done as
91 * requested.
92 *
93 * In order to cope with hardware architectures which
94 * make virtual-to-physical map invalidates expensive,
95 * this module may delay invalidate or reduced protection
96 * operations until such time as they are actually
97 * necessary. This module is given full information as
98 * to which processors are currently using which maps,
99 * and to when physical maps must be made correct.
100 */
101
102 #include "opt_msgbuf.h"
103 #include "opt_kstack_pages.h"
104
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/kernel.h>
108 #include <sys/lock.h>
109 #include <sys/mman.h>
110 #include <sys/msgbuf.h>
111 #include <sys/mutex.h>
112 #include <sys/proc.h>
113 #include <sys/sx.h>
114 #include <sys/user.h>
115 #include <sys/vmmeter.h>
116 #include <sys/sysctl.h>
117
118 #include <vm/vm.h>
119 #include <vm/vm_param.h>
120 #include <vm/vm_kern.h>
121 #include <vm/vm_page.h>
122 #include <vm/vm_map.h>
123 #include <vm/vm_object.h>
124 #include <vm/vm_extern.h>
125 #include <vm/vm_pageout.h>
126 #include <vm/vm_pager.h>
127 #include <vm/uma.h>
128 #include <vm/uma_int.h>
129
130 #include <machine/cpu.h>
131 #include <machine/cputypes.h>
132 #include <machine/md_var.h>
133 #include <machine/specialreg.h>
134
135 #define PMAP_KEEP_PDIRS
136 #ifndef PMAP_SHPGPERPROC
137 #define PMAP_SHPGPERPROC 200
138 #endif
139
140 #if defined(DIAGNOSTIC)
141 #define PMAP_DIAGNOSTIC
142 #endif
143
144 #define MINPV 2048
145
146 #if !defined(PMAP_DIAGNOSTIC)
147 #define PMAP_INLINE __inline
148 #else
149 #define PMAP_INLINE
150 #endif
151
152 /*
153 * Given a map and a machine independent protection code,
154 * convert to a vax protection code.
155 */
156 #define pte_prot(m, p) (protection_codes[p])
157 static pt_entry_t protection_codes[8];
158
159 struct pmap kernel_pmap_store;
160 LIST_HEAD(pmaplist, pmap);
161 static struct pmaplist allpmaps;
162 static struct mtx allpmaps_lock;
163
164 vm_paddr_t avail_start; /* PA of first available physical page */
165 vm_paddr_t avail_end; /* PA of last available physical page */
166 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
167 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
168 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */
169
170 static int nkpt;
171 static int ndmpdp;
172 vm_offset_t kernel_vm_end;
173
174 static u_int64_t KPTphys; /* phys addr of kernel level 1 */
175 static u_int64_t KPDphys; /* phys addr of kernel level 2 */
176 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */
177 u_int64_t KPML4phys; /* phys addr of kernel level 4 */
178
179 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
180 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
181
182 /*
183 * Data for the pv entry allocation mechanism
184 */
185 static uma_zone_t pvzone;
186 static struct vm_object pvzone_obj;
187 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
188 int pmap_pagedaemon_waken;
189
190 /*
191 * All those kernel PT submaps that BSD is so fond of
192 */
193 pt_entry_t *CMAP1 = 0;
194 static pt_entry_t *ptmmap;
195 caddr_t CADDR1 = 0, ptvmmap = 0;
196 static pt_entry_t *msgbufmap;
197 struct msgbuf *msgbufp = 0;
198
199 /*
200 * Crashdump maps.
201 */
202 static pt_entry_t *pt_crashdumpmap;
203 static caddr_t crashdumpmap;
204
205 static PMAP_INLINE void free_pv_entry(pv_entry_t pv);
206 static pv_entry_t get_pv_entry(void);
207 static void amd64_protection_init(void);
208 static __inline void pmap_changebit(vm_page_t m, int bit, boolean_t setem);
209
210 static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va,
211 vm_page_t m, vm_page_t mpte);
212 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
213 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
214 static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
215 vm_offset_t va);
216 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
217 vm_page_t mpte, vm_page_t m);
218
219 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
220
221 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex);
222 static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex);
223 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
224 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
225 static void *pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
226
227 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
228 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
229
230 /*
231 * Move the kernel virtual free pointer to the next
232 * 2MB. This is used to help improve performance
233 * by using a large (2MB) page for much of the kernel
234 * (.text, .data, .bss)
235 */
236 static vm_offset_t
237 pmap_kmem_choose(vm_offset_t addr)
238 {
239 vm_offset_t newaddr = addr;
240
241 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
242 return newaddr;
243 }
244
245 /********************/
246 /* Inline functions */
247 /********************/
248
249 /* Return a non-clipped PD index for a given VA */
250 static __inline vm_pindex_t
251 pmap_pde_pindex(vm_offset_t va)
252 {
253 return va >> PDRSHIFT;
254 }
255
256
257 /* Return various clipped indexes for a given VA */
258 static __inline vm_pindex_t
259 pmap_pte_index(vm_offset_t va)
260 {
261
262 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
263 }
264
265 static __inline vm_pindex_t
266 pmap_pde_index(vm_offset_t va)
267 {
268
269 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
270 }
271
272 static __inline vm_pindex_t
273 pmap_pdpe_index(vm_offset_t va)
274 {
275
276 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
277 }
278
279 static __inline vm_pindex_t
280 pmap_pml4e_index(vm_offset_t va)
281 {
282
283 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
284 }
285
286 /* Return a pointer to the PML4 slot that corresponds to a VA */
287 static __inline pml4_entry_t *
288 pmap_pml4e(pmap_t pmap, vm_offset_t va)
289 {
290
291 if (!pmap)
292 return NULL;
293 return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
294 }
295
296 /* Return a pointer to the PDP slot that corresponds to a VA */
297 static __inline pdp_entry_t *
298 pmap_pdpe(pmap_t pmap, vm_offset_t va)
299 {
300 pml4_entry_t *pml4e;
301 pdp_entry_t *pdpe;
302
303 pml4e = pmap_pml4e(pmap, va);
304 if (pml4e == NULL || (*pml4e & PG_V) == 0)
305 return NULL;
306 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
307 return (&pdpe[pmap_pdpe_index(va)]);
308 }
309
310 /* Return a pointer to the PD slot that corresponds to a VA */
311 static __inline pd_entry_t *
312 pmap_pde(pmap_t pmap, vm_offset_t va)
313 {
314 pdp_entry_t *pdpe;
315 pd_entry_t *pde;
316
317 pdpe = pmap_pdpe(pmap, va);
318 if (pdpe == NULL || (*pdpe & PG_V) == 0)
319 return NULL;
320 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
321 return (&pde[pmap_pde_index(va)]);
322 }
323
324 /* Return a pointer to the PT slot that corresponds to a VA */
325 static __inline pt_entry_t *
326 pmap_pte(pmap_t pmap, vm_offset_t va)
327 {
328 pd_entry_t *pde;
329 pt_entry_t *pte;
330
331 pde = pmap_pde(pmap, va);
332 if (pde == NULL || (*pde & PG_V) == 0)
333 return NULL;
334 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
335 return (&pte[pmap_pte_index(va)]);
336 }
337
338
339 PMAP_INLINE pt_entry_t *
340 vtopte(vm_offset_t va)
341 {
342 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
343
344 return (PTmap + (amd64_btop(va) & mask));
345 }
346
347 static u_int64_t
348 allocpages(int n)
349 {
350 u_int64_t ret;
351
352 ret = avail_start;
353 bzero((void *)ret, n * PAGE_SIZE);
354 avail_start += n * PAGE_SIZE;
355 return (ret);
356 }
357
358 static void
359 create_pagetables(void)
360 {
361 int i;
362
363 /* Allocate pages */
364 KPTphys = allocpages(NKPT);
365 KPML4phys = allocpages(1);
366 KPDPphys = allocpages(NKPML4E);
367 KPDphys = allocpages(NKPDPE);
368
369 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
370 if (ndmpdp < 1)
371 ndmpdp = 1;
372 DMPDPphys = allocpages(NDMPML4E);
373 DMPDphys = allocpages(ndmpdp);
374
375 /* Fill in the underlying page table pages */
376 /* Read-only from zero to physfree */
377 /* XXX not fully used, underneath 2M pages */
378 for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) {
379 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
380 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V;
381 }
382
383 /* Now map the page tables at their location within PTmap */
384 for (i = 0; i < NKPT; i++) {
385 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
386 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
387 }
388
389 #if 0
390 /* Map from zero to end of allocations under 2M pages */
391 /* This replaces some of the KPTphys entries above */
392 for (i = 0; (i << PDRSHIFT) < avail_start; i++) {
393 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
394 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS;
395 }
396 #endif
397
398 /* And connect up the PD to the PDP */
399 for (i = 0; i < NKPDPE; i++) {
400 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
401 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
402 }
403
404
405 /* Now set up the direct map space using 2MB pages */
406 for (i = 0; i < NPDEPG * ndmpdp; i++) {
407 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
408 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS;
409 }
410
411 /* And the direct map space's PDP */
412 for (i = 0; i < ndmpdp; i++) {
413 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
414 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
415 }
416
417 /* And recursively map PML4 to itself in order to get PTmap */
418 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
419 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
420
421 /* Connect the Direct Map slot up to the PML4 */
422 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
423 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
424
425 /* Connect the KVA slot up to the PML4 */
426 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
427 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
428 }
429
430 /*
431 * Bootstrap the system enough to run with virtual memory.
432 *
433 * On amd64 this is called after mapping has already been enabled
434 * and just syncs the pmap module with what has already been done.
435 * [We can't call it easily with mapping off since the kernel is not
436 * mapped with PA == VA, hence we would have to relocate every address
437 * from the linked base (virtual) address "KERNBASE" to the actual
438 * (physical) address starting relative to 0]
439 */
440 void
441 pmap_bootstrap(firstaddr)
442 vm_paddr_t *firstaddr;
443 {
444 vm_offset_t va;
445 pt_entry_t *pte;
446
447 avail_start = *firstaddr;
448
449 /*
450 * Create an initial set of page tables to run the kernel in.
451 */
452 create_pagetables();
453 *firstaddr = avail_start;
454
455 virtual_avail = (vm_offset_t) KERNBASE + avail_start;
456 virtual_avail = pmap_kmem_choose(virtual_avail);
457
458 virtual_end = VM_MAX_KERNEL_ADDRESS;
459
460
461 /* XXX do %cr0 as well */
462 load_cr4(rcr4() | CR4_PGE | CR4_PSE);
463 load_cr3(KPML4phys);
464
465 /*
466 * Initialize protection array.
467 */
468 amd64_protection_init();
469
470 /*
471 * Initialize the kernel pmap (which is statically allocated).
472 */
473 kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
474 kernel_pmap->pm_active = -1; /* don't allow deactivation */
475 TAILQ_INIT(&kernel_pmap->pm_pvlist);
476 LIST_INIT(&allpmaps);
477 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
478 mtx_lock_spin(&allpmaps_lock);
479 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
480 mtx_unlock_spin(&allpmaps_lock);
481 nkpt = NKPT;
482
483 /*
484 * Reserve some special page table entries/VA space for temporary
485 * mapping of pages.
486 */
487 #define SYSMAP(c, p, v, n) \
488 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
489
490 va = virtual_avail;
491 pte = vtopte(va);
492
493 /*
494 * CMAP1 is only used for the memory test.
495 */
496 SYSMAP(caddr_t, CMAP1, CADDR1, 1)
497
498 /*
499 * Crashdump maps.
500 */
501 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
502
503 /*
504 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
505 * XXX ptmmap is not used.
506 */
507 SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
508
509 /*
510 * msgbufp is used to map the system message buffer.
511 * XXX msgbufmap is not used.
512 */
513 SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
514 atop(round_page(MSGBUF_SIZE)))
515
516 virtual_avail = va;
517
518 *CMAP1 = 0;
519
520 invltlb();
521 }
522
523 static void *
524 pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
525 {
526 *flags = UMA_SLAB_PRIV;
527 return (void *)kmem_alloc(kernel_map, bytes);
528 }
529
530 void *
531 uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
532 {
533 static vm_pindex_t colour;
534 vm_page_t m;
535 int pflags;
536 void *va;
537
538 *flags = UMA_SLAB_PRIV;
539
540 if ((wait & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
541 pflags = VM_ALLOC_INTERRUPT;
542 else
543 pflags = VM_ALLOC_SYSTEM;
544
545 if (wait & M_ZERO)
546 pflags |= VM_ALLOC_ZERO;
547
548 for (;;) {
549 m = vm_page_alloc(NULL, colour++, pflags | VM_ALLOC_NOOBJ);
550 if (m == NULL) {
551 if (wait & M_NOWAIT)
552 return (NULL);
553 else
554 VM_WAIT;
555 } else
556 break;
557 }
558
559 va = (void *)PHYS_TO_DMAP(m->phys_addr);
560 if ((m->flags & PG_ZERO) == 0)
561 pagezero(va);
562 return (va);
563 }
564
565 void
566 uma_small_free(void *mem, int size, u_int8_t flags)
567 {
568 vm_page_t m;
569
570 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)mem));
571 vm_page_lock_queues();
572 vm_page_free(m);
573 vm_page_unlock_queues();
574 }
575
576 /*
577 * Initialize the pmap module.
578 * Called by vm_init, to initialize any structures that the pmap
579 * system needs to map virtual memory.
580 * pmap_init has been enhanced to support in a fairly consistant
581 * way, discontiguous physical memory.
582 */
583 void
584 pmap_init(phys_start, phys_end)
585 vm_paddr_t phys_start, phys_end;
586 {
587 int i;
588 int initial_pvs;
589
590 /*
591 * Allocate memory for random pmap data structures. Includes the
592 * pv_head_table.
593 */
594
595 for(i = 0; i < vm_page_array_size; i++) {
596 vm_page_t m;
597
598 m = &vm_page_array[i];
599 TAILQ_INIT(&m->md.pv_list);
600 m->md.pv_list_count = 0;
601 }
602
603 /*
604 * init the pv free list
605 */
606 initial_pvs = vm_page_array_size;
607 if (initial_pvs < MINPV)
608 initial_pvs = MINPV;
609 pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
610 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
611 uma_zone_set_allocf(pvzone, pmap_pv_allocf);
612 uma_prealloc(pvzone, initial_pvs);
613
614 /*
615 * Now it is safe to enable pv_table recording.
616 */
617 pmap_initialized = TRUE;
618 }
619
620 /*
621 * Initialize the address space (zone) for the pv_entries. Set a
622 * high water mark so that the system can recover from excessive
623 * numbers of pv entries.
624 */
625 void
626 pmap_init2()
627 {
628 int shpgperproc = PMAP_SHPGPERPROC;
629
630 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
631 pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
632 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
633 pv_entry_high_water = 9 * (pv_entry_max / 10);
634 uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
635 }
636
637
638 /***************************************************
639 * Low level helper routines.....
640 ***************************************************/
641
642 #if defined(PMAP_DIAGNOSTIC)
643
644 /*
645 * This code checks for non-writeable/modified pages.
646 * This should be an invalid condition.
647 */
648 static int
649 pmap_nw_modified(pt_entry_t ptea)
650 {
651 int pte;
652
653 pte = (int) ptea;
654
655 if ((pte & (PG_M|PG_RW)) == PG_M)
656 return 1;
657 else
658 return 0;
659 }
660 #endif
661
662
663 /*
664 * this routine defines the region(s) of memory that should
665 * not be tested for the modified bit.
666 */
667 static PMAP_INLINE int
668 pmap_track_modified(vm_offset_t va)
669 {
670 if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
671 return 1;
672 else
673 return 0;
674 }
675
676 /*
677 * Normal invalidation functions.
678 * We inline these within pmap.c for speed.
679 */
680 PMAP_INLINE void
681 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
682 {
683
684 if (pmap == kernel_pmap || pmap->pm_active)
685 invlpg(va);
686 }
687
688 PMAP_INLINE void
689 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
690 {
691 vm_offset_t addr;
692
693 if (pmap == kernel_pmap || pmap->pm_active)
694 for (addr = sva; addr < eva; addr += PAGE_SIZE)
695 invlpg(addr);
696 }
697
698 PMAP_INLINE void
699 pmap_invalidate_all(pmap_t pmap)
700 {
701
702 if (pmap == kernel_pmap || pmap->pm_active)
703 invltlb();
704 }
705
706 /*
707 * Are we current address space or kernel?
708 */
709 static __inline int
710 pmap_is_current(pmap_t pmap)
711 {
712 return (pmap == kernel_pmap ||
713 (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
714 }
715
716 /*
717 * Routine: pmap_extract
718 * Function:
719 * Extract the physical page address associated
720 * with the given map/virtual_address pair.
721 */
722 vm_paddr_t
723 pmap_extract(pmap, va)
724 register pmap_t pmap;
725 vm_offset_t va;
726 {
727 vm_paddr_t rtval;
728 pt_entry_t *pte;
729 pd_entry_t pde, *pdep;
730
731 if (pmap == 0)
732 return 0;
733 pdep = pmap_pde(pmap, va);
734 if (pdep) {
735 pde = *pdep;
736 if (pde) {
737 if ((pde & PG_PS) != 0) {
738 rtval = (pde & ~PDRMASK) | (va & PDRMASK);
739 return rtval;
740 }
741 pte = pmap_pte(pmap, va);
742 rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
743 return rtval;
744 }
745 }
746 return 0;
747
748 }
749
750 vm_paddr_t
751 pmap_kextract(vm_offset_t va)
752 {
753 pd_entry_t *pde;
754 vm_paddr_t pa;
755
756 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
757 pa = DMAP_TO_PHYS(va);
758 } else {
759 pde = pmap_pde(kernel_pmap, va);
760 if (*pde & PG_PS) {
761 pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1));
762 } else {
763 pa = *vtopte(va);
764 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
765 }
766 }
767 return pa;
768 }
769
770 /***************************************************
771 * Low level mapping routines.....
772 ***************************************************/
773
774 /*
775 * Add a wired page to the kva.
776 * Note: not SMP coherent.
777 */
778 PMAP_INLINE void
779 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
780 {
781 pt_entry_t *pte;
782
783 pte = vtopte(va);
784 pte_store(pte, pa | PG_RW | PG_V | PG_G);
785 }
786
787 /*
788 * Remove a page from the kernel pagetables.
789 * Note: not SMP coherent.
790 */
791 PMAP_INLINE void
792 pmap_kremove(vm_offset_t va)
793 {
794 pt_entry_t *pte;
795
796 pte = vtopte(va);
797 pte_clear(pte);
798 }
799
800 /*
801 * Used to map a range of physical addresses into kernel
802 * virtual address space.
803 *
804 * The value passed in '*virt' is a suggested virtual address for
805 * the mapping. Architectures which can support a direct-mapped
806 * physical to virtual region can return the appropriate address
807 * within that region, leaving '*virt' unchanged. Other
808 * architectures should map the pages starting at '*virt' and
809 * update '*virt' with the first usable address after the mapped
810 * region.
811 */
812 vm_offset_t
813 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
814 {
815 vm_offset_t va, sva;
816
817 va = sva = *virt;
818 while (start < end) {
819 pmap_kenter(va, start);
820 va += PAGE_SIZE;
821 start += PAGE_SIZE;
822 }
823 pmap_invalidate_range(kernel_pmap, sva, va);
824 *virt = va;
825 return (sva);
826 }
827
828
829 /*
830 * Add a list of wired pages to the kva
831 * this routine is only used for temporary
832 * kernel mappings that do not need to have
833 * page modification or references recorded.
834 * Note that old mappings are simply written
835 * over. The page *must* be wired.
836 * Note: SMP coherent. Uses a ranged shootdown IPI.
837 */
838 void
839 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
840 {
841 vm_offset_t va;
842
843 va = sva;
844 while (count-- > 0) {
845 pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
846 va += PAGE_SIZE;
847 m++;
848 }
849 pmap_invalidate_range(kernel_pmap, sva, va);
850 }
851
852 /*
853 * This routine tears out page mappings from the
854 * kernel -- it is meant only for temporary mappings.
855 * Note: SMP coherent. Uses a ranged shootdown IPI.
856 */
857 void
858 pmap_qremove(vm_offset_t sva, int count)
859 {
860 vm_offset_t va;
861
862 va = sva;
863 while (count-- > 0) {
864 pmap_kremove(va);
865 va += PAGE_SIZE;
866 }
867 pmap_invalidate_range(kernel_pmap, sva, va);
868 }
869
870 static vm_page_t
871 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
872 {
873 vm_page_t m;
874
875 retry:
876 m = vm_page_lookup(object, pindex);
877 if (m != NULL) {
878 vm_page_lock_queues();
879 if (vm_page_sleep_if_busy(m, FALSE, "pplookp"))
880 goto retry;
881 vm_page_unlock_queues();
882 }
883 return m;
884 }
885
886 #ifndef KSTACK_MAX_PAGES
887 #define KSTACK_MAX_PAGES 32
888 #endif
889
890 /*
891 * Create the kernel stack (including pcb for amd64) for a new thread.
892 * This routine directly affects the fork perf for a process and
893 * create performance for a thread.
894 */
895 void
896 pmap_new_thread(struct thread *td, int pages)
897 {
898 int i;
899 vm_page_t ma[KSTACK_MAX_PAGES];
900 vm_object_t ksobj;
901 vm_page_t m;
902 vm_offset_t ks;
903
904 /* Bounds check */
905 if (pages <= 1)
906 pages = KSTACK_PAGES;
907 else if (pages > KSTACK_MAX_PAGES)
908 pages = KSTACK_MAX_PAGES;
909
910 /*
911 * allocate object for the kstack
912 */
913 ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
914 td->td_kstack_obj = ksobj;
915
916 /* get a kernel virtual address for the kstack for this thread */
917 #ifdef KSTACK_GUARD
918 ks = kmem_alloc_nofault(kernel_map, (pages + 1) * PAGE_SIZE);
919 if (ks == 0)
920 panic("pmap_new_thread: kstack allocation failed");
921 if (*vtopte(ks) != 0)
922 pmap_qremove(ks, 1);
923 ks += PAGE_SIZE;
924 td->td_kstack = ks;
925 #else
926 /* get a kernel virtual address for the kstack for this thread */
927 ks = kmem_alloc_nofault(kernel_map, pages * PAGE_SIZE);
928 if (ks == 0)
929 panic("pmap_new_thread: kstack allocation failed");
930 td->td_kstack = ks;
931 #endif
932 /*
933 * Knowing the number of pages allocated is useful when you
934 * want to deallocate them.
935 */
936 td->td_kstack_pages = pages;
937
938 /*
939 * For the length of the stack, link in a real page of ram for each
940 * page of stack.
941 */
942 for (i = 0; i < pages; i++) {
943 /*
944 * Get a kernel stack page
945 */
946 m = vm_page_grab(ksobj, i,
947 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
948 ma[i] = m;
949
950 vm_page_lock_queues();
951 vm_page_wakeup(m);
952 vm_page_flag_clear(m, PG_ZERO);
953 m->valid = VM_PAGE_BITS_ALL;
954 vm_page_unlock_queues();
955 }
956 pmap_qenter(ks, ma, pages);
957 }
958
959 /*
960 * Dispose the kernel stack for a thread that has exited.
961 * This routine directly impacts the exit perf of a process and thread.
962 */
963 void
964 pmap_dispose_thread(td)
965 struct thread *td;
966 {
967 int i;
968 int pages;
969 vm_object_t ksobj;
970 vm_offset_t ks;
971 vm_page_t m;
972
973 pages = td->td_kstack_pages;
974 ksobj = td->td_kstack_obj;
975 ks = td->td_kstack;
976 pmap_qremove(ks, pages);
977 for (i = 0; i < pages; i++) {
978 m = vm_page_lookup(ksobj, i);
979 if (m == NULL)
980 panic("pmap_dispose_thread: kstack already missing?");
981 vm_page_lock_queues();
982 vm_page_busy(m);
983 vm_page_unwire(m, 0);
984 vm_page_free(m);
985 vm_page_unlock_queues();
986 }
987 /*
988 * Free the space that this stack was mapped to in the kernel
989 * address map.
990 */
991 #ifdef KSTACK_GUARD
992 kmem_free(kernel_map, ks - PAGE_SIZE, (pages + 1) * PAGE_SIZE);
993 #else
994 kmem_free(kernel_map, ks, pages * PAGE_SIZE);
995 #endif
996 vm_object_deallocate(ksobj);
997 }
998
999 /*
1000 * Set up a variable sized alternate kstack. Though it may look MI, it may
1001 * need to be different on certain arches like ia64.
1002 */
1003 void
1004 pmap_new_altkstack(struct thread *td, int pages)
1005 {
1006 /* shuffle the original stack */
1007 td->td_altkstack_obj = td->td_kstack_obj;
1008 td->td_altkstack = td->td_kstack;
1009 td->td_altkstack_pages = td->td_kstack_pages;
1010
1011 pmap_new_thread(td, pages);
1012 }
1013
1014 void
1015 pmap_dispose_altkstack(td)
1016 struct thread *td;
1017 {
1018 pmap_dispose_thread(td);
1019
1020 /* restore the original kstack */
1021 td->td_kstack = td->td_altkstack;
1022 td->td_kstack_obj = td->td_altkstack_obj;
1023 td->td_kstack_pages = td->td_altkstack_pages;
1024 td->td_altkstack = 0;
1025 td->td_altkstack_obj = NULL;
1026 td->td_altkstack_pages = 0;
1027 }
1028
1029 /*
1030 * Allow the Kernel stack for a thread to be prejudicially paged out.
1031 */
1032 void
1033 pmap_swapout_thread(td)
1034 struct thread *td;
1035 {
1036 int i;
1037 int pages;
1038 vm_object_t ksobj;
1039 vm_offset_t ks;
1040 vm_page_t m;
1041
1042 pages = td->td_kstack_pages;
1043 ksobj = td->td_kstack_obj;
1044 ks = td->td_kstack;
1045 pmap_qremove(ks, pages);
1046 for (i = 0; i < pages; i++) {
1047 m = vm_page_lookup(ksobj, i);
1048 if (m == NULL)
1049 panic("pmap_swapout_thread: kstack already missing?");
1050 vm_page_lock_queues();
1051 vm_page_dirty(m);
1052 vm_page_unwire(m, 0);
1053 vm_page_unlock_queues();
1054 }
1055 }
1056
1057 /*
1058 * Bring the kernel stack for a specified thread back in.
1059 */
1060 void
1061 pmap_swapin_thread(td)
1062 struct thread *td;
1063 {
1064 int i, rv;
1065 int pages;
1066 vm_page_t ma[KSTACK_MAX_PAGES];
1067 vm_object_t ksobj;
1068 vm_offset_t ks;
1069 vm_page_t m;
1070
1071 pages = td->td_kstack_pages;
1072 ksobj = td->td_kstack_obj;
1073 ks = td->td_kstack;
1074 for (i = 0; i < pages; i++) {
1075 m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1076 if (m->valid != VM_PAGE_BITS_ALL) {
1077 rv = vm_pager_get_pages(ksobj, &m, 1, 0);
1078 if (rv != VM_PAGER_OK)
1079 panic("pmap_swapin_thread: cannot get kstack for proc: %d\n", td->td_proc->p_pid);
1080 m = vm_page_lookup(ksobj, i);
1081 m->valid = VM_PAGE_BITS_ALL;
1082 }
1083 ma[i] = m;
1084 vm_page_lock_queues();
1085 vm_page_wire(m);
1086 vm_page_wakeup(m);
1087 vm_page_unlock_queues();
1088 }
1089 pmap_qenter(ks, ma, pages);
1090 }
1091
1092 /***************************************************
1093 * Page table page management routines.....
1094 ***************************************************/
1095
1096 /*
1097 * This routine unholds page table pages, and if the hold count
1098 * drops to zero, then it decrements the wire count.
1099 */
1100 static int
1101 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
1102 {
1103
1104 while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
1105 vm_page_lock_queues();
1106
1107 if (m->hold_count == 0) {
1108 vm_offset_t pteva;
1109 /*
1110 * unmap the page table page
1111 */
1112 if (m->pindex >= (NUPDE + NUPDPE)) {
1113 /* PDP page */
1114 pml4_entry_t *pml4;
1115 pml4 = pmap_pml4e(pmap, va);
1116 pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
1117 *pml4 = 0;
1118 } else if (m->pindex >= NUPDE) {
1119 /* PD page */
1120 pdp_entry_t *pdp;
1121 pdp = pmap_pdpe(pmap, va);
1122 pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
1123 *pdp = 0;
1124 } else {
1125 /* PTE page */
1126 pd_entry_t *pd;
1127 pd = pmap_pde(pmap, va);
1128 pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
1129 *pd = 0;
1130 }
1131 --pmap->pm_stats.resident_count;
1132 if (m->pindex < NUPDE) {
1133 /* Unhold the PD page */
1134 vm_page_t pdpg;
1135 pdpg = vm_page_lookup(pmap->pm_pteobj, NUPDE + pmap_pdpe_index(va));
1136 while (vm_page_sleep_if_busy(pdpg, FALSE, "pulook"))
1137 vm_page_lock_queues();
1138 vm_page_unhold(pdpg);
1139 if (pdpg->hold_count == 0)
1140 _pmap_unwire_pte_hold(pmap, va, pdpg);
1141 }
1142 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
1143 /* Unhold the PDP page */
1144 vm_page_t pdppg;
1145 pdppg = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + pmap_pml4e_index(va));
1146 while (vm_page_sleep_if_busy(pdppg, FALSE, "pulooK"))
1147 vm_page_lock_queues();
1148 vm_page_unhold(pdppg);
1149 if (pdppg->hold_count == 0)
1150 _pmap_unwire_pte_hold(pmap, va, pdppg);
1151 }
1152 if (pmap_is_current(pmap)) {
1153 /*
1154 * Do an invltlb to make the invalidated mapping
1155 * take effect immediately.
1156 */
1157 pmap_invalidate_page(pmap, pteva);
1158 }
1159
1160 /*
1161 * If the page is finally unwired, simply free it.
1162 */
1163 --m->wire_count;
1164 if (m->wire_count == 0) {
1165 vm_page_busy(m);
1166 vm_page_free_zero(m);
1167 atomic_subtract_int(&cnt.v_wire_count, 1);
1168 }
1169 return 1;
1170 }
1171 return 0;
1172 }
1173
1174 static PMAP_INLINE int
1175 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
1176 {
1177 vm_page_unhold(m);
1178 if (m->hold_count == 0)
1179 return _pmap_unwire_pte_hold(pmap, va, m);
1180 else
1181 return 0;
1182 }
1183
1184 /*
1185 * After removing a page table entry, this routine is used to
1186 * conditionally free the page, and manage the hold/wire counts.
1187 */
1188 static int
1189 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1190 {
1191 vm_pindex_t ptepindex;
1192
1193 if (va >= VM_MAXUSER_ADDRESS)
1194 return 0;
1195
1196 if (mpte == NULL) {
1197 ptepindex = pmap_pde_pindex(va);
1198 if (pmap->pm_pteobj->root &&
1199 pmap->pm_pteobj->root->pindex == ptepindex) {
1200 mpte = pmap->pm_pteobj->root;
1201 } else {
1202 while ((mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex)) != NULL &&
1203 vm_page_sleep_if_busy(mpte, FALSE, "pulook"))
1204 vm_page_lock_queues();
1205 }
1206 }
1207
1208 return pmap_unwire_pte_hold(pmap, va, mpte);
1209 }
1210
1211 void
1212 pmap_pinit0(pmap)
1213 struct pmap *pmap;
1214 {
1215
1216 pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
1217 pmap->pm_active = 0;
1218 TAILQ_INIT(&pmap->pm_pvlist);
1219 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1220 mtx_lock_spin(&allpmaps_lock);
1221 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1222 mtx_unlock_spin(&allpmaps_lock);
1223 }
1224
1225 /*
1226 * Initialize a preallocated and zeroed pmap structure,
1227 * such as one in a vmspace structure.
1228 */
1229 void
1230 pmap_pinit(pmap)
1231 register struct pmap *pmap;
1232 {
1233 vm_page_t pml4pg;
1234
1235 /*
1236 * allocate object for the ptes
1237 */
1238 if (pmap->pm_pteobj == NULL)
1239 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + NUPML4E + 1);
1240
1241 /*
1242 * allocate the page directory page
1243 */
1244 pml4pg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + NUPML4E,
1245 VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1246 vm_page_lock_queues();
1247 vm_page_flag_clear(pml4pg, PG_BUSY);
1248 pml4pg->valid = VM_PAGE_BITS_ALL;
1249 vm_page_unlock_queues();
1250
1251 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
1252
1253 if ((pml4pg->flags & PG_ZERO) == 0)
1254 bzero(pmap->pm_pml4, PAGE_SIZE);
1255
1256 mtx_lock_spin(&allpmaps_lock);
1257 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1258 mtx_unlock_spin(&allpmaps_lock);
1259
1260 /* Wire in kernel global address entries. */
1261 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
1262 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
1263
1264 /* install self-referential address mapping entry(s) */
1265 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
1266
1267 pmap->pm_active = 0;
1268 TAILQ_INIT(&pmap->pm_pvlist);
1269 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1270 }
1271
1272 /*
1273 * Wire in kernel global address entries. To avoid a race condition
1274 * between pmap initialization and pmap_growkernel, this procedure
1275 * should be called after the vmspace is attached to the process
1276 * but before this pmap is activated.
1277 */
1278 void
1279 pmap_pinit2(pmap)
1280 struct pmap *pmap;
1281 {
1282 /* XXX: Remove this stub when no longer called */
1283 }
1284
1285 /*
1286 * this routine is called if the page table page is not
1287 * mapped correctly.
1288 */
1289 static vm_page_t
1290 _pmap_allocpte(pmap, ptepindex)
1291 pmap_t pmap;
1292 vm_pindex_t ptepindex;
1293 {
1294 vm_page_t m, pdppg, pdpg;
1295
1296 /*
1297 * Find or fabricate a new pagetable page
1298 */
1299 m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1300 VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1301
1302 KASSERT(m->queue == PQ_NONE,
1303 ("_pmap_allocpte: %p->queue != PQ_NONE", m));
1304
1305 /*
1306 * Increment the hold count for the page table page
1307 * (denoting a new mapping.)
1308 */
1309 m->hold_count++;
1310
1311 /*
1312 * Map the pagetable page into the process address space, if
1313 * it isn't already there.
1314 */
1315
1316 pmap->pm_stats.resident_count++;
1317
1318 if (ptepindex >= (NUPDE + NUPDPE)) {
1319 pml4_entry_t *pml4;
1320 vm_pindex_t pml4index;
1321
1322 /* Wire up a new PDPE page */
1323 pml4index = ptepindex - (NUPDE + NUPDPE);
1324 pml4 = &pmap->pm_pml4[pml4index];
1325 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1326
1327 } else if (ptepindex >= NUPDE) {
1328 vm_pindex_t pml4index;
1329 vm_pindex_t pdpindex;
1330 pml4_entry_t *pml4;
1331 pdp_entry_t *pdp;
1332
1333 /* Wire up a new PDE page */
1334 pdpindex = ptepindex - NUPDE;
1335 pml4index = pdpindex >> NPML4EPGSHIFT;
1336
1337 pml4 = &pmap->pm_pml4[pml4index];
1338 if ((*pml4 & PG_V) == 0) {
1339 /* Have to allocate a new pdp, recurse */
1340 _pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index);
1341 } else {
1342 /* Add reference to pdp page */
1343 pdppg = pmap_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + pml4index);
1344 pdppg->hold_count++;
1345 }
1346 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1347
1348 /* Now find the pdp page */
1349 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
1350 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1351
1352 } else {
1353 vm_pindex_t pml4index;
1354 vm_pindex_t pdpindex;
1355 pml4_entry_t *pml4;
1356 pdp_entry_t *pdp;
1357 pd_entry_t *pd;
1358
1359 /* Wire up a new PTE page */
1360 pdpindex = ptepindex >> NPDPEPGSHIFT;
1361 pml4index = pdpindex >> NPML4EPGSHIFT;
1362
1363 /* First, find the pdp and check that its valid. */
1364 pml4 = &pmap->pm_pml4[pml4index];
1365 if ((*pml4 & PG_V) == 0) {
1366 /* Have to allocate a new pd, recurse */
1367 _pmap_allocpte(pmap, NUPDE + pdpindex);
1368 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1369 pdp = &pdp[pdpindex];
1370 } else {
1371 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
1372 pdp = &pdp[pdpindex];
1373 if ((*pdp & PG_V) == 0) {
1374 /* Have to allocate a new pd, recurse */
1375 _pmap_allocpte(pmap, NUPDE + pdpindex);
1376 } else {
1377 /* Add reference to the pd page */
1378 pdpg = pmap_page_lookup(pmap->pm_pteobj, NUPDE + pdpindex);
1379 pdpg->hold_count++;
1380 }
1381 }
1382 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
1383
1384 /* Now we know where the page directory page is */
1385 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
1386 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
1387 }
1388
1389 /*
1390 * Try to use the new mapping, but if we cannot, then
1391 * do it with the routine that maps the page explicitly.
1392 */
1393 if ((m->flags & PG_ZERO) == 0)
1394 pmap_zero_page(m);
1395 vm_page_lock_queues();
1396 m->valid = VM_PAGE_BITS_ALL;
1397 vm_page_flag_clear(m, PG_ZERO);
1398 vm_page_wakeup(m);
1399 vm_page_unlock_queues();
1400
1401 return m;
1402 }
1403
1404 static vm_page_t
1405 pmap_allocpte(pmap_t pmap, vm_offset_t va)
1406 {
1407 vm_pindex_t ptepindex;
1408 pd_entry_t *pd;
1409 vm_page_t m;
1410
1411 /*
1412 * Calculate pagetable page index
1413 */
1414 ptepindex = pmap_pde_pindex(va);
1415
1416 /*
1417 * Get the page directory entry
1418 */
1419 pd = pmap_pde(pmap, va);
1420
1421 /*
1422 * This supports switching from a 2MB page to a
1423 * normal 4K page.
1424 */
1425 if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
1426 *pd = 0;
1427 pd = 0;
1428 pmap_invalidate_all(kernel_pmap);
1429 }
1430
1431 /*
1432 * If the page table page is mapped, we just increment the
1433 * hold count, and activate it.
1434 */
1435 if (pd != 0 && (*pd & PG_V) != 0) {
1436 /*
1437 * In order to get the page table page, try the
1438 * hint first.
1439 */
1440 if (pmap->pm_pteobj->root &&
1441 (pmap->pm_pteobj->root->pindex == ptepindex)) {
1442 m = pmap->pm_pteobj->root;
1443 } else {
1444 m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1445 }
1446 m->hold_count++;
1447 return m;
1448 }
1449 /*
1450 * Here if the pte page isn't mapped, or if it has been deallocated.
1451 */
1452 m = _pmap_allocpte(pmap, ptepindex);
1453 return m;
1454 }
1455
1456
1457 /***************************************************
1458 * Pmap allocation/deallocation routines.
1459 ***************************************************/
1460
1461 /*
1462 * Release any resources held by the given physical map.
1463 * Called when a pmap initialized by pmap_pinit is being released.
1464 * Should only be called if the map contains no valid mappings.
1465 */
1466 void
1467 pmap_release(pmap_t pmap)
1468 {
1469 vm_object_t object;
1470 vm_page_t m;
1471
1472 object = pmap->pm_pteobj;
1473
1474 KASSERT(object->ref_count == 1,
1475 ("pmap_release: pteobj reference count %d != 1",
1476 object->ref_count));
1477 KASSERT(pmap->pm_stats.resident_count == 0,
1478 ("pmap_release: pmap resident count %ld != 0",
1479 pmap->pm_stats.resident_count));
1480
1481 mtx_lock_spin(&allpmaps_lock);
1482 LIST_REMOVE(pmap, pm_list);
1483 mtx_unlock_spin(&allpmaps_lock);
1484
1485 vm_page_lock_queues();
1486 while ((m = TAILQ_FIRST(&object->memq)) != NULL) {
1487 m->wire_count--;
1488 atomic_subtract_int(&cnt.v_wire_count, 1);
1489 vm_page_busy(m);
1490 vm_page_free(m);
1491 }
1492 KASSERT(TAILQ_EMPTY(&object->memq),
1493 ("pmap_release: leaking page table pages"));
1494 vm_page_unlock_queues();
1495 }
1496
1497 static int
1498 kvm_size(SYSCTL_HANDLER_ARGS)
1499 {
1500 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1501
1502 return sysctl_handle_long(oidp, &ksize, 0, req);
1503 }
1504 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1505 0, 0, kvm_size, "IU", "Size of KVM");
1506
1507 static int
1508 kvm_free(SYSCTL_HANDLER_ARGS)
1509 {
1510 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1511
1512 return sysctl_handle_long(oidp, &kfree, 0, req);
1513 }
1514 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1515 0, 0, kvm_free, "IU", "Amount of KVM free");
1516
1517 /*
1518 * grow the number of kernel page table entries, if needed
1519 */
1520 void
1521 pmap_growkernel(vm_offset_t addr)
1522 {
1523 int s;
1524 vm_paddr_t ptppaddr;
1525 vm_page_t nkpg;
1526 pd_entry_t newpdir;
1527
1528 s = splhigh();
1529 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1530 if (kernel_vm_end == 0) {
1531 kernel_vm_end = KERNBASE;
1532 nkpt = 0;
1533 while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1534 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1535 nkpt++;
1536 }
1537 }
1538 addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1539 while (kernel_vm_end < addr) {
1540 if ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
1541 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1542 continue;
1543 }
1544
1545 /*
1546 * This index is bogus, but out of the way
1547 */
1548 nkpg = vm_page_alloc(NULL, nkpt,
1549 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1550 if (!nkpg)
1551 panic("pmap_growkernel: no memory to grow kernel");
1552
1553 nkpt++;
1554
1555 pmap_zero_page(nkpg);
1556 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1557 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1558 *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
1559
1560 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1561 }
1562 splx(s);
1563 }
1564
1565
1566 /***************************************************
1567 * page management routines.
1568 ***************************************************/
1569
1570 /*
1571 * free the pv_entry back to the free list
1572 */
1573 static PMAP_INLINE void
1574 free_pv_entry(pv_entry_t pv)
1575 {
1576 pv_entry_count--;
1577 uma_zfree(pvzone, pv);
1578 }
1579
1580 /*
1581 * get a new pv_entry, allocating a block from the system
1582 * when needed.
1583 * the memory allocation is performed bypassing the malloc code
1584 * because of the possibility of allocations at interrupt time.
1585 */
1586 static pv_entry_t
1587 get_pv_entry(void)
1588 {
1589 pv_entry_count++;
1590 if (pv_entry_high_water &&
1591 (pv_entry_count > pv_entry_high_water) &&
1592 (pmap_pagedaemon_waken == 0)) {
1593 pmap_pagedaemon_waken = 1;
1594 wakeup (&vm_pages_needed);
1595 }
1596 return uma_zalloc(pvzone, M_NOWAIT);
1597 }
1598
1599 /*
1600 * If it is the first entry on the list, it is actually
1601 * in the header and we must copy the following entry up
1602 * to the header. Otherwise we must search the list for
1603 * the entry. In either case we free the now unused entry.
1604 */
1605
1606 static int
1607 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1608 {
1609 pv_entry_t pv;
1610 int rtval;
1611 int s;
1612
1613 s = splvm();
1614 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1615 if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1616 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1617 if (pmap == pv->pv_pmap && va == pv->pv_va)
1618 break;
1619 }
1620 } else {
1621 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1622 if (va == pv->pv_va)
1623 break;
1624 }
1625 }
1626
1627 rtval = 0;
1628 if (pv) {
1629 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1630 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1631 m->md.pv_list_count--;
1632 if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1633 vm_page_flag_clear(m, PG_WRITEABLE);
1634
1635 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1636 free_pv_entry(pv);
1637 }
1638
1639 splx(s);
1640 return rtval;
1641 }
1642
1643 /*
1644 * Create a pv entry for page at pa for
1645 * (pmap, va).
1646 */
1647 static void
1648 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1649 {
1650
1651 int s;
1652 pv_entry_t pv;
1653
1654 s = splvm();
1655 pv = get_pv_entry();
1656 pv->pv_va = va;
1657 pv->pv_pmap = pmap;
1658 pv->pv_ptem = mpte;
1659
1660 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1661 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1662 m->md.pv_list_count++;
1663
1664 splx(s);
1665 }
1666
1667 /*
1668 * pmap_remove_pte: do the things to unmap a page in a process
1669 */
1670 static int
1671 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1672 {
1673 pt_entry_t oldpte;
1674 vm_page_t m;
1675
1676 oldpte = pte_load_clear(ptq);
1677 if (oldpte & PG_W)
1678 pmap->pm_stats.wired_count -= 1;
1679 /*
1680 * Machines that don't support invlpg, also don't support
1681 * PG_G.
1682 */
1683 if (oldpte & PG_G)
1684 pmap_invalidate_page(kernel_pmap, va);
1685 pmap->pm_stats.resident_count -= 1;
1686 if (oldpte & PG_MANAGED) {
1687 m = PHYS_TO_VM_PAGE(oldpte);
1688 if (oldpte & PG_M) {
1689 #if defined(PMAP_DIAGNOSTIC)
1690 if (pmap_nw_modified((pt_entry_t) oldpte)) {
1691 printf(
1692 "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1693 va, oldpte);
1694 }
1695 #endif
1696 if (pmap_track_modified(va))
1697 vm_page_dirty(m);
1698 }
1699 if (oldpte & PG_A)
1700 vm_page_flag_set(m, PG_REFERENCED);
1701 return pmap_remove_entry(pmap, m, va);
1702 } else {
1703 return pmap_unuse_pt(pmap, va, NULL);
1704 }
1705
1706 return 0;
1707 }
1708
1709 /*
1710 * Remove a single page from a process address space
1711 */
1712 static void
1713 pmap_remove_page(pmap_t pmap, vm_offset_t va)
1714 {
1715 pt_entry_t *pte;
1716
1717 pte = pmap_pte(pmap, va);
1718 if (pte == NULL || (*pte & PG_V) == 0)
1719 return;
1720 pmap_remove_pte(pmap, pte, va);
1721 pmap_invalidate_page(pmap, va);
1722 }
1723
1724 /*
1725 * Remove the given range of addresses from the specified map.
1726 *
1727 * It is assumed that the start and end are properly
1728 * rounded to the page size.
1729 */
1730 void
1731 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1732 {
1733 vm_offset_t pdnxt;
1734 pd_entry_t ptpaddr, *pde;
1735 pt_entry_t *pte;
1736 int anyvalid;
1737
1738 if (pmap == NULL)
1739 return;
1740
1741 if (pmap->pm_stats.resident_count == 0)
1742 return;
1743
1744 /*
1745 * special handling of removing one page. a very
1746 * common operation and easy to short circuit some
1747 * code.
1748 */
1749 if (sva + PAGE_SIZE == eva) {
1750 pde = pmap_pde(pmap, sva);
1751 if (pde && (*pde & PG_PS) == 0) {
1752 pmap_remove_page(pmap, sva);
1753 return;
1754 }
1755 }
1756
1757 anyvalid = 0;
1758
1759 for (; sva < eva; sva = pdnxt) {
1760
1761 if (pmap->pm_stats.resident_count == 0)
1762 break;
1763
1764 /*
1765 * Calculate index for next page table.
1766 */
1767 pdnxt = (sva + NBPDR) & ~PDRMASK;
1768
1769 pde = pmap_pde(pmap, sva);
1770 if (pde == 0)
1771 continue;
1772 ptpaddr = *pde;
1773
1774 /*
1775 * Weed out invalid mappings. Note: we assume that the page
1776 * directory table is always allocated, and in kernel virtual.
1777 */
1778 if (ptpaddr == 0)
1779 continue;
1780
1781 /*
1782 * Check for large page.
1783 */
1784 if ((ptpaddr & PG_PS) != 0) {
1785 *pde = 0;
1786 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1787 anyvalid = 1;
1788 continue;
1789 }
1790
1791 /*
1792 * Limit our scan to either the end of the va represented
1793 * by the current page table page, or to the end of the
1794 * range being removed.
1795 */
1796 if (pdnxt > eva)
1797 pdnxt = eva;
1798
1799 for (; sva != pdnxt; sva += PAGE_SIZE) {
1800 pte = pmap_pte(pmap, sva);
1801 if (pte == NULL || *pte == 0)
1802 continue;
1803 anyvalid = 1;
1804 if (pmap_remove_pte(pmap, pte, sva))
1805 break;
1806 }
1807 }
1808
1809 if (anyvalid)
1810 pmap_invalidate_all(pmap);
1811 }
1812
1813 /*
1814 * Routine: pmap_remove_all
1815 * Function:
1816 * Removes this physical page from
1817 * all physical maps in which it resides.
1818 * Reflects back modify bits to the pager.
1819 *
1820 * Notes:
1821 * Original versions of this routine were very
1822 * inefficient because they iteratively called
1823 * pmap_remove (slow...)
1824 */
1825
1826 void
1827 pmap_remove_all(vm_page_t m)
1828 {
1829 register pv_entry_t pv;
1830 pt_entry_t *pte, tpte;
1831 int s;
1832
1833 #if defined(PMAP_DIAGNOSTIC)
1834 /*
1835 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1836 */
1837 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1838 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1839 VM_PAGE_TO_PHYS(m));
1840 }
1841 #endif
1842 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1843 s = splvm();
1844 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1845 pv->pv_pmap->pm_stats.resident_count--;
1846 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
1847 tpte = pte_load_clear(pte);
1848 if (tpte & PG_W)
1849 pv->pv_pmap->pm_stats.wired_count--;
1850 if (tpte & PG_A)
1851 vm_page_flag_set(m, PG_REFERENCED);
1852
1853 /*
1854 * Update the vm_page_t clean and reference bits.
1855 */
1856 if (tpte & PG_M) {
1857 #if defined(PMAP_DIAGNOSTIC)
1858 if (pmap_nw_modified((pt_entry_t) tpte)) {
1859 printf(
1860 "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1861 pv->pv_va, tpte);
1862 }
1863 #endif
1864 if (pmap_track_modified(pv->pv_va))
1865 vm_page_dirty(m);
1866 }
1867 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1868 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1869 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1870 m->md.pv_list_count--;
1871 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
1872 free_pv_entry(pv);
1873 }
1874 vm_page_flag_clear(m, PG_WRITEABLE);
1875 splx(s);
1876 }
1877
1878 /*
1879 * Set the physical protection on the
1880 * specified range of this map as requested.
1881 */
1882 void
1883 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1884 {
1885 vm_offset_t pdnxt;
1886 pd_entry_t ptpaddr, *pde;
1887 int anychanged;
1888
1889 if (pmap == NULL)
1890 return;
1891
1892 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1893 pmap_remove(pmap, sva, eva);
1894 return;
1895 }
1896
1897 if (prot & VM_PROT_WRITE)
1898 return;
1899
1900 anychanged = 0;
1901
1902 for (; sva < eva; sva = pdnxt) {
1903
1904 pdnxt = (sva + NBPDR) & ~PDRMASK;
1905
1906 pde = pmap_pde(pmap, sva);
1907 if (pde == NULL)
1908 continue;
1909 ptpaddr = *pde;
1910
1911 /*
1912 * Weed out invalid mappings. Note: we assume that the page
1913 * directory table is always allocated, and in kernel virtual.
1914 */
1915 if (ptpaddr == 0)
1916 continue;
1917
1918 /*
1919 * Check for large page.
1920 */
1921 if ((ptpaddr & PG_PS) != 0) {
1922 *pde &= ~(PG_M|PG_RW);
1923 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1924 anychanged = 1;
1925 continue;
1926 }
1927
1928 if (pdnxt > eva)
1929 pdnxt = eva;
1930
1931 for (; sva != pdnxt; sva += PAGE_SIZE) {
1932 pt_entry_t pbits;
1933 pt_entry_t *pte;
1934 vm_page_t m;
1935
1936 pte = pmap_pte(pmap, sva);
1937 if (pte == NULL)
1938 continue;
1939 pbits = *pte;
1940 if (pbits & PG_MANAGED) {
1941 m = NULL;
1942 if (pbits & PG_A) {
1943 m = PHYS_TO_VM_PAGE(pbits);
1944 vm_page_flag_set(m, PG_REFERENCED);
1945 pbits &= ~PG_A;
1946 }
1947 if ((pbits & PG_M) != 0 &&
1948 pmap_track_modified(sva)) {
1949 if (m == NULL)
1950 m = PHYS_TO_VM_PAGE(pbits);
1951 vm_page_dirty(m);
1952 pbits &= ~PG_M;
1953 }
1954 }
1955
1956 pbits &= ~PG_RW;
1957
1958 if (pbits != *pte) {
1959 pte_store(pte, pbits);
1960 anychanged = 1;
1961 }
1962 }
1963 }
1964 if (anychanged)
1965 pmap_invalidate_all(pmap);
1966 }
1967
1968 /*
1969 * Insert the given physical page (p) at
1970 * the specified virtual address (v) in the
1971 * target physical map with the protection requested.
1972 *
1973 * If specified, the page will be wired down, meaning
1974 * that the related pte can not be reclaimed.
1975 *
1976 * NB: This is the only routine which MAY NOT lazy-evaluate
1977 * or lose information. That is, this routine must actually
1978 * insert this page into the given map NOW.
1979 */
1980 void
1981 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1982 boolean_t wired)
1983 {
1984 vm_paddr_t pa;
1985 register pt_entry_t *pte;
1986 vm_paddr_t opa;
1987 pt_entry_t origpte, newpte;
1988 vm_page_t mpte;
1989
1990 if (pmap == NULL)
1991 return;
1992
1993 va &= PG_FRAME;
1994 #ifdef PMAP_DIAGNOSTIC
1995 if (va > VM_MAX_KERNEL_ADDRESS)
1996 panic("pmap_enter: toobig");
1997 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1998 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1999 #endif
2000
2001 mpte = NULL;
2002 /*
2003 * In the case that a page table page is not
2004 * resident, we are creating it here.
2005 */
2006 if (va < VM_MAXUSER_ADDRESS) {
2007 mpte = pmap_allocpte(pmap, va);
2008 }
2009 #if 0 && defined(PMAP_DIAGNOSTIC)
2010 else {
2011 pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2012 origpte = *pdeaddr;
2013 if ((origpte & PG_V) == 0) {
2014 panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
2015 origpte, va);
2016 }
2017 }
2018 #endif
2019
2020 pte = pmap_pte(pmap, va);
2021
2022 /*
2023 * Page Directory table entry not valid, we need a new PT page
2024 */
2025 if (pte == NULL)
2026 panic("pmap_enter: invalid page directory va=%#lx\n", va);
2027
2028 pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2029 origpte = *pte;
2030 opa = origpte & PG_FRAME;
2031
2032 if (origpte & PG_PS)
2033 panic("pmap_enter: attempted pmap_enter on 2MB page");
2034
2035 /*
2036 * Mapping has not changed, must be protection or wiring change.
2037 */
2038 if (origpte && (opa == pa)) {
2039 /*
2040 * Wiring change, just update stats. We don't worry about
2041 * wiring PT pages as they remain resident as long as there
2042 * are valid mappings in them. Hence, if a user page is wired,
2043 * the PT page will be also.
2044 */
2045 if (wired && ((origpte & PG_W) == 0))
2046 pmap->pm_stats.wired_count++;
2047 else if (!wired && (origpte & PG_W))
2048 pmap->pm_stats.wired_count--;
2049
2050 #if defined(PMAP_DIAGNOSTIC)
2051 if (pmap_nw_modified((pt_entry_t) origpte)) {
2052 printf(
2053 "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2054 va, origpte);
2055 }
2056 #endif
2057
2058 /*
2059 * Remove extra pte reference
2060 */
2061 if (mpte)
2062 mpte->hold_count--;
2063
2064 if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
2065 if ((origpte & PG_RW) == 0) {
2066 pte_store(pte, origpte | PG_RW);
2067 pmap_invalidate_page(pmap, va);
2068 }
2069 return;
2070 }
2071
2072 /*
2073 * We might be turning off write access to the page,
2074 * so we go ahead and sense modify status.
2075 */
2076 if (origpte & PG_MANAGED) {
2077 if ((origpte & PG_M) && pmap_track_modified(va)) {
2078 vm_page_t om;
2079 om = PHYS_TO_VM_PAGE(opa);
2080 vm_page_dirty(om);
2081 }
2082 pa |= PG_MANAGED;
2083 }
2084 goto validate;
2085 }
2086 /*
2087 * Mapping has changed, invalidate old range and fall through to
2088 * handle validating new mapping.
2089 */
2090 if (opa) {
2091 int err;
2092 vm_page_lock_queues();
2093 err = pmap_remove_pte(pmap, pte, va);
2094 vm_page_unlock_queues();
2095 if (err)
2096 panic("pmap_enter: pte vanished, va: 0x%lx", va);
2097 }
2098
2099 /*
2100 * Enter on the PV list if part of our managed memory. Note that we
2101 * raise IPL while manipulating pv_table since pmap_enter can be
2102 * called at interrupt time.
2103 */
2104 if (pmap_initialized &&
2105 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2106 pmap_insert_entry(pmap, va, mpte, m);
2107 pa |= PG_MANAGED;
2108 }
2109
2110 /*
2111 * Increment counters
2112 */
2113 pmap->pm_stats.resident_count++;
2114 if (wired)
2115 pmap->pm_stats.wired_count++;
2116
2117 validate:
2118 /*
2119 * Now validate mapping with desired protection/wiring.
2120 */
2121 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V);
2122
2123 if (wired)
2124 newpte |= PG_W;
2125 if (va < VM_MAXUSER_ADDRESS)
2126 newpte |= PG_U;
2127 if (pmap == kernel_pmap)
2128 newpte |= PG_G;
2129
2130 /*
2131 * if the mapping or permission bits are different, we need
2132 * to update the pte.
2133 */
2134 if ((origpte & ~(PG_M|PG_A)) != newpte) {
2135 pte_store(pte, newpte | PG_A);
2136 /*if (origpte)*/ {
2137 pmap_invalidate_page(pmap, va);
2138 }
2139 }
2140 }
2141
2142 /*
2143 * this code makes some *MAJOR* assumptions:
2144 * 1. Current pmap & pmap exists.
2145 * 2. Not wired.
2146 * 3. Read access.
2147 * 4. No page table pages.
2148 * 5. Tlbflush is deferred to calling procedure.
2149 * 6. Page IS managed.
2150 * but is *MUCH* faster than pmap_enter...
2151 */
2152
2153 static vm_page_t
2154 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
2155 {
2156 pt_entry_t *pte;
2157 vm_paddr_t pa;
2158
2159 /*
2160 * In the case that a page table page is not
2161 * resident, we are creating it here.
2162 */
2163 if (va < VM_MAXUSER_ADDRESS) {
2164 vm_pindex_t ptepindex;
2165 pd_entry_t *ptepa;
2166
2167 /*
2168 * Calculate pagetable page index
2169 */
2170 ptepindex = pmap_pde_pindex(va);
2171 if (mpte && (mpte->pindex == ptepindex)) {
2172 mpte->hold_count++;
2173 } else {
2174 retry:
2175 /*
2176 * Get the page directory entry
2177 */
2178 ptepa = pmap_pde(pmap, va);
2179
2180 /*
2181 * If the page table page is mapped, we just increment
2182 * the hold count, and activate it.
2183 */
2184 if (ptepa && (*ptepa & PG_V) != 0) {
2185 if (*ptepa & PG_PS)
2186 panic("pmap_enter_quick: unexpected mapping into 2MB page");
2187 if (pmap->pm_pteobj->root &&
2188 (pmap->pm_pteobj->root->pindex == ptepindex)) {
2189 mpte = pmap->pm_pteobj->root;
2190 } else {
2191 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
2192 }
2193 if (mpte == NULL)
2194 goto retry;
2195 mpte->hold_count++;
2196 } else {
2197 mpte = _pmap_allocpte(pmap, ptepindex);
2198 }
2199 }
2200 } else {
2201 mpte = NULL;
2202 }
2203
2204 /*
2205 * This call to vtopte makes the assumption that we are
2206 * entering the page into the current pmap. In order to support
2207 * quick entry into any pmap, one would likely use pmap_pte.
2208 * But that isn't as quick as vtopte.
2209 */
2210 pte = vtopte(va);
2211 if (*pte) {
2212 if (mpte != NULL) {
2213 vm_page_lock_queues();
2214 pmap_unwire_pte_hold(pmap, va, mpte);
2215 vm_page_unlock_queues();
2216 }
2217 return 0;
2218 }
2219
2220 /*
2221 * Enter on the PV list if part of our managed memory. Note that we
2222 * raise IPL while manipulating pv_table since pmap_enter can be
2223 * called at interrupt time.
2224 */
2225 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2226 pmap_insert_entry(pmap, va, mpte, m);
2227
2228 /*
2229 * Increment counters
2230 */
2231 pmap->pm_stats.resident_count++;
2232
2233 pa = VM_PAGE_TO_PHYS(m);
2234
2235 /*
2236 * Now validate mapping with RO protection
2237 */
2238 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2239 pte_store(pte, pa | PG_V | PG_U);
2240 else
2241 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2242
2243 return mpte;
2244 }
2245
2246 /*
2247 * Make a temporary mapping for a physical address. This is only intended
2248 * to be used for panic dumps.
2249 */
2250 void *
2251 pmap_kenter_temporary(vm_offset_t pa, int i)
2252 {
2253 vm_offset_t va;
2254
2255 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2256 pmap_kenter(va, pa);
2257 invlpg(va);
2258 return ((void *)crashdumpmap);
2259 }
2260
2261 #define MAX_INIT_PT (96)
2262 /*
2263 * pmap_object_init_pt preloads the ptes for a given object
2264 * into the specified pmap. This eliminates the blast of soft
2265 * faults on process startup and immediately after an mmap.
2266 */
2267 void
2268 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2269 vm_object_t object, vm_pindex_t pindex,
2270 vm_size_t size, int limit)
2271 {
2272 vm_pindex_t tmpidx;
2273 int psize;
2274 vm_page_t p, mpte;
2275
2276 if (pmap == NULL || object == NULL)
2277 return;
2278
2279 /*
2280 * This code maps large physical mmap regions into the
2281 * processor address space. Note that some shortcuts
2282 * are taken, but the code works.
2283 */
2284 if ((object->type == OBJT_DEVICE) &&
2285 ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2286 int i;
2287 vm_page_t m[1];
2288 int npdes;
2289 pd_entry_t ptepa, *pde;
2290
2291 pde = pmap_pde(pmap, addr);
2292 if (pde != 0 && (*pde & PG_V) != 0)
2293 return;
2294 retry:
2295 p = vm_page_lookup(object, pindex);
2296 if (p != NULL) {
2297 vm_page_lock_queues();
2298 if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2299 goto retry;
2300 } else {
2301 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2302 if (p == NULL)
2303 return;
2304 m[0] = p;
2305
2306 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2307 vm_page_lock_queues();
2308 vm_page_free(p);
2309 vm_page_unlock_queues();
2310 return;
2311 }
2312
2313 p = vm_page_lookup(object, pindex);
2314 vm_page_lock_queues();
2315 vm_page_wakeup(p);
2316 }
2317 vm_page_unlock_queues();
2318
2319 ptepa = VM_PAGE_TO_PHYS(p);
2320 if (ptepa & (NBPDR - 1)) {
2321 return;
2322 }
2323
2324 p->valid = VM_PAGE_BITS_ALL;
2325
2326 pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2327 npdes = size >> PDRSHIFT;
2328 for(i = 0; i < npdes; i++) {
2329 pde_store(pde, ptepa | PG_U | PG_RW | PG_V | PG_PS);
2330 ptepa += NBPDR;
2331 pde++;
2332 }
2333 pmap_invalidate_all(kernel_pmap);
2334 return;
2335 }
2336
2337 psize = amd64_btop(size);
2338
2339 if ((object->type != OBJT_VNODE) ||
2340 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2341 (object->resident_page_count > MAX_INIT_PT))) {
2342 return;
2343 }
2344
2345 if (psize + pindex > object->size) {
2346 if (object->size < pindex)
2347 return;
2348 psize = object->size - pindex;
2349 }
2350
2351 mpte = NULL;
2352
2353 if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
2354 if (p->pindex < pindex) {
2355 p = vm_page_splay(pindex, object->root);
2356 if ((object->root = p)->pindex < pindex)
2357 p = TAILQ_NEXT(p, listq);
2358 }
2359 }
2360 /*
2361 * Assert: the variable p is either (1) the page with the
2362 * least pindex greater than or equal to the parameter pindex
2363 * or (2) NULL.
2364 */
2365 for (;
2366 p != NULL && (tmpidx = p->pindex - pindex) < psize;
2367 p = TAILQ_NEXT(p, listq)) {
2368 /*
2369 * don't allow an madvise to blow away our really
2370 * free pages allocating pv entries.
2371 */
2372 if ((limit & MAP_PREFAULT_MADVISE) &&
2373 cnt.v_free_count < cnt.v_free_reserved) {
2374 break;
2375 }
2376 vm_page_lock_queues();
2377 if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
2378 (p->busy == 0) &&
2379 (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2380 if ((p->queue - p->pc) == PQ_CACHE)
2381 vm_page_deactivate(p);
2382 vm_page_busy(p);
2383 vm_page_unlock_queues();
2384 mpte = pmap_enter_quick(pmap,
2385 addr + amd64_ptob(tmpidx), p, mpte);
2386 vm_page_lock_queues();
2387 vm_page_wakeup(p);
2388 }
2389 vm_page_unlock_queues();
2390 }
2391 }
2392
2393 /*
2394 * pmap_prefault provides a quick way of clustering
2395 * pagefaults into a processes address space. It is a "cousin"
2396 * of pmap_object_init_pt, except it runs at page fault time instead
2397 * of mmap time.
2398 */
2399 #define PFBAK 4
2400 #define PFFOR 4
2401 #define PAGEORDER_SIZE (PFBAK+PFFOR)
2402
2403 static int pmap_prefault_pageorder[] = {
2404 -1 * PAGE_SIZE, 1 * PAGE_SIZE,
2405 -2 * PAGE_SIZE, 2 * PAGE_SIZE,
2406 -3 * PAGE_SIZE, 3 * PAGE_SIZE,
2407 -4 * PAGE_SIZE, 4 * PAGE_SIZE
2408 };
2409
2410 void
2411 pmap_prefault(pmap, addra, entry)
2412 pmap_t pmap;
2413 vm_offset_t addra;
2414 vm_map_entry_t entry;
2415 {
2416 int i;
2417 vm_offset_t starta;
2418 vm_offset_t addr;
2419 vm_pindex_t pindex;
2420 vm_page_t m, mpte;
2421 vm_object_t object;
2422 pd_entry_t *pde;
2423
2424 if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
2425 return;
2426
2427 object = entry->object.vm_object;
2428
2429 starta = addra - PFBAK * PAGE_SIZE;
2430 if (starta < entry->start) {
2431 starta = entry->start;
2432 } else if (starta > addra) {
2433 starta = 0;
2434 }
2435
2436 mpte = NULL;
2437 for (i = 0; i < PAGEORDER_SIZE; i++) {
2438 vm_object_t lobject;
2439 pt_entry_t *pte;
2440
2441 addr = addra + pmap_prefault_pageorder[i];
2442 if (addr > addra + (PFFOR * PAGE_SIZE))
2443 addr = 0;
2444
2445 if (addr < starta || addr >= entry->end)
2446 continue;
2447
2448 pde = pmap_pde(pmap, addr);
2449 if (pde == NULL || (*pde & PG_V) == 0)
2450 continue;
2451
2452 pte = vtopte(addr);
2453 if ((*pte & PG_V) == 0)
2454 continue;
2455
2456 pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2457 lobject = object;
2458 for (m = vm_page_lookup(lobject, pindex);
2459 (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
2460 lobject = lobject->backing_object) {
2461 if (lobject->backing_object_offset & PAGE_MASK)
2462 break;
2463 pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2464 m = vm_page_lookup(lobject->backing_object, pindex);
2465 }
2466
2467 /*
2468 * give-up when a page is not in memory
2469 */
2470 if (m == NULL)
2471 break;
2472 vm_page_lock_queues();
2473 if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2474 (m->busy == 0) &&
2475 (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2476
2477 if ((m->queue - m->pc) == PQ_CACHE) {
2478 vm_page_deactivate(m);
2479 }
2480 vm_page_busy(m);
2481 vm_page_unlock_queues();
2482 mpte = pmap_enter_quick(pmap, addr, m, mpte);
2483 vm_page_lock_queues();
2484 vm_page_wakeup(m);
2485 }
2486 vm_page_unlock_queues();
2487 }
2488 }
2489
2490 /*
2491 * Routine: pmap_change_wiring
2492 * Function: Change the wiring attribute for a map/virtual-address
2493 * pair.
2494 * In/out conditions:
2495 * The mapping must already exist in the pmap.
2496 */
2497 void
2498 pmap_change_wiring(pmap, va, wired)
2499 register pmap_t pmap;
2500 vm_offset_t va;
2501 boolean_t wired;
2502 {
2503 register pt_entry_t *pte;
2504
2505 if (pmap == NULL)
2506 return;
2507
2508 /*
2509 * Wiring is not a hardware characteristic so there is no need to
2510 * invalidate TLB.
2511 */
2512 pte = pmap_pte(pmap, va);
2513 if (wired && (*pte & PG_W) == 0) {
2514 pmap->pm_stats.wired_count++;
2515 *pte |= PG_W;
2516 } else if (!wired && (*pte & PG_W) != 0) {
2517 pmap->pm_stats.wired_count--;
2518 *pte &= ~PG_W;
2519 }
2520 }
2521
2522
2523
2524 /*
2525 * Copy the range specified by src_addr/len
2526 * from the source map to the range dst_addr/len
2527 * in the destination map.
2528 *
2529 * This routine is only advisory and need not do anything.
2530 */
2531
2532 void
2533 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2534 vm_offset_t src_addr)
2535 {
2536 vm_offset_t addr;
2537 vm_offset_t end_addr = src_addr + len;
2538 vm_offset_t pdnxt;
2539 vm_page_t m;
2540
2541 if (dst_addr != src_addr)
2542 return;
2543
2544 if (!pmap_is_current(src_pmap))
2545 return;
2546
2547 for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2548 pt_entry_t *src_pte, *dst_pte;
2549 vm_page_t dstmpte, srcmpte;
2550 pd_entry_t srcptepaddr, *pde;
2551 vm_pindex_t ptepindex;
2552
2553 if (addr >= UPT_MIN_ADDRESS)
2554 panic("pmap_copy: invalid to pmap_copy page tables\n");
2555
2556 /*
2557 * Don't let optional prefaulting of pages make us go
2558 * way below the low water mark of free pages or way
2559 * above high water mark of used pv entries.
2560 */
2561 if (cnt.v_free_count < cnt.v_free_reserved ||
2562 pv_entry_count > pv_entry_high_water)
2563 break;
2564
2565 pdnxt = (addr + NBPDR) & ~PDRMASK;
2566 ptepindex = pmap_pde_pindex(addr);
2567
2568 pde = pmap_pde(src_pmap, addr);
2569 if (pde)
2570 srcptepaddr = *pde;
2571 else
2572 continue;
2573 if (srcptepaddr == 0)
2574 continue;
2575
2576 if (srcptepaddr & PG_PS) {
2577 pde = pmap_pde(dst_pmap, addr);
2578 if (pde == 0) {
2579 /*
2580 * XXX should do an allocpte here to
2581 * instantiate the pde
2582 */
2583 continue;
2584 }
2585 if (*pde == 0) {
2586 *pde = srcptepaddr;
2587 dst_pmap->pm_stats.resident_count +=
2588 NBPDR / PAGE_SIZE;
2589 }
2590 continue;
2591 }
2592
2593 srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2594 if ((srcmpte == NULL) ||
2595 (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2596 continue;
2597
2598 if (pdnxt > end_addr)
2599 pdnxt = end_addr;
2600
2601 src_pte = vtopte(addr);
2602 while (addr < pdnxt) {
2603 pt_entry_t ptetemp;
2604 ptetemp = *src_pte;
2605 /*
2606 * we only virtual copy managed pages
2607 */
2608 if ((ptetemp & PG_MANAGED) != 0) {
2609 /*
2610 * We have to check after allocpte for the
2611 * pte still being around... allocpte can
2612 * block.
2613 */
2614 dstmpte = pmap_allocpte(dst_pmap, addr);
2615 dst_pte = pmap_pte(dst_pmap, addr);
2616 if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2617 /*
2618 * Clear the modified and
2619 * accessed (referenced) bits
2620 * during the copy.
2621 */
2622 m = PHYS_TO_VM_PAGE(ptetemp);
2623 *dst_pte = ptetemp & ~(PG_M | PG_A);
2624 dst_pmap->pm_stats.resident_count++;
2625 pmap_insert_entry(dst_pmap, addr,
2626 dstmpte, m);
2627 } else {
2628 vm_page_lock_queues();
2629 pmap_unwire_pte_hold(dst_pmap, addr, dstmpte);
2630 vm_page_unlock_queues();
2631 }
2632 if (dstmpte->hold_count >= srcmpte->hold_count)
2633 break;
2634 }
2635 addr += PAGE_SIZE;
2636 src_pte++;
2637 }
2638 }
2639 }
2640
2641 /*
2642 * pmap_zero_page zeros the specified hardware page by mapping
2643 * the page into KVM and using bzero to clear its contents.
2644 */
2645 void
2646 pmap_zero_page(vm_page_t m)
2647 {
2648 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2649
2650 pagezero((void *)va);
2651 }
2652
2653 /*
2654 * pmap_zero_page_area zeros the specified hardware page by mapping
2655 * the page into KVM and using bzero to clear its contents.
2656 *
2657 * off and size may not cover an area beyond a single hardware page.
2658 */
2659 void
2660 pmap_zero_page_area(vm_page_t m, int off, int size)
2661 {
2662 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2663
2664 if (off == 0 && size == PAGE_SIZE)
2665 pagezero((void *)va);
2666 else
2667 bzero((char *)va + off, size);
2668 }
2669
2670 /*
2671 * pmap_zero_page_idle zeros the specified hardware page by mapping
2672 * the page into KVM and using bzero to clear its contents. This
2673 * is intended to be called from the vm_pagezero process only and
2674 * outside of Giant.
2675 */
2676 void
2677 pmap_zero_page_idle(vm_page_t m)
2678 {
2679 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2680
2681 pagezero((void *)va);
2682 }
2683
2684 /*
2685 * pmap_copy_page copies the specified (machine independent)
2686 * page by mapping the page into virtual memory and using
2687 * bcopy to copy the page, one machine dependent page at a
2688 * time.
2689 */
2690 void
2691 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
2692 {
2693 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2694 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2695
2696 bcopy((void *)src, (void *)dst, PAGE_SIZE);
2697 }
2698
2699 /*
2700 * Returns true if the pmap's pv is one of the first
2701 * 16 pvs linked to from this page. This count may
2702 * be changed upwards or downwards in the future; it
2703 * is only necessary that true be returned for a small
2704 * subset of pmaps for proper page aging.
2705 */
2706 boolean_t
2707 pmap_page_exists_quick(pmap, m)
2708 pmap_t pmap;
2709 vm_page_t m;
2710 {
2711 pv_entry_t pv;
2712 int loops = 0;
2713 int s;
2714
2715 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2716 return FALSE;
2717
2718 s = splvm();
2719 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2720 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2721 if (pv->pv_pmap == pmap) {
2722 splx(s);
2723 return TRUE;
2724 }
2725 loops++;
2726 if (loops >= 16)
2727 break;
2728 }
2729 splx(s);
2730 return (FALSE);
2731 }
2732
2733 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
2734 /*
2735 * Remove all pages from specified address space
2736 * this aids process exit speeds. Also, this code
2737 * is special cased for current process only, but
2738 * can have the more generic (and slightly slower)
2739 * mode enabled. This is much faster than pmap_remove
2740 * in the case of running down an entire address space.
2741 */
2742 void
2743 pmap_remove_pages(pmap, sva, eva)
2744 pmap_t pmap;
2745 vm_offset_t sva, eva;
2746 {
2747 pt_entry_t *pte, tpte;
2748 vm_page_t m;
2749 pv_entry_t pv, npv;
2750 int s;
2751
2752 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2753 if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
2754 printf("warning: pmap_remove_pages called with non-current pmap\n");
2755 return;
2756 }
2757 #endif
2758 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2759 s = splvm();
2760 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2761
2762 if (pv->pv_va >= eva || pv->pv_va < sva) {
2763 npv = TAILQ_NEXT(pv, pv_plist);
2764 continue;
2765 }
2766
2767 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2768 pte = vtopte(pv->pv_va);
2769 #else
2770 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2771 #endif
2772 tpte = *pte;
2773
2774 if (tpte == 0) {
2775 printf("TPTE at %p IS ZERO @ VA %08lx\n",
2776 pte, pv->pv_va);
2777 panic("bad pte");
2778 }
2779
2780 /*
2781 * We cannot remove wired pages from a process' mapping at this time
2782 */
2783 if (tpte & PG_W) {
2784 npv = TAILQ_NEXT(pv, pv_plist);
2785 continue;
2786 }
2787
2788 m = PHYS_TO_VM_PAGE(tpte);
2789 KASSERT(m->phys_addr == (tpte & PG_FRAME),
2790 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2791 m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2792
2793 KASSERT(m < &vm_page_array[vm_page_array_size],
2794 ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2795
2796 pv->pv_pmap->pm_stats.resident_count--;
2797
2798 pte_clear(pte);
2799
2800 /*
2801 * Update the vm_page_t clean and reference bits.
2802 */
2803 if (tpte & PG_M) {
2804 vm_page_dirty(m);
2805 }
2806
2807 npv = TAILQ_NEXT(pv, pv_plist);
2808 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
2809
2810 m->md.pv_list_count--;
2811 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2812 if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2813 vm_page_flag_clear(m, PG_WRITEABLE);
2814 }
2815
2816 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
2817 free_pv_entry(pv);
2818 }
2819 splx(s);
2820 pmap_invalidate_all(pmap);
2821 }
2822
2823 /*
2824 * pmap_is_modified:
2825 *
2826 * Return whether or not the specified physical page was modified
2827 * in any physical maps.
2828 */
2829 boolean_t
2830 pmap_is_modified(vm_page_t m)
2831 {
2832 pv_entry_t pv;
2833 pt_entry_t *pte;
2834 int s;
2835
2836 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2837 return FALSE;
2838
2839 s = splvm();
2840 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2841 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2842 /*
2843 * if the bit being tested is the modified bit, then
2844 * mark clean_map and ptes as never
2845 * modified.
2846 */
2847 if (!pmap_track_modified(pv->pv_va))
2848 continue;
2849 #if defined(PMAP_DIAGNOSTIC)
2850 if (!pv->pv_pmap) {
2851 printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2852 continue;
2853 }
2854 #endif
2855 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2856 if (*pte & PG_M) {
2857 splx(s);
2858 return TRUE;
2859 }
2860 }
2861 splx(s);
2862 return (FALSE);
2863 }
2864
2865 /*
2866 * this routine is used to modify bits in ptes
2867 */
2868 static __inline void
2869 pmap_changebit(vm_page_t m, int bit, boolean_t setem)
2870 {
2871 register pv_entry_t pv;
2872 register pt_entry_t *pte;
2873 int s;
2874
2875 if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
2876 (!setem && bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2877 return;
2878
2879 s = splvm();
2880 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2881 /*
2882 * Loop over all current mappings setting/clearing as appropos If
2883 * setting RO do we need to clear the VAC?
2884 */
2885 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2886 /*
2887 * don't write protect pager mappings
2888 */
2889 if (!setem && (bit == PG_RW)) {
2890 if (!pmap_track_modified(pv->pv_va))
2891 continue;
2892 }
2893
2894 #if defined(PMAP_DIAGNOSTIC)
2895 if (!pv->pv_pmap) {
2896 printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
2897 continue;
2898 }
2899 #endif
2900
2901 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2902
2903 if (setem) {
2904 *pte |= bit;
2905 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2906 } else {
2907 pt_entry_t pbits = *pte;
2908 if (pbits & bit) {
2909 if (bit == PG_RW) {
2910 if (pbits & PG_M) {
2911 vm_page_dirty(m);
2912 }
2913 pte_store(pte, pbits & ~(PG_M|PG_RW));
2914 } else {
2915 pte_store(pte, pbits & ~bit);
2916 }
2917 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2918 }
2919 }
2920 }
2921 if (!setem && bit == PG_RW)
2922 vm_page_flag_clear(m, PG_WRITEABLE);
2923 splx(s);
2924 }
2925
2926 /*
2927 * pmap_page_protect:
2928 *
2929 * Lower the permission for all mappings to a given page.
2930 */
2931 void
2932 pmap_page_protect(vm_page_t m, vm_prot_t prot)
2933 {
2934 if ((prot & VM_PROT_WRITE) == 0) {
2935 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2936 pmap_changebit(m, PG_RW, FALSE);
2937 } else {
2938 pmap_remove_all(m);
2939 }
2940 }
2941 }
2942
2943 /*
2944 * pmap_ts_referenced:
2945 *
2946 * Return a count of reference bits for a page, clearing those bits.
2947 * It is not necessary for every reference bit to be cleared, but it
2948 * is necessary that 0 only be returned when there are truly no
2949 * reference bits set.
2950 *
2951 * XXX: The exact number of bits to check and clear is a matter that
2952 * should be tested and standardized at some point in the future for
2953 * optimal aging of shared pages.
2954 */
2955 int
2956 pmap_ts_referenced(vm_page_t m)
2957 {
2958 register pv_entry_t pv, pvf, pvn;
2959 pt_entry_t *pte;
2960 pt_entry_t v;
2961 int s;
2962 int rtval = 0;
2963
2964 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2965 return (rtval);
2966
2967 s = splvm();
2968 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2969 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2970
2971 pvf = pv;
2972
2973 do {
2974 pvn = TAILQ_NEXT(pv, pv_list);
2975
2976 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2977
2978 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2979
2980 if (!pmap_track_modified(pv->pv_va))
2981 continue;
2982
2983 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2984
2985 if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2986 pte_store(pte, v & ~PG_A);
2987 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2988
2989 rtval++;
2990 if (rtval > 4) {
2991 break;
2992 }
2993 }
2994 } while ((pv = pvn) != NULL && pv != pvf);
2995 }
2996 splx(s);
2997
2998 return (rtval);
2999 }
3000
3001 /*
3002 * Clear the modify bits on the specified physical page.
3003 */
3004 void
3005 pmap_clear_modify(vm_page_t m)
3006 {
3007 pmap_changebit(m, PG_M, FALSE);
3008 }
3009
3010 /*
3011 * pmap_clear_reference:
3012 *
3013 * Clear the reference bit on the specified physical page.
3014 */
3015 void
3016 pmap_clear_reference(vm_page_t m)
3017 {
3018 pmap_changebit(m, PG_A, FALSE);
3019 }
3020
3021 /*
3022 * Miscellaneous support routines follow
3023 */
3024
3025 static void
3026 amd64_protection_init()
3027 {
3028 register long *kp, prot;
3029
3030 #if 0
3031 #define PG_NX (1ul << 63)
3032 #else
3033 #define PG_NX 0
3034 #endif
3035
3036 kp = protection_codes;
3037 for (prot = 0; prot < 8; prot++) {
3038 switch (prot) {
3039 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3040 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3041 *kp++ = PG_NX;
3042 break;
3043 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3044 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3045 *kp++ = 0;
3046 break;
3047 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3048 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3049 *kp++ = PG_RW | PG_NX;
3050 break;
3051 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3052 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3053 *kp++ = PG_RW;
3054 break;
3055 }
3056 }
3057 }
3058
3059 /*
3060 * Map a set of physical memory pages into the kernel virtual
3061 * address space. Return a pointer to where it is mapped. This
3062 * routine is intended to be used for mapping device memory,
3063 * NOT real memory.
3064 */
3065 void *
3066 pmap_mapdev(pa, size)
3067 vm_paddr_t pa;
3068 vm_size_t size;
3069 {
3070 vm_offset_t va, tmpva, offset;
3071
3072 offset = pa & PAGE_MASK;
3073 size = roundup(offset + size, PAGE_SIZE);
3074
3075 GIANT_REQUIRED;
3076
3077 va = kmem_alloc_pageable(kernel_map, size);
3078 if (!va)
3079 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3080
3081 pa = pa & PG_FRAME;
3082 for (tmpva = va; size > 0; ) {
3083 pmap_kenter(tmpva, pa);
3084 size -= PAGE_SIZE;
3085 tmpva += PAGE_SIZE;
3086 pa += PAGE_SIZE;
3087 }
3088 pmap_invalidate_range(kernel_pmap, va, tmpva);
3089 return ((void *)(va + offset));
3090 }
3091
3092 void
3093 pmap_unmapdev(va, size)
3094 vm_offset_t va;
3095 vm_size_t size;
3096 {
3097 vm_offset_t base, offset, tmpva;
3098 pt_entry_t *pte;
3099
3100 base = va & PG_FRAME;
3101 offset = va & PAGE_MASK;
3102 size = roundup(offset + size, PAGE_SIZE);
3103 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
3104 pte = vtopte(tmpva);
3105 pte_clear(pte);
3106 }
3107 pmap_invalidate_range(kernel_pmap, va, tmpva);
3108 kmem_free(kernel_map, base, size);
3109 }
3110
3111 /*
3112 * perform the pmap work for mincore
3113 */
3114 int
3115 pmap_mincore(pmap, addr)
3116 pmap_t pmap;
3117 vm_offset_t addr;
3118 {
3119 pt_entry_t *ptep, pte;
3120 vm_page_t m;
3121 int val = 0;
3122
3123 ptep = pmap_pte(pmap, addr);
3124 if (ptep == 0) {
3125 return 0;
3126 }
3127
3128 if ((pte = *ptep) != 0) {
3129 vm_paddr_t pa;
3130
3131 val = MINCORE_INCORE;
3132 if ((pte & PG_MANAGED) == 0)
3133 return val;
3134
3135 pa = pte & PG_FRAME;
3136
3137 m = PHYS_TO_VM_PAGE(pa);
3138
3139 /*
3140 * Modified by us
3141 */
3142 if (pte & PG_M)
3143 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3144 else {
3145 /*
3146 * Modified by someone else
3147 */
3148 vm_page_lock_queues();
3149 if (m->dirty || pmap_is_modified(m))
3150 val |= MINCORE_MODIFIED_OTHER;
3151 vm_page_unlock_queues();
3152 }
3153 /*
3154 * Referenced by us
3155 */
3156 if (pte & PG_A)
3157 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3158 else {
3159 /*
3160 * Referenced by someone else
3161 */
3162 vm_page_lock_queues();
3163 if ((m->flags & PG_REFERENCED) ||
3164 pmap_ts_referenced(m)) {
3165 val |= MINCORE_REFERENCED_OTHER;
3166 vm_page_flag_set(m, PG_REFERENCED);
3167 }
3168 vm_page_unlock_queues();
3169 }
3170 }
3171 return val;
3172 }
3173
3174 void
3175 pmap_activate(struct thread *td)
3176 {
3177 struct proc *p = td->td_proc;
3178 pmap_t pmap;
3179 u_int64_t cr3;
3180
3181 critical_enter();
3182 pmap = vmspace_pmap(td->td_proc->p_vmspace);
3183 pmap->pm_active |= PCPU_GET(cpumask);
3184 cr3 = vtophys(pmap->pm_pml4);
3185 /* XXXKSE this is wrong.
3186 * pmap_activate is for the current thread on the current cpu
3187 */
3188 if (p->p_flag & P_THREADED) {
3189 /* Make sure all other cr3 entries are updated. */
3190 /* what if they are running? XXXKSE (maybe abort them) */
3191 FOREACH_THREAD_IN_PROC(p, td) {
3192 td->td_pcb->pcb_cr3 = cr3;
3193 }
3194 } else {
3195 td->td_pcb->pcb_cr3 = cr3;
3196 }
3197 load_cr3(cr3);
3198 critical_exit();
3199 }
3200
3201 vm_offset_t
3202 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3203 {
3204
3205 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3206 return addr;
3207 }
3208
3209 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3210 return addr;
3211 }
Cache object: 6f525f22ea6dd81fa6a3a9c8dce1ac34
|